def identify_language(self, document, default_lang = None):
     # Extract ngrams
     unigrams = regexp_tokenize(document, pattern_unigrams)
     bigrams = regexp_tokenize(document, pattern_bigrams) 
     
     #Create frequency distributions    
     doc_fdist = FreqDist(unigrams + bigrams)
     predicted_lang = default_lang
     max_sim = 0.5
     for k,v in self._prototypes.items():
         sim = cosineOnDicts(v, doc_fdist, self._union)
         if sim > max_sim:
             max_sim = sim
             predicted_lang = k
              
     return predicted_lang
 def get_score(self, document, lang):
     # Extract ngrams
     unigrams = regexp_tokenize(document, pattern_unigrams)
     bigrams = regexp_tokenize(document, pattern_bigrams) 
     #Create frequency distributions    
     doc_fdist = FreqDist(unigrams + bigrams)
     sim = cosineOnDicts(self._prototypes[lang], doc_fdist, self._union)
     return sim
def find_version(text):
    digit_pattern = r"(?:(\d+)\.)?(?:(\d+)\.)?(\*|\d+)"
    pattern = "\s?[vV]ersion\s?" + digit_pattern
    pattern += "| [vV]er\s?\.?\s?" + digit_pattern
    pattern += "| [vV]\s?\.?\s?" + digit_pattern
    version_matches = regexp_tokenize(text, pattern)
    pattern = digit_pattern + "$"
    versions = []
    for version in version_matches:
        matches = regexp_tokenize(version, pattern)
        for match in matches:
            versions.append(match)
    return versions
def are_duplicates(doc1, doc2):
    if len(doc1) > 50 and len(doc2) > 50 and  not are_duplicates(doc1[:50], doc2[:50]): 
        return False
    txt_tokens_1 = regexp_tokenize(doc1, pattern_words)
    txt_tokens_2 = regexp_tokenize(doc2, pattern_words)
    ngrams_1 = txt_tokens_1 + generate_ngrams(txt_tokens_1, 2)
    ngrams_2 = txt_tokens_2 + generate_ngrams(txt_tokens_2, 2)
    overlap = len([w for w in ngrams_1 if w in ngrams_2])
    score = (2*overlap)/(len(ngrams_1) + len(ngrams_1) + 1)
    if score > 0.8: 
        return True
    else:
        return False    
def main():
    args = argument_parser.main()
    global sql
    sql = SQLConnector(host=args.host, port=args.port, user=args.user, passwd=args.password, db=args.db)
    global bing
    bing = BingSearch()
    global new_software
    new_software = NewSoftware()
    global possible_tags
    possible_tags = []
    mongo = MongoConnector(host=args.H, db=args.db)
    for page in range(1):
        res = sql.load_data(page)
        rows = res.num_rows()
        if not rows:
            print "No tweets left to analyse"
            break

        for _i_ in range(1):  # rows):
            for tweet in res.fetch_row():
                tweet_id = str(tweet[0])
                text = tweet[1].lower()
                # text = "Version 2 Microsoft just released MS Office ver 3.20.2 for 99 cent 100c 10ps 13pence 10 pence"

                urls = find_url(text)
                for url in urls:
                    text = text.replace(url, "").strip()

                versions = find_version(text)

                words = regexp_tokenize(text, pattern=r"\w+([.,]\w+)*|\S+")
                # print words
                prices = find_price(words)

                pos_ = pos(words)
                ngram = ngrams(words, 5)

                try:
                    tagged_tweet = tag_tweets(ngram, tweet_id)
                    tagged_tweet.add("tweet_text", text)
                    tagged_tweet.add("sentiment", tweet[2])
                    tagged_tweet.add("url", urls)
                    tagged_tweet.add("version", versions)
                    tagged_tweet.add("price", prices)
                    if tweet_id in possible_tags:
                        print tweet_id
                    else:
                        if tagged_tweet.contains("software_id") or tagged_tweet.contains("operating_system_id"):
                            print tweet
                            print tagged_tweet
                            print
                            # mongo.insert(tagged_tweet)
                        else:
                            print tweet, "No software"
                        # sql.setTagged(tagged_tweet.get('tweet_db_id'))
                except IncompleteTaggingError, e:
                    # This will allow the tweet to be tagged again at a later stage
                    print tweet_id + ":", e
                    print tweet
                    print
def word_split(text):
    """
    Split a text in words. Returns a list of tuple that contains
    word.
    """
    a = regexp_tokenize(text.lower().strip(), pattern=r'\w+') 
    return a
Exemple #7
0
    def simhash(raw_text):
        """Compute the simhash value for a string."""
        fdist = FreqDist()
        for word in regexp_tokenize(raw_text, pattern=r'\w+([.,]\w+)*|\S+'):
            fdist.inc(word.lower())

        v = [0] * 128

        for word in fdist:
            projection = bitarray()
            projection.fromstring(hashlib.md5(word).digest())
            #print "\tw:%s, %d" % (word, fdist[word])
            #print "\t\t 128 bit hash: " + str(b)

            for i in xrange(128):
                if projection[i]:
                    v[i] += fdist.get(word)
                else:
                    v[i] -= fdist.get(word)


        hash_val = bitarray(128)
        hash_val.setall(False)

        for i in xrange(128):
            if v[i] > 0:
                hash_val[i] = True
        return hash_val
Exemple #8
0
def tokenizeList(tokenList):
      #remove stop words, punctuation & stem words to create tokens out of phrases and names
      tokenized_list = []
      
      for item in tokenList:
         tokenized = regexp_tokenize(item.lower(), "[\w']+")
         for word in tokenized:
           if word not in english_stops: 
               stemmed = stemmer.stem(word).encode('ascii', 'ignore').lstrip().lower().translate(None,string.punctuation) 
               if not stemmed.isalpha(): 
		 if stemmed.isdigit():
		   stemmed = 'NUMBER'
		   tokenized_list.append(stemmed)
		 elif stemmed.isalnum(): 
		   stemmed = 'ALPHANUM'
		   tokenized_list.append(stemmed)
               else:
                   tokenized_list.append(stemmed) 
         '''
         filtered = [word for word in tokenized if word not in english_stops] 
         stemmed  = [stemmer.stem(word).encode('ascii', 'ignore').lstrip().lower().translate(None,string.punctuation) for word in filtered]   
         stemmed  = [word for word in stemmed if word !='']
	 tokenized_list.extend(stemmed)
         '''

      return tokenized_list
Exemple #9
0
    def __init__(self, sentence):
        self.sentence = sentence

        self.forms = []
        for s in tuple(open(FORMS, "r")):  # read the user_forms from file
            self.forms.append([w for w in regexp_tokenize(s, "[-\w]+") if w.isalnum()])

        if self.is_valid():
            self.tokens = regexp_tokenize(self.sentence, "(\\$)|[-\w]+")  # tokenizing with regex
            self.stop_words = set(stop.words("english"))  # filtering tokens words to remove
            self.filtered = [w.lower() for w in self.tokens if w not in self.stop_words]  # remove stop words
            self.spell_checked = self.spell_check()
            self.tags = pos_tag(self.spell_checked, tagset="universal")  # speech tagging (identification)
            print(self.tags)
            self.digits = self.get_digits()
            self.user_form = self.get_user_form()
def getTokenizedQueries():
	queriesFileName = "../cacm.query"

	f = open(queriesFileName, 'r')
	i = 0
	queriesList = {}
	isText = False
	for lineWithEnter in f:
		line = lineWithEnter[:-1]

		if len(line) == 0:
			continue
		elif line[0] == '<' or (line[0] == ' ' and len(line) == 1):
			isText = False
			continue
		else:
			if not isText:
				isText = True
				queriesList[i] = ""
				queriesList[i] += line
				i += 1
			else:
				queriesList[i - 1] += " "
				queriesList[i - 1] += line
			# print line

	tokenizedQueriesList = {}
	for q in queriesList:
		tokenizedQueriesList[q] = regexp_tokenize(queriesList[q], pattern='[\d]+[\.\,\d]*[\d]+\%?|\[\d+\]|[\w\-]+')

	return tokenizedQueriesList
Exemple #11
0
	def tag_and_tokenize(self,file):
		'''Tokenize, Chuncks and tags string 's' the bulk of the script work (time) is done here'''
		self.text = get_docx_text(file)
		self.sentences = ""
		print("Tokenize and tagging...")
		self.sentences = regexp_tokenize(self.text, pattern='\w+|\$[\d\.]+|\S+')
		self.sentences = [st.tag(self.sentences)]
		print("Tagging done")
Exemple #12
0
def words(text, splitContractions=False, contractChars = ["'"]):
    '''uses a regexpTokenizer to tokenize text to words. If splitContractions is true,
    the regex pattern is [\w]+ so that contractions are split, e.g. "I can't" -> ['I','can','t'],
    otherwise the regex pattern is [\w']+ so that contractions are not split, i.e. "I can't" -> ['I', "can't"]
    Additional contract characters, e.g. a hyphen, can be added by over riding the contractChars arg'''
    if splitContractions:
        pat = "[\w]+"
    else:
        pat = "[\w{0}]+".format(reduce(lambda x,y: x+y, contractChars, ""))
    return regexp_tokenize(text, pat, discard_empty=True)
Exemple #13
0
def index(request):
    if request.method == "POST":
        if request.POST.get("tokens"):
            with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle:
                corpus = pickle.load(handle)

            tokens = ast.literal_eval(request.POST.get("tokens"))
            tagged = []
            i = 1
            for item in tokens:
                tagged.append((item,request.POST.get("token_"+str(i))))
                i += 1
            if tagged not in corpus:
                corpus.append(tagged)
                with open(settings.BASE_DIR+"/data/corpus.pkl", 'wb') as handle:
                    pickle.dump(corpus, handle)
                tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
                symbols = unique_list(word for sent in corpus for (word,tag) in sent)
                trainer = HiddenMarkovModelTrainer(tag_set, symbols)
                hmm = trainer.train_supervised(corpus, estimator=LaplaceProbDist)
                with open(settings.BASE_DIR+"/data/hmm.pkl", 'wb') as handle:
                    pickle.dump(hmm, handle)

            return render(request, 'tagger/index.html', {'corpus': corpus})

        else:
            if request.POST.get("random") == 'true':
                address = get_random_address()
                if not address:
                    return render(request, 'tagger/index.html', {'error_message': 'No random addresses left'})

            else:
                address = request.POST.get("address")

            tokens = regexp_tokenize(address, pattern=r'\d+|[^\r\n\t\f 0-9,]+|,', )

            if tokens:
                pkl_file = open(settings.BASE_DIR+"/data/hmm.pkl", 'rb')
                hmm = pickle.load(pkl_file)
                pkl_file.close()

                tagged = hmm.tag(tokens)

                tags_file = open(settings.BASE_DIR+"/data/tags.json", 'rb')
                reader = codecs.getreader("utf-8")
                tags = json.load(reader(tags_file))
                tags_file.close()

                return render(request, 'tagger/index.html', {'address': address,
                                                              'tokens': tokens,
                                                              'tagged': tagged,
                                                              'tags': sorted(tags.items(), key=operator.itemgetter(1)) })

    return render(request, 'tagger/index.html', {})
def getReviews(rootdir):
    reviews = []
    unique = []
    for folder, subs, files in os.walk(rootdir):
        for filename in files:
            with open(os.path.join(folder,filename),'r') as src:
                review = src.read()
                words = regexp_tokenize(review,"\w+")
                for word in words:
                    unique.append(word)
                reviews.append(review)
    return reviews
 def _tokenize_content(self):
     tokenized_content = []
     raw_content = self._clean_content()
     content_sents = sent_tokenize(raw_content)
     content_words_by_sents = map(lambda sent: word_tokenize(sent), content_sents)
     stopwords = regexp_tokenize(STOPWORDS, "[\w']+")
     extra_puncts = ['),', ').', '%),', '%).', '):', '()', '://', '>.', '.;', '...', '/>.']
     puncts = list(punctuation) + extra_puncts
     stopwords.extend(puncts)
     for sent in content_words_by_sents:
         clean_sent = [word for word in sent if word not in stopwords]
         tokenized_content.append(clean_sent)
     return tokenized_content
def get_features(review,polarity):
    features = {}
    uniqueWords = 0
    personalRatio = 0
    personal = 0
    misspelt = 0
    hotelName = 0
    personalPronouns = ["i","me","we","our","ours","mine"]
    sentences = sent_tokenize(review)
    sent = nltk.word_tokenize(review)

    s = len(sentences)
    wordsR = regexp_tokenize(review,"\w+")
    for x in wordsR:
        if x in personalPronouns:
            personal+=1
        #if x not in set(words.words()):
            #misspelt+=1
        if x in hotels:
            hotelName+=1
    w = len(wordsR)
    unique = len(set(wordsR))
    uniqueWords+=unique
    review = review.replace(" ","")
    c = len(review)
    cap = 0
    features['dollar'] = False
    for i in range(len(review)):
        if review[i].isupper:
            cap+=1
        if review[i] == '$':
            features['dollar'] = True
    ari =4.71*(float(c)/w)+0.5*(float(w)/s)-21.43
    capRatio = c/float(s)
    personalRatio += float(personal)/w
    features['uniqueWords'] = uniqueWords
    features['personalRatio'] = personalRatio
    features['ari'] = ari
    features['capRatio'] = capRatio
    features['polarity'] = polarity
    features['hotel'] = hotelName
    ngrams = get_bigrams(review,'x')
    sentiments = get_sentimentFeatures(review,'x')
    for x in ngrams.keys():
        features[x] = ngrams[x]
    for x in sentiments.keys():
        features[x] = sentiments[x]
    features['misspelt'] = misspelt
    return features
def tokenize_text(page_text):
    """
    Tokenizes text using NLTK and regEx   
    """

    pattern = r"""(?:[A-Z][.])+|([1-9]|1[0-2]|0[1-9]){1}(:[0-5][0-9][aApP][mM]){1}|([0]?[1-9]|[1|2][0-9]|[3][0|1])[./-]([0]?[1-9]|[1][0-2])[./-]([0-9]{4}|[0-9]{2})|[$?|\-?]\d[\d,.:\^\-/\d]*\d|((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)|\w+[\w\-\#\@\'.&$]*\w+|[\@|\#|\&]?\w+(\w+)?|[:punct:]"""

    tokens = regexp_tokenize(page_text.strip().lower(), pattern)
    tokens = [cleanup(w) for w in tokens]

    tokens = [w for w in tokens if ((len(w) > 1) and (money(w) or alpha_num(w)))]

    tokens = [LMTZR.lemmatize(w) for w in tokens]

    return tokens
def tokenize_text(page_text):
    '''
    Tokenizes text using NLTK and regEx   
    '''

    pattern = r'''(?:[A-Z][.])+|([1-9]|1[0-2]|0[1-9]){1}(:[0-5][0-9][aApP][mM]){1}|([0]?[1-9]|[1|2][0-9]|[3][0|1])[./-]([0]?[1-9]|[1][0-2])[./-]([0-9]{4}|[0-9]{2})|[$?|\-?]\d[\d,.:\^\-/\d]*\d|((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)|\w+[\w\-\#\@\'.&$]*\w+|[\@|\#|\&]?\w+(\w+)?|[:punct:]'''
    remove_list = ["[", "]", "{", "}", "(", ")", 
              "'", ".", "..", "...", ",", "?", "!", 
              "/", "\"", "\"", ";", ":", "-", "�", "_", "�", "�", 
              "`", "~", "@", "$", "^", "|", "#", "=", "*", "?"];
    ## making it to lower case may affect the performance
    tokens = regexp_tokenize(page_text, pattern)

    ## Removes unnecessary words 
    wt = [w for w in tokens if ((w not in remove_list) and (len(w) > 1))];        

    return wt;
def countW(rootdir):
    reviews = []
    unique = []
    for folder, subs, files in os.walk(rootdir):
        for filename in files:
            with open(os.path.join(folder,filename),'r') as src:
                review = src.read()
                words = regexp_tokenize(review,"\w+")
                for word in words:
                    unique.append(word)
                reviews.append(review)
    unique = set(unique)
    uniqueR = []
    for w in unique:
        if w not in stopwords.words('english'):
            uniqueR.append(w)
    print (len(set(uniqueR)))
def calculateAGARI(rootdir):
    avgARI = 0
    count = 0
    uniqueWords = 0
    personalRatio = 0
    dollarCount = 0
    personalPronouns = ["i","me","we","our","ours","mine"]
    hotelName = 0
    for folder, subs, files in os.walk(rootdir):
        for filename in files:
            with open(os.path.join(folder, filename), 'r') as src:
                review = src.read()
                personal = 0
                sentences = sent_tokenize(review)
                s = len(sentences)
                capitals = 0
                words = regexp_tokenize(review,"\w+")
                for x in words:
                    if x in personalPronouns:
                        personal+=1
                    if x in hotels:
                        hotelName+=1
                w = len(words)
                unique = len(set(words))
                uniqueWords+=unique
                review = review.replace(" ","")
                flag = "f"
                for i in range(len(review)):
                    if review[i].isupper():
                        capitals+=1
                    if review[i] == '$':
                        flag = "t"
                if flag=="t":
                    dollarCount+=1
                c = len(review)
                ari =4.71*(float(c)/w)+0.5*(float(w)/s)-21.43
                avgARI += ari
                count += 1
                personalRatio += float(personal)/w
                #print(nltk.ne_chunk(review))
    print("\n"+rootdir)
    print("ARI : "+str(float(avgARI/count)))
    print("Unique words"+" "+str(uniqueWords/float(count)))
    print("Ratio personal : "+str(personalRatio/float(count)))
    print("DollarCount :"+str(dollarCount))
Exemple #21
0
    def _tag(self, tweet):
        tweet_id = str(tweet[0])
        original = tweet[1].decode('utf-8', 'ignore')
        text = original.lower().replace('#','').strip()
        #text = "download 60 hundred pounds 72 million $800 billion pounds holiday havoc v2 in itunes for free 99"

        urls = find_url(original)
        for url in urls:
            text = text.replace(url.lower(), "").strip()

        word_freqs = word_frequencies(text)
        #print word_freqs

        versions = find_version(text)

        words = regexp_tokenize(text, pattern=r'\w+([.,]\w+)*|\S+')
        prices = find_price(words)

        five_gram = self._create_ngram(tokenized=words, gram_length=5)

        tagged_tweet = self._ngram_tagger(five_gram, tweet_id)
        tagged_tweet.add('sentiment', tweet[2])
        tagged_tweet.add('tweet', original)
        tagged_tweet.add('url', urls)
        tagged_tweet.add('version', versions)
        tagged_tweet.add('price', prices)

        if tagged_tweet.contains('software_name'):
            query = {'software_name':tagged_tweet.get('software_name')}
            words = {}
            for w in word_freqs:
                words['words.'+w] = word_freqs[w]
            #print query, words
            self._mongo.update_freqs(query,words)

        return tagged_tweet
import nltk
from nltk.tokenize import regexp_tokenize
import math 
import collections
from operator import itemgetter 

if __name__ == '__main__':
	# preparing files for read & write
	wordsContent = open ('words.txt', 'rU')
	documentContent = open ('documents.txt', 'rU')
	numDocsContainWordContent = open('numDocsContainWord.txt', "w", 0)

	# read words into a wordList
	wordList = []
	for line in wordsContent:
		tabs = line.split('\t')
		wordList.append(tabs[0])
	
	# read documents into a documentDict
	documentDict = {}
	for line in documentContent:
		tabs = line.split('\t')
		documentDict[tabs[0]] = regexp_tokenize(tabs[1], "[\w'#@]+")
		
	# save numDocsContainWord into a file
	for word in wordList:
		count = 0
		for documentId in documentDict.keys():
			if (word in documentDict[documentId]):
				count += 1
		numDocsContainWordContent.write("%s\t%s\n" % (word, count))
 def _tokenize_document(self, document):
     return regexp_tokenize(document, pattern_words)
Exemple #24
0
def renltk_tokenize(text):
    text = clean_text(text)
    words = regexp_tokenize(text, pattern='\s+', gaps=True)
    return words
def tokenize_grafs(text):
    return regexp_tokenize(text, r'\<\/p\>', gaps=True)
Exemple #26
0
from nltk.tokenize import regexp_tokenize
str_in = input()
x = int(input())
result = regexp_tokenize(str_in, r"[A-Za-z]+")
print(result[x])
Exemple #27
0
def _separate_text_into_words(text: str) -> List[str]:
    regex = r'\w+'
    return regexp_tokenize(text, regex)
print(re.match(pattern2, sentences[0]))

#%%
#                       Regex with NLTK tokenization
# =============================================================================

tweets = [
    'This is the best #nlp exercise ive found online! #python',
    '#NLP is super fun! <3 #learning', 'Thanks @datacamp :) #nlp #python'
]

# Define a regex pattern to find hashtags: pattern1
pattern1 = r"#\w+"

# Use the pattern on the first tweet in the tweets list
regexp_tokenize(tweets[0], pattern1)

# Write a pattern that matches both mentions and hashtags
pattern2 = r"([@]\w+)"

# Use the pattern on the last tweet in the tweets list
regexp_tokenize(tweets[-1], pattern2)

# Use the TweetTokenizer to tokenize all tweets into one list
tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)

#%%
#                           Non-ascii tokenization
# =============================================================================
def is_spanish(document):
	stopwords = nltk.corpus.stopwords.words('spanish') 
	for word in regexp_tokenize(document, "[\w#@]+"):
		if word in stopwords:
			return True
	return False
Exemple #30
0
# -*- coding: utf-8 -*-
"""
Created on 2018/6/17

@author: Samuel
@Desc: 
@dependence: Noting
"""
input_str = "Hi everyone! Hola gr8 &*$"
print(input_str.split())
from nltk.tokenize import word_tokenize, regexp_tokenize, wordpunct_tokenize, blankline_tokenize
output_str = word_tokenize(input_str)
print('word_tokenize: ')
print(output_str)
output_str = regexp_tokenize(input_str, pattern='\w+')
print('regexp_tokenize: ')
print(output_str)
output_str = regexp_tokenize(input_str, pattern='\d+')
print('regexp_tokenize: ')
print(output_str)
output_str = wordpunct_tokenize(input_str)
print('wordpunct_tokenize: ')
print(output_str)
output_str = blankline_tokenize(input_str)
print('blankline_tokenize: ')
print(output_str)
Exemple #31
0
from nltk.tokenize import sent_tokenize
from nltk.tokenize import regexp_tokenize

sentence = sent_tokenize(input())[int(input())]
print(regexp_tokenize(sentence, "[A-z']+"))
# part-1
# Import the necessary modules
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer

# part-2
# Define a regex pattern to find hashtags: pattern1
pattern1 = r"#\w+"
# Use the pattern on the first tweet in the tweets list
hashtags = regexp_tokenize(tweets[0], pattern1)
print(hashtags)

# part-3
# Write a pattern that matches both mentions (@) and hashtags
pattern2 = r"([@|#]\w+)"
# Use the pattern on the last tweet in the tweets list
mentions_hashtags = regexp_tokenize(tweets[2], pattern2)
print(mentions_hashtags)

# part-4
# Use the TweetTokenizer to tokenize all tweets into one list
tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)
from nltk.probability import FreqDist
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords

with open('wotw.txt', 'r') as file:
    data = file.read()

tokens = [word.lower() for word in regexp_tokenize(data, '\w+')]
stoplist = stopwords.words('english')
without_stops = [word for word in tokens if word not in stoplist]

freq_dist = FreqDist(without_stops)

print('Number of words: %s' % len(freq_dist))
for key in freq_dist.keys():
    print(key, freq_dist[key])

print(freq_dist.most_common(10))
print(freq_dist.most_common()[-10:])

dist_1 = [item[0] for item in freq_dist.items() if item[1] == 1]
print(len(dist_1), dist_1)
from nltk.tokenize import regexp_tokenize

text = input()
n = int(input())
print(regexp_tokenize(text, "[A-z']+")[n])
Exemple #35
0
import nltk
from nltk.tokenize import regexp_tokenize

sent="Don't hesitate to ask questions"
print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+'))
Exemple #36
0
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize, regexp_tokenize, wordpunct_tokenize, blankline_tokenize

with open('sentence1.txt', 'r') as myfile:
    data = myfile.read().replace('\n', '')

sentences = sent_tokenize(data)

from nltk.stem import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

pst = PorterStemmer()
lst = LancasterStemmer()
wnl = WordNetLemmatizer()

print("Stemming / lemmatization results")
for token in regexp_tokenize(sentences[0], pattern='\w+'):
    print(token, pst.stem(token), lst.stem(token), wnl.lemmatize(token))
Exemple #37
0
import nltk
from nltk.tokenize import regexp_tokenize
from operator import itemgetter 

if __name__ == '__main__':
	content = open ('10.txt','rU')
	output = open('wordlist10.txt', "w", 0) 

	words = {}
	for line in content:
		for word in regexp_tokenize(line, "[\w#@]+"):	# we define ours words to contain ', # and @
				words[word] = words.get(word, 0) + 1
	
	for item in sorted(words.items(), key=itemgetter(1), reverse=True): 
		output.write ("%s\t%s\n" % (item[0], item[1]))
Exemple #38
0
'''

from nltk.tokenize import regexp_tokenize
from nltk import ne_chunk
from nltk import pos_tag
from nltk import download

download('maxent_ne_chunker')
download('words')

#Claro está, NLTK trae por defecto el inglés bien pulido. El español nope.
frase = "Steven could be the main character, but Peridot is the coolest. \
        Stevonnie too, you must like Stevonnie. If not, please go to live \
        to New Jersey and leave Beach City"

frases_separadas = regexp_tokenize(frase, '\w+')
frases_tag = pos_tag(frases_separadas)
print("NLTK NER: ", ne_chunk(frases_tag, binary=False))
''' Podemos usar también un NER de Stanford, o al menos una versión del mismo
    que debería ser mejor que el de NLTK.
'''

from nltk.tag.stanford import StanfordNERTagger
stanford_ner = StanfordNERTagger(
    'assets/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
    'assets/stanford-ner/stanford-ner.jar')
#Al igual que en el 04, aquí le pasamos todo el texto, sin separar ni nada.
#lo hará él por nosotros.
print("Stanford NER: ", stanford_ner.tag(frases_separadas))
''' Recordad que aquí buscábamos nombres, entidades, lugares, organizaciones...
    En este caso, Stanford reconoce mejor estos valores, ya que como veréis, para
Exemple #39
0
import nltk
from nltk.tokenize import regexp_tokenize
sent="Don't hesitate to ask questions"
print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+'))


Exemple #40
0
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer

# hashtags
pattern1 = r"#\w+"
regexp_tokenize(tweets[0], pattern1)

# mentions and hashtags
pattern2 = r"([@#]\w+)"
regexp_tokenize(tweets[-1], pattern2)

tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)

import nltk

from nltk.tokenize import RegexpTokenizer, regexp_tokenize, BlanklineTokenizer, WhitespaceTokenizer, LineTokenizer, SpaceTokenizer
from nltk.tokenize.util import spans_to_relative, string_span_tokenize


text = "Don't hesitate to ask questions"
text2 = "This is a breaking news.\n A godzilla has been discovered in Tokyo city."
tokenizer = RegexpTokenizer('[\w]+')

print tokenizer.tokenize(text)

print regexp_tokenize(text, pattern='\w+|\$[\d\.]+\S+')

# Tokenize whitespace
tokenizer = RegexpTokenizer('\s+', gaps=True)
print tokenizer.tokenize(text)

# Select only words starting with capital letters
capt = RegexpTokenizer('[A-Z]\w+')
print capt.tokenize(text2)

print BlanklineTokenizer().tokenize(text2)

print WhitespaceTokenizer().tokenize(text2)

print LineTokenizer(blanklines='keep').tokenize(text2)
print LineTokenizer(blanklines='discard').tokenize(text2)

# SpaceTokenizer works similar to .split('')
print SpaceTokenizer().tokenize(text2)
def getAmenitiesTokens(cell_val):
    return regexp_tokenize(cell_val, "([\w\s\d']+), ")
Exemple #43
0
rgx_wrd_digit
re.findall(rgx_wrd_digit, 'he has 11 cats')

# [A-Za-z\-\.]+  ==== matches 'My-Website.com'
# groups are different
# (a-z)  matches "a-z"

tweet0 = r'This is the best #nlp exercise ive found online! #python'
print(tweet0)

from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer

# hash tags
pattern1 = r"#\w+"
regexp_tokenize(tweet0, pattern1)

tweet2 = r'Thanks @datacamp :) #nlp #python'
# mentions and hashtags
pattern2 = r"([#|@]\w+)"
regexp_tokenize(tweet2, pattern2)

tknzr = TweetTokenizer()
lst_tweets = [
    'This is the best #nlp exercise ive found online! #python',
    '#NLP is super fun! <3 #learning', 'Thanks @datacamp :) #nlp #python'
]

all_tokens = [tknzr.tokenize(t) for t in lst_tweets]
all_tokens
# Import the necessary modules
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import regexp_tokenize

tweets = [
    'This is the best #nlp exercise ive found online! #python',
    '#NLP is super fun! <3 #learning', 'Thanks @datacamp :) #nlp #python'
]

# Define a regex pattern to find hashtags: pattern1
pattern1 = r"#\w+"

# Use the pattern on the first tweet in the tweets list
print(regexp_tokenize(tweets[0], pattern1))

# Write a pattern that matches both mentions and hashtags
pattern2 = r"([#|@]\w+)"

# Use the pattern on the last tweet in the tweets list
print(regexp_tokenize(tweets[-1], pattern2))

# Use the TweetTokenizer to tokenize all tweets into one list
tknzr = TweetTokenizer(tweets)
all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)

######################################################
from nltk.tokenize import word_tokenize

german_text = 'Wann gehen wir Pizza essen? � Und fährst du mit Über? 🚕'
# Tokenize and print all words in german_text
Exemple #45
0
#1.b Write a pattern to match sentence endings:
print(re.findall("[.?!]",my_string))

print(re.split('[A-Za-z]*[.?!]\?!',my_string))
#1.d Find all capitalized words in my_string and print the result
print(re.findall(r"[A-Z]\w+",my_string))
#1.e Split my_string on spaces and print the result
print(re.split(r"\s",my_string))
#1.f Find all digits in my_string and print the result
print(re.findall(r"\d+", my_string))
#2.b Tokenize all the sentences using the sent_tokenize() function.
r1 = sent_tokenize(my_string)
print(r1)
#2.c Tokenize the fourth sentence, which you can access as sentences[3], using the word_tokenize() function.
print(word_tokenize(r1[3]))
#2.d Find the unique tokens by using word_tokenize() and then converting it into a set using set().
r2 = word_tokenize(my_string)
r3 = set(r2)
print(r3)
my_string_2 = "SOLDIER #1: Found them? In Mercea? The coconut's tropical!"
string = re.split(r"#\d\w+\?!",my_string_2)
print(string)
tweets=['I’ve retrieved the best #nlp exercise  for the class till now! It is done #python',      '#NLP is super cool!   #learning', 'Thanks @analytics :) #nlp #python']
pattern1 = r"#\w+"
print(regexp_tokenize(tweets[1],pattern1))
pattern2 = r"[#|@]\w+"
print(regexp_tokenize(tweets[2],pattern2))
tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)
Exemple #46
0
    def build_vocab(self, unlabeled_data, labeled_data, embedding_size,
                    max_seq_num, max_seq_len):
        sentences = []
        words = []
        if unlabeled_data is not None:
            for (u, v) in unlabeled_data.items():
                try:
                    results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                    dd = results.sub(" <website> ", v)
                    results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                    dd = results.sub(" <website> ", dd)
                    results = re.compile(
                        r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)',
                        re.S)
                    dd = results.sub(" <website> ", dd)
                    sents = sentence_tokenize(dd)
                    for j in range(0, len(sents)):
                        a = regexp_tokenize(transform_format(sents[j]),
                                            self.pattern)
                        temp = []
                        for k in range(0, len(a)):
                            if a[k] not in self.english_punctuations and check_ack_word(
                                    a[k]) == 1:
                                if a[k].isdigit():
                                    a[k] = '<number>'
                                elif a[k][0] == '$':
                                    a[k] = '<money>'
                                elif a[k][-1] == '%':
                                    a[k] = '<percentage>'
                                temp.append(a[k].lower())
                                words.append(a[k].lower())
                        if len(temp) > 0:
                            sentences.append(temp)
                except:
                    #print(u,v)
                    #exit()
                    pass

        if labeled_data is not None:
            for (u, v) in labeled_data.items():
                for i in range(0, len(v[0])):
                    v[0][i] = str(v[0][i])
                    try:
                        results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*',
                                             re.S)
                        dd = results.sub(" <website> ", v[0][i])
                        results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*',
                                             re.S)
                        dd = results.sub(" <website> ", dd)
                        results = re.compile(
                            r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)',
                            re.S)
                        dd = results.sub(" <website> ", dd)
                    except:
                        print(u, v)
                        print(v[0][i])
                        exit()
                    a = regexp_tokenize(transform_format(dd), self.pattern)
                    temp = []
                    for k in range(0, len(a)):
                        if a[k] not in self.english_punctuations and check_ack_word(
                                a[k]) == 1:
                            if a[k].isdigit():
                                a[k] = '<number>'
                            elif a[k][0] == '$':
                                a[k] = '<money>'
                            elif a[k][-1] == '%':
                                a[k] = '<percentage>'
                            temp.append(a[k].lower())
                            words.append(a[k].lower())
                    if len(temp) > 0:
                        sentences.append(temp)

        word_frequency = {}
        for i in range(0, len(words)):
            if words[i] in word_frequency:
                word_frequency[words[i]] += 1
            else:
                word_frequency[words[i]] = 1

        self.model = gensim.models.Word2Vec(sentences,
                                            size=embedding_size,
                                            window=5,
                                            min_count=1,
                                            iter=20,
                                            negative=50)

        x = 4
        self.word2id['<pad>'] = 0
        self.id2word[0] = '<pad>'
        self.word2id['<sos>'] = 2
        self.id2word[2] = '<sos>'
        self.word2id['<eos>'] = 3
        self.id2word[3] = '<eos>'

        self.unk_count = 0

        for i in range(0, len(sentences)):
            for j in range(0, len(sentences[i])):
                if word_frequency[sentences[i][j].lower()] >= 2:
                    if sentences[i][j].lower() in self.model:
                        if sentences[i][j].lower() in self.word2id:
                            pass
                        else:
                            self.word2id[sentences[i][j].lower()] = x
                            self.id2word[x] = sentences[i][j].lower()
                            x = x + 1
                else:
                    self.word2id['<unk>'] = 1
                    self.id2word[1] = '<unk>'
                    self.unk_count += 1
Exemple #47
0
from nltk.tokenize import regexp_tokenize

sentence = input()
index_of_word = int(input())

pattern = "[A-z]+"

res = regexp_tokenize(sentence, pattern)
print(res[index_of_word])
Exemple #48
0
    def load_data(self, labeled_data, ids):
        self.message = {}

        labels_esit = []

        for i in ids:
            sentences = []
            labels = []
            doc_len = []
            sent_len = []

            sents, l = labeled_data[i]

            for j in range(0, len(sents)):

                sents[j] = str(sents[j])

                results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                dd = results.sub(" <website> ", sents[j])
                results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                dd = results.sub(" <website> ", dd)
                results = re.compile(
                    r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)',
                    re.S)
                dd = results.sub(" <website> ", dd)

                a = regexp_tokenize(transform_format(dd), self.pattern)

                temp = []
                for k in range(0, len(a)):
                    if a[k] not in self.english_punctuations and check_ack_word(
                            a[k]) == 1:
                        if a[k].isdigit():
                            a[k] = '<number>'
                        elif a[k][0] == '$':
                            a[k] = '<money>'
                        elif a[k][-1] == '%':
                            a[k] = '<percentage>'
                        temp.append(a[k].lower())

                if len(temp) > 0:
                    temp_ = ['<sos>']
                    for k in range(0, min(len(temp), self.max_seq_len - 2)):
                        temp_.append(temp[k])
                    temp_.append('<eos>')
                    sentences.append(temp_)
                    labels.append(self.lookup_label_id(l[j]))

                    labels_esit.append(self.lookup_label_id(l[j]))

                    sent_len.append(len(temp_) - 1)

            doc_len.append(len(sents) - 1)

            self.message[i] = (sentences, labels, sent_len, doc_len)

        x_d = set()
        for (u, v) in self.label_set.items():
            x_d.add(v)
        x_d = np.array(list(x_d))

        self.kde.fit(np.array(labels_esit)[:, None])
        self.dist = self.kde.score_samples(x_d[:, None])

        self.esit_dist = F.softmax(torch.tensor(self.dist), dim=-1)
Exemple #49
0
def tokenize_sentence(sentence):
    #return nltk.word_tokenize(sentence.lower())
    return regexp_tokenize(sentence.lower(), "[\w']+")
Exemple #50
0
    def load_data(self, unlabeled_data, ids):
        self.message = {}
        self.ids = []
        self.data_num = 0

        for i in ids:
            try:
                sentences = []
                labels = []
                doc = unlabeled_data[i]

                doc_len = []
                sent_len = []

                doc += '.'

                results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                dd = results.sub(" <website> ", doc)
                results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S)
                dd = results.sub(" <website> ", dd)
                results = re.compile(
                    r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)',
                    re.S)
                dd = results.sub(" <website> ", dd)

                sents = sentence_tokenize(dd)

                # print(sents)

                for j in range(0, len(sents)):
                    a = regexp_tokenize(transform_format(sents[j]),
                                        self.pattern)
                    temp = []
                    for k in range(0, len(a)):
                        if a[k] not in self.english_punctuations and check_ack_word(
                                a[k]) == 1:
                            if a[k].isdigit():
                                a[k] = '<number>'
                            elif a[k][0] == '$':
                                a[k] = '<money>'
                            elif a[k][-1] == '%':
                                a[k] = '<percentage>'
                            temp.append(a[k].lower())

                    if len(temp) > 0:
                        temp_ = ['<sos>']
                        for k in range(0, min(len(temp),
                                              self.max_seq_len - 2)):
                            temp_.append(temp[k])
                        temp_.append('<eos>')
                        sentences.append(temp_)
                        labels.append(10)
                        sent_len.append(len(temp_) - 1)

                doc_len.append(min(len(sents) - 1, self.max_seq_num - 1))

                self.message[i] = (sentences[:self.max_seq_num],
                                   labels[:self.max_seq_num],
                                   sent_len[:self.max_seq_num], doc_len)
                self.ids.append(i)

            except:
                if str(doc) != "nan":
                    print(doc)
                pass
Exemple #51
0
from nltk.tokenize import regexp_tokenize


def readfile(filename):
    messages = [line.rstrip() for line in open('SMSSpamCollection.txt')]
    # print (input)
    return messages


filestopwords = open("stopwords.txt").read()


def casefolding(input):
    text = []
    for row in input:
        text.append(row)
    text = [x.lower() for x in text]
    return text


messages = readfile('SMSSpamCollection.txt')
print(messages)
print(casefolding(messages))

training_files = glob.glob('SMSSpamCollection.txt')
for file_name in training_files:
    text = open(file_name, encoding="UTF-8").read()
    filename.append(file_name.split("\\")[1])
    tokens = regexp_tokenize(text, r'[A-Za-z]{3,}')
    data.append(tokens)
Exemple #52
0
def tokenize(s):
    if type(s) is not unicode:
        s = s.decode('utf8')
    return regexp_tokenize(s, pattern='[^\W_]+|\S')
 # Store most common words and ngrams for latter comparison of texts
 words_most_common = []
 ngrams_most_common = []
 
 # Load stopwords if necessary
 if params.s:
     stopwords = [sw.strip() for sw in params.s.readlines()]
     params.s.close()
 
 #Process documents    
 for f in params.files:
     txt = f.read().lower()
     f.close()
     
     # Tokenize
     txt_tokens = regexp_tokenize(txt, pattern_words)
     if params.s:
         txt_tokens = [token for token in txt_tokens if token not in stopwords]                 
     
     # Extract ngrams
     unigrams = regexp_tokenize(txt, pattern_unigrams)
     bigrams = regexp_tokenize(txt, pattern_bigrams) 
     
     #Create frequency distributions    
     fdist_words = FreqDist(txt_tokens)
     fdist_ngrams = FreqDist(unigrams + bigrams)
     
     # Store most common words and ngrams for latter comparison of texts
     words_most_common.append([k for (k,_) in fdist_words.most_common(params.n)])
     ngrams_most_common.append([k for (k,_) in fdist_ngrams.most_common(params.m)])
     outputname = "output_for_" + f.name.rsplit(os.sep, 2)[1]
def get_tri_grams(corpus):
    words = regexp_tokenize(corpus, "[\\S]+")
    tri_grams = [[f'{words[i]} {words[i + 1]}', words[i + 2]]
                 for i in range(len(words) - 2)]
    return tri_grams
def is_portuguese(document):
	stopwords = nltk.corpus.stopwords.words('portuguese') 
	for word in regexp_tokenize(document, "[\w#@]+"):
		if word in stopwords:
			return True
	return False
Exemple #56
0
    text_file = f.read()

text_file = text_file.decode('utf-8').strip()
# splits sentences
from nltk.tokenize import sent_tokenize
tokens = sent_tokenize(text_file)
# print tokens

# splits words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text_file)
# print tokens

# whitespace tokenizer
from nltk.tokenize import regexp_tokenize
tokenizer = regexp_tokenize(text_file, '\s+', gaps=True)
# print tokenizer

from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
words = tokenizer
# print [word for word in words if word not in english_stops]

#look up words and print synset
from nltk.corpus import wordnet
syn = wordnet.synsets('cookbook')[0]
print syn.name()
print syn.definition()
print syn.hypernyms()
print syn.hypernyms()[0].hyponyms()
print syn.root_hypernyms()
Exemple #57
0
__author__ = 'Peiman'
import nltk
#nltk.download()

#from nltk.book import *
x = 'welcome to my first sentiment analysis and natural language processing learning codes!.'

from nltk.tokenize import regexp_tokenize
tokenizer = regexp_tokenize(r'\w+')

tokenized = nltk.word_tokenize(x)
tagged = nltk.pos_tag(tokenized)
print tokenized
print tagged
from nltk.tokenize import regexp_tokenize

text = input()
print(regexp_tokenize(text, r"[A-z'\-]+"))
    def prepare_data(self, data=False, re_train=False):

        flag = True
        if data == False:
            a = open('manner.xml').readlines()
            sent = []
            for k in a:
                k = k.lower()
                st = k.find('<subject>')
                if st == -1:
                    continue
                end = k.find('</subject>')
                sent.append(k[st + 9:end - 1])
                data = sent
                flag = False
        #print data[0:5]
        sentence = ["%s %s %s" % (self.START, x, self.END) for x in data]
        tokenize_sent = [
            regexp_tokenize(x, pattern='\w+|$[\d\.]+|\S+') for x in sentence
        ]

        freq = nltk.FreqDist(itertools.chain(*tokenize_sent))
        print 'found ', len(freq), ' unique words'
        if self.vocab_size > len(freq):
            self.vocab_size = len(freq)
        self.vocab = freq.most_common(self.vocab_size - 3)
        index_to_word = [x[0] for x in self.vocab]
        index_to_word.append(self.unk_token)
        index_to_word.append(self.START)
        index_to_word.append(self.END)

        word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

        for i, sent in enumerate(tokenize_sent):
            tokenize_sent[i] = [
                w if w in word_to_index else self.unk_token for w in sent
            ]

        self.char_indices = word_to_index
        self.indices_char = index_to_word

        if re_train == True or flag == True:
            sentences = []
            next_chars = []
            sentences_f = []
            sentences_b = []
            next_chars_f = []
            next_chars_b = []

            for sent in tokenize_sent:
                temp = [self.START for i in range(self.step)]
                flag = False
                for word in sent:
                    temp.remove(temp[0])
                    temp.append(word)
                    if flag == True:
                        next_chars_f.append(word)
                    if word != self.END:
                        temp1 = []
                        for i in temp:
                            temp1.append(i)
                        sentences_f.append(temp1)
                    flag = True

            for sent in tokenize_sent:
                temp = [self.END for i in range(self.step)]
                flag = False
                for word in sent[::-1]:
                    temp.remove(temp[0])
                    temp.append(word)
                    if flag == True:
                        next_chars_b.append(word)
                    if word != self.START:
                        temp1 = []
                        for i in temp:
                            temp1.append(i)
                        sentences_b.append(temp1)
                    flag = True

            print('preparing forward backward windows...')

            sentences, next_chars = [], []
            sentences.extend(sentences_f)
            sentences.extend(sentences_b)
            next_chars.extend(next_chars_f)
            next_chars.extend(next_chars_b)

            X_data = []
            for i in sentences:
                temp = []
                for j in i:
                    temp.append(word_to_index[j])
                X_data.append(temp)

            y_data = []
            for i in next_chars:
                y_data.append(self.char_indices[i])
            #y_train = np_utils.to_categorical(y_data, vocab_size)
            y_train = np.zeros((len(sentences), self.vocab_size),
                               dtype=np.bool)
            #X_train = sequence.pad_sequences(X_data, maxlen=maxlen)

            for i in range(len(y_data)):
                y_train[i][y_data[i]] = True

            self.X_data = X_data
            self.y_data = y_train
Exemple #60
0
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer("[\w']+")
print(tokenizer.tokenize("Can't is a contraction."))

# or
from nltk.tokenize import regexp_tokenize
print(regexp_tokenize("Can't is a contraction.", "[\w']+"))

# use gaps
tokenizer = RegexpTokenizer('\s+', gaps=True)
print(tokenizer.tokenize("Can't is a contraction."))