def ngrams(self, ns=[2, 3, 5]): _p = ["/".join(t) for t in zip(self.SUF, self.POS)] for n in ns: ngf = {"Ngram(N={})_{}".format(n, "_".join(t)): 1 for t in ngrams(self.SUF, n)} ngfp = {"NgramP(N={})_{}".format(n, "_".join(t)): 1 for t in ngrams(_p, n)} self.features.update(ngf) self.features.update(ngfp)
def update_freqs(self, doc_text, id_str): for bigram in list(ngrams(doc_text, 2)): k = bigram[0] + u"_" + bigram[1] self.bicount.update([k]) self.bigram_to_ids[k] = self.bigram_to_ids.get(k, []) + [id_str] for trigram in list(ngrams(doc_text, 3)): k = trigram[0] + u"_" + trigram[1] + u"_" + trigram[2] self.tricount.update([k]) self.trigram_to_ids[k] = self.trigram_to_ids.get(k, []) + [id_str]
def get_gram_ratio(w2v, text1, text2, n_grams_1=1, n_grams_2=1, n_jobs=1): t1 = list(ngrams(text1.split(), n_grams_1)) t2 = list(ngrams(text2.split(), n_grams_2)) pairs = list(iter_product(t1, t2, repeat=1)) res = list(map(lambda x: similarity(w2v, x), pairs)) if len(res) == 0: return 0 else: return np.mean(res)
def ngrams_extract(string): if random.random() < SAMPLE_RATE: print '[*]',string l = list grams = l(ngrams(string,2)) + l(ngrams(string,3)) + l(ngrams(string,4)) + l(ngrams(string,5)) SIZE = 1024 vec = zeros((SIZE,)) for t in grams: vec[hash(t)%SIZE]+=1 return log(vec+1.0)
def build_ngram(source): ngram_set = {} for key, value in source.items(): ngram = [] for line in value: if IS_PAD: ngram.extend(nltk.ngrams(line.strip(), NGRAM_LEVEL, pad_left=True, pad_right=True, pad_symbol='SSS')) else: ngram.extend(nltk.ngrams(line.strip(), NGRAM_LEVEL)) ngram_set[key] = ngram return ngram_set
def read_data(type): datapath = '../data/' + type + '/' data = {} maxindex = 500 count = 0 unigrams = [] bigrams = [] dependecies = [] for c in string.ascii_uppercase: data[c] = {} for i in range(1, maxindex): filename = datapath + c + str(i) txtpath = filename + '.data' metapath = filename + '.meta' text = read_file(txtpath) meta = read_file(metapath) if text is not None: count += 1 # print (count) data[c][i] = {'text': text[0], 'meta': parse_meta(meta)} tokens = nltk.word_tokenize(text[0]) data[c][i]['tokens'] = tokens data[c][i]['length'] = len(tokens) s = remove_punct(text[0]) tokens = nltk.word_tokenize(remove_punct(s.lower())) data[c][i]['unigrams'] = list(nltk.ngrams(tokens, 1)) data[c][i]['bigrams'] = list(nltk.ngrams(tokens, 2)) # data[c][i]['dependencies'] = dependency_parse(text[0]) # deppath = filename + '.dep' # with open (deppath, 'w') as f: # json.dump(data[c][i]['dependencies'],f) # with open (deppath, 'r') as f: # data[c][i]['dependencies'] = json.load(f) unigrams.extend(data[c][i]['unigrams']) bigrams.extend(data[c][i]['bigrams']) # dependecies.extend(data[c][i]['dependencies']) data[c]['sequences'] = gen_sequences(data[c]) data['unigram_model'] = create_model(unigrams, maxfeat=5000, minfreq=3) data['bigram_model'] = create_model(bigrams, maxfeat=5000, minfreq=3) # data['dependencies'] = create_model(dependecies, maxfeat=5000, minfreq=3) # pprint.pprint (data['unigram_model']) # pprint.pprint (data['bigram_model']) # pprint.pprint (data['dependencies']) # print(type, count) return data
def extract_ngrams (self, memes): for meme_type in memes: for meme in memes[meme_type]: top_unigrams = meme[0] bottom_unigrams = meme[1] all_unigrams = top_unigrams + bottom_unigrams top_bigrams = ngrams (meme[0], 2) bottom_bigrams = ngrams (meme[1], 2) all_bigrams = top_bigrams + bottom_bigrams self.add_ngrams(key, top_unigrams, bottom_unigrams, all_unigrams, top_bigrams, bottom_bigrams, all_bigrams)
def get_gram_ratio(text1, text2, w2v, n_grams_1=1, n_grams_2=1, w=30, h=2000): arr = np.ndarray((w, h), np.float32) arr.fill(0) t1 = list(ngrams(text1.split(), n_grams_1)) t2 = list(ngrams(text2.split(), n_grams_2)) for i in range(len(t1)): for j in range(len(t2)): try: arr[i, j] = w2v.n_similarity(t1[i], t2[j]) except: pass return arr
def generate_location_vector(self, branch, index): if branch.text is not None: branch.text = branch.text.encode('ascii', 'ignore') if not branch.getchildren(): sentences = branch.text.split('. ') for sentence in range(0, len(sentences)): #sentence_location = (("{0}[{1}]".format(index, sentence)), sentences[sentence]) words = sentences[sentence].split() for doc_word in range(0, len(words)): word_location = (("{0}[{1}][{2}]".format(index, sentence, doc_word)), words[doc_word]) # any change in line below should be replicated in corpus.py also symbols = ".,[]();:<>+=&+%!@#~?{}|" whitespace = " " replace = maketrans(symbols, whitespace) doc_word = word_location[1].translate(replace) doc_word = doc_word.lstrip() doc_word = doc_word.rstrip() if len(doc_word) > 1 and not len(doc_word) > 16: self.doc_words.append(doc_word) doc_bigrams = bigrams(words) if not len(doc_bigrams) < 1: doc_bigrams = self.n_gram_cleaner(doc_bigrams) for bi_gram in doc_bigrams: bi_gram = ' '.join(bi_gram) self.bi_grams.append(bi_gram) doc_trigrams = trigrams(words) if not len(doc_trigrams) < 1: doc_trigrams = self.n_gram_cleaner(doc_trigrams) for tri_gram in doc_trigrams: tri_gram = ' '.join(tri_gram) self.tri_grams.append(tri_gram) doc_fourgrams = ngrams(words, 4) if not len(doc_fourgrams) < 1: doc_fourgrams = self.n_gram_cleaner(doc_fourgrams) for four_gram in doc_fourgrams: four_gram = ' '.join(four_gram) self.four_grams.append(four_gram) doc_fivegrams = ngrams(words, 5) if not len(doc_fivegrams) < 1: doc_fivegrams = self.n_gram_cleaner(doc_fivegrams) for five_gram in doc_fivegrams: five_gram = ' '.join(five_gram) self.five_grams.append(five_gram) else: for subtree in range(0, len(branch)): LocationVector.generate_location_vector(self, branch[subtree], ("{0}[{1}]".format(index, subtree)))
def lookup_phrases(sentence, noun_types, ignore_case=False): phrases = ngrams(sentence, 3) + ngrams(sentence, 2) + ngrams(sentence, 1) matches = [] for phrase in phrases: if contains_noun(phrase): phrase_str = u' '.join(w.form for w in phrase) if ignore_case: phrase_str = phrase_str.lower() types = noun_types.get(phrase_str) if types: matches.append((phrase, types)) return sorted(matches)
def get_top_ngrams_tfidf(text,collection,NGRAM=2,cutoff=100,docs=None): bigs = nltk.ngrams(text,NGRAM) print 'totally',len(bigs),'bigrams' bigs = remove_website_stopwords(bigs) freqdist = nltk.FreqDist(bigs) topwords = freqdist.keys()[:cutoff] # print len(topwords),'topwords:',topwords[:30],freqdist[topwords[0]],freqdist[topwords[1]] from math import log if True: #do_tfidf df = {} df_les = {} df_time = {} tfidf ={} for doc_id, text in docs.items(): words = [w for w in nltk.ngrams(text,NGRAM)] les_id,time_id = doc_id.split(':') time_id = time_id.replace('.csv','') time_id = time_id[0:8] for w in words: df.setdefault(w,set()) df[w].add(doc_id) df_les.setdefault(w,set()) df_les[w].add(les_id) df_time.setdefault(w,set()) df_time[w].add(time_id) _cutoff=10000 _topwords = freqdist.keys()[:_cutoff] df0,df1,df2={},{},{} for w in _topwords: # print w try: df0[w] = len(df[w]) except: df0[w] = 0 try: df1[w] = len(df_les[w]) except: df1[w] = 0 try: df2[w] = len(df_time[w]) except: df2[w] = 0 tfidf[w] = freqdist[w]/(1+df0[w]) # print df0 #get sorted words in decreasing order of tfidf values sortedwords = sorted(tfidf.items(), key=itemgetter(1), reverse=True) sortedwords = sortedwords[:cutoff] topwords = [w for w,s in sortedwords] sortedwords0 = sorted(df0.items(), key=itemgetter(1), reverse=True) sortedwords1 = sorted(df1.items(), key=itemgetter(1), reverse=True) sortedwords2 = sorted(df2.items(), key=itemgetter(1), reverse=True) print 'TF-IDF topwords:' print len(topwords),'topwords:',sortedwords[:50],freqdist[topwords[0]],freqdist[topwords[1]] print sortedwords0[:30] print sortedwords1[:30] print sortedwords2[:30] return topwords,freqdist,df0,df1,df2 return topwords,freqdist
def __call__(self, words): grams = list(ngrams(words, 2)) + list(ngrams(words, 3)) positives = [ (i, len(gram), gram) for i, gram in enumerate(grams) if self.colls[len(gram)][gram] ] if not positives: return words positives.sort(key=lambda x: (x[1], len(words) - x[0]), reverse=True) matches, covered = self.__non_overlapping(positives) unigrams = [(i, w) for i, w in enumerate(words) if i not in covered] catted = sorted(matches + unigrams) return zip(*catted)[1]
def generateLocationVector(self, branch, index): if branch.text is not None: branch.text = branch.text.encode('ascii', 'ignore') if not branch.getchildren(): sentences = branch.text.split('. ') for sentence in range(0, len(sentences)): #sentence_location = (("{0}[{1}]".format(index, sentence)), sentences[sentence]) words = sentences[sentence].split() for word in range(0, len(words)): word_location = (("{0}[{1}][{2}]".format(index, sentence, word)), words[word]) symbols = ",[]();:<>+=&+%!@#~?{}|" whitespace = " " replace = maketrans(symbols, whitespace) spec_word = word_location[1].translate(replace) spec_word = spec_word.lstrip() spec_word = spec_word.rstrip() if len(spec_word) > 1 and not len(spec_word) > 16: self.spec_words.append(spec_word) bi_grams = bigrams(words) if not len(bi_grams) < 1: for bi_gram in bi_grams: bi_gram = ' '.join(bi_gram) self.bi_grams.append(bi_gram) tri_grams = trigrams(words) if not len(tri_grams) < 1: for tri_gram in tri_grams: tri_gram = ' '.join(tri_gram) self.tri_grams.append(tri_gram) four_grams = ngrams(words, 4) if not len(four_grams) < 1: for four_gram in four_grams: four_gram = ' '.join(four_gram) self.four_grams.append(four_gram) five_grams = ngrams(words, 5) if not len(five_grams) < 1: for five_gram in five_grams: five_gram = ' '.join(five_gram) self.five_grams.append(five_gram) else: for subtree in range(0, len(branch)): Corpus.generateLocationVector(self, branch[subtree], ("{0}[{1}]".format(index, subtree)))
def jacquard_fivegram(query): final=[] n=4 for a in file('enwiktionary.a.list'): a=a.rstrip() fivegram=set(nltk.ngrams(a,5)) q_fivegram=set(nltk.ngrams(query,5)) intersect=q_fivegram.intersection(fivegram) union=q_fivegram.union(fivegram) sim=float(len(intersect))/len(union) final.append([a,sim]) final_sorted= sorted(final,key=lambda sim:sim[1], reverse=True) print final_sorted[:10]
def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200): split_text = text.split() if len(split_text) < shingle_length: raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length)) self.minhash = [] self.shingles = ngrams(split_text, shingle_length) for hash_seed in generate_random_seeds(minhash_size, random_seed): min_value = float('inf') for shingle in ngrams(split_text, shingle_length): value = mmh3.hash(' '.join(shingle), hash_seed) min_value = min(min_value, value) self.minhash.append(min_value)
def train(self, words, tagged=False): if tagged is True: tags = [] for i in range(len(words)): tags.append(words[i][1]) self.ngrams = list(nltk.ngrams(tags, self.n)) else: # text = nltk.word_tokenize(words) tagged_words = nltk.pos_tag(words) universal_tags = [nltk.map_tag('en-ptb', 'universal', tag) for word, tag in tagged_words] self.ngrams = list(nltk.ngrams(universal_tags, self.n)) self.frequencies = nltk.FreqDist(self.ngrams) self.probs_ng = nltk.MLEProbDist(self.frequencies) print self.probs_ng
def count_alliteration(tokens): allit_instances = [] #ignore stopwords tokens = [token for token in tokens if not(is_punctuation(token) or is_stopword(token))] bigrams = nltk.ngrams(tokens,2) for one,two in bigrams: if has_alliteration(one,two): allit_instances.append((one,two)) trigrams = nltk.ngrams(tokens,3) for one,two,three in trigrams: #the not avoids double counting if has_alliteration(one,three) and not has_alliteration(one,two): allit_instances.append((one,two,three)) return len(allit_instances)
def calc_precision(n,translation, reference): total = 0 correct = 0 for i in range(min(len(translation),len(reference))): tra_ngrams = nltk.ngrams(translation[i].split(), n) ref_ngrams = nltk.ngrams(reference[i].split(), n) total += min(len(ref_ngrams),len(tra_ngrams)) for ng in tra_ngrams: if(ng in ref_ngrams): correct += 1 print("total:" + str(total)+ ", correct: "+ str(correct)) if(total == 0): return(0) precision = float(correct)/total return(precision)
def get_date_from_utterance(tokenized_utterance: List[Token], year: int = 1993) -> List[datetime]: """ When the year is not explicitly mentioned in the utterance, the query assumes that it is 1993 so we do the same here. If there is no mention of the month or day then we do not return any dates from the utterance. """ dates = [] utterance = ' '.join([token.text for token in tokenized_utterance]) year_result = re.findall(r'199[0-4]', utterance) if year_result: year = int(year_result[0]) trigrams = ngrams([token.text for token in tokenized_utterance], 3) for month, tens, digit in trigrams: # This will match something like ``september twenty first``. day = ' '.join([tens, digit]) if month in MONTH_NUMBERS and day in DAY_NUMBERS: try: dates.append(datetime(year, MONTH_NUMBERS[month], DAY_NUMBERS[day])) except ValueError: print('invalid month day') bigrams = ngrams([token.text for token in tokenized_utterance], 2) for month, day in bigrams: if month in MONTH_NUMBERS and day in DAY_NUMBERS: # This will match something like ``september first``. try: dates.append(datetime(year, MONTH_NUMBERS[month], DAY_NUMBERS[day])) except ValueError: print('invalid month day') fivegrams = ngrams([token.text for token in tokenized_utterance], 5) for tens, digit, _, year_match, month in fivegrams: # This will match something like ``twenty first of 1993 july``. day = ' '.join([tens, digit]) if month in MONTH_NUMBERS and day in DAY_NUMBERS and year_match.isdigit(): try: dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[day])) except ValueError: print('invalid month day') if month in MONTH_NUMBERS and digit in DAY_NUMBERS and year_match.isdigit(): try: dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[digit])) except ValueError: print('invalid month day') return dates
def generate(starting_point='i', crp=nltk.corpus.brown, ngram=2): words = nltk.corpus.brown.words(categories='news') ngrams = nltk.ngrams([w.lower() for w in words], ngram) cdf = nltk.ConditionalFreqDist(ngrams) # print cdf.viewitems( # for item in cdf.viewitems(): # print item word = starting_point.lower() result = [word] while word not in [".", "?", "!", "'", ";", "`", "``"]: prev_word = result[-1] for new_word in cdf[word]: if new_word not in result[-len(result) / 2:]: prev_phrase = [prev_word, new_word] if not ' '.join(prev_phrase) in ' '.join(result): word = new_word if word == result[-1]: break result.append(word) result = ' '.join(result) return result
def main(): text = [] with open("development.set", 'r') as filedata: for line in filedata: l = line.split() if len(l) >=6: text.append([l[0], l[1], l[2], l[3], l[4]]) posTagger(text) entityTagger() # print(wiki_lookup("Barack Obama", "PERSON")) # class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', # 'stanford-ner/stanford-ner.jar') # print(class3.tag(["Barack Obama"])) # print(wordNettTagger("Barack Obama")) words = [] with open("pos.tagged", 'r') as filedata: for line in filedata: l = line.split() if l[5] == "NN" or l[5] == "NNP": words.append(l[4]) bigram_list = nltk.ngrams(words, 2) tagged_bigrams = ngramTagger(bigram_list) tagChecker(tagged_bigrams) locationCheck() wikification()
def POS_Ngram(N, example_set, i): N_grams = dict() count = 0 for para in example_set: if i == 0: # get first sentence tokens = word_tokenize(para.first) else: # get ith sentence para.order_sentence() tokens = word_tokenize(para.ordered_sentences[i-1]) #tokens = word_tokenize(para.scrambled_sentences[int(para.correct_order[i-1])-1]) tagset = None #print(tokens) tokens = _pos_tag(tokens, tagset, tagger) tags = [x[1] for x in tokens] # take POS tags only n_tags = list(ngrams(tags, N)) for tag_set in n_tags: count += 1 if tag_set in N_grams: N_grams[tag_set] += 1 else: N_grams[tag_set] = 1 # first occurence of tagset # Normalize N_gram counts by total number of N grams for this set of sentences for ngram, num in N_grams.items(): N_grams[ngram] = num/count return N_grams
def clean_up_txt(self, txt): # strip EOL, apostrophes, numbers, HTML, all other punctuation, and then # break into sentences ptn1=re.compile(r"""\ba\b|\ban\b|\bthe\b|\band\b|\bthat\b|\bthis\b| \bto\b|\bas\b|\bfor\b|\bof\b|\bin\b|\byou\b|\byour\b|\bbut\b| \bwith\b|\bon\b|\bis\b|\bby\b|\bfrom\b|\btheir\b|\bit\b|\bits\b| \btheir\b|\bor\b|\bat\b|\bwhich\b|\bcan\b|\binc\b|\bhas\b|\bhave\b| \balso\b|\bthan\b|\ball\b|\bbe\b|\bthey\b|\bwas\b|\bsuch\b| \binto\b""", re.X) ptn2=re.compile(r'\&#[0-9A-F]{4};') #words beginning with digits--get rid of digits ptn3=re.compile(r'\b[0-9]+') # end of clause or sentence to make into periods ,;:!? ptn4=re.compile(r'[!\?:;]') # other punctuation: get rid of ptn5=re.compile(r'[\"$\(\)&\/,]') # Break into sentences ptn6=re.compile(r'\.[ ]+(?=[A-Z])') TAG_RE = re.compile(r'<[^>]+>') txt = TAG_RE.sub("", txt.replace("\n"," ").encode('ascii','ignore').\ replace('\\/','/').replace("'","")) txt = ptn5.sub(" ",ptn4.sub(".",ptn3.sub(" ",ptn2.sub("",txt)))) sents = ptn6.split(txt) grams = set([]) for sent in sents: new_sent = ptn1.sub("",sent.lower().replace("."," ")).split() # generate n-grams for n in range(2, self.max_ngrams+1): grams.update(set(ngrams(new_sent, n))) return grams
def __fromcursor__(self): self.data = [] for document in c['Body'][self.source].find({ 'term' : self.term, 'date' : {'$gt' : self.start_date, '$lt' : self.stop_date}, 'str_type' : self.str_type.__name__, 'n' : self.n }, { 'documents' : 1 }, no_cursor_timeout=True): for _id in document['documents']: comment = get_comment(_id, self.source) gram_list = [] for ngram in ngrams(comment[self.str_type.__name__], self.n): gram_list.append(Gram(ngram).term) if self.position: loc = gram_list.index(self.term) + position self[gram_list[loc]] + 1 else: gram_list.remove(self.term) for gram in gram_list: self[gram] += 1 try: self * (sum(self) ** -1) except ZeroDivisionError: raise ValueError("No comments with term {} found".format(self.term)) self.__tocollection__()
def generate_ngrams(line): result = [] line = line.strip() for sentence in line_filter(' '.join(default_tokenize_func(line))): tokens_plain = [] sentence = sentence.split() i = 0 while i < len(sentence): for j in range(min(len(sentence), i+20), i, -1): token = ' '.join(sentence[i:j]) if i+1 == j and i == 0: # if first word in sentence -> do not attempt to link, could be wrong (Apple) tokens_plain.append(token.lower()) elif token in unambiguous_labels: # TODO: check it doesn't span titles uri = unambiguous_labels[token] # get types tokens_plain.append('<dbpedia:'+uri+'>') i = j-1 break i += 1 for n in range(1, N+1): for ngram in nltk.ngrams(tokens_plain, n): result.append((' '.join(ngram), 1)) return result
def mapper(self, _, line): N = 4 filename = mrjob.compat.jobconf_from_env('map.input.file') filename = ntpath.basename(filename) # currently, the file name is like "595F_1852_01_01_0102.txt" # I just get rid of the last page number part "_0102.txt" # to obtain the filename "595F_1852_01_01" fname = filename[:filename.rfind("_")] text = self.progPunctuation.sub(' ', line) tokens = text.split() toks = [w for w in tokens if self.progContainsALetterOrNumber.search(w)] #d = {} #for n in range(1, N+1): # d[n] = {} for n in range(1, N+1): d = {} for ng in nltk.ngrams(toks, n): ngram = " ".join(ng) if ngram in d: d[ngram] += 1 else: d[ngram] = 1 # pickle for w, freq in d.items(): yield (fname, n), (w, freq)
def ngrams_sentences(sentences, n): ngrams_serntences = [] for sentence in sentences: for i in range(n - 1): sentence = ['start{}'.format(i)] + sentence + ['end{}'.format(i)] ngrams_serntences.append([gram for gram in ngrams(sentence, n)]) return ngrams_serntences
def build_LM(in_file): """ build language models for each label each line in in_file contains a label and an URL separated by a tab(\t) """ print 'building language models...' # This is an empty method # Pls implement your code in below LM = {"malaysian" : Counter(), "indonesian": Counter(), "tamil" : Counter() } #total_count before smoothing total_count = {"malaysian" : 0, "indonesian": 0, "tamil" : 0 } num_4gram = {"malaysian" : 0, "indonesian": 0, "tamil" : 0 } with open(in_file, 'r') as f: #scan line by line to collect 4grams counts for line in f: label = line.split()[0] #get the label string = rm_nonalphabet_char(line.split(' ', 1)[1].lower()) #get the actual text after the label fourgram = nltk.ngrams(string, 4) #4grams fourgramDist = nltk.FreqDist(fourgram) LM[label] += fourgramDist # LM[label] += Counter([string[i:i+ngramL].lower() for i in range(0, len(string) - ngramL) if string[i:i+ngramL]]) #add unseen 4grams from other languages to a specific language for label in LM.keys(): other_labels = [k for k in LM.keys() if k != label] for other in other_labels: #get all 4grams in table[other] but not in table[label] and give them value 0 add_dict = {k: 0 for k in LM[label].keys() if not k in [other_k for other_k in LM[other].keys()]} #add all above 4grams to table[label] LM[other].update(add_dict) #total_count before smoothing total_count["malaysian"] = sum(LM["malaysian"].values()) total_count["indonesian"] = sum(LM["indonesian"].values()) total_count["tamil"] = sum(LM["tamil"].values()) #number of unique 4grams in each label before smoothing num_4gram["malaysian"] = len(LM["malaysian"]) num_4gram["indonesian"] = len(LM["indonesian"]) num_4gram["tamil"] = len(LM["tamil"]) #smoothing + 1 and convert to log base10 scale for label in LM.keys(): for k in LM[label].keys(): LM[label][k] = log(LM[label][k] + 1, 10) - log(total_count[label] + num_4gram[label], 10) return LM
def computeProbability(n, tags, sluice, table): result = 0.0 count = 0 # get ngrams and iterate over them to # see how many match in the table ngrams = list(nltk.ngrams(tags, n)) for ngram in ngrams: count += 1 if sluice not in table[str(n)]: sluiceKey = random.choice(table[str(n)].keys()) while sluiceKey == "key": sluiceKey = random.choice(table[str(n)].keys()) result += table[str(n)][sluiceKey].get(" ".join(list(ngram)), 0.0) else: result += table[str(n)][sluice].get(" ".join(list(ngram)), 0.0) # return a pseudoprobability which # is the average probability of # all ngrams in this sentence; a number # which is always between 0 and 1 if count == 0: return 0.0 else: return result/count
def __call__(self, feat_string): # generate features! base_feats = feat_string.split(TOKENSEP) # make ngrams from lemmatized words for n in xrange(self.min_ngram, self.max_ngram+1): for ng in nltk.ngrams([bf for bf in base_feats if bf.startswith("LEMM:") and bf[5:].lower() not in self.stop_words], n): s = ' '.join(map(lambda s: s.strip("LEMM:"), ng)) if len(ng) > 1: yield s else: # only strip stop words for 1-grams if s not in self.stop_words: yield s # use the original words, but not stop words #for bf in [bf for bf in base_feats if bf.startswith("ORIG:") and bf[5:].lower() not in self.stop_words]: # yield bf # this will overlap with ORIG:, but if we don't use ORIG, then it'll work for bf in [bf for bf in base_feats if bf.startswith("NNP:")]: yield bf #for bf in [bf for bf in base_feats if bf.startswith("TENSE:")]: # yield bf ##for bf in [bf for bf in base_feats if bf.startswith("SYN:")]: ## yield bf # wiki categories for bf in [bf for bf in base_feats if bf.startswith("WIKICAT:")]: yield bf
def getFreqDist(text, n): ngramsObject = nk.ngrams(text, n) freqDist = nk.FreqDist(ngramsObject) return freqDist
#- Using Lemmatization, apply lemmatization on the remaining words lemmatizer = WordNetLemmatizer() frm_lemma = [] for word in frm_word: fr_lema = lemmatizer.lemmatize(word.lower()) frm_lemma.append(fr_lema) print("\n -----------lemmetaizion---------- ") print(frm_lemma) frm_pos = pos_tag(frm_lemma) print("--------------BIGRAM-------------") n = 2 gram = [] bigrams = ngrams(frm_lemma, n) for grams in bigrams: gram.append(grams) print(gram) str1 = " ".join(str(x) for x, y in frm_pos) str1_word = word_tokenize(str1) print("--------Bi-Grams with word frequency----------") fdist1 = nltk.FreqDist(gram) top_fiv = fdist1.most_common() top_five = fdist1.most_common(5) top = sorted(top_fiv, key=itemgetter(0)) print(top) print('---------Top 5 bi-grams word freq with count--------') print(top_five) sent1 = sent_tokenize(frm)
def addTurn(self, turn): """ Adds a turn to this tracker :param turn: The turn to process and add :return: A hypothesis of the current state of the dialog """ hyps = copy.deepcopy(self.hyps) goal_stats = defaultdict(lambda: defaultdict(float)) # Obtaining the best hypothesis from the ASR module best_asr_hyp = turn['input']["live"]['asr-hyps'][0]["asr-hyp"] # English stopwords set with punctuation stop = stopwords.words('english') + list(string.punctuation) # Tokenize the best hypothesis on the whitespaces tkns = word_tokenize(best_asr_hyp) # Remove stop words and also shingle the tokens processed_hyp = [word for word in tkns if word not in stop] + [tup[0] + " " + tup[1] for tup in ngrams(tkns, 2)] # Manually change from "moderately"/"affordable" to "moderate" and "cheaper" to "cheap" for idx, word in enumerate(processed_hyp): if word == "moderately" or word == "affordable": processed_hyp[idx] = "moderate" if word == "cheaper": processed_hyp[idx] = "cheap" if processed_hyp: # Obtain the ontology information pricerange_options = self.ontology["informable"]["pricerange"] food_options = self.ontology["informable"]["food"] area_options = self.ontology["informable"]["area"] state_updated = False # SIMPLE Matching # Iterate through all the words in the best asr hypothesis # If the word is present in the ontology update that slot with the word for hyp_word in processed_hyp: if hyp_word in food_options: goal_stats["food"][hyp_word] += 1.0 state_updated = True if hyp_word in area_options: goal_stats["area"][hyp_word] += 1.0 state_updated = True if hyp_word in pricerange_options: goal_stats["pricerange"][hyp_word] += 1.0 state_updated = True # If this simple matching was not able to match anything then we will use BERT w/ cosine-similarity if not state_updated: # Use BERT to encode all the words in the sentence encoded_hyp = np.array(self.bc.encode(processed_hyp)) # Use the cosine sim between the previous encoding and the encoded knowledge base cosine_sim = cosine_similarity(encoded_hyp, self.encoded_kb) for idx, sub_arr in enumerate(cosine_sim): # For every word in the sentence obtain the word in the KB that maximizes the cosine sim argmax_index = np.argmax(sub_arr) # assuming that if it's lower than 0.97 then it's probably a mistake # (Not many cases have 0.97 cosine sim, maybe none actually) if sub_arr[argmax_index] >= 0.97: kb_word = self.knowledge_base[argmax_index] print(f"BERT: Word in query: {processed_hyp[idx]} \t matched with {kb_word}") if kb_word in food_options: goal_stats["food"][kb_word] += 1.0 if kb_word in area_options: goal_stats["area"][kb_word] += 1.0 if kb_word in pricerange_options: goal_stats["pricerange"][kb_word] += 1.0 super(BertTracker, self).fill_goal_labels(goal_stats, hyps) super(BertTracker, self).fill_joint_goals(hyps) self.hyps = hyps return self.hyps
from nltk.book import * from nltk import FreqDist from nltk import bigrams from nltk import ngrams print(len(text6) / len(set(text6))) fdist = FreqDist(text6) result = fdist.most_common(20) print(result) bigrams = bigrams(text6) bigramsDist = FreqDist(bigrams) print(bigramsDist[('Sir', 'Robin')]) fourgrams = ngrams(text6, 4) for fourgram in fourgrams: if fourgram[0] == 'coconut': print(fourgram)
for i, book in enumerate(books): parts = [file for file in filenames if book in file] parts.sort() for part in parts: with open(f"./outputs/sentences_extractor/{part}", "r") as file: for j, line in enumerate(file): line = re.sub(r",|;|:", "", line) line = line.lower() line = line.strip() if "ngrams" in args.case: n_gram = int(args.case.replace("ngrams","")) line_parts = [reduce(lambda acc, x: acc + " " + x, group) for group in nltk.ngrams(line.split(), n_gram)] else: line_parts = line.split(" ") for expression in expressions.index: count = line_parts.count(expression.lower()) if count > 0: compilator_lines[expression][i] += [total_lines + j+1 for i in range(count)] total_lines += j+1 print(f"{part} | {total_lines}") sentences_break.append(total_lines) with open(f'data/proust-sentences_{args.case}.json', 'w') as fp:
text = soup.p.contents[0] text_1 = text.lower() text_2 = re.sub('\W', ' ', text_1) from nltk import word_tokenize from nltk import bigrams from nltk import trigrams from nltk import ngrams text_3 = word_tokenize(text_2) text_3_bi = bigrams(text_3) text_3_tri = trigrams(text_3) text_3_n = ngrams(text_3, 4) stop_words = urlopen( 'http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop' ).read().split('\n') ##we can then identify the stop words and then eliminate them from the list ##this is code that executes a very simple for loop to check the list text_4 = [x for x in text_3 if x not in stop_words] ##you can check what was removed with: text_rem = [x for x in text_3 if x not in text_4] ##we're going to use a similar format to apply various stemming/lemmatizing/synonyms algorithms
def cleaner(line): strippedList = re.sub(r'[^a-zA-Z ]+', ' ', line.replace("'", "")).lower().replace( "advertisement", "").replace("\t", " ").strip().replace("\n", " ") strippedList = ' '.join([ word for word in strippedList.split() if word not in stopwords.words('english') ]) return strippedList # input comes from STDIN (standard input) for line in sys.stdin: # remove leading and trailing whitespace line = cleaner(line).strip() # split the line into words bigrams = ngrams(line.split(), N) words = [" ".join(grams) for grams in bigrams] # increase counters for word in words: # write the results to STDOUT (standard output); # what we output here will be the input for the # Reduce step, i.e. the input for reducer.py # tab-delimited; the trivial word count is 1 print '%s\t%s' % (word, 1)
def getBigrams(tokens): LOGGER.debug("Bigrams...") return [g for g in ngrams(tokens, 2)]
except Exception as e: print(str(e)) process_content() # lemmatizing the data print("lemmatizing:") lem = [] for w in wordtokens: lem.append(lemmatizer.lemmatize(w)) print(lem) #trigram the data print("trigram") print("") sent = " i am studying in umkc which is a good university" text = word_tokenize(sent) trigram = ngrams(text, 5) for t in trigram: print(t) #Named Entity Recognition of text print("Named Entity Recognition:") NER = [] NER.append(ne_chunk(pos_tag(fileread))) print(NER)
vocab = list(set(flatten(corpus))) print(len(vocab)) word2index = {'<UNK>': 0} for i, v in enumerate(vocab): if word2index.get(v) is None: word2index[v] = i + 1 index2word = {v: k for k, v in word2index.items()} ## Context and centers WINDOW_SIZE = 5 win_pairs = flatten([ list( nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus ]) train_data = [] for word_pair in win_pairs: for i in range(WINDOW_SIZE * 2 + 1): if i == WINDOW_SIZE and word_pair[i] == '<DUMMY>': continue train_data.append((word_pair[WINDOW_SIZE], word_pair[i])) X_tensor = [] y_tensor = [] for data in train_data: X_tensor.append(prepare_word(data[0], word2index).view(1, -1)) y_tensor.append(prepare_word(data[1], word2index).view(1, -1))
from nltk import FreqDist from nltk import Text from nltk.book import text6 from nltk import ngrams # 前十个的频率 # fdist = FreqDist(text6) # print(fdist.most_common(10)) # 'Sir', 'Robin'出现的次数 fourgrms = ngrams(text6, 2) fourgrmsDist = FreqDist(fourgrms) print(fourgrmsDist['Sir', 'Robin'])
# lemmas [x.lemma_ for x in doc] # POS tags [x.tag_ for x in doc] # N-grams from nltk import ngrams from collections import Counter # get n-gram counts for 10 documents grams = [] for i, row in df.iterrows(): tokens = row['text'].lower().split() # get tokens for n in range(2, 4): grams += list(ngrams(tokens, n)) # get bigrams, trigrams, and quadgrams if i > 50: break Counter(grams).most_common()[:8] # most frequent n-grams # Tokenizers from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer( min_df=0.001, # at min 0.1% of docs max_df=.8, max_features=1000, stop_words='english', ngram_range=(1, 3)) X = vec.fit_transform(df['text']) # save the vectors
def transform(self, texts: List[str], annotations: List[str] = []) -> np.ndarray: stopwords_enabled = 'stopwords' in self.features bigrams_enabled = 'bigrams' in self.features trigrams_enabled = 'trigrams' in self.features rare_pos_enabled = 'rare_pos_tags' in self.features annotation_enabled = 'annotation' in self.features vector_length = (stopwords_enabled * len(self.stopwords)) + \ (trigrams_enabled * len(self.trigrams_by_frequency)) + \ (bigrams_enabled * len(self.bigrams_by_frequency)) + \ (rare_pos_enabled * len(self.rare_pos_tags_by_frequency)) + \ (annotation_enabled * len(self.annotations)) matrix = np.empty([len(texts), vector_length]) for index, text in enumerate(texts): tokens = word_tokenize(text) vector = np.zeros(vector_length) if stopwords_enabled: for j, stopword in enumerate(self.stopwords['text']): c = tokens.count(stopword) vector[j] += c if bigrams_enabled: offset = (stopwords_enabled * len(self.stopwords)) bigrams = list(ngrams(text, 2)) for k, bigram in enumerate(self.bigrams_by_frequency): c = bigrams.count(bigram) vector[offset + k] += c if trigrams_enabled: offset = (stopwords_enabled * len(self.stopwords)) + ( bigrams_enabled * len(self.bigrams_by_frequency)) trigrams = list(ngrams(text, 3)) for l, trigram in enumerate(self.trigrams_by_frequency): c = trigrams.count(trigram) vector[offset + l] += c if rare_pos_enabled: offset = (stopwords_enabled * len(self.stopwords)) \ + (bigrams_enabled * len(self.bigrams_by_frequency)) \ + (trigrams_enabled * len(self.trigrams_by_frequency)) pos_bigrams_per_text = list(ngrams(pos_tag(tokens), 2)) for m, pos in enumerate(self.rare_pos_tags_by_frequency): c = pos_bigrams_per_text.count(pos) vector[offset + m] += c if annotation_enabled: offset = (stopwords_enabled * len(self.stopwords)) \ + (bigrams_enabled * len(self.bigrams_by_frequency)) \ + (trigrams_enabled * len(self.trigrams_by_frequency)) \ + (rare_pos_enabled * len(self.rare_pos_tags_by_frequency)) matches = get_matches(annotations[index]) for n, annotation in enumerate(self.annotations['text']): c = matches.count(annotation) vector[offset + n] += c matrix[index] = vector return matrix
def extract_bigrams(texts: List[str]) -> List[Tuple[str]]: bigrams = chain(*[ngrams(text, 2) for text in texts]) bigram_frequency = Counter(bigrams) return [freq[0] for freq in bigram_frequency.most_common(100)]
def extract_features(document): n_gram = 3 ngram_vocab = nltk.ngrams(document, n_gram) features = dict([(ng, True) for ng in ngram_vocab]) return features
b_values = [] for item in data: text = functions.clean_text(item['full_text']) text = functions.give_emoji_free_text(text) text = functions.additional_remove(text) tokens = WhitespaceTokenizer().tokenize(text) tokens = [x.lower() for x in tokens] for t in tokens: if t in B: b_values.append((item['id_str'], item['created_at'])) res.append(item['user']['screen_name']+'\t'+item['id_str']+'\t'+text+'\t'+item['created_at']) for i in ngrams(tokens, 2): if ' '.join([j for j in i]) in A: a_values.append((item['id'], item['created_at'])) res.append(item['user']['screen_name']+'\t'+item['id_str']+'\t'+text+'\t'+item['created_at']) a_values_counter+=len(a_values) b_values_counter+=len(b_values) if len(b_values)>0 or len(a_values)>0: temp['B_values'] = b_values temp['A_values'] = a_values file_name = file_name.split('.',-1)[0] result[file_name] = temp sys.stdout.write('\r%d/%d'%(counter,num_of_files)) print("\nnumber of tweets;", tweets_counter)
f['article'] = text # update list json, add article content return files result = get_aritcle('data') texts = [t['article'] for t in result] len(texts) clean_ts = clean_text(result, 'data') from nltk import ngrams, tokenize token = tokenize.word_tokenize(' '.join(clean_ts)) ngm = ngrams(token, 2) grams = list(set(token)) + [' '.join(list(n)) for n in list(ngm)] import tensorflow as tf import tensorflow_hub as hub model_url = "https://tfhub.dev/google/universal-sentence-encoder/2" embed = hub.Module(model_url) tf.logging.set_verbosity(tf.logging.ERROR) with tf.Session() as session: session.run([tf.global_variables_initializer(), tf.tables_initializer()]) embeding = session.run(embed(grams))
from nltk import ngrams, ne_chunk, wordpunct_tokenize, pos_tag with open('output.txt', 'r', encoding='utf-8') as f: raw = f.read() #Tokenization wtokens = nltk.word_tokenize(raw) words = [word.lower() for word in wtokens if word.isalpha()] print(words) #Adding tag print(nltk.pos_tag(words)) lStem = LancasterStemmer() print( "Lancaster Stemming :----------------------------------------------------- \n" ) for tok in words: print(lStem.stem(str(tok))) lemmatizer = WordNetLemmatizer() print( "Lemmatization ------------------------------------------------------------:\n" ) for tok in words: print(lemmatizer.lemmatize(str(tok))) print("Trigrams --------------------------------------------:\n") trigram = [] x = 0 trigram.append(list(ngrams(words, 3))) print(trigram) print("NER-------------------------------------\n") print("NER : \n", ne_chunk(pos_tag(wordpunct_tokenize(str(words)))))
def get_ngrams(text, n): n_grams = nltk.ngrams(word_tokenize(text), n) return [' '.join(grams) for grams in n_grams]
with open('logos.txt', 'r', encoding="latin1") as myfile: my_string = myfile.read().replace('\n', '') exclude = set(string.punctuation) string = ''.join(ch for ch in my_string if ch not in exclude) tokens = word_tokenize(string) text = nltk.Text(tokens) #array is the tuple of ngrams, array2 is the count of appearances, array1 is joined tuples, array 1 & 2 can be zipped into a dataframe array = [] array2 = [] ### length of n-grams is second argument to nltk.ngrams(tokens,length_goes_here) bgs = nltk.ngrams(tokens, 2) fdist = nltk.FreqDist(bgs) for k, v in fdist.items(): if v > 10: array.append(k) array2.append(v) array1 = [] for i in range(len(array)): x = ' '.join(map(str, array[i])) array1.append(x) df = pd.DataFrame({ 'phrase': array1, 'count': array2 }).sort_values(by="count", ascending=False)
def makeMainList(): """AUDIO_FILE = "/Users/kushamaharshi/Desktop/TERM PROJECT!/tp1/lastSavedFile.wav" r = sr.Recognizer() with sr.AudioFile(AUDIO_FILE) as source: audio = r.record(source) # read the entire audio file # Speech recognition using Google Speech Recognition try: # for testing purposes, we're just using the default API key # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` # instead of `r.recognize_google(audio)` testString = r.recognize_google(audio) #testString = r.recognize_google(audio) print("You said: " + testString) except sr.UnknownValueError: print("Google Speech Recognition could not understand audio") except sr.RequestError as e: print("Could not request results from Google Speech Recognition service; {0}".format(e))""" #above code from uberi, github """testString = "Species are different varieties of animals. They are formed over a long time. They can interbreed. Just need some words here. This happens only if you act like a bad person. If you have more money, due to increased sense of security, then you automatically have more happiness. They have differences in color, size, strength, gender roles, etcetera. There are three ways to do this. first, we can charge. Still talking about first here. still going bleh. Second, we can dance. Droning about second here. still going bleh. Hello people of earth. Third, we can sing. still going bleh. Singing like shit. Going on about the same old thing. Okay, moving on. Life is good. This can be done in four ways: swimming, charging, dancing, liking, making and bathing, if for whatever reason. This is a nice thing to do. Although there is a lot of controversy about this issue, it is still not acted upon by the government. Whenever I go on a walk, I like to get some food, milk and pizza. testString = "there are four types of biological diversity first is species diversity every ecosystem contains a unique collection of species all interacting with each other secondly genetic diversity describes how closely related the members of one species are in a given ecosystem third consider ecosystem diversity a region may have several ecosystems, or it may have one wide expanses of oceans or deserts would be examples of regions with low ecological diversity fourth is functional diversity understanding an ecosystem’s functional diversity can be useful to ecologists trying to conserve or restore damaged it okay let’s move on by examining the similarities and differences of different lineages that are related, scientists can determine most likely when the species diverged and evolved compared to when the common ancestor was around since biological species concept is dependent upon reproductive isolation of reproducing species it cannot necessarily be applied to a species that reproduces asexually the lineage species concept does not have that restraint and therefore can be used to explain simpler species that do not need a partner to reproduce the five types of species interactions are predation competition parasitism mutualism and commensalism to conclude showing a bit of math here five hundred seventy six is twenty four times twenty four""" testString = "Biological diversity in an environment is indicated by numbers of different species of plants and animals. Essentially you could say that there are four types of biological diversity. First is species diversity. Every ecosystem contains a unique collection of species, all interacting with each other. Secondly, genetic diversity describes how closely related the members of one species are in a given ecosystem. Third consider ecosystem diversity. A region may have several ecosystems, or it may have one. Wide expanses of oceans or deserts would be examples of regions with low ecological diversity. Fourth is functional diversity. Understanding an ecosystem’s functional diversity can be useful to ecologists trying to conserve or restore damaged it. Okay, let’s move on. By examining the similarities, likes and differences of different lineages that are related, scientists, researchers and explorers can determine most likely when the species diverged and evolved compared to when the common ancestor was around. Since biological species concept is dependent upon reproductive isolation of reproducing species, it cannot necessarily be applied to a species that reproduces asexually. The lineage species concept does not have that restraint and therefore can be used to explain simpler species that do not need a partner to reproduce. Species have five types: predation, competition, parasitism, mutualism and commensalism. To conclude,showing a bit of math here. Also note that two thousand seventy four added to sixty four is not equal to twenty four times twenty four right. I would also like to say this project was made possible thanks to the wonderful support of my TP mentor and all the faculty and staff of 15-112. Subsequently, I had super fun doing this!" testString1 = "biological diversity in an environment as indicated by numbers of different species of plants and animals essentially you could say that there are four types of biological diversity first is species diversity every ecosystem contains a unique collection of species all interacting with each other secondly genetic diversity describes how closely related the members of one species are in a given ecosystem third consider ecosystem diversity a region may have several ecosystems, or it may have one wide expanses of oceans or deserts would be examples of regions with low ecological diversity fourth is functional diversity understanding an ecosystem’s functional diversity can be useful to ecologists trying to conserve or restore damaged it okay let’s move on by examining the similarities and differences of different lineages that are related, scientists can determine most likely when the species diverged and evolved compared to when the common ancestor was around since biological species concept is dependent upon reproductive isolation of reproducing species it cannot necessarily be applied to a species that reproduces asexually the lineage species concept does not have that restraint and therefore can be used to explain simpler species that do not need a partner to reproduce the five types of species interactions are: predation, competition parasitism mutualism and commensalism to conclude showing a bit of math here. five hundred seventy six is twenty four times twenty four" testString = convertToSymbols(testString) """data = {'text': testString} req = requests.post('http://bark.phon.ioc.ee/punctuator', data=data) punctuatedString = req.text print(punctuatedString)""" punctuatedString = testString tstart = sent_tokenize(punctuatedString) sstart = [] toRemove = [] for sent in tstart: #print("sent start") tg = nltk.word_tokenize(sent) for j in range(6, 3, -1): #print(str(j)+"grams") aGram = ngrams(tg, j) for i in aGram: #print(i) boolNFU = newFindUseless(i) #print() if(boolNFU==True): toRemove.append(i) #print("toREMOVE: ", toRemove) for remPhrase in toRemove: stringCon = ' '.join(remPhrase) + " " punctuatedString = punctuatedString.replace(stringCon, "") #punctuatedString = convertToSymbols(punctuatedString) finalPOSArray = tagPreprocess(punctuatedString) #print("fpA: ", finalPOSArray) ## global bifBubbles, bifIndices, bifJumps, bifPrecSents bifBubbles, bifIndices, bifJumps, bifPrecSents = checkBubBifA(finalPOSArray) ## #print('********************8bifBubbles!!!!!!!!!!: ', bifBubbles) testStrsentp = sent_tokenize(punctuatedString) testStrsentp = [nltk.word_tokenize(sent) for sent in testStrsentp] for aSentence in testStrsentp: finalLabelledArray.append(labeler(aSentence)) indexi = 0 while(indexi <= len(testStrsentp)-1): sentence = testStrsentp[indexi] if(indexi in bifIndices): #print("BIFHERE") bifIndex = bifIndices.index(indexi) #print("bifindex: ", bifIndex) bifYChange = getBifYChange(bifBubbles[bifIndex]) bifYActual = getBifYMax(bifBubbles[bifIndex]) #print("bify: ", bifYChange) curBubble = Bubble("bif", sentence, getColorProper(indexi), bifIndex, bifYChange, bifYActual) indexi+=1 #print("bj", bifJumps) indexi+=bifJumps[bifIndex] else: chunkedSent = chunker(labeler(sentence)) for chunk in chunkedSent: type, sent = chunk[0], chunk[1] curBubble = Bubble(type, sent, getColorProper(indexi)) indexi+=1
# from nltk.tokenize import blankline_tokenize # AI_Blank= blankline_tokenize(AI) # print(len(AI_Blank)) from nltk.util import bigrams, trigrams, ngrams string = "Topic sentences are similar to mini thesis statements. Like a thesis statement, a topic sentence has a specific main point. Whereas the thesis is the main point of the essay, the topic sentence is the main point of the paragraph. Like the thesis statement, a topic sentence has a unifying function. But a thesis statement or topic sentence alone doesn’t guarantee unity. An essay is unified if all the paragraphs relate to the thesis, whereas a paragraph is unified if all the sentences relate to the topic sentence. Note: Not all paragraphs need topic sentences. In particular, opening and closing paragraphs, which serve different functions from body paragraphs, generally don’t have topic sentences." quote_token = nltk.word_tokenize(string) quotes_bigram = list(nltk.bigrams(quote_token)) # print(quotes_bigram) quotes_tigram = list(nltk.trigrams(quote_token)) # print(quotes_tigram) quotes_ngram = list(nltk.ngrams(quote_token, 4)) # print(quotes_ngram) from nltk.stem import PorterStemmer pst = PorterStemmer() # print(pst.stem("Having")) # words_to_steam=["give","giving","given","gave"] # for words in words_to_steam: # print( words ,":", pst.stem(words)) from nltk.stem import LancasterStemmer lst = LancasterStemmer() print(lst.stem("Having")) # words_to_steam=["give","giving","given","gave"]
for t in training: length_of_trianing = len(training) print(" ", countert_training + 1, "/", length_of_trianing) countert_training = countert_training + 1 filename = str(n) + "gramsfor" + t fw = open(filename, "a") print(" working on file", t, "...", end="") fr = open(t, "r") text = fr.read() fr.close print("done") data = [] count = [] print(" finding", n, "grams for", t, "...", end="") #time1 = time() all_grams = ngrams(text.split(), n) #print (time() - time1) print("done") #print (time() - time1) print(" counting the frequency of", n, "grams in", t, "...", end="") s_counter = 0 data = [] point_int = {} count = [] index_count = 0 #for grams in all_grams: # if grams in data: #increasing count # ind=data.index(grams); # count[ind]=count[ind]+1;
def Ex_gram(_data, num): data = " ".join(_data) n_grams = ngrams(nltk.word_tokenize(data), num) return [" ".join(grams) for grams in n_grams]
def parse_snips_intent(self): """ Parse original data.json into Snips NLU Engine Training Data in yaml format. Convert into yaml file through command line prompt : 'snips-nlu generate-dataset en input-yaml-file > output-json-file' """ # Get original data.json in DataFrame data_df = DataProcessing( f"{getcwd()}/data_lake/{self.json_arg}").retrieve_process_json() # Get list of Unique Intents intent_list = list(set(data_df["Intent"])) # Load SpaCy NLP Large Corpus spacy_nlp_engine = load('en_core_web_lg') # Init yaml object yaml = ruamel.yaml.YAML() # Set explicit start to True yaml.explicit_start = True # Parse by Intents for intent_name in intent_list: # yes and no are reserved values for yaml file. # To avoid parsing error, "_" is added before the intent name. if intent_name == "yes" or intent_name == "no": intent_dict = {"type": "intent", "name": f"{intent_name}s"} else: intent_dict = {"type": "intent", "name": intent_name} # Init Lists for Slots + Utterances slots_value_list = [] utt_value_list = [] # Subset current Intent Data subset_data = data_df[data_df["Intent"] == intent_name].reset_index(drop=True) # Get current Intent Queries intent_query_words = list(subset_data["Query"]) # Get the 4 grams and convert into a list word_ngrams = (pd.Series(ngrams(intent_query_words, 4))).to_list() # Random sample 80% of each Intent as training phrases for NLU Engine sample_ngrams = sample(word_ngrams, int(len(subset_data) * 0.8)) # Start parsing each queries for phrases in sample_ngrams: # Join phrases back to one single sentence full_text = " ".join(phrases) # Parse Entity of the text through Spacy NLP Engine parse_phrases = spacy_nlp_engine(full_text) # Set slots if len(parse_phrases.ents) > 0: # Get Entity Label and Text, if any for nlp_entity in parse_phrases.ents: entity_label = nlp_entity.label_ entity_text = nlp_entity.text # Construct "slot" for name and entity slot_entities = { "name": entity_label, "entity": entity_label } # Replace text with entity label full_text = full_text.replace( entity_text, f"[{entity_label}]({entity_text})") # Store "utterances" from the ngram utt_value_list.append(full_text) # Store unique "slots" if slot_entities not in slots_value_list: slots_value_list.append(slot_entities) # Set slots in intent dictionary if len(slots_value_list) > 0: intent_dict["slots"] = slots_value_list # Set utterances in intent dictionary if len(utt_value_list) > 0: intent_dict["utterances"] = utt_value_list # If there's no utterances found, use the original ngrams else: intent_dict["utterances"] = [ " ".join(gram) for gram in sample_ngrams ] # Append into output yaml with open(f"{getcwd()}/data_lake/intent_ngram.yaml", "a") as file: yaml.dump(intent_dict, file)
import nltk import time from time import time from nltk import ngrams timestamp = time() a = [1, 2, 3, 4, 5, 6, 7, 8, 1, 2] single_grams = ngrams(a, 3) adata = [] point_int = {} count = [] index_count = 0 for grams in single_grams: try: if point_int[grams] >= 0: passing = point_int[grams] except: passing = -1 if passing >= 0: ind = passing count[ind] = count[ind] + 1 else: data.append(grams) count.append(1) point_int[grams] = index_count index_count = index_count + 1 print(point_int) print(count) print(time() - timestamp) print(data) #print(data)
# Gera e filtra os tokens de cada arquivo data[file]['tokens'] = tokenize.word_tokenize(text) data[file]['tokens'] = [ t.lower() for t in data[file]['tokens'] if t.lower() not in stopwords ] # Gera os dados de frequência dos tokens data[file]['freq_tokens'] = nltk.FreqDist(data[file]['tokens']) # Gera os dados dos 15 tokens mais frequentes top15 = data[file]['freq_tokens'].most_common(15) data[file]['freq_tokens_top15'] = top15 # Gera os dados de frequência dos bigramas bigram = ngrams(data[file]['tokens'], 2) data[file]['freq_bigrams'] = nltk.FreqDist(bigram) # Gera os dados dos 15 bigramas mais frequentes top15 = data[file]['freq_bigrams'].most_common(15) data[file]['freq_bigrams_top15'] = top15 # Gera os dados de frequência dos quadrigramas com palavra "life" quadrigram = [ng for ng in ngrams(data[file]['tokens'], 4) if 'life' in ng] data[file]['freq_quadrigrams_life'] = nltk.FreqDist(quadrigram) # Imprime frequência das palavras 'the' e 'that' print('\n{:20s} {:35s} {}'.format('Arquivo', 'Token', 'Frequência')) for word in ['the', 'that']: freq = data[file]['freq_tokens'][word] print('{:20s} {:35s} {:03}'.format(file, word, freq))
#SnowBallStemmer sStem = SnowballStemmer('english') print("SnowBall Stemming : \n") for i in tokens[0:50]: print(sStem.stem(str(i))) #PorterStemmer pStem = PorterStemmer() print("Porter Stemming : \n") for i in tokens[0:50]: print(pStem.stem(str(i))) # POS-tagging print("Part of Speech Tagging :\n", pos_tag(word_tokenize(text))) # Lemmatization lemmatizer = WordNetLemmatizer() print("Lemmatization :\n") for tok in tokens[0:50]: print(lemmatizer.lemmatize(str(tok))) # Trigram print("Trigrams :\n") trigram = [] for x in tokens[0:20]: trigram.append(list(ngrams(x, 3))) print(trigram) # Named Entity Recognition print("NER : \n", ne_chunk(pos_tag(wordpunct_tokenize(str(tokens)))))
def getUnigrams(tokens): LOGGER.debug("Unigrams...") return [g for g in ngrams(tokens, 1)]
def compute(sent: str, k: int) -> 'DistMetric': token_set = set() for token in ngrams(sent.split(), k): token_set.add(token) return DistMetric(len(token_set))