def generate_trigrams(self): finder = TrigramCollocationFinder.from_words(self.corpus_tokens) resto_len = finder.N finder_contrast = TrigramCollocationFinder.from_words( self.contrast_tokens) contrast_len = finder_contrast.N corpus = self.generate_corpus(finder.ngram_fd, finder_contrast.ngram_fd) finder.apply_freq_filter(3) finder_contrast.apply_freq_filter(3) trigrams_resto = finder.ngram_fd trigrams_contrast = finder_contrast.ngram_fd scores = self.compute_tf_idf(trigrams_resto, resto_len, corpus) for i in scores: if i != 0.0: for tg in scores[i]: tmp = '' for word in tg: tmp += word + ' ' self.trigrams.append(tmp)
def ngram_collocation(words, sents, n, support=10, topK=200): if n >= 4: finder = TrigramCollocationFinder.from_words(words) ngram_measures = TrigramAssocMeasures() finder.apply_freq_filter(support) pmi_ngrams = finder.nbest(ngram_measures.pmi, topK) ext_ngrams = NgramCollocationExtender(pmi_ngrams, sents, support / 3, 0.3) print_ngrams(ext_ngrams) return ext_ngrams #pmi_ngrams = NgramCollocationFinder(words, 2, lowFreq, topK) #the current collocation measure is PMI else: if n == 2: finder = BigramCollocationFinder.from_words(words) ngram_measures = BigramAssocMeasures() if n == 3: finder = TrigramCollocationFinder.from_words(words) ngram_measures = TrigramAssocMeasures() finder.apply_freq_filter(support) pmi_ngrams = finder.nbest(ngram_measures.pmi, topK) print_ngrams(pmi_ngrams) return pmi_ngrams
def ngram_collocation(words, sents, n, support=10, topK=200): if n>=4: finder = TrigramCollocationFinder.from_words(words) ngram_measures = TrigramAssocMeasures() finder.apply_freq_filter(support) pmi_ngrams = finder.nbest(ngram_measures.pmi, topK) ext_ngrams = NgramCollocationExtender(pmi_ngrams, sents, support/3, 0.3) print_ngrams(ext_ngrams) return ext_ngrams #pmi_ngrams = NgramCollocationFinder(words, 2, lowFreq, topK) #the current collocation measure is PMI else: if n==2: finder = BigramCollocationFinder.from_words(words) ngram_measures = BigramAssocMeasures() if n==3: finder = TrigramCollocationFinder.from_words(words) ngram_measures = TrigramAssocMeasures() finder.apply_freq_filter(support) pmi_ngrams = finder.nbest(ngram_measures.pmi, topK) print_ngrams(pmi_ngrams) return pmi_ngrams
def best_ngrams(words, top_n=1000, min_freq=100): """ Extract `top_n` most salient collocations (bigrams and trigrams), from a stream of words. Ignore collocations with frequency lower than `min_freq`. This fnc uses NLTK for the collocation detection itself -- not very scalable! Return the detected ngrams as compiled regular expressions, for their faster detection later on. """ tcf = TrigramCollocationFinder.from_words(words) tcf.apply_freq_filter(min_freq) trigrams = [ ' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n) ] logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20])) bcf = tcf.bigram_finder() bcf.apply_freq_filter(min_freq) bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)] logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20])) pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE) pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE) return pat_gram2, pat_gram3
def __init__(self, words, sentences, language): self.num_words = len(words) self.unique_words = len(set(words)) self.num_sentences = len(sentences) self.average_sentence_length = round(self.num_words / self.num_sentences) self.lexical_diversity = round(self.num_words / self.unique_words) fdist = FreqDist(words) stop_words = stopwords.words(language) not_stopwords = [w for w in words if w not in stop_words] fdist2 = FreqDist(not_stopwords) self.fifty_first_words = fdist.most_common(50) self.hundreds_nsw = fdist2.most_common(300) bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(words) finder.apply_freq_filter(10) self.fifty_collocations = finder.nbest(bigram_measures.pmi, 50) trigram_measures = TrigramAssocMeasures() finder3 = TrigramCollocationFinder.from_words(words) finder3.apply_freq_filter(10) self.fifty_collocations3 = finder3.nbest(trigram_measures.pmi, 50) self.stcs_width_words = [' '.join(sent) for sent in sentences if "malheureusement" in sent.lower()]
def best_ngrams(words, top_n=10, min_freq=5): """ Extract `top_n` most salient collocations (bigrams and trigrams), from a stream of words. Ignore collocations with frequency lower than `min_freq`. This fnc uses NLTK for the collocation detection itself -- not very scalable! Return the detected ngrams as compiled regular expressions, for their faster detection later on. """ tcf = TrigramCollocationFinder.from_words(words) tcf.apply_freq_filter(min_freq) trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)] logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20])) bcf = tcf.bigram_finder() bcf.apply_freq_filter(min_freq) bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)] logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20])) pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE) pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE) print pat_gram2 return pat_gram2, pat_gram3
def find_ngrams(data, PATH_SW=None, ntopbg=10, ntoptg=10): '''Find top occuring bigrams and trigrams in a corpus Parameters data: list of strings (each string in list is a document in corpus) PATH_SW: path to stop words file ntopbg: how many bigrams to return ntoptg: how many trigrams to return Returns topbg: list of tuples containing top bigrams toptg: list of tuples containing top trigrams ''' long_string = ' '.join(data) tokenizer = RegexpTokenizer('[\w]+') words = tokenizer.tokenize(long_string) # english_stemmer = SnowballStemmer('english') # stemmed = [english_stemmer.stem(item) # for item in filter_stops] # print(stemmed) bef = BigramCollocationFinder.from_words(words) tcf = TrigramCollocationFinder.from_words(words) with open(PATH_SW, 'r') as f: stops = [re.sub(r'\s', '', line) for line in f] stopset = set(stops) filter_stops = lambda w: w in stopset bef.apply_word_filter(filter_stops) tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(3) topbg = bef.nbest(BigramAssocMeasures.likelihood_ratio, ntopbg) toptg = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, ntoptg) return topbg, toptg
def get_frequencies(self, desc): stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset words = word_tokenize(desc) print '------gram--------' words_to_count = [word for word in words if word not in stopset] words_to_count = [word for word in words_to_count if not len(word) < 3] c = Counter(words_to_count) single = c.most_common(20) print single print '------bigram--------' bcf = BigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter_stops) bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15) print bigrm print '------trigram--------' tcf = TrigramCollocationFinder.from_words(words) tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(3) #only keep those that appear more than 3 times trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10) print trigrm matches = [single,bigrm,trigrm] return matches
def create_trigram_finder(tokenized_docs, should_filter=False): if should_filter: trigrams_data_samples = [trigram_prep(doc) for doc in tokenized_docs] else: trigrams_data_samples = tokenized_docs trigrams_finder = TrigramCollocationFinder.from_documents(trigrams_data_samples) return trigrams_finder
def collocation(inp, outp, freq_filter, results, coll_type, pos): pos = bool(pos == 'true') with open(inp, 'r') as fd: i = fd.read() all_words = [] if pos: text = i.split(' ')[:-1] all_words = [x[0:x.index('/')] if x != '\n' else x for x in text] all_words = [x.strip(' ').strip('\n') for x in all_words] else: sents = nltk.sent_tokenize(i) for sent in sents: all_words += nltk.word_tokenize(sent) if coll_type == 'bigram': measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(all_words) else: measures = TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(all_words) finder.apply_freq_filter(int(freq_filter)) # score the ngrams and get the first N colls = finder.score_ngrams(measures.pmi)[:int(results)] with open(outp, 'w') as output: for coll in colls: (a, b), score = coll output.write("%s\t%s\n" % (a, b))
def generate_trigrams(tokens): trigram_measures = nltk.collocations.TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(tokens, window_size = 3) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in stoplist) finder.apply_freq_filter(1) colls = finder.nbest(trigram_measures.likelihood_ratio, 10) return colls
def collocations(stream, top_n=10000, min_bigram_freq=50, min_trigram_freq=20): """Extract text collocations (bigrams and trigrams), from a stream of words. Parameters ---------- stream: iterable object An iterable of words top_n: int Number of collocations to retrieve from the stream of words (order by decreasing frequency). Default is 10000 min_bigram_freq: int Minimum frequency of a bigram in order to retrieve it. Default is 50. min_trigram_freq: int Minimum frequency of a trigram in order to retrieve it. Default is 20. """ tcf = TrigramCollocationFinder.from_words(stream) tcf.apply_freq_filter(min_trigram_freq) trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)] logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20])) bcf = tcf.bigram_finder() bcf.apply_freq_filter(min_bigram_freq) bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)] logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20])) bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE) trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE) return bigrams_patterns, trigrams_patterns
def get_trigrams(self, size): file_name = self.disease_type + '-trigram-freq-' + str(size) if 'training' in file_name: full_training_trigram_filename = file_name + '.csv' file_trigrams = csv.writer( open(full_training_trigram_filename, 'w')) else: self.full_test_trigram_filename = file_name + '.csv' file_trigrams = csv.writer( open(self.full_test_trigram_filename, 'w')) finder = TrigramCollocationFinder.from_words(self.word_set) #scored = finder.score_ngrams(bigram_measures.raw_freq) True sortedTriGrams = sorted( finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:size] # doctest: +NORMALIZE_WHITESPACE # Store results of 400 bigrams into CSV file for trigram_tuple, count in sortedTriGrams: file_trigrams.writerow([ type(trigram_tuple)(x.encode('utf-8') for x in trigram_tuple), count ]) # formatted properly x.encode return self.full_training_trigram_filename, self.full_test_trigram_filename
def get_frequencies(self, desc): stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset words = word_tokenize(desc) print '------gram--------' words_to_count = [word for word in words if word not in stopset] words_to_count = [word for word in words_to_count if not len(word) < 3] c = Counter(words_to_count) single = c.most_common(20) print single print '------bigram--------' bcf = BigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter_stops) bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15) print bigrm print '------trigram--------' tcf = TrigramCollocationFinder.from_words(words) tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter( 3) #only keep those that appear more than 3 times trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10) print trigrm matches = [single, bigrm, trigrm] return matches
def predict(test_string, models): # clean string test_string = pre_processing(test_string) bi_test = BigramCollocationFinder.from_words(test_string) tri_test = TrigramCollocationFinder.from_words(test_string) quad_test = QuadgramCollocationFinder.from_words(test_string) final_test = list(bi_test.ngram_fd.items()) + list( tri_test.ngram_fd.items()) + list(quad_test.ngram_fd.items()) model_name = [] for model in models: model_name.append(model[0]) freq_sum = np.zeros(len(models)) for ngram, freq in final_test: exists = 0 for i, lang_model in enumerate(models): lang = lang_model[0] model = lang_model[1] total_ngram = lang_model[2] if ngram in model: if DEBUG: print("Found", ngram, model[ngram], lang, total_ngram) # normalizing to prevent freq/total to be zero freq_sum[i] = freq_sum[i] + (freq * 10000) / total_ngram exist = 1 if not exists: freq_sum[i] += 1 max_val = freq_sum.max() index = freq_sum.argmax() if not max(freq_sum): if DEBUG: print("[ERROR] Invalid string. String: {}".format(test_string)) return 0, "Hmm, I do not know this word. Please try other words." # get highest score and normalize it to be between 0,1} _max = 0 freq_to_model = list(zip(freq_sum, model_name)) scores = [x for x, y in freq_to_model] normalized_scores_name = [(normalize_score(f, scores), m) for f, m in freq_to_model] sorted_score_model = sorted(normalized_scores_name, reverse=True) if DEBUG: print("[DEBUG] Frequency to model: {}".format(freq_to_model)) if DEBUG: print("[DEBUG] Scores: {}".format(scores)) if DEBUG: print("[DEBUG] Normalized scores name: {}".format( normalized_scores_name)) if DEBUG: print("[DEBUG] Reverse sorted score model: {}".format( sorted_score_model)) return 1, sorted_score_model
def create_tri_collocations(features_words,document_preprocess): finder = TrigramCollocationFinder.from_words(movie_reviews.words()) finder.apply_freq_filter(3) tricoll = finder.nbest(trigram_measures.pmi,1000) for f in document_preprocess: tricoll = [(f(a),f(b),f(c)) for (a,b,c) in tricoll if (f(a) and f(b) and f(c))] return tricoll
def collocations(stream, top_n=10000, min_bigram_freq=50, min_trigram_freq=20): """Extract text collocations (bigrams and trigrams), from a stream of words. Parameters ---------- stream: iterable object An iterable of words top_n: int Number of collocations to retrieve from the stream of words (order by decreasing frequency). Default is 10000 min_bigram_freq: int Minimum frequency of a bigram in order to retrieve it. Default is 50. min_trigram_freq: int Minimum frequency of a trigram in order to retrieve it. Default is 20. """ tcf = TrigramCollocationFinder.from_words(stream) tcf.apply_freq_filter(min_trigram_freq) trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)] logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20])) bcf = tcf.bigram_finder() bcf.apply_freq_filter(min_bigram_freq) bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)] logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20])) bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE) trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE) return bigrams_patterns, trigrams_patterns
def train_language(language, training_path): words = [] filter_words(training_path, words) seq = ' ' + ''.join(words) # Bigram bigram_finder = BigramCollocationFinder.from_words(seq) bigram_finder.apply_freq_filter(FREQ_FILTER) bigram_model = bigram_finder.ngram_fd.items() # Trigram trigram_finder = TrigramCollocationFinder.from_words(seq) trigram_finder.apply_freq_filter(FREQ_FILTER) trigram_model = trigram_finder.ngram_fd.items() # Quad quadgram_finder = QuadgramCollocationFinder.from_words(seq) quadgram_finder.apply_freq_filter(FREQ_FILTER) quadgram_model = quadgram_finder.ngram_fd.items() bigram_model = sorted(bigram_finder.ngram_fd.items(), key=lambda item: item[1], reverse=True) trigram_model = sorted(trigram_finder.ngram_fd.items(), key=lambda item: item[1], reverse=True) quadgram_model = sorted(quadgram_finder.ngram_fd.items(), key=lambda item: item[1], reverse=True) final_model = bigram_model + trigram_model + quadgram_model #print(final_model) np.save(MODELS_PATH + language + '.npy', final_model) print("Language model for {} stored at {}".format( language, MODELS_PATH + language + '.npy'))
def _get_trigrams(words, top_n, min_freq): tcf = TrigramCollocationFinder.from_words(iter(words)) tcf.apply_freq_filter(min_freq) trigrams = [ ' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n) ] return re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)
def tagged_trigram_collocation(document): # content = re.sub('[,;.!-:\(\)“”"\'’‘]','',document) tag_filter = ( ('CC','NN','NNS','NNP','NNPS','IN','JJ','JJR','JJS'), ('CC','NN','NNS','NNP','NNPS','IN','JJ','JJR','JJS'), ('NN','NNS','NNP','NNPS') ) tag_func = lambda key1,key2,key3:key1[1] not in tag_filter[0] or key2[1] not in tag_filter[1] or key3[1] not in tag_filter[2] #每个单词 words = nltk.word_tokenize(document) #每个单词标注词性 tagged_words = nltk.pos_tag(words) #全部转换成小写 tagged_words = ((tw[0].lower(),tw[1]) for tw in tagged_words) #英语停用词 sw = stopwords.words("english") words = [w for w in tagged_words if w[0].strip() and w[0] not in sw and len(w[0]) > 3] trigram_finder = TrigramCollocationFinder.from_words(words) trigram_finder.apply_ngram_filter(tag_func) for (key1,key2,key3),feq in trigram_finder.ngram_fd.items(): print(key1,key2,key3,feq)
def trigramFeats(thesewords, n=100): si = iter(thesewords) words = [c + " " + next(si, '') + " " + next(si, '') for c in si] tcf = TrigramCollocationFinder.from_words(words) tcf.apply_freq_filter(n) trigram = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, n) return dict([(ngram, True) for ngram in itertools.chain(words, trigram)])
def trigram(words, score_fn=TrigramAssocMeasures.likelihood_ratio, n=1500, freq=1): """ tmp_words=[] for w in words: tmp_words.append(w) words=tmp_words """ if len(words) <= 0: return {} tmp_dict = {} for w in words: tmp_dict[w] = 1 if len(tmp_dict.keys()) < 3: return {} trigram_finder = TrigramCollocationFinder.from_words(words) # 把文本变成双词搭配的形式 trigram_finder.apply_freq_filter(freq) trigrams = trigram_finder.nbest(score_fn, n) # 使用了卡方统计的方法,选择排名前1000的双词 # print type(words) res = {} for s in trigrams: if res.has_key(s[0] + s[1] + s[2]) == True: res[s[0] + s[1] + s[2]] += 1 else: res[s[0] + s[1] + s[2]] = 1 return res
def set_trigramas(self, freq=2, best=20): tcf = TrigramCollocationFinder.from_words(self.palavras) stopset = set(stopwords.words('portuguese')) filter_stops = lambda w: len(w) < 3 or w in stopset tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(freq) a = tcf.nbest(TrigramAssocMeasures.pmi, best) self.trigramas = a
def extract_trigrams(self, sent): sent = self._preprocess_sent(sent) trigram_measures = TrigramAssocMeasures() TriFinder = TrigramCollocationFinder.from_words(sent) trigrams = TriFinder.nbest(trigram_measures.pmi, 10000) trigrams = set([' '.join(i) for i in trigrams]) trigrams = trigrams & self._trigrams_set return { i: True for i in trigrams }
def calc_trigrams(text, min_freq=50): """Returns frequency of trigrams from a text input.""" words = [w.lower() for w in text] tcf = TrigramCollocationFinder.from_words(words) tcf.apply_freq_filter(min_freq) trigrams = tcf.ngram_fd.items() trigram_list.append(trigrams) return trigram_list
def set_trigramas(self,freq=2,best=20): tcf = TrigramCollocationFinder.from_words(self.palavras) stopset = set(stopwords.words('portuguese')) filter_stops = lambda w: len(w) < 3 or w in stopset tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(freq) a = tcf.nbest(TrigramAssocMeasures.pmi, best) self.trigramas = a
def best_trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=trigram_feature_number): trigram_finder = TrigramCollocationFinder.from_words(words) trigrams = trigram_finder.nbest(score_fn, n) d = dict([(trigram, True) for trigram in trigrams]) #d.update(best_word_feats(words)) return d
def trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=100): """Splits each review into a list of trigrams E.g "The book is good" - (the book is), (book is good) Filters out the top 100 most relevant trigrams with a chi-squared association measure """ words = text_process(words) trigram_finder = TrigramCollocationFinder.from_words(words) trigrams = trigram_finder.nbest(score_fn, n) bigrams = bigram_word_feats(review_class) return ([ngram for ngram in trigrams] + bigrams)
def extract_trigrams(self, sent): sent = self._preprocess_sent(sent) trigram_measures = TrigramAssocMeasures() TriFinder = TrigramCollocationFinder.from_words(sent) trigrams = TriFinder.nbest(trigram_measures.pmi, 10000) trigrams = set([' '.join(i) for i in trigrams]) trigrams = trigrams & self._trigrams_set return {i: True for i in trigrams}
def tri(text): trigram_measures = nltk.collocations.TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(word_tokenize(text)) finder.apply_freq_filter(30) finder.nbest(trigram_measures.pmi, 200) print(finder.ngram_fd.items()) print(len(finder.ngram_fd.items())) return finder.ngram_fd.items()
def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None): """collects bigrams and trigrams from collection of documents. Input to collocation tokenizer. bigrams are pairs of words that recur in the collection; trigrams are triplets. Parameters ---------- raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str)) body of documents to examine top_n : int limit results to this many entries min_length : int Minimum length of any single word min_freqs : iterable of int threshold of when to consider a pair of words as a recognized n-gram, starting with bigrams. stopwords : None or iterable of str Collection of words to ignore as tokens Examples -------- >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2]) >>> patterns[0].pattern u'(frank swank|swank tank|sassy unicorns)' >>> patterns[1].pattern u'(frank swank tank)' """ from nltk.collocations import TrigramCollocationFinder from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures # generator of documents, turn each element to its list of words doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords) for doc_id, doc_text in raw_corpus) # generator, concatenate (chain) all words into a single sequence, lazily words = itertools.chain.from_iterable(doc_texts) tcf = TrigramCollocationFinder.from_words(iter(words)) bcf = tcf.bigram_finder() bcf.apply_freq_filter(min_freqs[0]) bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)] tcf.apply_freq_filter(min_freqs[1]) trigrams = [ ' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n) ] bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE) trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE) return bigrams_patterns, trigrams_patterns
def create_word_features(words): trigram_measures = nltk.collocations.TrigramAssocMeasures() score = TrigramAssocMeasures.chi_sq trigram_measures = nltk.collocations.TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(words) trigrams = finder.score_ngrams(trigram_measures.raw_freq) return dict([(word, True) for word in itertools.chain(words, trigrams)])
def best_n_trigrams(self, n, method="pmi"): trigram_measures = TrigramAssocMeasures() tokens = self.get_word_lst() finder = TrigramCollocationFinder.from_words(tokens) if method == "pmi": return finder.nbest(trigram_measures.pmi, n) if method == "raw_freq": return finder.nbest(trigram_measures.raw_freq, n)
def get_top_trigrams(corpus, top_n=100): ''' Most frequent tri-gram detection ''' finder = TrigramCollocationFinder.from_documents( [item.split() for item in corpus]) trigram_measures = TrigramAssocMeasures() return finder.nbest(trigram_measures.raw_freq, top_n)
def best_trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=200): tcf = TrigramCollocationFinder.from_words(words) trigrams = tcf.nbest(score_fn, n) d = dict([(trigram, True) for trigram in trigrams]) d.update(best_bigram_word_feats(words)) d.update(best_word_feats(words)) return d
def bag_of_ngram_words(words, bscore_fn=BigramAssocMeasures.chi_sq, tscore_fn=TrigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(bscore_fn, n) trigram_finder = TrigramCollocationFinder.from_words(words) trigrams = trigram_finder.nbest(tscore_fn, n) return bag_of_words(words + bigrams + trigrams)
def getTrigram(haystack): tokenizer = WordPunctTokenizer() words = tokenizer.tokenize(haystack) tcf = TrigramCollocationFinder.from_words(words) stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset tcf.apply_word_filter(filter_stops) return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)
def getTrigrams(self): words = [w.lower() for w in nltk.word_tokenize(self.text)] tcf = TrigramCollocationFinder.from_words(words) stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(1) return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 6)
def trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=50): trigram_finder = TrigramCollocationFinder.from_words(words) try: trigrams = trigram_finder.nbest(score_fn, n) except: print "lost trigrams", words return dict([(ngram, True) for ngram in itertools.chain(words)]) return dict([(ngram, True) for ngram in itertools.chain(words, trigrams)])
def setup_mwes(self, trigram_nbest=100, bigram_nbest=2000): """Create multi-word expressions by learning a corpus located in a corpus directory. Testing setting up mwes with custom path and setting it up twice (correct when no exception): >>> corpus_dir = os.path.join(base_path, 'test', 'corpus') >>> clusterer = DumbClusterer(corpus_dir=corpus_dir, mwes=['custom mwe']) >>> mwes = clusterer.setup_mwes(trigram_nbest=1000, bigram_nbest=15000) >>> 'custom mwe' not in mwes True >>> 'custom mwe' in clusterer.mwes True Args: trigram_nbest(int): Number of highest ranked trigrams to acquire. bigram_nbest(int): Number of highest ranked trigrams to acquire. Returns: list: List of multi-word expressions. """ if self.corpus is None: raise Exception("Corpus not found. Run method `setup_corpus` with given corpus directory first.") bigram_measures = BigramAssocMeasures() trigram_measures = TrigramAssocMeasures() # Following are not used since ne chunk takes too much time. # Text processing before bigrams and trigrams calculated # words = [] # for sent in self.corpus.sents(): # for chunk in nltk.ne_chunk(nltk.pos_tag(sent)): # if not isinstance(chunk, nltk.Tree): # w = chunk[0] # # - Removal of words containing numbers or punctuations # if not any((ch.isdigit() or ch in string.punctuation) for ch in w): # # - Lowercasing all words # words.append(w.lower()) # print(w.lower().encode("utf-8")), # Text processing before bigrams and trigrams calculated words = [] for w in self.corpus.words(): # - Removal of words containing numbers or punctuations if not any((ch.isdigit() or ch in string.punctuation) for ch in w): # - Lowercasing all words words.append(w.lower()) bigram_finder = BigramCollocationFinder.from_words(words) trigram_finder = TrigramCollocationFinder.from_words(words) mwes = trigram_finder.nbest(trigram_measures.pmi, trigram_nbest) + bigram_finder.nbest(bigram_measures.pmi, bigram_nbest) # Basically combining two list by turning them into sets to make sure union returned # i.e. `set1 | set2` where set1 could be list of string or list, and if the latter, they # need to be converted into sets. set1 = {(tuple(mwe) if isinstance(mwe,list) else mwe) for mwe in self.mwes} set2 = set(mwes) self.mwes = list(set1 | set2) return mwes
def print_top_trig_collocs(word, pd_series, tokenizer, frac_corpus = 0.1, stopwords = gen_stop_words): corpus = [tokenizer.tokenize(x) for x in pd_series.to_list()] finder = TrigramCollocationFinder.from_documents(corpus) finder.apply_freq_filter(round(frac_corpus*len(pd_series))) main_trigrams = finder.nbest(trigram_measures.likelihood_ratio, 100000) for trigram in main_trigrams: if word in trigram: print(trigram) return
def collocations(data, col='text', n_gram='bigram'): fulltext = ' '.join(data[col].tolist()).lower() tokens = fulltext.split() if n_gram == 'bigram': collocation = BigramCollocationFinder.from_words(tokens) n_grams = collocation.nbest(BigramAssocMeasures.likelihood_ratio, 10) elif n_gram == 'trigram': collocation = TrigramCollocationFinder.from_words(tokens) n_grams = collocation.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
def collocation_finder(self, n_gram_total, n_gram_filter_word): cf = TrigramCollocationFinder.from_words( word_tokenize(self.filtered_desc)) #checking what words appear frequently with 'word' in this case it is 'work' n_filter = lambda *words: n_gram_filter_word not in words cf.apply_ngram_filter(n_filter) #apply frq filter removes occurences that happened less than x times self.collocation_scores = cf.nbest( TrigramAssocMeasures.likelihood_ratio, n_gram_total) return self.collocation_scores
def get_trigrams(filelocation, ratio): '''In addition to BigramCollocationFinder, there's also TrigramCollocationFinder, which finds triplets instead of pairs.''' words = [w.lower() for w in webtext.words(filelocation)] stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset tcf = TrigramCollocationFinder.from_words(words) tcf.apply_word_filter(filter_stops) tcf.apply_freq_filter(3) return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, ratio)
def trigrama(tokens): trigram_medidas = trigram() for i in xrange(len(tokens)): for j in tokens[i]: finder = trigram_finder.from_words(j) finder.apply_freq_filter( 1) #filtramos los trigramas que hayan aparecido una vez print finder.nbest(trigram_medidas.pmi, 30) time.sleep( 3) #esperamos tres segundos para leer el proximo trigrama
def trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=50): trigram_finder = TrigramCollocationFinder.from_words(words) trigrams = trigram_finder.nbest(score_fn, n) """ print words for ngram in itertools.chain(words, bigrams): if ngram not in stopset: print ngram exit() """ return dict([(ngram, True) for ngram in itertools.chain(words, trigrams)])
def trigrams(words, max_trigrams=100): print "Extracting trigrams" trigram_finder = TrigramCollocationFinder.from_words(words) for trigram, score in trigram_finder.score_ngrams(trigram_measures.raw_freq)[:max_trigrams]: l_trigram = [lmtzr.lemmatize(p) for p in trigram] if l_trigram in tg: print "Common trigram", trigram continue #print trigram, score yield trigram
def find_collocations(text_series): bigram_measures = BigramAssocMeasures() trigram_measures = TrigramAssocMeasures() tokens = [ token for token_list in text_series for token in token_list ] bigrams = BigramCollocationFinder.from_words(tokens) trigrams = TrigramCollocationFinder.from_words(tokens) scored_bigrams = bigrams.score_ngrams(bigram_measures.likelihood_ratio) scored_trigrams = trigrams.score_ngrams(trigram_measures.likelihood_ratio) with open('bigrams.pkl', 'wb') as fid: cPickle.dump(scored_bigrams, fid) with open('trigrams.pkl', 'wb') as fid: cPickle.dump(scored_trigrams, fid)
def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None): """collects bigrams and trigrams from collection of documents. Input to collocation tokenizer. bigrams are pairs of words that recur in the collection; trigrams are triplets. Parameters ---------- raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str)) body of documents to examine top_n : int limit results to this many entries min_length : int Minimum length of any single word min_freqs : iterable of int threshold of when to consider a pair of words as a recognized n-gram, starting with bigrams. stopwords : None or iterable of str Collection of words to ignore as tokens Examples -------- >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2]) >>> patterns[0].pattern u'(frank swank|swank tank|sassy unicorns)' >>> patterns[1].pattern u'(frank swank tank)' """ from nltk.collocations import TrigramCollocationFinder from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures # generator of documents, turn each element to its list of words doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords) for doc_id, doc_text in raw_corpus) # generator, concatenate (chain) all words into a single sequence, lazily words = itertools.chain.from_iterable(doc_texts) tcf = TrigramCollocationFinder.from_words(iter(words)) bcf = tcf.bigram_finder() bcf.apply_freq_filter(min_freqs[0]) bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)] tcf.apply_freq_filter(min_freqs[1]) trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)] bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE) trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE) return bigrams_patterns, trigrams_patterns
def best_ngrams(words, top_n=1000, min_freq=100): tcf = TrigramCollocationFinder.from_words(words) tcf.apply_freq_filter(min_freq) trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)] logging.info('%i trigrams found: %s...' % (len(trigrams), trigrams[:10])) bcf = tcf.bigram_finder() bcf.apply_freq_filter(min_freq) bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)] logging.info('%i bigrams found: %s...' % (len(bigrams), bigrams[:10])) pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE) pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE) return pat_gram2, pat_gram3
def tri_collocations(tokens, num=20): from nltk.corpus import stopwords ignored_words = stopwords.words('english') word_list = [word for sent in tokens for word in sent] finder = TrigramCollocationFinder.from_words(word_list) finder.apply_freq_filter(3) finder.apply_ngram_filter(lambda w1, w2, w3: len(w1) < 3 \ or len(w3) < 3 \ or (len(w1)+len(w2)+len(w3)) < 11 \ or w1.lower() in ignored_words \ or w3.lower() in ignored_words) trigram_measures = TrigramAssocMeasures() collocations = finder.nbest(trigram_measures.likelihood_ratio, num) return collocations
def get_trigrams(sentences, freq_filter): ''' Method to parse corpus into trigrams, then filter to include only those that occur more than 10 times. ''' # Initialize trigram utils trigram_measures = TrigramAssocMeasures() trigram_finder = TrigramCollocationFinder.from_words( word_tokenize(" ".join(sentences).lower())) # Filter trigrams by frequency to reduce pmi pollution trigram_finder.apply_freq_filter(freq_filter) # Generate pmi ranked set of trigrams for sorting scored = trigram_finder.score_ngrams(trigram_measures.pmi) return sorted(trigram for trigram, score in scored)
def find_collocations(text_series): #use stemmed collocations to tokenizer #text_series= text_series.map(custom_tokenizer) #use nltk.collocations to find the most commonly occuring bigrams and trigrams bigram_measures = BigramAssocMeasures() trigram_measures = TrigramAssocMeasures() tokens = [token for token_list in text_series for token in token_list] bigrams = BigramCollocationFinder.from_words(tokens) trigrams = TrigramCollocationFinder.from_words(tokens) scored_bigrams = bigrams.score_ngrams(bigram_measures.likelihood_ratio) scored_trigrams = trigrams.score_ngrams(trigram_measures.likelihood_ratio) #save to pickle with open('bigrams.pkl', 'wb') as fid: cPickle.dump(scored_bigrams,fid) with open('trigrams.pkl', 'wb') as fid: cPickle.dump(scored_trigrams, fid)
def nGrams(string,corpus,number,clean=True): global wordList biList=[] triList=[] words = WordPunctTokenizer().tokenize(string) stopset = set(stopwords.words('english')) if clean == True: words = [word.lower() for word in words] if clean == False: words = [word.lower() for word in words] filter = lambda words: len(words) < 2 or words.isdigit() bcf = BigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter) biResult = bcf.nbest(BigramAssocMeasures.likelihood_ratio, number) tcf = TrigramCollocationFinder.from_words(words) tcf.apply_word_filter(filter) triResult = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, number) for i in range(len(biResult)): if len(biResult) > 0: biPrint = " ".join(biResult[i]) biList.append(biPrint) else: biList=[] csv = open('db\cyttron-keywords.csv','a') if len(biList) > 1: csv.write('"' + ','.join(biList[:-1]) + ',' + biList[-1] + '";') else: csv.write('"' + ''.join(biList) + '";') csv.close() for i in range(len(triResult)): if len(triResult) > 0: triPrint = " ".join(triResult[i]) triList.append(triPrint) else: triList=[] csv = open('db\cyttron-keywords.csv','a') if len(triList) > 1: csv.write('"' + ','.join(triList[:-1]) + ',' + triList[-1] + '"\n') else: csv.write('"' + ''.join(triList) + '"\n') csv.close() print biList print triList
def process(self, document): trigram_measures = nltk.collocations.TrigramAssocMeasures() metrics = ['chi_sq', 'jaccard', 'likelihood_ratio', 'mi_like', 'pmi', 'poisson_stirling', 'raw_freq', 'student_t'] trigram_finder = TrigramCollocationFinder.from_words(document['tokens']) tr = defaultdict(lambda: []) for m in metrics: for res in trigram_finder.score_ngrams(getattr(trigram_measures,m)): tr[res[0]].append(res[1]) return {'trigram_rank': tr, 'metrics':metrics}
def calc_trigram_collocation_set(string, regexp, boolStem): tokens = nltk.regexp_tokenize(string, regexp) if boolStem: tokens = Util.applyStem(tokens) trigram_collocation_finder = \ TrigramCollocationFinder.from_words(tokens) #trigram_collocation_finder.apply_freq_filter(5) trigrams = \ trigram_collocation_finder.nbest(TrigramAssocMeasures.chi_sq, len(tokens)/10) #trigram_collocation_finder.apply_freq_filter(2) final_tokens = [] for trigram in trigrams: final_tokens += ['~'.join(list(trigram))] return final_tokens
def find_collocations(words): """ Find trigram and bigram collocations in text. Args: words - an array of tokenized words. Returns: A list of collocations, sorted by score. """ ignore_words = lambda w: len(w) < 3 or w.lower() in _stopset trigram_finder = TrigramCollocationFinder.from_words(words) trigram_finder.apply_word_filter(ignore_words) collocations = trigram_finder.nbest(TrigramAssocMeasures.raw_freq, 10) bigram_finder = BigramCollocationFinder.from_words(words) bigram_finder.apply_word_filter(ignore_words) collocations += bigram_finder.nbest(BigramAssocMeasures.raw_freq, 25) return map(lambda w: ' '.join(w), collocations)
def extract_top_collocations(json_cleaned, return_top_n=10, use_trigrams=False): if use_trigrams: measures = nltk.collocations.TrigramAssocMeasures() else: measures = nltk.collocations.BigramAssocMeasures() items = json_cleaned tweets = "\n".join(item["tweet"] for item in items) tweets_split = tweet_as_terms(tweets) # change this to read in your data if use_trigrams: finder = TrigramCollocationFinder.from_words(tweets_split) else: finder = BigramCollocationFinder.from_words(tweets_split) # only bigrams that appear 3+ times finder.apply_freq_filter(3) # return the 10 n-grams with the highest PMI top_collocations = finder.nbest(measures.pmi, return_top_n) return top_collocations
def best_ngrams(words, top_n, min_freq): """ This function has been extracted from an Europython 2014 tutorial about topic modelling given by Radim Rehurek and modified for this particular project. Extract `top_n` most salient collocations (bigrams and trigrams), from a stream of words. Ignore collocations with frequency lower than `min_freq`. This fnc uses NLTK for the collocation detection itself -- not very scalable! Return the detected ngrams as compiled regular expressions, for their faster detection later on. """ tcf = TrigramCollocationFinder.from_words(words) tcf.apply_freq_filter(min_freq) trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)] logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20])) bcf = tcf.bigram_finder() bcf.apply_freq_filter(min_freq) bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)] logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20])) # Write collocations to two files to be read by the preprocess program f1 = open('bigrams.txt', 'w') f1.writelines(["{0}\n".format(item) for item in bigrams]) f1.close() f2 = open('trigrams.txt', 'w') f2.writelines(["{0}\n".format(item) for item in trigrams]) f2.close() pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE) pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE) return pat_gram2, pat_gram3
def TriGram(self, text): ''' @param text: POS tagged text which is going to be collocated. This use the NLTK Collocations methods to find the relevent collocations which have a frequancy of 1 or more @return: A set of Tri Gram Collocations. ''' words = [] for s in text: for w in s: words.append(w[0]) tri = TrigramCollocationFinder.from_words(words) tri.apply_word_filter(self.filter_stop) tri.apply_freq_filter(1) tmp = tri.nbest(TrigramAssocMeasures.chi_sq, 20) tmp1 = [] for word in tmp: tmp1.append(self.pos.POSTag(word, s=True)) return tmp1