def create_trigram_finder(tokenized_docs, should_filter=False): if should_filter: trigrams_data_samples = [trigram_prep(doc) for doc in tokenized_docs] else: trigrams_data_samples = tokenized_docs trigrams_finder = TrigramCollocationFinder.from_documents(trigrams_data_samples) return trigrams_finder
def get_top_trigrams(corpus, top_n=100): ''' Most frequent tri-gram detection ''' finder = TrigramCollocationFinder.from_documents( [item.split() for item in corpus]) trigram_measures = TrigramAssocMeasures() return finder.nbest(trigram_measures.raw_freq, top_n)
def print_top_trig_collocs(word, pd_series, tokenizer, frac_corpus = 0.1, stopwords = gen_stop_words): corpus = [tokenizer.tokenize(x) for x in pd_series.to_list()] finder = TrigramCollocationFinder.from_documents(corpus) finder.apply_freq_filter(round(frac_corpus*len(pd_series))) main_trigrams = finder.nbest(trigram_measures.likelihood_ratio, 100000) for trigram in main_trigrams: if word in trigram: print(trigram) return
def retrieve_top_trigrams_collocations(corpus, top=5, measure='pmi'): finder = TrigramCollocationFinder.from_documents( [item.split() for item in corpus]) trigram_measures = TrigramAssocMeasures() if measure == 'pmi': top_trigrams = finder.nbest(trigram_measures.pmi, top) elif measure == 'frequency': top_trigrams = finder.nbest(trigram_measures.raw_freq, top) else: raise ValueError('Type of measure is unknown!') return top_trigrams
def compute_ngrams_count(text_corpus, out_p, n=20): print("Compute ngrams count...") list_of_tokens = [] for document in text_corpus: for sentence in document: list_of_tokens.append(word_tokenize(sentence)) # Unigram tokens = util.flatten_one_level(list_of_tokens) custom_sw = [".", "[", "]", ","] sw = stopwords.words("english") + custom_sw tokens = [w for w in tokens if w not in sw] word_fd = FreqDist(tokens) uni_mc = word_fd.most_common(n) # Bigram bi = BigramCollocationFinder.from_documents(list_of_tokens) #bi.apply_freq_filter(2) #print(bi.ngram_fd.items()) bi_mc = bi.ngram_fd.most_common(n) # Trigram tri = TrigramCollocationFinder.from_documents(list_of_tokens) tri_mc = tri.ngram_fd.most_common(n) # Quadgram quad = QuadgramCollocationFinder.from_documents(list_of_tokens) quad_mc = quad.ngram_fd.most_common(n) # Plot data = [uni_mc, bi_mc, tri_mc, quad_mc] x = [] y = [] for i in range(len(data)): x_ng = [] y_ng = [] for d in data[i]: if i==0: x_ng.append(d[0]) else: x_ng.append(" ".join(d[0])) y_ng.append(d[1]) x.append(x_ng[::-1]) y.append(y_ng[::-1]) title = ["Unigram", "Bigram", "Trigram", "Quadgram"] sup_title = "ngrams count" util.plot_bar_chart_grid(x, y, 1, len(data), title, sup_title, out_p, sup_title_font_size=16, tick_font_size=14, title_font_size=14, h_size=5, w_size=5, rotate=True)
get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10) from nltk.collocations import BigramCollocationFinder from nltk.collocations import BigramAssocMeasures finder = BigramCollocationFinder.from_documents( [item.split() for item in norm_alice]) bigram_measures = BigramAssocMeasures() finder.nbest(bigram_measures.raw_freq, 10) finder.nbest(bigram_measures.pmi, 10) from nltk.collocations import TrigramCollocationFinder from nltk.collocations import TrigramAssocMeasures finder = TrigramCollocationFinder.from_documents( [item.split() for item in norm_alice]) trigram_measures = TrigramAssocMeasures() finder.nbest(trigram_measures.raw_freq, 10) finder.nbest(trigram_measures.pmi, 10) toy_text = """ Elephants are large mammals of the family Elephantidae and the order Proboscidea. Two species are traditionally recognised, the African elephant and the Asian elephant. Elephants are scattered throughout sub-Saharan Africa, South Asia, and Southeast Asia. Male African elephants are the largest extant terrestrial animals. All elephants have a long trunk used for many purposes, particularly breathing, lifting water and grasping objects. Their incisors grow into tusks, which can serve as weapons and as tools for moving objects and digging. Elephants' large ear flaps help to control their body temperature. Their pillar-like legs can
def find_trigrams(sentences, n_ngrams): cf = TrigramCollocationFinder.from_documents(sentences) fng = cf.nbest(TrigramAssocMeasures.likelihood_ratio, n_ngrams) return fng
corpus, category = get_data() from nltk.collocations import BigramCollocationFinder from nltk.collocations import BigramAssocMeasures finder = BigramCollocationFinder.from_documents( [item.split() for item in corpus]) bigram_measures = BigramAssocMeasures() print finder.nbest(bigram_measures.raw_freq, 10) from nltk.collocations import TrigramCollocationFinder from nltk.collocations import TrigramAssocMeasures finder = TrigramCollocationFinder.from_documents( [item.split() for item in corpus]) trigram_measures = TrigramAssocMeasures() print finder.nbest(trigram_measures.raw_freq, 10) print finder.nbest(trigram_measures.pmi, 10) # print get_top_ngrams(corpus, ngram_val=2, limit=10) from feature_extractors import build_feature_matrix import networkx import numpy as np import matplotlib from normalization import normalize_corpus norm = normalize_corpus(corpus) # construcat weighted document term matrix vec, dt_matrix = build_feature_matrix(norm, feature_type='tfidf')
from nltk.collocations import BigramCollocationFinder from nltk.collocations import BigramAssocMeasures finder = BigramCollocationFinder.from_documents([item.split() for item in norm_alice]) bigram_measures = BigramAssocMeasures() finder.nbest(bigram_measures.raw_freq, 10) finder.nbest(bigram_measures.pmi, 10) from nltk.collocations import TrigramCollocationFinder from nltk.collocations import TrigramAssocMeasures finder = TrigramCollocationFinder.from_documents([item.split() for item in norm_alice]) trigram_measures = TrigramAssocMeasures() finder.nbest(trigram_measures.raw_freq, 10) finder.nbest(trigram_measures.pmi, 10) toy_text = """ Elephants are large mammals of the family Elephantidae and the order Proboscidea. Two species are traditionally recognised, the African elephant and the Asian elephant. Elephants are scattered throughout sub-Saharan Africa, South Asia, and Southeast Asia. Male African elephants are the largest extant terrestrial animals. All elephants have a long trunk used for many purposes, particularly breathing, lifting water and grasping objects. Their incisors grow into tusks, which can serve as weapons and as tools
def process(text: str, num_1_grams: int = 100, num_2_grams: int = 100, num_3_grams: int = 100, num_4_grams: int = 100, min_chars: int = 3, max_chars: int = 30): """ Extract keywords from text sources """ # Find all sentences in the text sents = get_sentences(text) # Filter out any sentences which occur identically more than once sent_counter = collections.Counter(sents) sents = [sent for sent in sents if sent_counter[sent] == 1] # Tokenize each sentence sents = [RE_TOKEN.split(sent) for sent in sents] # and len(word) > 1 # Filter out non-alphabetic tokens and convert to lowercase sents = [[token.lower() for token in sent if is_alpha(token)] for sent in sents] # We look at two variants of the input sentences # a. For 1-grams, we remove all stopwords, short tokens, and possesives # b. For 2-grams and longer, we want to keep stopwords and short tokens as # these might provide some information in relation to other words sents_a = [[RE_POSS.sub('', t) for t in sent if filter_token(t)] for sent in sents] sents_b = sents assert len(sents_a) > 0 and len(sents_b), 'Not enough words' counter = collections.Counter() for sent in sents_a: for token in sent: counter[token] += 1 res = [[], [], [], []] if num_1_grams: # Represent tokens using a tuple with only one element to match the # format of the other ngrams with n > 1 tuples = [((token, ), count) for token, count in counter.items()] df_1 = df_top(tuples=tuples, num=num_1_grams, token_filter=filter_1_grams, min_char=min_chars, max_char=max_chars) if df_1 is not None: res[0] = df_1['entry'].tolist() if num_2_grams: bigrams = BigramCollocationFinder.from_documents(sents_b) tuples = bigrams.score_ngrams(BigramAssocMeasures.raw_freq) df_2 = df_top(tuples=tuples, num=num_2_grams, token_filter=filter_2_grams, min_char=min_chars, max_char=max_chars) if df_2 is not None: res[1] = df_2['entry'].tolist() if num_3_grams: trigrams = TrigramCollocationFinder.from_documents(sents_b) tuples = trigrams.score_ngrams(TrigramAssocMeasures.raw_freq) df_3 = df_top(tuples=tuples, num=num_3_grams, token_filter=filter_3_grams, min_char=min_chars, max_char=max_chars) if df_3 is not None: res[2] = df_3['entry'].tolist() if num_4_grams: quadgrams = QuadgramCollocationFinder.from_documents(sents_b) tuples = quadgrams.score_ngrams(QuadgramAssocMeasures.raw_freq) df_4 = df_top(tuples=tuples, num=num_4_grams, token_filter=filter_4_grams, min_char=min_chars, max_char=max_chars) if df_4 is not None: res[3] = df_4['entry'].tolist() return res