def _chiSq(self,depGraphList, word1 ,word2): bigram_measures = BigramAssocMeasures() firstTuple = word1 secondTuple = word2 depGraphList = depGraphList[0] depLength = len(depGraphList) # value of n11 i = 0 j = 0 count1 = 0 count2 = 0 count3 = 0 for j in range(depLength): if (firstTuple == depGraphList[i][0] or firstTuple == depGraphList[i][1]) and (secondTuple == depGraphList[i][0] or secondTuple == depGraphList[i][1]): count1 = count1+1 else: count1 = count1 i = i+1 j = j+1 cnt1 = count1 # value of n12 i = 0 j = 0 for j in range(depLength): if firstTuple == depGraphList[i][0] or firstTuple == depGraphList[i][1]: count2 = count2+1 else: count2 = count2 i = i+1 j = j+1 cnt2 = count2-1 #value of n21 i = 0 j = 0 for j in range(depLength): if secondTuple == depGraphList[i][0] or secondTuple == depGraphList[i][1]: count3 = count3+1 else: count3 = count3 i = i+1 j = j+1 cnt3 = count3-1 #value of n22 cnt4 = depLength-cnt1-cnt2-cnt3 #total of n11 & n12 n1p = cnt1+cnt2 #total of n21 & n22 n2p = cnt3+cnt4 #total of n11 & n21 np1 = cnt1+cnt3 #total of n12 & n22 np2 = cnt2+cnt4 # Equatio of chi square test=> X^2 = [N(n11 * n22 - n12 * n21)^2]/[n1. * n2. * n.1 * n.2] x2 = float(bigram_measures.chi_sq(cnt1,(np1,n1p),depLength)) if( x2 < 0): x2 = -x2 return x2
def generate_least_frequent_words_wordcloud(title, text_content): finder = BigramCollocationFinder.from_words(text_content) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) scoredList = sorted(scored, key=itemgetter(1)) scoredListLen = len(scoredList) - 1 maxLenCnt = 0 MINSCORE = 0.000265 indx = 0 while (indx < scoredListLen) and (scoredList[indx][1] < MINSCORE): indx += 1 word_dict2 = {} while (indx < scoredListLen) and (maxLenCnt < WC_max_words): word_dict2['_'.join(scoredList[indx][0])] = scoredList[indx][1] indx += 1 maxLenCnt += 1 wordCloud = WordCloud(max_words=WC_max_words, height=WC_height, width=WC_width) if len(word_dict2) > 0: wordCloud.generate_from_frequencies(word_dict2) plt.title(slugify(title)) plt.imshow(wordCloud, interpolation='bilinear') plt.axis("off") wordCloud.to_file("bigram/least_frequent_words/" + slugify(title) + ".png")
def ngram_collocation(words, sents, n, support=10, topK=200): if n >= 4: finder = TrigramCollocationFinder.from_words(words) ngram_measures = TrigramAssocMeasures() finder.apply_freq_filter(support) pmi_ngrams = finder.nbest(ngram_measures.pmi, topK) ext_ngrams = NgramCollocationExtender(pmi_ngrams, sents, support / 3, 0.3) print_ngrams(ext_ngrams) return ext_ngrams #pmi_ngrams = NgramCollocationFinder(words, 2, lowFreq, topK) #the current collocation measure is PMI else: if n == 2: finder = BigramCollocationFinder.from_words(words) ngram_measures = BigramAssocMeasures() if n == 3: finder = TrigramCollocationFinder.from_words(words) ngram_measures = TrigramAssocMeasures() finder.apply_freq_filter(support) pmi_ngrams = finder.nbest(ngram_measures.pmi, topK) print_ngrams(pmi_ngrams) return pmi_ngrams
def bigrams(corpus): bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(corpus) # only bigrams that appear 3+ times finder.apply_freq_filter(3) # return the 5 n-grams with the highest PMI return finder.nbest(bigram_measures.pmi, 5)
def bigram_cloud(toks): finder = BigramCollocationFinder.from_words(toks) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) scoredList = sorted(scored, key=itemgetter(1), reverse=True) word_dict = {} listLen = len(scoredList) for i in range(listLen): word_dict['_'.join(scoredList[i][0])] = scoredList[i][1] WC_height = 500 WC_width = 1000 WC_max_words = 100 wordCloud = WordCloud(max_words=WC_max_words, height=WC_height, width=WC_width) wordCloud.generate_from_frequencies(word_dict) plt.title('Most frequently occurring bigrams connected with an underscore_') plt.imshow(wordCloud, interpolation='bilinear') plt.axis("off") plt.show()
def N_collocations_in_text(text, N, min_freq): # finds <N> most significant two word collocations which occur at # least <min_freq> times text_lower = [w.lower() for w in text] finder = BigramCollocationFinder.from_words(text_lower) finder.apply_freq_filter(min_freq) return finder.nbest(BigramAssocMeasures().pmi, N)
def bigram_collocation_finder_with_log_likelihood_ratio(tokens,window_size=2): '''It returns bigram collocations, including their pointwise mutual information, by using a list of tokens or list of sentences that are list of tokens as input. Window size is two. Parameters ----------- tokens: a list of tokens or list of sentences that are list of tokens window_size: the window size of the collocation, by default 3 Returns ------- bigram_collocations: list of bigram collocations and their raw frequency in tuples ''' bigram_measures = BigramAssocMeasures() if isinstance(tokens[0],list): finder=BigramCollocationFinder.from_words(BigramCollocationFinder._build_new_documents(tokens, window_size, pad_right=True),window_size=window_size) #this is the original code #finder = BigramCollocationFinder.from_documents(tokens) else: finder = BigramCollocationFinder.from_words(tokens,window_size=window_size) result = finder.score_ngrams(bigram_measures.likelihood_ratio) return result
def collocation(inp, outp, freq_filter, results, coll_type, pos): pos = bool(pos == 'true') with open(inp, 'r') as fd: i = fd.read() all_words = [] if pos: text = i.split(' ')[:-1] all_words = [x[0:x.index('/')] if x != '\n' else x for x in text] all_words = [x.strip(' ').strip('\n') for x in all_words] else: sents = nltk.sent_tokenize(i) for sent in sents: all_words += nltk.word_tokenize(sent) if coll_type == 'bigram': measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(all_words) else: measures = TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(all_words) finder.apply_freq_filter(int(freq_filter)) # score the ngrams and get the first N colls = finder.score_ngrams(measures.pmi)[:int(results)] with open(outp, 'w') as output: for coll in colls: (a, b), score = coll output.write("%s\t%s\n" % (a, b))
def cal(finder, _TOP_NUM, total_bigrams, annotations): #print('total bigram num:%s'%(len(total_bigrams))) truth_no_related = [] for x,y in total_bigrams: if x+'' +y not in annotations: truth_no_related.append(x+'' +y) bigram_measures = BigramAssocMeasures() #print('TOP WORDS : %s'%(_TOP_NUM)) TP = 0 TN = 0 system_bigrams = finder.nbest(bigram_measures.likelihood_ratio, _TOP_NUM) system_no_related = [] for x,y in total_bigrams: if x+' '+y not in system_bigrams: system_no_related.append(x+'' +y) for w in truth_no_related: if w in system_no_related: TN += 1 for x,y in system_bigrams: if x+' '+y in annotations: TP += 1 #print('likelihood_ratio precision: %s'%(TP/_TOP_NUM)) #print('likelihood_ratio accuracy: %s'%((TP+TN)/(_TOP_NUM+len(system_no_related)))) print('%s'%((TP+TN)/(_TOP_NUM+len(system_no_related))))
def test(): text = """ LTE single-card dual-standby multi-mode terminal and method for processing concurrency of its CS service and PS service The present invention is applicable to the field of communications technologies, and provides an method, the method includes: when a CS service and PS service of a local LTE single-card dual-standby multi-mode terminal are concurrent, detecting, by a local LTE single-card dual-standby multi-mode terminal, whether a peer communication terminal that is performing voice communication with it is in a voice silent period; when detecting that the peer communication terminal is not in the voice silent period, receiving, by the local LTE single-card dual-standby multi-mode terminal, downlink data in an LTE system, and suspending, by the local LTE single-card dual-standby multi-mode terminal, sending of uplink data in the LTE system at the same time; and when detecting that the peer communication terminal is in the voice silent period, sending the uplink data and receiving the downlink data, by the local LTE single-card dual-standby multi-mode terminal, in the LTE system. """ bigram_measures = BigramAssocMeasures() #trigram_measures = TrigramAssocMeasures() # change this to read in your data finder = BigramCollocationFinder.from_words(preprocessing(text)) # only bigrams that appear 3+ times #finder.apply_freq_filter(2) # return the 10 n-grams with the highest PMI #print(finder.nbest(bigram_measures.pmi,50)) #print(finder.nbest(bigram_measures.likelihood_ratio, 20)) #print(finder.nbest(bigram_measures.poisson_stirling, 20)) for x,y in finder.nbest(bigram_measures.likelihood_ratio,50): print(x+' '+y)
def get_keyword_collocations(corpus, keyword, windowsize=10, numresults=10): '''This function uses the Natural Language Toolkit to find collocations for a specific keyword in a corpus. It takes as an argument a string that contains the corpus you want to find collocations from. It prints the top collocations it finds for each keyword. ''' # convert the corpus (a string) into a list of words tokens = word_tokenize(corpus) # initialize the bigram association measures object to score each collocation bigram_measures = BigramAssocMeasures() # initialize the bigram collocation finder object to find and rank collocations finder = BigramCollocationFinder.from_words(tokens, window_size=windowsize) # initialize a function that will narrow down collocates that don't contain the keyword keyword_filter = lambda *w: keyword not in w # apply a series of filters to narrow down the collocation results ignored_words = stopwords.words('english') finder.apply_word_filter( lambda w: len(w) < 2 or w.lower() in ignored_words) finder.apply_freq_filter(1) finder.apply_ngram_filter(keyword_filter) # calculate the top results by T-score # list of all possible measures: .raw_freq, .pmi, .likelihood_ratio, .chi_sq, .phi_sq, .fisher, .student_t, .mi_like, .poisson_stirling, .jaccard, .dice results = finder.nbest(bigram_measures.student_t, numresults) # print the results print("Top collocations for ", str(keyword), ":") collocations = '' for k, v in results: if k != keyword: collocations += k + ' ' else: collocations += v + ' ' print(collocations, '\n')
def nlp_process(text): # Tokenize the string, remove all the punctuations, and make them all lower case tokenizer = RegexpTokenizer(r'\w+') tokenized_string = tokenizer.tokenize(text) tokenized_string = list(map(lambda x: x.lower(), tokenized_string)) stop_words = set(stopwords.words('english')) # Stopword Removal stop_removed = [] for word in tokenized_string: if word not in stop_words: stop_removed.append(word) # Count the frequency for words fdist1 = FreqDist(stop_removed) common_word = fdist1.most_common(1) top_word, top_freq = common_word[0] print("the most common word '{top_word}' occours {top_freq} times in the sampled text.".format(top_word = top_word, top_freq = top_freq)) print(f'The most common 10 words are: {fdist1.most_common(10)}') # Get the association between two words. See: http://www.nltk.org/howto/collocations.html bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(stop_removed) finder.apply_freq_filter(3) finder.apply_word_filter(lambda x: x in stopwords.words('english')) print(f'The most correlated words are: {finder.nbest(bigram_measures.pmi, 10)}')
def get_collocation(keywords, source): """ Filtering NLTK BigramFinder Object to get the score of maximum likelyhood of words (length >= 2) co-occuring with the keywords. Returns a dataframe with the scores and the associated words. """ bgm = BigramAssocMeasures() word_filter = lambda w1, w2: keywords not in w1 or len(w2) < 2 filename = f"finder_{source}_trimmed.sav" finder = pickle.load(open(filename, 'rb')) try: scorelist = bidirection_score_ngrams(finder, bgm.likelihood_ratio, word_filter) word_pairs, scores = zip(*scorelist) key, asso = zip(*word_pairs) df = pd.DataFrame(np.array([asso, scores]).transpose(), columns=['Collocation', 'Score']) except: # for cases where no word collocations are found df = pd.DataFrame(np.array([[np.NaN, np.NaN]]), columns=['Collocation', 'Score']) df['Keyword'] = keywords df['Source'] = source return df.drop_duplicates('Collocation').reset_index(drop=True)[:80]
def jieba_feature(number): pos_words = [] neg_words = [] for items in pickle.load(open('./data/pos_cut.pkl', 'rb')): #把集合的集合变成集合 for item in items: pos_words.append(item) for items in pickle.load(open('./data/neg_cut.pkl', 'rb')): for item in items: neg_words.append(item) word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in pos_words: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg_words: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() #积极词的数量 neg_word_count = cond_word_fd['neg'].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} #包括了每个词和这个词的信息量 for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq( cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 best_vals = sorted( word_scores.items(), key=lambda item: item[1], reverse=True)[:number] #把词按信息量倒序排序。number是特征的维度,是可以不断调整直至最优的 best_words = set([w for w, s in best_vals]) return dict([(word, True) for word in best_words])
def extract_bigrams(self, sent): sent = self._preprocess_sent(sent) bigram_measures = BigramAssocMeasures() BiFinder = BigramCollocationFinder.from_words(sent) bigrams = BiFinder.nbest(bigram_measures.pmi, 10000) bigrams = set([' '.join(i) for i in bigrams]) bigrams = bigrams & self._bigrams_set return {i: True for i in bigrams}
def get_top_bigrams(text, n): bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(clean_and_tokenize_text(text)) finder.apply_freq_filter(2) return [ ' '.join(list(words)) for words in finder.nbest(bigram_measures.raw_freq, n) ]
def collocs(text): bigrams = BigramAssocMeasures() finder = BigramCollocationFinder.from_documents( [nltk.word_tokenize(" ".join(text))]) finder.apply_freq_filter(2) topk = finder.nbest(bigrams.pmi, 15) for tk in topk: print(tk)
def get_top_bigrams(corpus, top_n=100): ''' Most frequent bigram detection ''' finder = BigramCollocationFinder.from_documents( [item.split() for item in corpus]) bigram_measures = BigramAssocMeasures() return finder.nbest(bigram_measures.raw_freq, top_n)
def get_collocations(tokens, n_collocations=None): """This functions returns the collocations for a given set of tokens""" from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokens) scored = finder.score_ngrams(bigram_measures.raw_freq) colls = sorted(bigram for bigram, score in scored)[:100] return colls
def collocations(words): bigrams = defaultdict(int) bg_meas = BigramAssocMeasures() bi_finder = BigramCollocationFinder.from_words(words) bi_collocs = bi_finder.nbest(bg_meas.likelihood_ratio, 10) for colloc in bi_collocs: bigrams[colloc] += 1 return bigrams # returns defaultdict, not dict!!!
def bigram_colwise(col): tokens = word_tokenize(col) tokens = [s for s in tokens if len(s) >= 4] finder = BigramCollocationFinder.from_words(tokens) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) scoredList = sorted(scored, key=itemgetter(1), reverse=True) word_dict = {} listLen = len(scoredList) for i in range(listLen): word_dict['_'.join(scoredList[i][0])] = scoredList[i][1] return list(word_dict.keys())[:4]
def retrieve_top_bigrams_collocations(corpus, top=5, measure='pmi'): finder = BigramCollocationFinder.from_documents( [item.split() for item in corpus]) bigram_measures = BigramAssocMeasures() if measure == 'pmi': top_bigrams = finder.nbest(bigram_measures.pmi, top) elif measure == 'frequency': top_bigrams = finder.nbest(bigram_measures.raw_freq, top) else: raise ValueError('Type of measure is unknown!') return top_bigrams
def common_collocations(text, occurences=20): tokens = word_tokenize(text) final_results = [] for measures, collocationFinder, min_size in [ (BigramAssocMeasures(), BigramCollocationFinder, 2), (TrigramAssocMeasures(), TrigramCollocationFinder, 3) ]: m = measures finder = collocationFinder.from_words(tokens, window_size=min_size) finder.apply_word_filter(lambda w: len(w) < 2) finder.apply_freq_filter(1) results = finder.nbest(m.student_t, occurences) final_results += [" ".join(gram) for gram in results] return final_results
def getBigramFeatures(documents, stopwords): # lower-case conversionof complete document tokenization all_words_list = [ word.lower() for (email, cat) in documents for word in email ] # Top 1000 bigram feature extraction measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(all_words_list) # scorer finder.apply_word_filter(alpha_filter) # exclude non-alphabetic words finder.apply_word_filter( lambda w: w in stopwords) # exclude stop words scored = finder.score_ngrams(measures.raw_freq) bigram_features = [s[0] for s in scored[:1000]] return bigram_features
def bigrams(unigram_stats, bigram_stats, measure="pmi", freq_filter=20): """Produce a list of scored bigrams. Args: unigram_stats (FreqDist) bigram_stats (FreqDist) measure (str): a measure like "pmi" or "student_t". Should be an attribute of BigramAssocMeasures freq_filter (int): minimum number of occurences to consider a bigram """ finder = BigramCollocationFinder(unigram_stats, bigram_stats) finder.apply_freq_filter(freq_filter) measures = BigramAssocMeasures() return finder.score_ngrams(getattr(measures, measure))
def extract_bigrams(titles, stopwords): bigram_measures = BigramAssocMeasures() # split all titles into a single list of one-word terms words = [word for title in titles for word in title.split(' ')] # create a bigram collocation finder based on the list of words finder = BigramCollocationFinder.from_words(words) # Remove bigrams that occur fewer than five times finder.apply_freq_filter(5) # select all bigrams that do no include stopwords bigrams = [] for bigram in finder.nbest(bigram_measures.pmi, 1000): if bigram[0] in stopwords or bigram[1] in stopwords: continue bigrams.append(bigram) return bigrams
def collocations(self, words): ''' Rerturns frequency distribution of collocations NOT CURRENTLY IN USE ''' bigrams = defaultdict(int) bg_meas = BigramAssocMeasures() bi_finder = BigramCollocationFinder.from_words(words) bi_collocs = bi_finder.nbest(bg_meas.likelihood_ratio, 10) for colloc in bi_collocs: bigrams[colloc] += 1 return bigrams
def create_wordCloud_dict_bigrams(text_content, bad_bigrams = []): finder = BigramCollocationFinder.from_words(text_content) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) # Sort highest to lowest based on the score. #scoredList = sorted(scored, key=itemgetter(1), reverse=True) scoredList = scored word_dict = {} listLen = len(scoredList) # Set the key to the scored value. for i in range(listLen): word_dict[' '.join(scoredList[i][0])] = scoredList[i][1] for bad_bigram in bad_bigrams: if bad_bigram in word_dict: del word_dict[bad_bigram] return word_dict
def pmi_jumlah(text1, text2): stopwords_ = set(stopwords.words('english')) words1 = [ word.lower() for word in text1.split() if len(word) > 2 and word not in stopwords_ ] words2 = [ word.lower() for word in text2.split() if len(word) > 2 and word not in stopwords_ ] finder = BigramCollocationFinder.from_words(words1 + words2) bgm = BigramAssocMeasures() score = bgm.mi_like total_pmi = sum( [math.log(pmi) for bigram, pmi in finder.score_ngrams(score)]) return total_pmi
def compute_collocation(corpora_dir: str, session: int, party: str, num_chunks: int, bigram_out_path: str, trigram_out_path: str, discard_tokens: Set[str], stop_words: Set[str], min_frequency: int) -> None: """ discard_tokens should be a subset of stop_words. This is used for a heuristic to filter trigrams, where the second word is permitted to be a stop word (e.g. "freedom of speech") but not a discarded token (e.g. "I yield to"). The first and third words can never be a stop word. """ tokenized_corpus: List[str] = [] for chunk_index in range(num_chunks): corpus_path = os.path.join(corpora_dir, f'{session}_{party}{chunk_index}.txt') with open(corpus_path) as corpus_file: raw_text = corpus_file.read() tokens: List[str] = nltk.tokenize.word_tokenize(raw_text) tokens = [ t.lower() for t in tokens if t not in discard_tokens and not t.isdigit() ] tokenized_corpus.extend(tokens) del tokens bigram_finder = BigramCollocationFinder.from_words(tokenized_corpus) bigram_finder.apply_freq_filter(min_frequency) bigram_finder.apply_word_filter(lambda word: word in stop_words) bigrams = bigram_finder.score_ngrams(BigramAssocMeasures().raw_freq) trigram_finder = TrigramCollocationFinder.from_words(tokenized_corpus) trigram_finder.apply_freq_filter(min_frequency) trigram_finder.apply_ngram_filter(lambda w1, w2, w3: ( w1 in stop_words) or (w3 in stop_words) or (w2 in discard_tokens)) trigrams = trigram_finder.score_ngrams(TrigramAssocMeasures().raw_freq) num_tokens = len(tokenized_corpus) with open(bigram_out_path, 'w') as bigram_file: for bigram, relative_freq in bigrams: absolute_freq = relative_freq * num_tokens bigram_str = ' '.join(bigram) bigram_file.write(f'{absolute_freq:.0f}\t{bigram_str}\n') with open(trigram_out_path, 'w') as trigram_file: for trigram, relative_freq in trigrams: absolute_freq = relative_freq * num_tokens trigram_str = ' '.join(trigram) trigram_file.write(f'{absolute_freq:.0f}\t{trigram_str}\n')
def get_keyword_collocations(tokens, keyword, windowsize=10, numresults=35): '''This function uses the Natural Language Toolkit to find collocations for a specific keyword in a corpus. It takes as an argument a string that contains the corpus you want to find collocations from. It prints the top collocations it finds for each keyword. ''' # initialize the bigram association measures object to score each collocation bigram_measures = BigramAssocMeasures() # initialize the bigram collocation finder object to find and rank collocations finder = BigramCollocationFinder.from_words(tokens, window_size=windowsize) # initialize a function that will narrow down collocates that don't contain the keyword keyword_filter = lambda *w: keyword not in w # apply a series of filters to narrow down the collocation results ignored_words = stopwords.words('english') finder.apply_word_filter( lambda w: len(w) < 2 or w.lower() in ignored_words) finder.apply_freq_filter(1) finder.apply_ngram_filter(keyword_filter) # calculate the top results by T-score # list of all possible measures: .raw_freq, .pmi, .likelihood_ratio, .chi_sq, .phi_sq, .fisher, .student_t, .mi_like, .poisson_stirling, .jaccard, .dice results = finder.score_ngrams(bigram_measures.student_t) results = results[:numresults] t = sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0])) for p in range(0, len(results)): for n in range(0, len(t)): if t[n][0] == results[p][0]: freq.append(t[n][1]) # print the results for n in range(0, len(results)): r.append(results[n][0]) print("Top collocations for ", str(keyword), ":") print('total occurences of' + ' ' + keyword + ':' + ' ', tokens.count(keyword)) for n in range(0, len(results)): score.append(results[n][1]) for k, v in r: collocations = '' if k != keyword: collocations = k else: collocations = v collocate.append(collocations)