def Mixup3Feature(word_list, bi_method=BigramAssocMeasures.chi_sq,\ bi_n=2000, tri_method=TrigramAssocMeasures.chi_sq, tri_n=1000): # 防止由于wordlist中只包含一种词而造成调用nbest引起的错误 if len(set(word_list)) != 1: bigram_list = BigramCollocationFinder.from_words(word_list) top_bigram = bigram_list.nbest(bi_method, bi_n) trigram_list = TrigramCollocationFinder.from_words(word_list) top_trigram = trigram_list.nbest(tri_method, tri_n) return UniGramFeature(word_list + top_bigram + top_trigram) else: trigram_list = TrigramCollocationFinder.from_words(word_list) top_trigram = trigram_list.nbest(tri_method, tri_n) return UniGramFeature(word_list + top_trigram)
def association(self, measure='pmi'): if measure in self._association_dict: return self._association_dict[measure] ngrams = [self.ngram] collocs = {} for ngram in ngrams: self.ngram = ngram dist = self._get_freq_distributions() if len(self.ngram) == 2: finder = BigramCollocationFinder(*dist) measures = getattr(bigram_measures, measure) else: finder = TrigramCollocationFinder(*dist) measures = getattr(trigram_measures, measure) try: collocs = finder.score_ngrams(measures) collocs = dict((x[0][self.edit_pos], (i, x[1])) for i, x in enumerate(collocs)) except Exception as e: print('Exception in pmi_preps', e) print(self) print(dist) collocs = {} self._association_dict[measure] = collocs if collocs: return collocs return collocs
def get_trigrams(words): trigram_measures = nltk.collocations.TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(words) finder.apply_freq_filter( 3) # Restrict trigrams to those that appear at least three times return sorted(finder.ngram_fd.items())
def document_features(words_in_document, score_fn=TrigramAssocMeasures.chi_sq, n=300): trigram_finder = TrigramCollocationFinder.from_words(words_in_document) trigrams = trigram_finder.nbest(score_fn, n) document_words = set(words_in_document) features = dict([(ngram, True) for ngram in itertools.chain(words_in_document, trigrams) if len(ngram) >= 2 and ngram not in stopwords]) #for word in word_features: # features['contains(%s)' % word] = (word in document_words) # #features['count(%s)' % word] = (document.count(word)) return features
def grams(): from nltk import BigramCollocationFinder, BigramAssocMeasures, TrigramAssocMeasures, TrigramCollocationFinder words = get_words() bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(words) finder.apply_freq_filter(40) bigrams = finder.nbest(bigram_measures.pmi, 500) trigram_measures = TrigramAssocMeasures() finder3 = TrigramCollocationFinder.from_words(words) finder3.apply_freq_filter(100) trigrams = finder3.nbest(trigram_measures.pmi, 300) combos2 = [combo2[0]+ " " + combo2[1] for combo2 in bigrams] combos3 = [combo3[0]+ " " + combo3[1] + " " + combo3[2] for combo3 in trigrams] return combos2, combos3
ret_axes.set_axisbelow(True) ret_axes.grid(True) plt.show() ret_axes: Axes = pivot_by_length_frequency[3:].plot(kind='barh') ret_axes.plot(pivot_by_length_frequency['frequency'].iloc[3:], list(ret_axes.get_yticks())) ret_axes.set_xlabel("Bin wise cumulative frequencies by word length") ret_axes.set_title("Whole corpus") ret_axes.get_legend().remove() ret_axes.set_axisbelow(True) ret_axes.grid(True) plt.show() bigrams = BigramCollocationFinder.from_words(corpus) trigrams = TrigramCollocationFinder.from_words(corpus) phone_nums = {} def is_phone_number(phone_str: str) -> bool: phone_regex_1 = r'(?:\+1(?P<delim>(\.|-)?))([0-9]{3})(?P=delim)([0-9]{3})(?P=delim)([0-9]{4})' phone_regex_2 = r'([0-9]{3})(?P<delim>(\.|-)?)([0-9]{3})(?P=delim)([0-9]{4})' if phone_str.startswith('+1'): match = re.fullmatch(phone_regex_1, phone_str) if not match: return False num = '-'.join(match.groups()[2:]) phone_nums[num] = phone_nums.get(num, 0) + 1 else: match = re.fullmatch(phone_regex_2, phone_str)
one_grams = calculate_n_grams(words, 1) one_grams_freq = calculate_n_grams_frequency(one_grams, words, 1) # Calculate MI be_mi = calculate_mi(be_grams_freq, 2, one_grams_freq, len(words), 30) print() print(be_mi) three_mi = calculate_mi(three_grams_freq, 3, one_grams_freq, len(words), 30) print() print(three_mi) # Check bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() text = nltk.Text(words) finder_bi = BigramCollocationFinder.from_words(text) finder_thr = TrigramCollocationFinder.from_words(text) print() be_best = finder_bi.nbest(bigram_measures.pmi, 30) print(be_best) print() tri_best = finder_thr.nbest(trigram_measures.pmi, 30) print(tri_best)
finally: f.close() tokens = WhitespaceTokenizer().tokenize(content) content = "" # 1-5 stages # res_bigrams = dict(bigrams(tokens)) # print(set(res_bigrams.keys())) # print(res_bigrams) # print(f"Number of bigrams: {len(res_bigrams)}") # find_dict = BigramCollocationFinder.from_words(tokens).ngram_fd # head = random.choice(list(finder.word_fd.keys())) # head = random.choice(list(set(res_bigrams.keys()))) # head = random.choice(tokens) # print(head, end=" ") # print(find_dict.keys()) find_dict3 = dict(TrigramCollocationFinder.from_words(tokens).ngram_fd) result = {} for i in range(10): j = 0 while True: if j == 0: head3 = random.choice(list(find_dict3.keys())) head = (head3[0], head3[1]) while re.match(r"[A-Z]\w*[^.!?]$", head[0]) is None: head3 = random.choice(list(find_dict3.keys())) head = (head3[0], head3[1]) print(f"{head[0]} {head[1]}", end=" ") result.clear() for (_, __, ___) in find_dict3.keys(): if (_, __) == head: result[(___)] = find_dict3[(_, __, ___)]
finder = BigramCollocationFinder.from_words(words) scored = finder.score_ngrams(BigramAssocMeasures.raw_freq) # Write two-word expressions to log file for item in scored: log_file.write(str(item[1]) + "\t" + str(item[0]) + "\n") #sorted(bigram for bigram, score in scored) mostFrequentBigramsList = finder.nbest(BigramAssocMeasures.raw_freq, 150) # To sort the list alphabetically, just use the sorted() function in Python. # Write to file for item in mostFrequentBigramsList: output_file1.write(str(item) + "\n") # Find trigram collocations finder = TrigramCollocationFinder.from_words(words) scored = finder.score_ngrams(TrigramAssocMeasures.raw_freq) # Write three-word expressions to log file for item in scored: log_file.write(str(item[1]) + "\t" + str(item[0]) + "\n") mostFrequentTrigramsList = finder.nbest(TrigramAssocMeasures.raw_freq, 150) # To sort the list alphabetically, just use the sorted() function in Python. # Write to file for item in mostFrequentTrigramsList: output_file2.write(str(item) + "\n") log_file.close()
def TriGramFeature(word_list, method=TrigramAssocMeasures.chi_sq, n=1000): trigram_list = TrigramCollocationFinder.from_words(word_list) top_trigram = trigram_list.nbest(method, n) return UniGramFeature(top_trigram)
def run_wordcloud_model( entry_id, mode ): # this will extract paragraph and header text from given json file and extract the topics from that mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print("wordcloud model started", str(data[0]['_id']), data[0]['link']) try: h_p_data = data[0]["paragraph_text"] + data[0][ "header_text"] # do topic extraction on paragraph and header text wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue') # Generate a word cloud data_words = list(sent_to_words(h_p_data)) # data_words_nostops = remove_stopwords(data_words) data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ']) # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ']) # print(data_lemmatized) all_tokens = [j for i in data_lemmatized for j in i] # print('all', all_tokens) all_tokens = [ value for value in all_tokens if (value != 'other' and value != 'day' and value != 'thing' and value != 'last') ] if mode == 'single': combined_text = " ".join(all_tokens) else: if mode == 'bi': finder = BigramCollocationFinder.from_words(all_tokens) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) if mode == 'tri': # print(combined_text) # setup and score the bigrams using the raw frequency. finder = TrigramCollocationFinder.from_words(all_tokens) trigram_measures = TrigramAssocMeasures() scored = finder.score_ngrams(trigram_measures.raw_freq) # print(scored) # By default finder.score_ngrams is sorted, however don't rely on this default behavior. # Sort highest to lowest based on the score. scoredList = sorted(scored, key=itemgetter(1), reverse=True) # print('sclist',scoredList) # word_dict is the dictionary we'll use for the word cloud. # Load dictionary with the FOR loop below. # The dictionary will look like this with the bigram and the score from above. # word_dict = {'bigram A': 0.000697411, # 'bigram B': 0.000524882} word_dict = {} listLen = len(scoredList) # Get the bigram and make a contiguous string for the dictionary key. # Set the key to the scored value. for i in range(listLen - 1): word_dict['_'.join(scoredList[i][0])] = scoredList[i][1] # print('dic',word_dict) if mode == 'single': wordcloud.generate(combined_text) else: wordcloud.generate_from_frequencies(word_dict) except Exception: print("cannot make word cloud for empty text") mycol.update_one({'_id': entry_id}, {'$set': { 'wordcloud_results_' + mode: [] }}) print("vocabulary is empty") return "Vocabulary is empty" # Visualize the word cloud wordcloud.to_image() wordcloud_words = [] word_cloud_results = [] for each_res in wordcloud.words_: word_cloud_results.append([each_res, wordcloud.words_[each_res]]) wordcloud_words.append(each_res) # print('words', wordcloud_words) # plt.imshow(wordcloud, interpolation='bilinear') # plt.axis("off") # plt.savefig(name_j[:-4]+"png") # plt.show() print(word_cloud_results) # return wordcloud_words mycol.update_one( {'_id': entry_id}, {'$set': { 'wordcloud_results_' + mode: word_cloud_results }}) print("Successfully extended the data entry with wordcloud results", entry_id) # run_wordcloud_model("F://Armitage_project//crawl_n_depth//extracted_json_files//3_www.hydroterra.com.au_data.json")
def get_wc_results(text,mode): try: h_p_data = text # do topic extraction on paragraph and header text wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue') # Generate a word cloud data_words = list(sent_to_words(h_p_data)) # data_words_nostops = remove_stopwords(data_words) data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ']) # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ']) # print(data_lemmatized) all_tokens = [j for i in data_lemmatized for j in i] # print('all', all_tokens) all_tokens = [value for value in all_tokens if (value != 'other' and value != 'day' and value != 'thing' and value != 'last')] if mode == 'single': combined_text = " ".join(all_tokens) else: if mode=='bi': finder = BigramCollocationFinder.from_words(all_tokens) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) if mode =='tri': # print(combined_text) # setup and score the bigrams using the raw frequency. finder = TrigramCollocationFinder.from_words(all_tokens) trigram_measures = TrigramAssocMeasures() scored = finder.score_ngrams(trigram_measures.raw_freq) # print(scored) # By default finder.score_ngrams is sorted, however don't rely on this default behavior. # Sort highest to lowest based on the score. scoredList = sorted(scored, key=itemgetter(1), reverse=True) # print('sclist',scoredList) # word_dict is the dictionary we'll use for the word cloud. # Load dictionary with the FOR loop below. # The dictionary will look like this with the bigram and the score from above. # word_dict = {'bigram A': 0.000697411, # 'bigram B': 0.000524882} word_dict = {} listLen = len(scoredList) # Get the bigram and make a contiguous string for the dictionary key. # Set the key to the scored value. for i in range(listLen-1): word_dict['_'.join(scoredList[i][0])] = scoredList[i][1] print('dic',word_dict) if mode=='single': wordcloud.generate(combined_text) else: wordcloud.generate_from_frequencies(word_dict) except Exception: print("cannot make word cloud for empty text") return [] # Visualize the word cloud wordcloud.to_image() wordcloud_words = [] word_cloud_results = [] for each_res in wordcloud.words_: word_cloud_results.append([each_res,wordcloud.words_[each_res]]) wordcloud_words.append(each_res) # print('words', wordcloud_words) # plt.imshow(wordcloud, interpolation='bilinear') # plt.axis("off") # plt.savefig(name_j[:-4]+"png") # plt.show() print(word_cloud_results) return word_cloud_results