Esempio n. 1
0
def Mixup3Feature(word_list, bi_method=BigramAssocMeasures.chi_sq,\
 bi_n=2000, tri_method=TrigramAssocMeasures.chi_sq, tri_n=1000):
	# 防止由于wordlist中只包含一种词而造成调用nbest引起的错误
	if len(set(word_list)) != 1:
		bigram_list = BigramCollocationFinder.from_words(word_list)
		top_bigram = bigram_list.nbest(bi_method, bi_n)
		trigram_list = TrigramCollocationFinder.from_words(word_list)
		top_trigram = trigram_list.nbest(tri_method, tri_n)
		return UniGramFeature(word_list + top_bigram + top_trigram)
	else:
		trigram_list = TrigramCollocationFinder.from_words(word_list)
		top_trigram = trigram_list.nbest(tri_method, tri_n)
		return UniGramFeature(word_list + top_trigram)
Esempio n. 2
0
    def association(self, measure='pmi'):
        if measure in self._association_dict:
            return self._association_dict[measure]

        ngrams = [self.ngram]
        collocs = {}

        for ngram in ngrams:
            self.ngram = ngram
            dist = self._get_freq_distributions()

            if len(self.ngram) == 2:
                finder = BigramCollocationFinder(*dist)
                measures = getattr(bigram_measures, measure)
            else:
                finder = TrigramCollocationFinder(*dist)
                measures = getattr(trigram_measures, measure)

            try:
                collocs = finder.score_ngrams(measures)
                collocs = dict((x[0][self.edit_pos], (i, x[1]))
                               for i, x in enumerate(collocs))
            except Exception as e:
                print('Exception in pmi_preps', e)
                print(self)
                print(dist)
                collocs = {}
            self._association_dict[measure] = collocs
            if collocs:
                return collocs
        return collocs
Esempio n. 3
0
def get_trigrams(words):
    trigram_measures = nltk.collocations.TrigramAssocMeasures()

    finder = TrigramCollocationFinder.from_words(words)

    finder.apply_freq_filter(
        3)  # Restrict trigrams to those that appear at least three times

    return sorted(finder.ngram_fd.items())
Esempio n. 4
0
def document_features(words_in_document, score_fn=TrigramAssocMeasures.chi_sq, n=300):
    trigram_finder = TrigramCollocationFinder.from_words(words_in_document)
    trigrams = trigram_finder.nbest(score_fn, n)


    document_words = set(words_in_document)
    features = dict([(ngram, True) for ngram in itertools.chain(words_in_document, trigrams) if len(ngram) >= 2 and ngram not in stopwords])

    #for word in word_features:
    #    features['contains(%s)' % word] = (word in document_words)
    #    #features['count(%s)' % word] = (document.count(word))
    return features
Esempio n. 5
0
def grams():
    from nltk import BigramCollocationFinder, BigramAssocMeasures, TrigramAssocMeasures, TrigramCollocationFinder
    
    
    words = get_words()
    
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(words)
    finder.apply_freq_filter(40)
    bigrams = finder.nbest(bigram_measures.pmi, 500)
    
    trigram_measures = TrigramAssocMeasures()
    finder3 = TrigramCollocationFinder.from_words(words)
    finder3.apply_freq_filter(100)
    trigrams = finder3.nbest(trigram_measures.pmi, 300)
    combos2 = [combo2[0]+ " " + combo2[1] for combo2 in bigrams]
    combos3 = [combo3[0]+ " " + combo3[1] + " " + combo3[2] for combo3 in trigrams]
    return combos2, combos3
ret_axes.set_axisbelow(True)
ret_axes.grid(True)
plt.show()

ret_axes: Axes = pivot_by_length_frequency[3:].plot(kind='barh')
ret_axes.plot(pivot_by_length_frequency['frequency'].iloc[3:],
              list(ret_axes.get_yticks()))
ret_axes.set_xlabel("Bin wise cumulative frequencies by word length")
ret_axes.set_title("Whole corpus")
ret_axes.get_legend().remove()
ret_axes.set_axisbelow(True)
ret_axes.grid(True)
plt.show()

bigrams = BigramCollocationFinder.from_words(corpus)
trigrams = TrigramCollocationFinder.from_words(corpus)

phone_nums = {}


def is_phone_number(phone_str: str) -> bool:
    phone_regex_1 = r'(?:\+1(?P<delim>(\.|-)?))([0-9]{3})(?P=delim)([0-9]{3})(?P=delim)([0-9]{4})'
    phone_regex_2 = r'([0-9]{3})(?P<delim>(\.|-)?)([0-9]{3})(?P=delim)([0-9]{4})'
    if phone_str.startswith('+1'):
        match = re.fullmatch(phone_regex_1, phone_str)
        if not match:
            return False
        num = '-'.join(match.groups()[2:])
        phone_nums[num] = phone_nums.get(num, 0) + 1
    else:
        match = re.fullmatch(phone_regex_2, phone_str)
Esempio n. 7
0
one_grams = calculate_n_grams(words, 1)
one_grams_freq = calculate_n_grams_frequency(one_grams, words, 1)

# Calculate MI

be_mi = calculate_mi(be_grams_freq, 2, one_grams_freq, len(words), 30)
print()
print(be_mi)

three_mi = calculate_mi(three_grams_freq, 3, one_grams_freq, len(words), 30)
print()
print(three_mi)

# Check

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

text = nltk.Text(words)

finder_bi = BigramCollocationFinder.from_words(text)
finder_thr = TrigramCollocationFinder.from_words(text)

print()
be_best = finder_bi.nbest(bigram_measures.pmi, 30)
print(be_best)

print()
tri_best = finder_thr.nbest(trigram_measures.pmi, 30)
print(tri_best)
Esempio n. 8
0
     finally:
         f.close()
 tokens = WhitespaceTokenizer().tokenize(content)
 content = ""
 # 1-5 stages
 # res_bigrams = dict(bigrams(tokens))
 # print(set(res_bigrams.keys()))
 # print(res_bigrams)
 # print(f"Number of bigrams: {len(res_bigrams)}")
 # find_dict = BigramCollocationFinder.from_words(tokens).ngram_fd
 # head = random.choice(list(finder.word_fd.keys()))
 # head = random.choice(list(set(res_bigrams.keys())))
 # head = random.choice(tokens)
 # print(head, end=" ")
 # print(find_dict.keys())
 find_dict3 = dict(TrigramCollocationFinder.from_words(tokens).ngram_fd)
 result = {}
 for i in range(10):
     j = 0
     while True:
         if j == 0:
             head3 = random.choice(list(find_dict3.keys()))
             head = (head3[0], head3[1])
             while re.match(r"[A-Z]\w*[^.!?]$", head[0]) is None:
                 head3 = random.choice(list(find_dict3.keys()))
                 head = (head3[0], head3[1])
             print(f"{head[0]} {head[1]}", end=" ")
         result.clear()
         for (_, __, ___) in find_dict3.keys():
             if (_, __) == head:
                 result[(___)] = find_dict3[(_, __, ___)]
finder = BigramCollocationFinder.from_words(words)
scored = finder.score_ngrams(BigramAssocMeasures.raw_freq)

# Write two-word expressions to log file
for item in scored:
    log_file.write(str(item[1]) + "\t" + str(item[0]) + "\n")

#sorted(bigram for bigram, score in scored)
mostFrequentBigramsList = finder.nbest(BigramAssocMeasures.raw_freq, 150)
# To sort the list alphabetically, just use the sorted() function in Python.

# Write to file
for item in mostFrequentBigramsList:
    output_file1.write(str(item) + "\n")

# Find trigram collocations
finder = TrigramCollocationFinder.from_words(words)
scored = finder.score_ngrams(TrigramAssocMeasures.raw_freq)

# Write three-word expressions to log file
for item in scored:
    log_file.write(str(item[1]) + "\t" + str(item[0]) + "\n")

mostFrequentTrigramsList = finder.nbest(TrigramAssocMeasures.raw_freq, 150)
# To sort the list alphabetically, just use the sorted() function in Python.

# Write to file
for item in mostFrequentTrigramsList:
    output_file2.write(str(item) + "\n")

log_file.close()
Esempio n. 10
0
def TriGramFeature(word_list, method=TrigramAssocMeasures.chi_sq, n=1000):
	trigram_list = TrigramCollocationFinder.from_words(word_list)
	top_trigram = trigram_list.nbest(method, n)
	return UniGramFeature(top_trigram)
Esempio n. 11
0
def run_wordcloud_model(
    entry_id, mode
):  # this will extract paragraph and header text from given json file and extract the topics from that

    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    print("wordcloud model started", str(data[0]['_id']), data[0]['link'])
    try:
        h_p_data = data[0]["paragraph_text"] + data[0][
            "header_text"]  # do topic extraction on paragraph and header text

        wordcloud = WordCloud(background_color="white",
                              max_words=100,
                              contour_width=3,
                              contour_color='steelblue')
        # Generate a word cloud
        data_words = list(sent_to_words(h_p_data))
        # data_words_nostops = remove_stopwords(data_words)
        data_lemmatized = lemmatization(data_words,
                                        allowed_postags=['NOUN', 'ADJ'])
        # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ'])
        # print(data_lemmatized)
        all_tokens = [j for i in data_lemmatized for j in i]
        # print('all', all_tokens)
        all_tokens = [
            value for value in all_tokens
            if (value != 'other' and value != 'day' and value != 'thing'
                and value != 'last')
        ]

        if mode == 'single':
            combined_text = " ".join(all_tokens)
        else:
            if mode == 'bi':
                finder = BigramCollocationFinder.from_words(all_tokens)
                bigram_measures = BigramAssocMeasures()
                scored = finder.score_ngrams(bigram_measures.raw_freq)
            if mode == 'tri':
                # print(combined_text)
                # setup and score the bigrams using the raw frequency.
                finder = TrigramCollocationFinder.from_words(all_tokens)
                trigram_measures = TrigramAssocMeasures()
                scored = finder.score_ngrams(trigram_measures.raw_freq)

            # print(scored)

            # By default finder.score_ngrams is sorted, however don't rely on this default behavior.
            # Sort highest to lowest based on the score.
            scoredList = sorted(scored, key=itemgetter(1), reverse=True)
            # print('sclist',scoredList)
            # word_dict is the dictionary we'll use for the word cloud.
            # Load dictionary with the FOR loop below.
            # The dictionary will look like this with the bigram and the score from above.
            # word_dict = {'bigram A': 0.000697411,
            #             'bigram B': 0.000524882}

            word_dict = {}
            listLen = len(scoredList)
            # Get the bigram and make a contiguous string for the dictionary key.
            # Set the key to the scored value.
            for i in range(listLen - 1):
                word_dict['_'.join(scoredList[i][0])] = scoredList[i][1]
            # print('dic',word_dict)

        if mode == 'single':
            wordcloud.generate(combined_text)
        else:
            wordcloud.generate_from_frequencies(word_dict)
    except Exception:
        print("cannot make word cloud for empty text")
        mycol.update_one({'_id': entry_id},
                         {'$set': {
                             'wordcloud_results_' + mode: []
                         }})
        print("vocabulary is empty")
        return "Vocabulary is empty"

    # Visualize the word cloud
    wordcloud.to_image()
    wordcloud_words = []

    word_cloud_results = []
    for each_res in wordcloud.words_:

        word_cloud_results.append([each_res, wordcloud.words_[each_res]])
        wordcloud_words.append(each_res)
    # print('words', wordcloud_words)
    # plt.imshow(wordcloud, interpolation='bilinear')
    # plt.axis("off")
    # plt.savefig(name_j[:-4]+"png")
    # plt.show()
    print(word_cloud_results)
    # return wordcloud_words

    mycol.update_one(
        {'_id': entry_id},
        {'$set': {
            'wordcloud_results_' + mode: word_cloud_results
        }})
    print("Successfully extended the data entry with wordcloud results",
          entry_id)


# run_wordcloud_model("F://Armitage_project//crawl_n_depth//extracted_json_files//3_www.hydroterra.com.au_data.json")
Esempio n. 12
0
def get_wc_results(text,mode):
    try:
        h_p_data = text  # do topic extraction on paragraph and header text

        wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue')
        # Generate a word cloud
        data_words = list(sent_to_words(h_p_data))
        # data_words_nostops = remove_stopwords(data_words)
        data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ'])
        # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ'])
        # print(data_lemmatized)
        all_tokens = [j for i in data_lemmatized for j in i]
        # print('all', all_tokens)
        all_tokens = [value for value in all_tokens if
                      (value != 'other' and value != 'day' and value != 'thing' and value != 'last')]


        if mode == 'single':
            combined_text = " ".join(all_tokens)
        else:
            if mode=='bi':

                finder = BigramCollocationFinder.from_words(all_tokens)
                bigram_measures = BigramAssocMeasures()
                scored = finder.score_ngrams(bigram_measures.raw_freq)
            if mode =='tri':
                # print(combined_text)
                # setup and score the bigrams using the raw frequency.
                finder = TrigramCollocationFinder.from_words(all_tokens)
                trigram_measures = TrigramAssocMeasures()
                scored = finder.score_ngrams(trigram_measures.raw_freq)

            # print(scored)

            # By default finder.score_ngrams is sorted, however don't rely on this default behavior.
            # Sort highest to lowest based on the score.
            scoredList = sorted(scored, key=itemgetter(1), reverse=True)
            # print('sclist',scoredList)
            # word_dict is the dictionary we'll use for the word cloud.
            # Load dictionary with the FOR loop below.
            # The dictionary will look like this with the bigram and the score from above.
            # word_dict = {'bigram A': 0.000697411,
            #             'bigram B': 0.000524882}

            word_dict = {}
            listLen = len(scoredList)
            # Get the bigram and make a contiguous string for the dictionary key.
            # Set the key to the scored value.
            for i in range(listLen-1):
                word_dict['_'.join(scoredList[i][0])] = scoredList[i][1]
            print('dic',word_dict)

        if mode=='single':
            wordcloud.generate(combined_text)
        else:
            wordcloud.generate_from_frequencies(word_dict)
    except Exception:
        print("cannot make word cloud for empty text")
        return []



    # Visualize the word cloud
    wordcloud.to_image()
    wordcloud_words = []

    word_cloud_results = []
    for each_res in wordcloud.words_:

        word_cloud_results.append([each_res,wordcloud.words_[each_res]])
        wordcloud_words.append(each_res)
    # print('words', wordcloud_words)
    # plt.imshow(wordcloud, interpolation='bilinear')
    # plt.axis("off")
    # plt.savefig(name_j[:-4]+"png")
    # plt.show()
    print(word_cloud_results)
    return word_cloud_results