Python BigramAssocMeasures.BigramAssocMeasuresの例

プログラミング言語: Python

名前空間/パッケージ名: nltk.metrics

メソッド/関数: BigramAssocMeasures

hotexamples.comのコード掲載数: 11

Python BigramAssocMeasures.BigramAssocMeasures - 11件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのnltk.metrics.BigramAssocMeasures.BigramAssocMeasuresの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

chi_sq(30)

BigramAssocMeasures(11)

dice(1)

fisher(1)

jaccard(1)

likelihood_ratio(1)

mi_like(1)

pmi(1)

poisson_stirling(1)

raw_freq(1)

student_t(1)

コード例 #1

ファイルを表示

ファイル: text.py プロジェクト: blake-irons/codePoem

    def collocation_list(self, num=20, window_size=2):
        """
        Return collocations derived from the text, ignoring stopwords.

        :param num: The maximum number of collocations to return.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ("_collocations" in self.__dict__ and self._num == num
                and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words("english")
            finder = BigramCollocationFinder.from_words(
                self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(
                lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio,
                                              num)
        return [w1 + " " + w2 for w1, w2 in self._collocations]

コード例 #2

ファイルを表示

    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num
                and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(
                self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(
                lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio,
                                              num)
        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))

コード例 #3

ファイルを表示

    def collocation_list(self, num=20, window_size=2):
        """
        Return collocations derived from the text, ignoring stopwords.

            >>> from nltk.book import text4
            >>> text4.collocation_list()[:2]
            [('United', 'States'), ('fellow', 'citizens')]

        :param num: The maximum number of collocations to return.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        :rtype: list(tuple(str, str))
        """
        if not ("_collocations" in self.__dict__ and self._num == num
                and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words("english")
            finder = BigramCollocationFinder.from_words(
                self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(
                lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = list(
                finder.nbest(bigram_measures.likelihood_ratio, num))
        return self._collocations

コード例 #4

ファイルを表示

ファイル: text.py プロジェクト: waelyafooz/Arabic-Sentiment-Analysis

 def collocations(self, duanyu_num=20, window_size=2):
     finder = BigramCollocationFinder.from_words(self.tokens, window_size)
     finder.apply_freq_filter(2)
     finder.apply_word_filter(
         lambda w: len(w) < 3 or w.lower() in self.ignored_words)
     bigram_measures = BigramAssocMeasures()
     self._collocations = finder.nbest(bigram_measures.likelihood_ratio,
                                       duanyu_num)
     cizus = [w1 + ' ' + w2 for w1, w2 in self._collocations]
     tag_word = pos_tag(self.tokens)
     tag_word_map = dict(tag_word)
     cizu_NN = []
     for cizu in cizus:
         flag = True
         for word in cizu.split():
             if tag_word_map[word] not in ['NN', 'NNS', 'NNP', 'NNPS'] \
                     or word.find('.') != -1 or word in self.ignored_words:
                 flag = False
         if flag:
             cizu_NN.append(re.sub('\.|\?|!|…', '', cizu))
     text = list(tag_word)
     text_n_list = [
         re.sub('\.|\?|!|…', '', word_[0]) for word_ in text
         if len(word_[0]) > 4 and word_[0] not in self.ignored_words
         and word_[1] in ['NN', 'NNS', 'NNP', 'NNPS']
         and word_[0].find('.') == -1
     ]
     text_n_list = text_n_list + cizu_NN
     return text_n_list

コード例 #5

ファイルを表示

 def get_collocations(self):
     ignored_words = stopwords.words('english')
     finder = BigramCollocationFinder.from_words(self.text_array,2)
     finder.apply_freq_filter(3)
     finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
     bigram_measures = BigramAssocMeasures()
     return finder.nbest(bigram_measures.likelihood_ratio,40)

コード例 #6

ファイルを表示

ファイル: collocationreadability.py プロジェクト: viveksck/Text-Tools

def ShowCollocations():
	text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n")
	import nltk
	from nltk.collocations import BigramCollocationFinder
	from nltk.collocations import TrigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.metrics import TrigramAssocMeasures
	pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
	data = resultsbox.get(1.0,END)
	rawtext=nltk.regexp_tokenize(data, pattern)
	prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()]
	text.delete(1.0, END)
	text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n")
	text.insert(END, "\nBigram Collocations:\n")
	bigram = BigramAssocMeasures()
	bigramfinder = BigramCollocationFinder.from_words(prepcolloc)
	bigramfinder.apply_freq_filter (3)
	bigrams=bigramfinder.nbest(bigram.pmi, 10)
	for item in bigrams:
		first = item[0]
		second = item[1]
		text.insert(END, first)
		text.insert(END, " ")
		text.insert(END, second)
		text.insert(END, "\n")

コード例 #7

ファイルを表示

ファイル: keyword_extractor.py プロジェクト: huynt-plus/sis-project

 def best_bigram_word_feats(self,
                            words,
                            score_fn=BigramAssocMeasures.chi_sq,
                            n=1000):
     bgm = BigramAssocMeasures()
     bigram_finder = BigramCollocationFinder.from_words(words)
     self.bigrams = bigram_finder.score_ngrams(bgm.likelihood_ratio)
     # self.bigrams = bigram_finder.nbest(score_fn, n)
     d = dict([(' '.join(bigram), s) for bigram, s in self.bigrams])
     # d.update(self.best_word_feats(words))
     return d

コード例 #8

ファイルを表示

ファイル: termExtraction.py プロジェクト: elrob/AutoFlashcardGenerator

def bi_collocations(tokens, num=20):
    from nltk.corpus import stopwords
    ignored_words = stopwords.words('english')

    word_list = [word for sent in tokens for word in sent]
    finder = BigramCollocationFinder.from_words(word_list, 2)
    finder.apply_freq_filter(3)

    finder.apply_ngram_filter(lambda w1, w2:
                                  len(w1) < 3 \
                                  or len(w2) < 3 \
                                  or (len(w1)+len(w2)) < 8 \
                                  or w1.lower() in ignored_words \
                                  or w2.lower() in ignored_words) #length=2 want to keep e.g. rf pulse
    bigram_measures = BigramAssocMeasures()
    collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
    return collocations

コード例 #9

ファイルを表示

def findtopbigrams(bigrams,word_fd,settings):
    nkey = settings['nkey']
    measure = settings['measure']

    bigram_measures = BigramAssocMeasures()
    bigram_fd = FreqDist(bigrams)
    finder = BigramCollocationFinder(word_fd, bigram_fd)

    warning = ""

    if measure == "LR":
        try:
            top_bigrams = finder.nbest(bigram_measures.likelihood_ratio, nkey)
        except:
            warning = "Problem with LR measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    elif measure == "PMI":
        try:
            top_bigrams = finder.nbest(bigram_measures.pmi, nkey)
        except:
            warning = "Problem with PMI measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    elif measure == "CHISQ":
        try:
            top_bigrams = finder.nbest(bigram_measures.chi_sq, nkey)
        except:
            warning = "Problem with CHISQ measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    elif measure == "STUDT":
        try:
            top_bigrams = finder.nbest(bigram_measures.student_t, nkey)
        except:
            warning = "Problem with STUDT measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    else:
        top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)


    #score bigrams using LR or similar measure but more helpful to end user to see raw counts + explain measure used in tool tip
    top_bg_with_count = sorted([(bg, count) for (bg, count) in finder.ngram_fd.items() if bg in top_bigrams], key=lambda bgcount:-bgcount[1])
    top_bigrams = [(bg, count) for (bg, count) in top_bg_with_count if count > 1 and bg[0]!=bg[1]]
    return top_bigrams, bigram_fd, warning

コード例 #10

ファイルを表示

ファイル: collocate.py プロジェクト: joshoe/CompLing_II

def collocRecursively(corp,interp,constructor,threshhold,addUnrelated,addBigram,filters=None):
	bgFinder = constructor(corp)
	if filters:
		bgFinder = applyFilters(bgFinder,filters)
	bgScores = {bg:score for bg,score in bgFinder.score_ngrams(BigramAssocMeasures().likelihood_ratio)}
	print(sorted(list(bgScores.items()),key=lambda tup: tup[1])[-6:])
	idx = 0
	N = len(corp)
	newCorp = list()
	flag = False
	while idx < N-1:
		bg = (corp[idx],corp[idx+1])
		if bgScores.get((interp(bg[0]),interp(bg[1])),0) > threshhold:
			addBigram(newCorp,bg)
			idx += 2
			flag = True
		else:
			addUnrelated(newCorp,bg[0])
			idx += 1
	if idx == N-1:
		addUnrelated(newCorp,corp[idx])
	if flag:
		return collocRecursively(newCorp, interp, constructor, threshhold, addUnrelated, addBigram, filters)
	return newCorp

コード例 #11

ファイルを表示

from nltk.util import ngrams
from nltk.corpus import alpino

print(alpino.words())
unigrams = ngrams(alpino.words(), 4)  #四元语法
print(unigrams)
# for i in unigrams:
#     print(i)

from nltk.collocations import BigramCollocationFinder
from nltk.corpus import webtext
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stops_filter = lambda w: len(w) < 3 or w in stop_words  # 单词长度小于3或是停用词
tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)  # 创建实例
print(words)
words.apply_word_filter(stops_filter)
res = words.nbest(BigramAssocMeasures.likelihood_ratio, 5)  # 二元语法，前5个
print(res)

# 使用词汇搭配查找器生成bigrams
import nltk
text1 = "Hardwork is the key to success. Never give up!"
word = nltk.wordpunct_tokenize(text1)
finder = BigramCollocationFinder.from_words(word)
bigram_measures = BigramAssocMeasures()
value = finder.score_ngrams(bigram_measures.raw_freq)
print(sorted(bigram for bigram, score in value))