コード例 #1
0
ファイル: text.py プロジェクト: blake-irons/codePoem
    def collocation_list(self, num=20, window_size=2):
        """
        Return collocations derived from the text, ignoring stopwords.

        :param num: The maximum number of collocations to return.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ("_collocations" in self.__dict__ and self._num == num
                and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words("english")
            finder = BigramCollocationFinder.from_words(
                self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(
                lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio,
                                              num)
        return [w1 + " " + w2 for w1, w2 in self._collocations]
コード例 #2
0
    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num
                and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(
                self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(
                lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio,
                                              num)
        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))
コード例 #3
0
    def collocation_list(self, num=20, window_size=2):
        """
        Return collocations derived from the text, ignoring stopwords.

            >>> from nltk.book import text4
            >>> text4.collocation_list()[:2]
            [('United', 'States'), ('fellow', 'citizens')]

        :param num: The maximum number of collocations to return.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        :rtype: list(tuple(str, str))
        """
        if not ("_collocations" in self.__dict__ and self._num == num
                and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words("english")
            finder = BigramCollocationFinder.from_words(
                self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(
                lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = list(
                finder.nbest(bigram_measures.likelihood_ratio, num))
        return self._collocations
コード例 #4
0
 def collocations(self, duanyu_num=20, window_size=2):
     finder = BigramCollocationFinder.from_words(self.tokens, window_size)
     finder.apply_freq_filter(2)
     finder.apply_word_filter(
         lambda w: len(w) < 3 or w.lower() in self.ignored_words)
     bigram_measures = BigramAssocMeasures()
     self._collocations = finder.nbest(bigram_measures.likelihood_ratio,
                                       duanyu_num)
     cizus = [w1 + ' ' + w2 for w1, w2 in self._collocations]
     tag_word = pos_tag(self.tokens)
     tag_word_map = dict(tag_word)
     cizu_NN = []
     for cizu in cizus:
         flag = True
         for word in cizu.split():
             if tag_word_map[word] not in ['NN', 'NNS', 'NNP', 'NNPS'] \
                     or word.find('.') != -1 or word in self.ignored_words:
                 flag = False
         if flag:
             cizu_NN.append(re.sub('\.|\?|!|…', '', cizu))
     text = list(tag_word)
     text_n_list = [
         re.sub('\.|\?|!|…', '', word_[0]) for word_ in text
         if len(word_[0]) > 4 and word_[0] not in self.ignored_words
         and word_[1] in ['NN', 'NNS', 'NNP', 'NNPS']
         and word_[0].find('.') == -1
     ]
     text_n_list = text_n_list + cizu_NN
     return text_n_list
コード例 #5
0
 def get_collocations(self):
     ignored_words = stopwords.words('english')
     finder = BigramCollocationFinder.from_words(self.text_array,2)
     finder.apply_freq_filter(3)
     finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
     bigram_measures = BigramAssocMeasures()
     return finder.nbest(bigram_measures.likelihood_ratio,40)
コード例 #6
0
def ShowCollocations():
	text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n")
	import nltk
	from nltk.collocations import BigramCollocationFinder
	from nltk.collocations import TrigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.metrics import TrigramAssocMeasures
	pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
	data = resultsbox.get(1.0,END)
	rawtext=nltk.regexp_tokenize(data, pattern)
	prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()]
	text.delete(1.0, END)
	text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n")
	text.insert(END, "\nBigram Collocations:\n")
	bigram = BigramAssocMeasures()
	bigramfinder = BigramCollocationFinder.from_words(prepcolloc)
	bigramfinder.apply_freq_filter (3)
	bigrams=bigramfinder.nbest(bigram.pmi, 10)
	for item in bigrams:
		first = item[0]
		second = item[1]
		text.insert(END, first)
		text.insert(END, " ")
		text.insert(END, second)
		text.insert(END, "\n")
コード例 #7
0
 def best_bigram_word_feats(self,
                            words,
                            score_fn=BigramAssocMeasures.chi_sq,
                            n=1000):
     bgm = BigramAssocMeasures()
     bigram_finder = BigramCollocationFinder.from_words(words)
     self.bigrams = bigram_finder.score_ngrams(bgm.likelihood_ratio)
     # self.bigrams = bigram_finder.nbest(score_fn, n)
     d = dict([(' '.join(bigram), s) for bigram, s in self.bigrams])
     # d.update(self.best_word_feats(words))
     return d
コード例 #8
0
def bi_collocations(tokens, num=20):
    from nltk.corpus import stopwords
    ignored_words = stopwords.words('english')

    word_list = [word for sent in tokens for word in sent]
    finder = BigramCollocationFinder.from_words(word_list, 2)
    finder.apply_freq_filter(3)

    finder.apply_ngram_filter(lambda w1, w2:
                                  len(w1) < 3 \
                                  or len(w2) < 3 \
                                  or (len(w1)+len(w2)) < 8 \
                                  or w1.lower() in ignored_words \
                                  or w2.lower() in ignored_words) #length=2 want to keep e.g. rf pulse
    bigram_measures = BigramAssocMeasures()
    collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
    return collocations
コード例 #9
0
def findtopbigrams(bigrams,word_fd,settings):
    nkey = settings['nkey']
    measure = settings['measure']

    bigram_measures = BigramAssocMeasures()
    bigram_fd = FreqDist(bigrams)
    finder = BigramCollocationFinder(word_fd, bigram_fd)

    warning = ""

    if measure == "LR":
        try:
            top_bigrams = finder.nbest(bigram_measures.likelihood_ratio, nkey)
        except:
            warning = "Problem with LR measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    elif measure == "PMI":
        try:
            top_bigrams = finder.nbest(bigram_measures.pmi, nkey)
        except:
            warning = "Problem with PMI measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    elif measure == "CHISQ":
        try:
            top_bigrams = finder.nbest(bigram_measures.chi_sq, nkey)
        except:
            warning = "Problem with CHISQ measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    elif measure == "STUDT":
        try:
            top_bigrams = finder.nbest(bigram_measures.student_t, nkey)
        except:
            warning = "Problem with STUDT measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)
    else:
        top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey)


    #score bigrams using LR or similar measure but more helpful to end user to see raw counts + explain measure used in tool tip
    top_bg_with_count = sorted([(bg, count) for (bg, count) in finder.ngram_fd.items() if bg in top_bigrams], key=lambda bgcount:-bgcount[1])
    top_bigrams = [(bg, count) for (bg, count) in top_bg_with_count if count > 1 and bg[0]!=bg[1]]
    return top_bigrams, bigram_fd, warning
コード例 #10
0
ファイル: collocate.py プロジェクト: joshoe/CompLing_II
def collocRecursively(corp,interp,constructor,threshhold,addUnrelated,addBigram,filters=None):
	bgFinder = constructor(corp)
	if filters:
		bgFinder = applyFilters(bgFinder,filters)
	bgScores = {bg:score for bg,score in bgFinder.score_ngrams(BigramAssocMeasures().likelihood_ratio)}
	print(sorted(list(bgScores.items()),key=lambda tup: tup[1])[-6:])
	idx = 0
	N = len(corp)
	newCorp = list()
	flag = False
	while idx < N-1:
		bg = (corp[idx],corp[idx+1])
		if bgScores.get((interp(bg[0]),interp(bg[1])),0) > threshhold:
			addBigram(newCorp,bg)
			idx += 2
			flag = True
		else:
			addUnrelated(newCorp,bg[0])
			idx += 1
	if idx == N-1:
		addUnrelated(newCorp,corp[idx])
	if flag:
		return collocRecursively(newCorp, interp, constructor, threshhold, addUnrelated, addBigram, filters)
	return newCorp
コード例 #11
0
from nltk.util import ngrams
from nltk.corpus import alpino

print(alpino.words())
unigrams = ngrams(alpino.words(), 4)  #四元语法
print(unigrams)
# for i in unigrams:
#     print(i)

from nltk.collocations import BigramCollocationFinder
from nltk.corpus import webtext
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stops_filter = lambda w: len(w) < 3 or w in stop_words  # 单词长度小于3或是停用词
tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)  # 创建实例
print(words)
words.apply_word_filter(stops_filter)
res = words.nbest(BigramAssocMeasures.likelihood_ratio, 5)  # 二元语法,前5个
print(res)

# 使用词汇搭配查找器生成bigrams
import nltk
text1 = "Hardwork is the key to success. Never give up!"
word = nltk.wordpunct_tokenize(text1)
finder = BigramCollocationFinder.from_words(word)
bigram_measures = BigramAssocMeasures()
value = finder.score_ngrams(bigram_measures.raw_freq)
print(sorted(bigram for bigram, score in value))