def findtoptrigrams(trigrams,word_fd,bigram_fd,settings): nkey = settings['nkey'] measure = settings['measure'] trigram_measures = TrigramAssocMeasures() trigram_fd = FreqDist(trigrams) wild = [(t[0],t[2]) for t in trigrams] wild_fd = FreqDist(wild) finder = TrigramCollocationFinder(word_fd, bigram_fd, wild_fd, trigram_fd) warning = "" if measure == "LR": try: top_trigrams = finder.nbest(trigram_measures.likelihood_ratio, nkey) except: warning = "Problem with LR measure. Default to simple frequency (RAW setting)" print(warning) top_trigrams = finder.nbest(trigram_measures.raw_freq, nkey) elif measure == "PMI": try: top_trigrams = finder.nbest(trigram_measures.pmi, nkey) except: warning = "Problem with PMI measure. Default to simple frequency (RAW setting)" print(warning) top_trigrams = finder.nbest(trigram_measures.raw_freq, nkey) elif measure == "CHISQ": try: top_trigrams = finder.nbest(trigram_measures.chi_sq, nkey) except: warning = "Problem with CHISQ measure. Default to simple frequency (RAW setting)" print(warning) top_trigrams = finder.nbest(trigram_measures.raw_freq, nkey) elif measure == "STUDT": try: top_trigrams = finder.nbest(trigram_measures.student_t, nkey) except: warning = "Problem with STUDT measure. Default to simple frequency (RAW setting)" print(warning) top_trigrams = finder.nbest(trigram_measures.raw_freq, nkey) else: top_trigrams = finder.nbest(trigram_measures.raw_freq, nkey) #score trigrams using LR or similar measure but more helpful to end user to see raw counts + explain measure used in tool tip top_tg_with_count = sorted([(tg, count) for (tg, count) in finder.ngram_fd.items() if tg in top_trigrams], key=lambda tgcount:-tgcount[1]) top_trigrams = [(tg, count) for (tg, count) in top_tg_with_count if count > 1 and tg[0]!=tg[1]] return top_trigrams, warning
def trigram_finder(self): return TrigramCollocationFinder(self.word_fd, self.trigram_fd)
default='models/trigram.pkl', help='Path to trigrams.') args = parser.parse_args() print(args) unigrams = load_ngrams(args.unigram, 1) bigrams = load_ngrams(args.bigram, 2) if args.n == 3: trigrams = load_ngrams(args.trigram, 3) wildcards = defaultdict(int) for key in trigrams.keys(): wildcards[(key[0], key[2])] += trigrams.get(key) wildcards = FreqDist(wildcards) ngram_measures = nltk.collocations.TrigramAssocMeasures() finder = TrigramCollocationFinder(unigrams, bigrams, wildcards, trigrams) else: ngram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder(unigrams, bigrams) if args.freq_filter > 0: finder.apply_freq_filter(args.freq_filter) if args.word_filter: finder.apply_ngram_filter(lambda *words: not any( re.match(args.word_filter, word) for word in words)) scored = finder.score_ngrams(eval("ngram_measures." + args.measure)) if args.top: scored = scored[:args.top]