def bigrams(unigram_stats, bigram_stats, measure="pmi", freq_filter=20): """Produce a list of scored bigrams. Args: unigram_stats (FreqDist) bigram_stats (FreqDist) measure (str): a measure like "pmi" or "student_t". Should be an attribute of BigramAssocMeasures freq_filter (int): minimum number of occurences to consider a bigram """ finder = BigramCollocationFinder(unigram_stats, bigram_stats) finder.apply_freq_filter(freq_filter) measures = BigramAssocMeasures() return finder.score_ngrams(getattr(measures, measure))
print "---------- 100 collocations -----------" overall_text.collocations(num=100) print "---------- ---------------- -----------" print overall_text.concordance('Imperium') index = nltk.text.ConcordanceIndex(master_tokens, key=lambda s:s.lower()) sys.exit(0) from nltk import bigrams from nltk import collocations from nltk import FreqDist from nltk.collocations import BigramCollocationFinder # http://nltk.googlecode.com/svn/trunk/doc/howto/collocations.html # http://stackoverflow.com/questions/9151326/python-nltk-find-collocations-without-dot-separated-words bigram_measures = collocations.BigramAssocMeasures() word_fd = FreqDist(master_tokens) bigram_fd = FreqDist(bigrams(master_tokens)) finder = BigramCollocationFinder(word_fd, bigram_fd) #finder.apply_word_filter(lambda w: w in ('.', ',')) # only when collocation occurs 3+ times finder.apply_freq_filter(3) scored = finder.score_ngrams(bigram_measures.raw_freq) #print sorted(bigram for bigram, score in scored) print "=========================================" print sorted(finder.nbest(bigram_measures.raw_freq,200),reverse=True)