def bigrams(unigram_stats, bigram_stats, measure="pmi", freq_filter=20): """Produce a list of scored bigrams. Args: unigram_stats (FreqDist) bigram_stats (FreqDist) measure (str): a measure like "pmi" or "student_t". Should be an attribute of BigramAssocMeasures freq_filter (int): minimum number of occurences to consider a bigram """ finder = BigramCollocationFinder(unigram_stats, bigram_stats) finder.apply_freq_filter(freq_filter) measures = BigramAssocMeasures() return finder.score_ngrams(getattr(measures, measure))
def load_ppmi(monograms, bigrams, pickle=None, cache=True): path = get_path(pickle) if cache and isfile(path): return pickle_load(path) ppmi = {} finder = BigramCollocationFinder(FreqDist(monograms), FreqDist(bigrams)) bigram_measures = nltk.collocations.BigramAssocMeasures() for words, score in finder.score_ngrams(bigram_measures.pmi): ppmi[words] = max(score, 0.0) print("Caching ppmi of length", len(ppmi)) pickle_dump(ppmi, path) return ppmi
def getTopFeaturesForClass(documents, noOfFeaturesPerClass=10): ''' Feature values are in integer. [{document vector}, classId] ''' classToFeaturesMap = defaultdict(list) word_fd = nltk.FreqDist(feature for doc in documents for feature, count in doc[0].iteritems() for i in range(count)) for document, clusterId in documents: if clusterId not in word_fd: word_fd[clusterId]=0 word_fd[clusterId]+=1 bigram_fd = nltk.FreqDist((feature, doc[1]) for doc in documents for feature, count in doc[0].iteritems() for i in range(count)) bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder(word_fd, bigram_fd) scored = finder.score_ngrams(bigram_measures.pmi) for (feature, classId), score in scored: classToFeaturesMap[classId].append((feature, score)) returnData = [] for classId, features in classToFeaturesMap.iteritems(): returnData.append((classId, features[:noOfFeaturesPerClass])) return returnData
def getTopFeaturesForClass(documents, noOfFeaturesPerClass=10): ''' Feature values are in integer. [{document vector}, classId] ''' classToFeaturesMap = defaultdict(list) word_fd = nltk.FreqDist(feature for doc in documents for feature, count in doc[0].iteritems() for i in range(count)) for document, clusterId in documents: if clusterId not in word_fd: word_fd[clusterId] = 0 word_fd[clusterId] += 1 bigram_fd = nltk.FreqDist((feature, doc[1]) for doc in documents for feature, count in doc[0].iteritems() for i in range(count)) bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder(word_fd, bigram_fd) scored = finder.score_ngrams(bigram_measures.pmi) for (feature, classId), score in scored: classToFeaturesMap[classId].append((feature, score)) returnData = [] for classId, features in classToFeaturesMap.iteritems(): returnData.append((classId, features[:noOfFeaturesPerClass])) return returnData
from nltk.collocations import BigramAssocMeasures from nltk import FreqDist from nltk import bigrams from nltk.metrics import spearman analyzer = MorphAnalyzer() corpus = pd.read_csv("court-V-N.csv", header=None) measures = BigramAssocMeasures() tagger = lambda x: (x, analyzer.parse(x.lower().strip())[0].tag.POS) tagged_corpus = corpus.applymap(tagger).drop(0, axis=1) with open("gold_standard.txt", "r") as io: standard = [tuple(x.split()) for x in io.readlines()] wfd = FreqDist(tagged_corpus.values.flatten()) bfd = FreqDist(bigrams(tagged_corpus.values.flatten())) finder_1 = BigramCollocationFinder(wfd, bfd) filter = lambda x: [tuple(z[0] for z in y[0]) for y in x if y[0][0][1] == "INFN"] scored_pmi = filter(finder_1.score_ngrams(measures.pmi)) scored_student = filter(finder_1.score_ngrams(measures.student_t)) pmi_top = scored_pmi[:10] student_top = scored_student[:10] for name, top in [("pmi_top10.txt", pmi_top), ("student_top10.txt", student_top)]: with open(name, "w") as io: joined = [" ".join(x) + "\n" for x in top] io.writelines(joined) print(spearman.spearman_correlation(pmi_top, student_top)) print("Done")
print "---------- 100 collocations -----------" overall_text.collocations(num=100) print "---------- ---------------- -----------" print overall_text.concordance('Imperium') index = nltk.text.ConcordanceIndex(master_tokens, key=lambda s:s.lower()) sys.exit(0) from nltk import bigrams from nltk import collocations from nltk import FreqDist from nltk.collocations import BigramCollocationFinder # http://nltk.googlecode.com/svn/trunk/doc/howto/collocations.html # http://stackoverflow.com/questions/9151326/python-nltk-find-collocations-without-dot-separated-words bigram_measures = collocations.BigramAssocMeasures() word_fd = FreqDist(master_tokens) bigram_fd = FreqDist(bigrams(master_tokens)) finder = BigramCollocationFinder(word_fd, bigram_fd) #finder.apply_word_filter(lambda w: w in ('.', ',')) # only when collocation occurs 3+ times finder.apply_freq_filter(3) scored = finder.score_ngrams(bigram_measures.raw_freq) #print sorted(bigram for bigram, score in scored) print "=========================================" print sorted(finder.nbest(bigram_measures.raw_freq,200),reverse=True)