def demo(scorer=None, compare_scorer=None): """Finds trigram collocations in the files of the WebText corpus.""" from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores if scorer is None: scorer = BigramAssocMeasures.likelihood_ratio if compare_scorer is None: compare_scorer = BigramAssocMeasures.raw_freq from nltk import corpus ignored_words = corpus.stopwords.words('english') word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words for file in corpus.webtext.files(): words = [word.lower() for word in corpus.webtext.words(file)] cf = BigramCollocationFinder.from_words(words) cf.apply_freq_filter(3) cf.apply_word_filter(word_filter) print file print '\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)] print '\t Correlation to %s: %0.4f' % (compare_scorer.__name__, spearman_correlation( ranks_from_scores(cf.score_ngrams(scorer)), ranks_from_scores(cf.score_ngrams(compare_scorer))))
def demo(scorer=None, compare_scorer=None): """Finds trigram collocations in the files of the WebText corpus.""" from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores if scorer is None: scorer = BigramAssocMeasures.likelihood_ratio if compare_scorer is None: compare_scorer = BigramAssocMeasures.raw_freq from nltk import corpus ignored_words = corpus.stopwords.words('english') word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words for file in corpus.webtext.files(): words = [word.lower() for word in corpus.webtext.words(file)] cf = BigramCollocationFinder.from_words(words) cf.apply_freq_filter(3) cf.apply_word_filter(word_filter) print file print '\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)] print '\t Correlation to %s: %0.4f' % ( compare_scorer.__name__, spearman_correlation( ranks_from_scores(cf.score_ngrams(scorer)), ranks_from_scores(cf.score_ngrams(compare_scorer))))
scorer = BigramAssocMeasures.likelihood_ratio compare_scorer = BigramAssocMeasures.raw_freq ignored_words = stopwords.words('english') word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words for file in webtext.fileids(): words = [word.lower() for word in webtext.words(file)] cf = BigramCollocationFinder.from_words(words) cf.apply_freq_filter(3) cf.apply_word_filter(word_filter) print(file) print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)]) print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, spearman_correlation( ranks_from_scores(cf.score_ngrams(scorer)), ranks_from_scores(cf.score_ngrams(compare_scorer))))) ''' #from nltk.util import bigrams bigram_measures=BigramAssocMeasures() trigram_measure=BigramAssocMeasures() finder=BigramCollocationFinder.from_words("grail.txt") print finder.nbest() #filter_stopwords = lambda x:len(x) < 3 or x in stopwords.words("english") #print filter_stopwords #words= (w.lower() for w in webtext.words("grail.txt")) #bcf = BigramCollocationFinder.from_words(words) #bcf.apply_word_filter(filter_stopwords) #print bcf.nbest(BigramAssocMeasures.likelihood_ratio,10) #print list(bigrams(['more','is','said','than','done']))