Exemple #1
0
def demo(scorer=None, compare_scorer=None):
    """Finds trigram collocations in the files of the WebText corpus."""
    from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores

    if scorer is None:
        scorer = BigramAssocMeasures.likelihood_ratio
    if compare_scorer is None:
        compare_scorer = BigramAssocMeasures.raw_freq

    from nltk import corpus

    ignored_words = corpus.stopwords.words('english')
    word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words

    for file in corpus.webtext.files():
        words = [word.lower() for word in corpus.webtext.words(file)]

        cf = BigramCollocationFinder.from_words(words)
        cf.apply_freq_filter(3)
        cf.apply_word_filter(word_filter)

        print file
        print '\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)]
        print '\t Correlation to %s: %0.4f' % (
            compare_scorer.__name__,
            spearman_correlation(
                ranks_from_scores(cf.score_ngrams(scorer)),
                ranks_from_scores(cf.score_ngrams(compare_scorer))))
Exemple #2
0
def demo(scorer=None, compare_scorer=None):
    """Finds trigram collocations in the files of the WebText corpus."""
    from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores

    if scorer is None:
        scorer = BigramAssocMeasures.likelihood_ratio
    if compare_scorer is None:
        compare_scorer = BigramAssocMeasures.raw_freq

    from nltk import corpus
        
    ignored_words = corpus.stopwords.words('english')
    word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words

    for file in corpus.webtext.files():
        words = [word.lower()
                 for word in corpus.webtext.words(file)]

        cf = BigramCollocationFinder.from_words(words)
        cf.apply_freq_filter(3)
        cf.apply_word_filter(word_filter)

        print file
        print '\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)]
        print '\t Correlation to %s: %0.4f' % (compare_scorer.__name__,
                spearman_correlation(
                    ranks_from_scores(cf.score_ngrams(scorer)),
                    ranks_from_scores(cf.score_ngrams(compare_scorer))))
 def _scoresToRanks(self,rankdict):
     scored_items = sorted(rankdict.items(),key=itemgetter(1),reverse=True)
     ranked_items = [
         ranked_item
         for ranked_item in
         ranks_from_scores(scored_items)]
     return ranked_items
scorer = BigramAssocMeasures.likelihood_ratio
compare_scorer = BigramAssocMeasures.raw_freq
ignored_words = stopwords.words('english')
word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
for file in webtext.fileids():
    words = [word.lower()
             for word in webtext.words(file)]
    cf = BigramCollocationFinder.from_words(words)
    cf.apply_freq_filter(3)
    cf.apply_word_filter(word_filter)
    print(file)
    print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
    print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__,
                                            spearman_correlation(
                                            ranks_from_scores(cf.score_ngrams(scorer)),
                                            ranks_from_scores(cf.score_ngrams(compare_scorer)))))
        
'''
#from nltk.util import bigrams
bigram_measures=BigramAssocMeasures()
trigram_measure=BigramAssocMeasures()
finder=BigramCollocationFinder.from_words("grail.txt")
print finder.nbest()
#filter_stopwords = lambda x:len(x) < 3 or x in stopwords.words("english")
#print filter_stopwords
#words= (w.lower() for w in webtext.words("grail.txt"))
#bcf = BigramCollocationFinder.from_words(words)
#bcf.apply_word_filter(filter_stopwords)
#print bcf.nbest(BigramAssocMeasures.likelihood_ratio,10)
#print list(bigrams(['more','is','said','than','done']))