コード例 #1
0
def findtoptrigrams(trigrams,word_fd,bigram_fd,settings):
    nkey = settings['nkey']
    measure = settings['measure']

    trigram_measures = TrigramAssocMeasures()
    trigram_fd = FreqDist(trigrams)

    wild = [(t[0],t[2]) for t in trigrams]
    wild_fd = FreqDist(wild)
    finder = TrigramCollocationFinder(word_fd, bigram_fd, wild_fd, trigram_fd)

    warning = ""

    if measure == "LR":
        try:
            top_trigrams = finder.nbest(trigram_measures.likelihood_ratio, nkey)
        except:
            warning = "Problem with LR measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_trigrams = finder.nbest(trigram_measures.raw_freq, nkey)
    elif measure == "PMI":
        try:
            top_trigrams = finder.nbest(trigram_measures.pmi, nkey)
        except:
            warning = "Problem with PMI measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_trigrams = finder.nbest(trigram_measures.raw_freq, nkey)
    elif measure == "CHISQ":
        try:
            top_trigrams = finder.nbest(trigram_measures.chi_sq, nkey)
        except:
            warning = "Problem with CHISQ measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_trigrams = finder.nbest(trigram_measures.raw_freq, nkey)
    elif measure == "STUDT":
        try:
            top_trigrams = finder.nbest(trigram_measures.student_t, nkey)
        except:
            warning = "Problem with STUDT measure. Default to simple frequency (RAW setting)"
            print(warning)
            top_trigrams = finder.nbest(trigram_measures.raw_freq, nkey)
    else:
        top_trigrams = finder.nbest(trigram_measures.raw_freq, nkey)

    #score trigrams using LR or similar measure but more helpful to end user to see raw counts + explain measure used in tool tip
    top_tg_with_count = sorted([(tg, count) for (tg, count) in finder.ngram_fd.items() if tg in top_trigrams], key=lambda tgcount:-tgcount[1])
    top_trigrams = [(tg, count) for (tg, count) in top_tg_with_count if count > 1 and tg[0]!=tg[1]]
    return top_trigrams, warning
コード例 #2
0
 def trigram_finder(self): 
     return TrigramCollocationFinder(self.word_fd, self.trigram_fd)
コード例 #3
0
                        default='models/trigram.pkl',
                        help='Path to trigrams.')
    args = parser.parse_args()
    print(args)

    unigrams = load_ngrams(args.unigram, 1)
    bigrams = load_ngrams(args.bigram, 2)

    if args.n == 3:
        trigrams = load_ngrams(args.trigram, 3)
        wildcards = defaultdict(int)
        for key in trigrams.keys():
            wildcards[(key[0], key[2])] += trigrams.get(key)
        wildcards = FreqDist(wildcards)
        ngram_measures = nltk.collocations.TrigramAssocMeasures()
        finder = TrigramCollocationFinder(unigrams, bigrams, wildcards,
                                          trigrams)
    else:
        ngram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder(unigrams, bigrams)

    if args.freq_filter > 0:
        finder.apply_freq_filter(args.freq_filter)

    if args.word_filter:
        finder.apply_ngram_filter(lambda *words: not any(
            re.match(args.word_filter, word) for word in words))

    scored = finder.score_ngrams(eval("ngram_measures." + args.measure))

    if args.top:
        scored = scored[:args.top]