Ejemplo n.º 1
0
    candidate_scores = [(score, remove_str_postags(cand)) for score, cand in candidate_scores]

    stoplist = binom_stoplist(0.2)  # 0.5 buen valor
    # stoplist = log_likelihood_stoplist(5)
    candidate_scores = stoplist_filter(candidate_scores, stoplist)

    return candidate_scores


if __name__ == "__main__":
    terms = load_terms()

    domain_sents = load_analysis()
    train_terms, test_corpus, test_terms = split_train_test(domain_sents, terms)

    candidates = main(train_terms, test_corpus)
    candidates = [word for word, score in candidates]

    print "[C]", len(candidates)
    print "[T]", len(set(candidates).intersection(set(test_terms)))
    print "======"
    precision, recall = evaluation.precision_recall(test_terms, candidates)
    print "[P]", round(precision, 3)
    print "[R]", round(recall, 3)
    print "======"
    precision_by_segment = evaluation.precision_by_segments(test_terms, candidates, 4)
    for i, seg_precision in enumerate(precision_by_segment):
        print "[%s] %s" % (i + 1, round(seg_precision, 3))
    recall_list, precision_list = evaluation.precision_at_recall_values(test_terms, candidates)
    evaluation.plot_precision_at_recall_values(recall_list, precision_list)
Ejemplo n.º 2
0
        for c in sorted_candidates:
            newc = '%.5f\t%s' % (c[1], c[0])
            new_cands.append(newc)
        f.write('\n'.join(new_cands).encode('utf-8'))

    sorted_candidates = [cand for cand, score in sorted_candidates]
    print '\nC-VALUE'
    print '========'
    print '[C]', len(sorted_candidates)
    print '[T]', len(set(sorted_candidates).intersection(set(terms)))
    print '========'
    precision, recall = evaluation.precision_recall(terms, sorted_candidates)
    print '[P]', round(precision, 3)
    print '[R]', round(recall, 3)
    print '========'
    precision_by_segment = evaluation.precision_by_segments(
        terms, sorted_candidates, 4)
    for i, seg_precision in enumerate(precision_by_segment):
        print '[%s] %s' % (i, round(seg_precision, 3))

    recall_list, precision_list = evaluation.precision_at_recall_values(
        terms, sorted_candidates)
    evaluation.plot_precision_at_recall_values(recall_list, precision_list)

    cvalue_top = [c for c in sorted_candidates[:int(len(candidates) * 0.2)]]
    context_words = make_contextword_weight_dict(cvalue_top, domain_corpus,
                                                 ['NC', 'AQ', 'VM'], 5)
    ncvalue_output = calc_ncvalue(candidates, domain_corpus, context_words,
                                  ['NC', 'AQ', 'VM'], 5)
    sorted_ncvalue = [(cand, score) for cand, score in sorted(
        ncvalue_output.items(), key=lambda x: x[1], reverse=True)]
Ejemplo n.º 3
0
def main(loglike_threshold):

    term_corp = load_terms()

    anal_corp = load_analysis()
    random.shuffle(anal_corp)

    train_corp = anal_corp[:int(0.5*len(anal_corp))]
    train_terms = []
    for sent in train_corp:
        for term in term_corp:
            if term.lower() in ' '.join(sent).lower():
                train_terms.append(term)
    train_terms = [t for t in train_terms if t]

    test_corp = anal_corp[int(0.5*len(anal_corp)):]
    test_terms = []
    for sent in test_corp:
        for term in term_corp:
            if term.lower() in ' '.join(sent).lower():
                test_terms.append(term)
    test_terms = [remove_str_postags(t) for t in test_terms if t]

    term_model = make_term_model(train_terms)

    gen_corp = load_general()
    gen_model = make_general_model(gen_corp)

    candidate_scores = []

    pos_patterns = term_model['pos_freq'].keys()
    for pos_seq in pos_patterns:
        syn_coef = calc_syntactic_coef(pos_seq, term_model)
        chunks = chunk_sents(pos_seq, test_corp)
        chunk_freq_dict = defaultdict(int)
        for chnk in chunks:
            chunk_freq_dict[chnk] += 1

        accepted_phrases = chunk_freq_dict

        for candidate in accepted_phrases.keys():
            cand_freq = chunk_freq_dict[candidate]
            lex_coef = calc_lexical_coef(candidate, term_model, gen_model)
            morph_coef = calc_morph_coef(candidate, term_model, gen_model)
            candidate_coef = cand_freq * syn_coef * lex_coef * morph_coef
            candidate_scores.append((candidate_coef, candidate),)
    candidate_scores = sorted(candidate_scores, reverse=True)

    candidates = [cand for score, cand in candidate_scores]
    stripped_candidates = []
    for cand in candidates:
        new_cand = remove_str_postags(cand)
        stripped_candidates.append(new_cand)
    stoplist = loglike_stoplist(loglike_threshold)
    accepted_candidates, rejected_candidates = \
        filter_out(stripped_candidates, stoplist)

    precision, recall = \
        evaluation.precision_recall(test_terms, accepted_candidates)
    print '\nNAZAR'
    print '=========='
    print '[P]', round(precision, 3)
    print '[R]', round(recall, 3)
    print '=========='
    precision_by_segment = evaluation.precision_by_segments(
        test_terms, accepted_candidates, 4)
    for i, seg_precision in enumerate(precision_by_segment):
        print '[%s] %s' % (i, round(seg_precision, 3))
    recall_list, precision_list = evaluation.precision_at_recall_values(
        test_terms, accepted_candidates)
    evaluation.plot_precision_at_recall_values(recall_list, precision_list)

    return candidates
Ejemplo n.º 4
0
        for c in sorted_candidates:
            newc = '%.5f\t%s' % (c[1], c[0])
            new_cands.append(newc)
        f.write('\n'.join(new_cands).encode('utf-8'))

    sorted_candidates = [cand for cand, score in sorted_candidates]
    print '\nC-VALUE'
    print '========'
    print '[C]', len(sorted_candidates)
    print '[T]', len(set(sorted_candidates).intersection(set(terms)))
    print '========'
    precision, recall = evaluation.precision_recall(terms, sorted_candidates)
    print '[P]', round(precision, 3)
    print '[R]', round(recall, 3)
    print '========'
    precision_by_segment = evaluation.precision_by_segments(
        terms, sorted_candidates, 4)
    for i, seg_precision in enumerate(precision_by_segment):
        print '[%s] %s' % (i, round(seg_precision, 3))

    recall_list, precision_list = evaluation.precision_at_recall_values(
        terms, sorted_candidates)
    evaluation.plot_precision_at_recall_values(recall_list, precision_list)

    cvalue_top = [c for c in sorted_candidates[:int(len(candidates) * 0.2)]]
    context_words = make_contextword_weight_dict(
        cvalue_top, domain_corpus, ['NC', 'AQ', 'VM'], 5)
    ncvalue_output = calc_ncvalue(
        candidates, domain_corpus, context_words, ['NC', 'AQ', 'VM'], 5)
    sorted_ncvalue = [(cand, score) for cand, score in sorted(
        ncvalue_output.items(), key=lambda x: x[1], reverse=True)]
Ejemplo n.º 5
0
    #stoplist = log_likelihood_stoplist(5)
    candidate_scores = stoplist_filter(candidate_scores, stoplist)

    return candidate_scores


if __name__ == '__main__':
    terms = load_terms()

    domain_sents = load_analysis()
    train_terms, test_corpus, test_terms = \
        split_train_test(domain_sents, terms)

    candidates = main(train_terms, test_corpus)
    candidates = [word for word, score in candidates]

    print '[C]', len(candidates)
    print '[T]', len(set(candidates).intersection(set(test_terms)))
    print '======'
    precision, recall = evaluation.precision_recall(test_terms, candidates)
    print '[P]', round(precision, 3)
    print '[R]', round(recall, 3)
    print '======'
    precision_by_segment = evaluation.precision_by_segments(
        test_terms, candidates, 4)
    for i, seg_precision in enumerate(precision_by_segment):
        print '[%s] %s' % (i + 1, round(seg_precision, 3))
    recall_list, precision_list = evaluation.precision_at_recall_values(
        test_terms, candidates)
    evaluation.plot_precision_at_recall_values(recall_list, precision_list)
Ejemplo n.º 6
0
def main(pattern, min_freq, loglike_threshold, min_cvalue, use_ncval=False,
         cval_top=0.2):
    # STEP 1.
    # POS-tagged corpus.
    domain = load_domain()

    # STEP 2.
    # Extract matching patterns above frequency threshold.
    phrase_freq = chunk_sents(domain, pattern, min_freq)
    # Remove POS tags from candidates.
    phrase_freq = remove_dict_postags(phrase_freq)
    # Remove candidates with words in stoplist.
    stoplist = loglike_stoplist(loglike_threshold)
    accepted_phrases, discarded_phrases = filter_out(phrase_freq, stoplist)
    # Order candidates first by number of words, then by frequency.
    sorted_phrases = build_sorted_phrases(accepted_phrases)

    # STEP 3.
    # Calculate C-value, discard if C-value below threshold.
    cvalue_output = calc_cvalue(sorted_phrases, min_cvalue)

    cvalue_candidates = [c[0] for c in cvalue_output]
    known_terms = load_terms()

    if use_ncval is True:
        cvalue_top = [c for c in
                      cvalue_candidates[0:int(len(cvalue_candidates)
                                              * cval_top)]]
        context_word_weights = make_contextword_weight_dict(
            cvalue_top, domain, ['NC', 'AQ', 'VM'], 5)
        ncvalue_output = calc_ncvalue(
            cvalue_output, domain, context_word_weights, ['NC', 'AQ', 'VM'], 5)

        ncvalue_candidates = [c[0] for c in ncvalue_output]
        precision, recall = \
            evaluation.precision_recall(known_terms, ncvalue_candidates)
        print '\nNC-VALUE'
        print '=========='
        print 'PRECISION:', round(precision, 3)
        print 'RECALL:', round(recall, 3)
        print '=========='
        precision_by_segment = evaluation.precision_by_segments(
            known_terms, ncvalue_candidates, 4)
        for i, seg_precision in enumerate(precision_by_segment):
            print '[%s] %s' % (i, round(seg_precision, 3))
        recall_list, precision_list = evaluation.precision_at_recall_values(
            known_terms, cvalue_candidates)
        evaluation.plot_precision_at_recall_values(recall_list, precision_list)
        results = ncvalue_candidates

    else:
        precision, recall = \
            evaluation.precision_recall(known_terms, cvalue_candidates)
        print '\nC-VALUE'
        print '=========='
        print '[P]', round(precision, 3)
        print '[R]', round(recall, 3)
        print '=========='
        precision_by_segment = evaluation.precision_by_segments(
            known_terms, cvalue_candidates, 4)
        for i, seg_precision in enumerate(precision_by_segment):
            print '[%s] %s' % (i, round(seg_precision, 3))
        recall_list, precision_list = evaluation.precision_at_recall_values(
            known_terms, cvalue_candidates)
        evaluation.plot_precision_at_recall_values(recall_list, precision_list)
        results = cvalue_candidates

    return results