def get_features(in_file, idf_enabled=False): print "loading xml..." lexical_tree = load_xml.get_pairs(in_file) syntax_tree = create_tree.generate_syntax_tree(in_file) print "done loading" if idf_enabled: generate_idf_score(lexical_tree) print "parsing reference" ref = get_attributes_pair(in_file) print "extracting features" features = defaultdict(list) #word_matching score = lexical.word_match(lexical_tree, idf_enabled=idf_enabled) for k, v in score: features[k].append(v) #simple negation score = lexical.get_simple_negations(lexical_tree) for k, v in score: features[k].append(v) #tree edit distance score = syntactic.tree_edit_distance(syntax_tree) for k,v in score: features[k].append(v) #number_match score = lexical.number_match(lexical_tree) for k,v in score: features[k].append(v) #1,2,3-gram with synonyms of lemmas for n in [1,2,3]: score = bleu(lexical_tree, n=n, idf_enabled=True, lemma=True, synonyms=True) for k,v in score: features[k].append(v) memory = {} for n in [2]: # 2-gram without synonyms score = bleu(lexical_tree, n=n, idf_enabled=True, lemma=True, synonyms=False) for k,v in score: features[k].append(v) #appending task and entailment for k,v in features.iteritems(): features[k].extend(ref[str(k)]) return features
def get_features(in_file, idf_enabled=False): print "loading xml..." lexical_tree = load_xml.get_pairs(in_file) syntax_tree = create_tree.generate_syntax_tree(in_file) print "done loading" if idf_enabled: generate_idf_score(lexical_tree) print "parsing reference" ref = get_attributes_pair(in_file) print "extracting features" features = defaultdict(list) #word_matching score = lexical.word_match(lexical_tree, idf_enabled=idf_enabled) for k, v in score: features[k].append(v) #lemma_matching score = lexical.lemma_match(lexical_tree) for k, v in score: features[k].append(v) #bigram_matching (lemma) score = lexical.bleu(lexical_tree, n=2, return_only_n=2, idf_enabled=idf_enabled, lemma=True) for k, v in score: features[k].append(v) #leamma_pos_matching score = lexical.lemma_match(lexical_tree) for k, v in score: features[k].append(v) #simple negation score = lexical.get_simple_negations(lexical_tree) for k, v in score: features[k].append(v) score = syntactic.tree_edit_distance(syntax_tree) for k,v in score: features[k].append(v) #appending task and entailment for k,v in features.iteritems(): features[k].extend(ref[str(k)]) return features
def main(tree, output, method, threshold, find_best, n=4, idf_enabled=False): #load xml and idf if method in ["word", "lemma", "bleu"]: print "Loading xmlfile" tree = (load_xml.get_pairs(tree), tree) print "done." if idf_enabled: generate_idf_score(tree[0]) elif method in ["print_ted", "ted"]: print "Loading xmlfile" tree = (create_tree.generate_syntax_tree(tree), tree) print "done." if idf_enabled: generate_idf_score(load_xml.get_pairs(tree[1])) elif method in ["features"]: features = get_features(tree, idf_enabled) write_features(output, features) return elif method in ["knn", "knn-xv"]: tree = (tree, tree) #run methods if find_best: find_best_threshold(tree[0], METHODS[method], tree[1], output, n=n, idf_enabled=idf_enabled) else: if method in ["knn", "knn-xv"]: features = get_features(tree[0], idf_enabled=idf_enabled) write_features("features.tab", features) results = METHODS[method](None, outfile="features.tab") else: results = METHODS[method](tree[0], n=n, idf_enabled=idf_enabled, output=output) if method == "print_ted": return classification = classify_results(results, threshold) print "writing output" write(classification, output) print "Accuracy = %.4f" % evaluate(tree[1], output)