def find_best_threshold(tree, method, input_file, output_file, n=4, idf_enabled=False): if method in [knn_classifier, knn_classifier_xv]: features = get_features(input_file, idf_enabled) write_features("tmp.tab", features) results = knn_classifier(None, outfile="tmp.tab") else: results = method(tree, output=output_file, n=n, idf_enabled=idf_enabled) reference = parse_reference(input_file) # some speedup, read once best_threshold = 0.01 best_accuracy = 0 threshold = 0.01 while(threshold <= 1): threshold = round(threshold,2) classification = classify_results(results, threshold) #write(classification, output_file) #find accuracy acc = evaluate(input_file, output_file, pred_id2label=classification, ref_id2label=reference) print "th:", threshold, "acc:",acc if acc >= best_accuracy: best_threshold = threshold best_accuracy = acc threshold += 0.01 print "best threshold was %.2f with %.4f accuracy" % (best_threshold, best_accuracy)
def predict(): # Reads the list of word matches, and prints to file a list of predictions in RTE output format b = open("bleuresults.txt") bleu = read_file(b) b.close() w = open("wordmatches.txt") word = read_file(w) w.close() l = open("lemma_matches.txt") lemma = read_file(l) l.close() p = open("pos-tag_matches.txt") pos = read_file(p) p.close() print len(bleu), len(word), len(lemma), len(pos) file = open("finalresult.txt", "wb") if file: print >> file, "ranked: no" for i in range(len(bleu)): yes = 0 no = 0 if bleu[i] > bleu_threshold: yes += bleu_result elif bleu[i] < bleu_threshold: no += bleu_result if word[i] > word_threshold: yes += word_result elif word[i] < word_threshold: no += word_result if lemma[i] > lemma_threshold: yes += lemma_result elif lemma[i] < lemma_threshold: no += lemma_result if pos[i] > pos_threshold: yes += pos_result elif pos[i] < pos_threshold: no += pos_result if yes > no: print >> file, i+1, "YES" else: print >> file, i+1, "NO" file.close() else: print "Error opening file" match = eval_rte.evaluate("RTE2_dev.xml", "finalresult.txt") print "%.4f" %match
def main(tree, output, method, threshold, find_best, n=4, idf_enabled=False): #load xml and idf if method in ["word", "lemma", "bleu"]: print "Loading xmlfile" tree = (load_xml.get_pairs(tree), tree) print "done." if idf_enabled: generate_idf_score(tree[0]) elif method in ["print_ted", "ted"]: print "Loading xmlfile" tree = (create_tree.generate_syntax_tree(tree), tree) print "done." if idf_enabled: generate_idf_score(load_xml.get_pairs(tree[1])) elif method in ["features"]: features = get_features(tree, idf_enabled) write_features(output, features) return elif method in ["knn", "knn-xv"]: tree = (tree, tree) #run methods if find_best: find_best_threshold(tree[0], METHODS[method], tree[1], output, n=n, idf_enabled=idf_enabled) else: if method in ["knn", "knn-xv"]: features = get_features(tree[0], idf_enabled=idf_enabled) write_features("features.tab", features) results = METHODS[method](None, outfile="features.tab") else: results = METHODS[method](tree[0], n=n, idf_enabled=idf_enabled, output=output) if method == "print_ted": return classification = classify_results(results, threshold) print "writing output" write(classification, output) print "Accuracy = %.4f" % evaluate(tree[1], output)
def main(training_data, test_data, output_file): if test_data: training_features = get_features(training_data) test_features = get_features(test_data) write_f("train.tab", training_features) write_f("test.tab", test_features) results = tweaked_on_testdata("train.tab", "test.tab") classification = classify_results(results, 0.5) print "witing output" write(classification, output_file) else: training_features = get_features(training_data) write_f("train.tab", training_features) results = tweaked("train.tab") # cross-validation print "classifying" classification = classify_results(results, 0.5) print "writing output" write(classification, output_file) print "Accuracy = %.4f" % evaluate(training_data, output_file)
def predict(step_size, name): step_size = float(step_size) # Reads the list of word matches, and prints to file a list of predictions in RTE output format file = open(name) c = [] threshold = 0 best_match = 0 match_threshold = 0 if file: for line in file: c.append(float(line)) file.close() else: print "Error opening file" while threshold < 1: threshold = threshold + step_size out = "predictions.txt" file = open(out, 'wb') if file: print >> file, "ranked: no" for i in range(len(c)): if c[i] > threshold: print >> file, i+1, "YES" else: print >> file, i+1, "NO" file.close() else: print "Error opening file" match = eval_rte.evaluate("RTE2_dev.xml", "predictions.txt") if match > best_match: best_match = match match_threshold = threshold print "Best match : %.4f" %best_match, "match threshold : %.4f" %match_threshold