def evaluate_engine_scores(): concat_cross_validation_folds() ref_scores = {} avg_scores = {} print 'Average scores:' for dataset in EVAL_DATASETS: print '\t {0} dataset:'.format(dataset) ref_scores[dataset] = {} avg_scores[dataset] = {} path = os.path.join(EVAL_IN_DIR, dataset) s = Score(path) test_list = [ "{0}-{1}.txt".format(path, engine) for engine in EVAL_ENGINES ] ref_file = path + "-ref.txt" (file1, file2, line_counts) = s.BatchFiles(test_list, [ref_file]) output_file = s.ComputeMeteorScores(file1, file2, n_refs=4) results = s.UnbatchResults(output_file, test_list, [ref_file], line_counts) for i, r in enumerate(results): len(r) == 1 or Die("Multiple results for only 1 reference") engine_name = EVAL_ENGINES[i] ref_scores[dataset][engine_name] = r[0] avg_scores[dataset][engine_name] = float(sum(r[0])) / len(r[0]) print '\t\t {0} engine: {1}'.format( engine_name, avg_scores[dataset][engine_name]) out_dir = os.path.join(score.OUTPUT_DIR, "results") pickle.dump(ref_scores, open(os.path.join(out_dir, 'ref_scores.pkl'), 'w')) pickle.dump(avg_scores, open(os.path.join(out_dir, 'avg_scores.pkl'), 'w'))