#!/usr/bin/python3 import setup_run_dir # this import tricks script to run from 2 levels up import sys from amrlib.evaluate.smatch_enhanced import get_entries, compute_smatch, compute_scores from amrlib.evaluate.smatch_enhanced import redirect_smatch_errors # Score "nowiki" version, meaning the generated file should not have the :wiki tags added GOLD = 'amrlib/data/tdata_xfm/test.txt.nowiki' PRED = 'amrlib/data/model_parse_xfm_bart_large/test-pred.txt' # Score with the original version meaning the generated files need to have been "wikified" #GOLD='amrlib/data/tdata_xfm/test.txt' #PRED='amrlib/data/model_parse_xfm_bart_base/test-pred.txt.wiki' redirect_smatch_errors('logs/score_smatch_errors.log') # Run only the smatch score if 0: gold_entries = get_entries(GOLD) test_entries = get_entries(PRED) precision, recall, f_score = compute_smatch(test_entries, gold_entries) print('SMATCH -> P: %.3f, R: %.3f, F: %.3f' % (precision, recall, f_score)) # Compute enhanced scoring else: compute_scores(GOLD, PRED)
print('%d generated graphs do not deserialize out of %d = %.1f%%' % (len(bad_graphs), num_non_clipped, pct)) print() # Save the reference, omitting any clipped or bad ref_fpath = os.path.join(out_dir, ref_out_fn) print('Saving', ref_fpath) skipped = 0 with open(ref_fpath, 'w') as f: for i, graph in enumerate(ref_in_graphs): if i in bad_graphs or i in clip_index_set: skipped += 1 continue f.write(graph + '\n\n') print('Skipped writing %d as either bad or clipped' % skipped) print('Wrote a total of %d reference AMR graphs' % (len(ref_in_graphs) - skipped)) print() # Save the generated gen_fpath = os.path.join(out_dir, gen_out_fn) print('Saving', gen_fpath) penman.dump(gen_out_graphs, gen_fpath, indent=6, model=NoOpModel()) print('Wrote a total of %d generated AMR graphs' % len(gen_out_graphs)) print() # Score the resultant files print('Scoring the above files with SMATCH') gold_entries = get_entries(ref_fpath) test_entries = get_entries(gen_fpath) precision, recall, f_score = compute_smatch(test_entries, gold_entries) print('SMATCH -> P: %.3f, R: %.3f, F: %.3f' % (precision, recall, f_score))
print('Generating') gen_graphs = inference.parse_sents(ref_sents, disable_progress=False) assert len(gen_graphs) == len(ref_serials) # Save the reference and generated graphs, inserting dummy graphs for that are None # Originally I was omitting these graphs but that makes it to test after wikification # because the graphs will no longer line up with the original file. f_ref = open(gold_fpath, 'w') f_gen = open(pred_fpath, 'w') print('Saving %s and %s' % (gold_fpath, pred_fpath)) dummies = 0 for ref_graph, gen_graph in zip(ref_graphs, gen_graphs): # If I didn't get a return, form a dummy graph so the file still aligns with the original if gen_graph is None: dummies += 1 gen_graph = '# ::snt dummy graph for deserialization failure.\n()' f_ref.write(ref_graph + '\n\n') f_gen.write(gen_graph + '\n\n') f_ref.close() f_gen.close() print('Out of %d graphs, %d did not deserialize properly.' % (len(ref_graphs), dummies)) print() # Run smatch gold_entries = get_entries(gold_fpath) test_entries = get_entries(pred_fpath) precision, recall, f_score = compute_smatch(test_entries, gold_entries) print('SMATCH -> P: %.3f, R: %.3f, F: %.3f' % (precision, recall, f_score))