def check_eval(eval_treebank, ep, flag='dev'): # nonlocal best_eval_fscore # nonlocal best_eval_model_path # nonlocal best_eval_processed dev_start_time = time.time() eval_predicted = [] for dev_start_index in range(0, len(eval_treebank), args.eval_batch_size): subbatch_trees = eval_treebank[dev_start_index:dev_start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] predicted, _ = parser.parse_batch(subbatch_sentences) del _ eval_predicted.extend([p.convert() for p in predicted]) eval_fscore = evaluate.evalb(args.evalb_dir, eval_treebank, eval_predicted) logger.info(flag + ' eval ' 'epoch {} ' "fscore {} " "elapsed {} " "total-elapsed {}".format( ep, eval_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), )) return eval_fscore
def run_test(args): print("Loading test trees from {}...".format(args.test_path)) test_treebank = treebanks.load_trees(args.test_path, args.test_path_text, args.text_processing) print("Loaded {:,} test examples.".format(len(test_treebank))) if len(args.model_path) != 1: raise NotImplementedError("Ensembling multiple parsers is not " "implemented in this version of the code.") model_path = args.model_path[0] print("Loading model from {}...".format(model_path)) parser = parse_chart.ChartParser.from_trained(model_path) if args.no_predict_tags and parser.f_tag is not None: print("Removing part-of-speech tagging head...") parser.f_tag = None if args.parallelize: parser.parallelize() elif torch.cuda.is_available(): parser.cuda() print("Parsing test sentences...") start_time = time.time() test_predicted = parser.parse( test_treebank.without_gold_annotations(), subbatch_max_tokens=args.subbatch_max_tokens, ) if args.output_path == "-": for tree in test_predicted: print(tree.pformat(margin=1e100)) elif args.output_path: with open(args.output_path, "w") as outfile: for tree in test_predicted: outfile.write("{}\n".format(tree.pformat(margin=1e100))) # The tree loader does some preprocessing to the trees (e.g. stripping TOP # symbols or SPMRL morphological features). We compare with the input file # directly to be extra careful about not corrupting the evaluation. We also # allow specifying a separate "raw" file for the gold trees: the inputs to # our parser have traces removed and may have predicted tags substituted, # and we may wish to compare against the raw gold trees to make sure we # haven't made a mistake. As far as we can tell all of these variations give # equivalent results. ref_gold_path = args.test_path if args.test_path_raw is not None: print("Comparing with raw trees from", args.test_path_raw) ref_gold_path = args.test_path_raw test_fscore = evaluate.evalb(args.evalb_dir, test_treebank.trees, test_predicted, ref_gold_path=ref_gold_path) print("test-fscore {} " "test-elapsed {}".format( test_fscore, format_elapsed(start_time), ))
def run_test(args): print("Loading test trees from {}...".format(args.test_path)) test_treebank = trees.load_trees(args.test_path) print("Loaded {:,} test examples.".format(len(test_treebank))) print("Loading model from {}...".format(args.model_path_base)) # model = dy.ParameterCollection() # [parser] = dy.load(args.model_path_base, model) parser = torch.load(args.model_path_base) print("Parsing test sentences...") start_time = time.time() test_predicted = [] for tree in test_treebank: # dy.renew_cg() parser.eval() sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()] predicted, _ = parser.parse(sentence) test_predicted.append(predicted.convert()) test_fscore = evaluate.evalb(args.evalb_dir, test_treebank, test_predicted) print("test-fscore {} " "test-elapsed {}".format( test_fscore, format_elapsed(start_time), ))
def check_dev(epoch_num): nonlocal best_dev_fscore nonlocal best_model_path nonlocal best_dev_processed dev_start_time = time.time() parser.eval() dev_predicted = [] for dev_start_index in range(0, len(dev_treebank), args.eval_batch_size): subbatch_trees = dev_treebank[dev_start_index:dev_start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] ( predicted, _, ) = parser.parse_batch(subbatch_sentences) del _ dev_predicted.extend([p.convert() for p in predicted]) dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank, dev_predicted) print("\n" "dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format(dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time))) if dev_fscore.fscore > best_dev_fscore: if best_model_path is not None: extensions = [".pt"] for ext in extensions: path = best_model_path + ext if os.path.exists(path): print( "Removing previous model file {}...".format(path)) os.remove(path) best_dev_fscore = dev_fscore.fscore best_model_path = "{}_best_dev={:.2f}".format( args.model_path_base, dev_fscore.fscore) best_dev_processed = total_processed print("Saving new best model to {}...".format(best_model_path)) torch.save( { "spec": parser.spec, "state_dict": parser.state_dict(), "trainer": trainer.state_dict(), }, best_model_path + ".pt", )
def check_dev(): nonlocal best_dev_fscore nonlocal best_dev_model_path nonlocal dev_efscore dev_start_time = time.time() dev_predicted = [] for dev_start_index in range(0, len(dev_treebank), args.eval_batch_size): subbatch_trees = dev_treebank[dev_start_index:dev_start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] predicted, _ = parser.parse_batch(subbatch_sentences) del _ dev_predicted.extend([p.convert() for p in predicted]) dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank, dev_predicted) print(" dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format( dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), ), flush=True) dev_efscore = evaluate_EDITED.Evaluate(dev_treebank, dev_predicted) print(" dev-Efscore: {}".format(dev_efscore), flush=True) # MJ - keep model with best efscore if dev_efscore.efscore > best_dev_fscore: best_dev_fscore = dev_efscore.efscore if best_dev_model_path is not None: extensions = [".pt"] for ext in extensions: path = best_dev_model_path + ext if os.path.exists(path): print( " Removing previous model file {}...".format(path), flush=True) os.remove(path) best_dev_model_path = "{}_Edev={:.4}".format( args.model_path_base, best_dev_fscore) print(" Saving new best model to {}...".format( best_dev_model_path), flush=True) torch.save( { 'spec': parser.spec, 'state_dict': parser.state_dict(), 'trainer': trainer.state_dict(), }, best_dev_model_path + ".pt")
def run_parse_extra(args): if args.output_path != '-' and os.path.exists(args.output_path): print("Error: output file already exists:", args.output_path) return print("Loading parse trees from {}...".format(args.input_path)) treebank = trees.load_trees(args.input_path) if args.max_len_eval > 0: treebank = [ tree for tree in treebank if len(list(tree.leaves())) <= args.max_len_eval ] print("Loaded {:,} parse tree examples.".format(len(treebank))) print("Loading model from {}...".format(args.model_path_base)) assert args.model_path_base.endswith( ".pt"), "Only pytorch savefiles supported" info = torch_load(args.model_path_base) assert 'hparams' in info['spec'], "Older savefiles not supported" parser = parse_nk.NKChartParser.from_spec(info['spec'], info['state_dict']) print("Parsing test sentences...") start_time = time.time() new_treebank = [] for start_index in range(0, len(treebank), args.eval_batch_size): subbatch_trees = treebank[start_index:start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] predicted, _ = parser.parse_batch(subbatch_sentences) del _ new_treebank.extend([p.convert() for p in predicted]) assert len(treebank) == len(new_treebank), (len(treebank), len(new_treebank)) if args.write_parse is not None: print('writing to {}'.format(args.write_parse)) f = open(args.write_parse, 'w') for x, y in zip(new_treebank, treebank): gold = '(ROOT {})'.format(y.linearize()) pred = '(ROOT {})'.format(x.linearize()) ex = dict(gold=gold, pred=pred) f.write(json.dumps(ex) + '\n') f.close() test_fscore = evaluate.evalb(args.evalb_dir, treebank, new_treebank, ref_gold_path=None) print("test-fscore {} " "test-elapsed {}".format( test_fscore, format_elapsed(start_time), ))
def run_test(args): print("Loading test trees from {}...".format(args.test_path)) test_treebank = trees.load_trees(args.test_path) print("Loaded {:,} test examples.".format(len(test_treebank))) print("Loading model from {}...".format(args.model_path_base)) assert args.model_path_base.endswith( ".pt"), "Only pytorch savefiles supported" info = torch_load(args.model_path_base) assert 'hparams' in info['spec'], "Older savefiles not supported" parser = SAPar_model.SAChartParser.from_spec(info['spec'], info['state_dict']) print("Parsing test sentences...") start_time = time.time() test_predicted = [] for start_index in tqdm(range(0, len(test_treebank), args.eval_batch_size)): subbatch_trees = test_treebank[start_index:start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] predicted, _ = parser.parse_batch(subbatch_sentences) del _ test_predicted.extend([p.convert() for p in predicted]) # The tree loader does some preprocessing to the trees (e.g. stripping TOP # symbols or SPMRL morphological features). We compare with the input file # directly to be extra careful about not corrupting the evaluation. We also # allow specifying a separate "raw" file for the gold trees: the inputs to # our parser have traces removed and may have predicted tags substituted, # and we may wish to compare against the raw gold trees to make sure we # haven't made a mistake. As far as we can tell all of these variations give # equivalent results. ref_gold_path = args.test_path if args.test_path_raw is not None: print("Comparing with raw trees from", args.test_path_raw) ref_gold_path = args.test_path_raw test_fscore = evaluate.evalb(args.evalb_dir, test_treebank, test_predicted, ref_gold_path=ref_gold_path) model_name = args.model_path_base[args.model_path_base.rfind('/') + 1:args.model_path_base.rfind('.')] output_file = './results/' + model_name + '.txt' with open(output_file, "w") as outfile: for tree in test_predicted: outfile.write("{}\n".format(tree.linearize())) print("test-fscore {} " "test-elapsed {}".format( test_fscore, format_elapsed(start_time), ))
def check_dev(): nonlocal best_dev_fscore nonlocal best_dev_model_path nonlocal best_dev_processed dev_start_time = time.time() dev_predicted = [] for dev_start_index in range(0, len(dev_treebank), args.eval_batch_size): subbatch_trees = dev_treebank[dev_start_index:dev_start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] predicted, _ = parser.parse_batch( subbatch_sentences, span_index=span_index, k=K, zero_empty=parser.zero_empty, train_nn=args.train_through_nn, ) del _ dev_predicted.extend([p.convert() for p in predicted]) dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank, dev_predicted) print("dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format( dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), )) if dev_fscore.fscore > best_dev_fscore: if best_dev_model_path is not None: extensions = [".pt"] for ext in extensions: path = best_dev_model_path + ext if os.path.exists(path): print( "Removing previous model file {}...".format(path)) os.remove(path) best_dev_fscore = dev_fscore.fscore best_dev_model_path = "{}_dev={:.2f}".format( args.model_path_base, dev_fscore.fscore) best_dev_processed = total_processed print("Saving new best model to {}...".format(best_dev_model_path)) torch.save( { 'spec': parser.spec, 'state_dict': parser.state_dict(), 'trainer': trainer.state_dict(), }, best_dev_model_path + ".pt")
def run_ensemble(args): print("Loading test trees from {}...".format(args.test_path)) test_treebank = trees.load_trees(args.test_path) print("Loaded {:,} test examples.".format(len(test_treebank))) parsers = [] for model_path_base in args.model_path_base: print("Loading model from {}...".format(model_path_base)) assert model_path_base.endswith(".pt"), "Only pytorch savefiles supported" info = torch_load(model_path_base) assert 'hparams' in info['spec'], "Older savefiles not supported" parser = parse_nk.NKChartParser.from_spec(info['spec'], info['state_dict']) parsers.append(parser) # Ensure that label scores charts produced by the models can be combined # using simple averaging ref_label_vocab = parsers[0].label_vocab for parser in parsers: assert parser.label_vocab.indices == ref_label_vocab.indices print("Parsing test sentences...") start_time = time.time() test_predicted = [] # Ensemble by averaging label score charts from different models # We did not observe any benefits to doing weighted averaging, probably # because all our parsers output label scores of around the same magnitude for start_index in range(0, len(test_treebank), args.eval_batch_size): subbatch_trees = test_treebank[start_index:start_index+args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] chart_lists = [] for parser in parsers: charts = parser.parse_batch(subbatch_sentences, return_label_scores_charts=True) chart_lists.append(charts) subbatch_charts = [np.mean(list(sentence_charts), 0) for sentence_charts in zip(*chart_lists)] predicted, _ = parsers[0].decode_from_chart_batch(subbatch_sentences, subbatch_charts) del _ test_predicted.extend([p.convert() for p in predicted]) test_fscore = evaluate.evalb(args.evalb_dir, test_treebank, test_predicted, ref_gold_path=args.test_path) print( "test-fscore {} " "test-elapsed {}".format( test_fscore, format_elapsed(start_time), ) )
def run_test(args): test_path = args.test_ptb_path if args.dataset == "ctb": test_path = args.test_ctb_path print("Loading model from {}...".format(args.model_path_base)) assert args.model_path_base.endswith( ".pt"), "Only pytorch savefiles supported" info = torch_load(args.model_path_base) assert "hparams" in info["spec"], "Older savefiles not supported" parser = Lparser.ChartParser.from_spec(info["spec"], info["state_dict"]) parser.eval() print("Loading test trees from {}...".format(test_path)) test_treebank = trees.load_trees(test_path) print("Loaded {:,} test examples.".format(len(test_treebank))) print("Parsing test sentences...") start_time = time.time() punct_set = "." "``" "''" ":" "," parser.eval() test_predicted = [] for start_index in range(0, len(test_treebank), args.eval_batch_size): subbatch_trees = test_treebank[start_index:start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] ( predicted, _, ) = parser.parse_batch(subbatch_sentences) del _ test_predicted.extend([p.convert() for p in predicted]) test_fscore = evaluate.evalb(args.evalb_dir, test_treebank, test_predicted) print("test-fscore {} " "test-elapsed {}".format( test_fscore, format_elapsed(start_time), ))
def check_dev(): nonlocal best_dev_fscore nonlocal best_dev_model_path nonlocal best_dev_processed dev_start_time = time.time() dev_predicted = parser.parse( dev_treebank.without_gold_annotations(), subbatch_max_tokens=args.subbatch_max_tokens, ) dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank.trees, dev_predicted) print( "dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format( dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), ) ) if dev_fscore.fscore > best_dev_fscore: if best_dev_model_path is not None: extensions = [".pt"] for ext in extensions: path = best_dev_model_path + ext if os.path.exists(path): print("Removing previous model file {}...".format(path)) os.remove(path) best_dev_fscore = dev_fscore.fscore best_dev_model_path = "{}_dev={:.2f}".format( args.model_path_base, dev_fscore.fscore ) best_dev_processed = total_processed print("Saving new best model to {}...".format(best_dev_model_path)) torch.save( { "config": parser.config, "state_dict": parser.state_dict(), "optimizer": optimizer.state_dict(), }, best_dev_model_path + ".pt", )
def check_dev(): nonlocal best_dev_fscore nonlocal best_dev_model_path dev_start_time = time.time() dev_predicted = [] for tree in dev_treebank: # dy.renew_cg() parser.eval() sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()] predicted, _ = parser.parse(sentence) dev_predicted.append(predicted.convert()) dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank, dev_predicted) print("dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format( dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), )) if dev_fscore.fscore > best_dev_fscore: if best_dev_model_path is not None: # for ext in [".data", ".meta"]: # path = best_dev_model_path + ext # if os.path.exists(path): # print("Removing previous model file {}...".format(path)) # os.remove(path) path = best_dev_model_path if os.path.exists(path): print("Removing previous model file {}...".format(path)) os.remove(path) best_dev_fscore = dev_fscore.fscore # best_dev_model_path = "{}_dev={:.2f}".format( best_dev_model_path = "{}_dev={:.2f}.pth".format( args.model_path_base, dev_fscore.fscore) print("Saving new best model to {}...".format(best_dev_model_path)) # dy.save(best_dev_model_path, [parser]) torch.save(parser, best_dev_model_path)
def check_performance(parser, treebank, sentence_embeddings, args): dev_start_time = time.time() dev_predicted = [] for tree_index, tree in enumerate(treebank): if tree_index % 100 == 0: dy.renew_cg() if sentence_embeddings is not None: elmo_embeddings = dy.inputTensor(sentence_embeddings[str(tree_index)][:, :, :]) else: elmo_embeddings = None sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves] predicted = parser.span_parser(sentence, is_train=False, elmo_embeddings=elmo_embeddings) dev_predicted.append(predicted.convert()) dev_fscore = evaluate.evalb('EVALB', treebank, dev_predicted, args=args, name="dev") return dev_fscore, dev_start_time
def run_test(args): print("Loading test trees from {}...".format(args.test_path)) test_treebank = treebanks.load_trees( args.test_path, args.test_path_text, args.text_processing ) print("Loaded {:,} test examples.".format(len(test_treebank))) print("Loading model from {}...".format(args.model_path)) parser = Parser(args.model_path, batch_size=args.batch_size) print("Parsing test sentences...") start_time = time.time() if args.output_path == "-": output_file = sys.stdout elif args.output_path: output_file = open(args.output_path, "w") else: output_file = None test_predicted = [] for predicted_tree in parser.parse_sents( inputs_from_treebank(test_treebank, predict_tags=args.predict_tags) ): test_predicted.append(predicted_tree) if output_file is not None: print(tree.pformat(margin=1e100), file=output_file) test_fscore = evaluate.evalb(args.evalb_dir, test_treebank.trees, test_predicted) print( "test-fscore {} " "test-elapsed {}".format( test_fscore, format_elapsed(start_time), ) )
def check_dev(): nonlocal best_dev_fscore nonlocal best_dev_model_path dev_start_time = time.time() dev_predicted = run_eval(parser, dev_treebank) dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank, dev_predicted) print("dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format( dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), )) if dev_fscore.fscore > best_dev_fscore: if best_dev_model_path is not None: # for ext in [".data", ".meta"]: # path = best_dev_model_path + ext # if os.path.exists(path): # print("Removing previous model file {}...".format(path)) # os.remove(path) path = best_dev_model_path if os.path.exists(path): print("Removing previous model file {}...".format(path)) os.remove(path) best_dev_fscore = dev_fscore.fscore # best_dev_model_path = "{}_dev={:.2f}".format( best_dev_model_path = "{}_dev={:.2f}.pth".format( args.model_path_base, dev_fscore.fscore) print("Saving new best model to {}...".format(best_dev_model_path)) # dy.save(best_dev_model_path, [parser]) torch.save(parser, best_dev_model_path)
def run_test(args): print("Loading test trees from {}...".format(args.test_path)) test_treebank = trees.load_trees(args.test_path) print("Loaded {:,} test examples.".format(len(test_treebank))) print("Loading model from {}...".format(args.model_path_base)) # model = dy.ParameterCollection() # [parser] = dy.load(args.model_path_base, model) parser = torch.load(args.model_path_base) if torch.cuda.is_available(): parser = parser.cuda() print("Parsing test sentences...") start_time = time.time() test_predicted = run_eval(parser, test_treebank) test_fscore = evaluate.evalb(args.evalb_dir, test_treebank, test_predicted) print("test-fscore {} " "test-elapsed {}".format( test_fscore, format_elapsed(start_time), ))
def main(args): parser = Parser(args.grammar, args.expand_binaries) print( 'Grammar rules:', f'{parser.grammar.num_lexical_rules:,} lexical,', f'{parser.grammar.num_unary_rules:,} unary,', f'{parser.grammar.num_binary_rules:,} binary.' ) if args.infile: print(f'Predicting trees for tokens in `{args.infile}`.') print(f'Writing trees to file `{args.outfile}`...') if args.parallel: trees = predict_from_file_parallel( parser, args.infile, args.num_lines, args.tokenize) else: trees = predict_from_file( parser, args.infile, args.num_lines, args.tokenize) with open(args.outfile, 'w') as fout: print('\n'.join(trees), file=fout) if args.show: show(args.outfile) print('Evaluating bracket score...') if args.goldfile: try: evalb(args.evalb_dir, args.outfile, args.goldfile, args.result, args.ignore_empty) if args.show: show(args.result) except: exit('Could not evaluate trees. Maybe you did not parse the entire file?') print(f'Finished. Results saved to `{args.result}`.') elif args.treefile: num_trees = 10 if args.num_lines == None else args.num_lines parses = predict_from_trees(parser, args.treefile) fscores = [] for i in range(num_trees): gold, pred, prec, rec, fscore = next(parses) fscores.append(fscore) print(f'Tree {i}, f1={fscore:.3f}.') print() print('Gold:') gold.pretty_print() print() print('Pred:') pred.pretty_print() print() print() print('All F1 =', ' '.join([f'{fscore:.3f}' for fscore in fscores])) print('Avg F1 = ', sum(fscores) / len(fscores)) elif args.syneval: syneval(parser, args.syneval, args.outfile, parallel=args.parallel, short=args.short) else: if args.sent: sentence = tokenize.word_tokenize(args.sent) else: # Demo: use a default test-sentence with gold tree. sentence, gold = SENT.split(), GOLD print('Parsing sentence...') start = time.time() tree, score = parser.parse(sentence, use_numpy=args.use_numpy) elapsed = time.time() - start tree.un_chomsky_normal_form() print('Predicted.') print() tree.pretty_print() print('Logprob:', score) print() if not args.sent: gold = Tree.fromstring(gold) prec, recall, fscore = parser.evalb( gold.pformat(margin=np.inf), tree.pformat(margin=np.inf)) print('Gold.') gold.pretty_print() print(f'Precision = {prec:.3f}') print(f'Recall = {recall:.3f}') print(f'F1 = {fscore:.3f}') print() print(f'Parse-time: {elapsed:.3f}s.') if args.perplexity: perplexity = parser.perplexity(sentence) print('Perplexity:', round(perplexity, 2))
def check_dev(): nonlocal best_dev_fscore nonlocal best_dev_model_path nonlocal best_dev_processed dev_start_time = time.time() dev_predicted = [] eval_batch_size = args.eval_batch_size for dev_start_index in range(0, len(dev_treebank), eval_batch_size): subbatch_trees = dev_treebank[dev_start_index:dev_start_index \ + eval_batch_size] subbatch_sent_ids = dev_sent_ids[dev_start_index:dev_start_index \ + eval_batch_size] if hparams.seg: subbatch_txt = [tree[0] for tree in subbatch_trees] subbatch_lbl = [tree[1] for tree in subbatch_trees] subbatch_sentences = [[(lbl,txt) for lbl,txt in zip(sent_lbl,sent_txt)] \ for sent_lbl,sent_txt in zip(subbatch_lbl,subbatch_txt)] else: subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in \ tree.leaves()] for tree in subbatch_trees] subbatch_features = load_features(subbatch_sent_ids, dev_feat_dict) predicted, _ = parser.parse_batch(subbatch_sentences, \ subbatch_sent_ids, subbatch_features) del _ if hparams.seg: dev_predicted.extend(predicted) else: dev_predicted.extend([p.convert() for p in predicted]) if hparams.seg: dev_fscore = evaluate.seg_fscore(dev_treebank, dev_predicted) else: dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank, dev_predicted) """ with open('tmp_preds.txt','w') as f: for pred in dev_predicted: f.write(pred.linearize()) f.write('\n') with open('tmp_gold.txt','w') as f: for gold in dev_treebank: f.write(gold.linearize()) f.write('\n') """ print("dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format( dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), )) sys.stdout.flush() if dev_fscore.fscore > best_dev_fscore: if best_dev_model_path is not None: extensions = [".pt"] for ext in extensions: path = best_dev_model_path + ext if os.path.exists(path): print( "Removing previous model file {}...".format(path)) os.remove(path) best_dev_fscore = dev_fscore.fscore best_dev_model_path = "{}_dev={:.2f}".format( args.model_path_base, dev_fscore.fscore) best_dev_processed = total_processed print("Saving new best model to {}...".format(best_dev_model_path)) torch.save( { 'spec': parser.spec, 'state_dict': parser.state_dict(), 'trainer': trainer.state_dict(), }, best_dev_model_path + ".pt") sys.stdout.flush()
print("Parsing test sentences using tensorflow...") start_time = time.time() test_predicted = [] for start_index in range(0, len(test_treebank), args.eval_batch_size): # for start_index in range(0, 2, 2): print(start_index, format_elapsed(start_time)) subbatch_trees = test_treebank[start_index:start_index+args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] predicted, _ = tf_parse_batch(subbatch_sentences) del _ test_predicted.extend([p.convert() for p in predicted]) test_fscore = evaluate.evalb(args.evalb_dir, test_treebank[:len(test_predicted)], test_predicted) print('Done', format_elapsed(start_time)) str(test_fscore) #%% input_node_names = [the_inp_tokens.name.split(':')[0], the_inp_mask.name.split(':')[0]] output_node_names = [the_out_chart.name.split(':')[0], the_out_tags.name.split(':')[0]] print("Input node names:", input_node_names) print("Output node names:", output_node_names) graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, output_node_names) #%%
def test_on_parses(args): if not os.path.exists(args.experiment_directory): os.mkdir(args.experiment_directory) model = dy.ParameterCollection() [parser] = dy.load(args.model_path_base, model) treebank = trees.load_trees(args.input_file, strip_top=True, filter_none=True) output = [tree.linearize() for tree in treebank] with open(os.path.join(args.experiment_directory, 'parses.txt'), 'w') as f: f.write('\n'.join(output)) sentence_embeddings = h5py.File(args.elmo_embeddings_file_path, 'r') test_predicted = [] start_time = time.time() total_log_likelihood = 0 total_confusion_matrix = {} total_turned_off = 0 ranks = [] num_correct = 0 total = 0 for tree_index, tree in enumerate(treebank): if tree_index % 100 == 0: print(tree_index) dy.renew_cg() sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves] elmo_embeddings_np = sentence_embeddings[str(tree_index)][:, :, :] assert elmo_embeddings_np.shape[1] == len(sentence), ( elmo_embeddings_np.shape[1], len(sentence), [word for pos, word in sentence]) elmo_embeddings = dy.inputTensor(elmo_embeddings_np) predicted, (additional_info, c, t) = parser.span_parser(sentence, is_train=False, elmo_embeddings=elmo_embeddings) num_correct += c total += t rank = additional_info[3] ranks.append(rank) total_log_likelihood += additional_info[-1] test_predicted.append(predicted.convert()) print('pos accuracy', num_correct / total) print("total time", time.time() - start_time) print("total loglikelihood", total_log_likelihood) print("total turned off", total_turned_off) print(total_confusion_matrix) print(ranks) print("avg", np.mean(ranks), "median", np.median(ranks)) dev_fscore_without_labels = evaluate.evalb('EVALB/', treebank, test_predicted, args=args, erase_labels=True, name="without-labels") print("dev-fscore without labels", dev_fscore_without_labels) dev_fscore_without_labels = evaluate.evalb('EVALB/', treebank, test_predicted, args=args, erase_labels=True, flatten=True, name="without-label-flattened") print("dev-fscore without labels and flattened", dev_fscore_without_labels) dev_fscore_without_labels = evaluate.evalb('EVALB/', treebank, test_predicted, args=args, erase_labels=False, flatten=True, name="flattened") print("dev-fscore with labels and flattened", dev_fscore_without_labels) test_fscore = evaluate.evalb('EVALB/', treebank, test_predicted, args=args, name="regular") print( "test-fscore {} " "test-elapsed {}".format( test_fscore, format_elapsed(start_time), ) ) with open(os.path.join(args.experiment_directory, "confusion_matrix.pickle"), "wb") as f: pickle.dump(total_confusion_matrix, f)
def run_test_qbank(args): if not os.path.exists(args.experiment_directory): os.mkdir(args.experiment_directory) print("Loading model from {}...".format(args.model_path_base)) model = dy.ParameterCollection() [parser] = dy.load(args.model_path_base, model) all_trees = trees.load_trees(args.question_bank_trees_path) if args.stanford_split == 'true': print('using stanford split') split_to_indices = { 'train': list(range(0, 1000)) + list(range(2000, 3000)), 'dev': list(range(1000, 1500)) + list(range(3000, 3500)), 'test': list(range(1500, 2000)) + list(range(3500, 4000)) } else: print('not using stanford split') split_to_indices = { 'train': range(0, 2000), 'dev': range(2000, 3000), 'test': range(3000, 4000) } test_indices = split_to_indices[args.split] qb_embeddings_file = h5py.File('../question-bank.hdf5', 'r') dev_predicted = [] for test_index in test_indices: if len(dev_predicted) % 100 == 0: dy.renew_cg() tree = all_trees[test_index] sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves] test_embeddings_np = qb_embeddings_file[str(test_index)][:, :, :] assert test_embeddings_np.shape[1] == len(sentence) test_embeddings = dy.inputTensor(test_embeddings_np) predicted, _ = parser.span_parser(sentence, is_train=False, elmo_embeddings=test_embeddings) dev_predicted.append(predicted.convert()) test_treebank = [all_trees[index] for index in test_indices] dev_fscore_without_labels = evaluate.evalb(args.evalb_dir, test_treebank, dev_predicted, args=args, erase_labels=True, name="without-labels") print("dev-fscore without labels", dev_fscore_without_labels) dev_fscore_without_labels = evaluate.evalb(args.evalb_dir, test_treebank, dev_predicted, args=args, erase_labels=True, flatten=True, name="without-label-flattened") print("dev-fscore without labels and flattened", dev_fscore_without_labels) dev_fscore_without_labels = evaluate.evalb(args.evalb_dir, test_treebank, dev_predicted, args=args, erase_labels=False, flatten=True, name="flattened") print("dev-fscore with labels and flattened", dev_fscore_without_labels) test_fscore = evaluate.evalb(args.evalb_dir, test_treebank, dev_predicted, args=args, name="regular") print("regular", test_fscore)
def run_export(args): if args.test_path is not None: print("Loading test trees from {}...".format(args.test_path)) test_treebank = treebanks.load_trees( args.test_path, args.test_path_text, args.text_processing ) print("Loaded {:,} test examples.".format(len(test_treebank))) else: test_treebank = None print("Loading model from {}...".format(args.model_path)) parser = Parser(args.model_path, batch_size=args.batch_size) model = parser._parser if model.pretrained_model is None: raise ValueError( "Exporting is only defined when using a pre-trained transformer " "encoder. For CharLSTM-based model, just distribute the pytorch " "checkpoint directly. You may manually delete the 'optimizer' " "field to reduce file size by discarding the optimizer state." ) if test_treebank is not None: print("Parsing test sentences (predicting tags)...") start_time = time.time() test_inputs = inputs_from_treebank(test_treebank, predict_tags=True) test_predicted = list(parser.parse_sents(test_inputs)) test_fscore = evaluate.evalb(args.evalb_dir, test_treebank.trees, test_predicted) test_elapsed = format_elapsed(start_time) print("test-fscore {} test-elapsed {}".format(test_fscore, test_elapsed)) print("Parsing test sentences (not predicting tags)...") start_time = time.time() test_inputs = inputs_from_treebank(test_treebank, predict_tags=False) notags_test_predicted = list(parser.parse_sents(test_inputs)) notags_test_fscore = evaluate.evalb( args.evalb_dir, test_treebank.trees, notags_test_predicted ) notags_test_elapsed = format_elapsed(start_time) print( "test-fscore {} test-elapsed {}".format(notags_test_fscore, notags_test_elapsed) ) print("Exporting tokenizer...") model.retokenizer.tokenizer.save_pretrained(args.output_dir) print("Exporting config...") config = model.pretrained_model.config config.benepar = model.config config.save_pretrained(args.output_dir) if args.compress: print("Compressing weights...") state_dict = get_compressed_state_dict(model.cpu()) print("Saving weights...") else: print("Exporting weights...") state_dict = model.cpu().state_dict() torch.save(state_dict, os.path.join(args.output_dir, "benepar_model.bin")) del model, parser, state_dict print("Loading exported model from {}...".format(args.output_dir)) exported_parser = Parser(args.output_dir, batch_size=args.batch_size) if test_treebank is None: print() print("Export complete.") print("Did not verify model accuracy because no treebank was provided.") return print("Parsing test sentences (predicting tags)...") start_time = time.time() test_inputs = inputs_from_treebank(test_treebank, predict_tags=True) exported_predicted = list(exported_parser.parse_sents(test_inputs)) exported_fscore = evaluate.evalb( args.evalb_dir, test_treebank.trees, exported_predicted ) exported_elapsed = format_elapsed(start_time) print( "exported-fscore {} exported-elapsed {}".format( exported_fscore, exported_elapsed ) ) print("Parsing test sentences (not predicting tags)...") start_time = time.time() test_inputs = inputs_from_treebank(test_treebank, predict_tags=False) notags_exported_predicted = list(exported_parser.parse_sents(test_inputs)) notags_exported_fscore = evaluate.evalb( args.evalb_dir, test_treebank.trees, notags_exported_predicted ) notags_exported_elapsed = format_elapsed(start_time) print( "exported-fscore {} exported-elapsed {}".format( notags_exported_fscore, notags_exported_elapsed ) ) print() print("Export and verification complete.") fscore_delta = evaluate.FScore( recall=notags_exported_fscore.recall - notags_test_fscore.recall, precision=notags_exported_fscore.precision - notags_test_fscore.precision, fscore=notags_exported_fscore.fscore - notags_test_fscore.fscore, complete_match=( notags_exported_fscore.complete_match - notags_test_fscore.complete_match ), tagging_accuracy=( exported_fscore.tagging_accuracy - test_fscore.tagging_accuracy ), ) print("delta-fscore {}".format(fscore_delta))
def check_dev(epoch_num): nonlocal best_dev_score nonlocal best_model_path dev_start_time = time.time() parser.eval() dev_predicted = [] for dev_start_index in range(0, len(dev_treebank), args.eval_batch_size): subbatch_trees = dev_treebank[dev_start_index:dev_start_index+args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] predicted, _,= parser.parse_batch(subbatch_sentences) del _ dev_predicted.extend([p.convert() for p in predicted]) dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank, dev_predicted) print( "dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format( dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), ) ) dev_pred_head = [[leaf.father for leaf in tree.leaves()] for tree in dev_predicted] dev_pred_type = [[leaf.type for leaf in tree.leaves()] for tree in dev_predicted] assert len(dev_pred_head) == len(dev_pred_type) assert len(dev_pred_type) == len(dep_dev_type) stats, stats_nopunc, stats_root, num_inst = dep_eval.eval(len(dev_pred_head), dep_dev_word, dep_dev_pos, dev_pred_head, dev_pred_type, dep_dev_headid, dep_dev_type, dep_dev_lengs, punct_set=punct_set, symbolic_root=False) dev_ucorr, dev_lcorr, dev_total, dev_ucomlpete, dev_lcomplete = stats dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc, dev_ucomlpete_nopunc, dev_lcomplete_nopunc = stats_nopunc dev_root_corr, dev_total_root = stats_root dev_total_inst = num_inst print( 'W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % ( dev_ucorr, dev_lcorr, dev_total, dev_ucorr * 100 / dev_total, dev_lcorr * 100 / dev_total, dev_ucomlpete * 100 / dev_total_inst, dev_lcomplete * 100 / dev_total_inst)) print( 'Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % ( dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc, dev_ucorr_nopunc * 100 / dev_total_nopunc, dev_lcorr_nopunc * 100 / dev_total_nopunc, dev_ucomlpete_nopunc * 100 / dev_total_inst, dev_lcomplete_nopunc * 100 / dev_total_inst)) print('Root: corr: %d, total: %d, acc: %.2f%%' % ( dev_root_corr, dev_total_root, dev_root_corr * 100 / dev_total_root)) dev_uas = dev_ucorr_nopunc * 100 / dev_total_nopunc dev_las = dev_lcorr_nopunc * 100 / dev_total_nopunc if dev_fscore.fscore + dev_las > best_dev_score : if best_model_path is not None: extensions = [".pt"] for ext in extensions: path = best_model_path + ext if os.path.exists(path): print("Removing previous model file {}...".format(path)) os.remove(path) best_dev_score = dev_fscore.fscore + dev_las best_model_path = "{}_best_dev={:.2f}_devuas={:.2f}_devlas={:.2f}".format( args.model_path_base, dev_fscore.fscore, dev_uas,dev_las) print("Saving new best model to {}...".format(best_model_path)) torch.save({ 'spec': parser.spec, 'state_dict': parser.state_dict(), 'trainer' : trainer.state_dict(), }, besthh_model_path + ".pt")
def run_test(args): print("Loading test trees from {}...".format(args.test_path)) if args.test_lbls: test_txt = [ l.strip().split() for l in open(args.test_path, 'r').readlines() ] test_lbls = [ l.strip().split() for l in open(args.test_lbls, 'r').readlines() ] test_sent_ids = [ l.strip() for l in open(args.test_sent_id_path, 'r').readlines() ] test_treebank = [(txt, lbl) for txt, lbl in zip(test_txt, test_lbls)] else: test_treebank, test_sent_ids = trees.load_trees_with_idx(args.test_path, \ args.test_sent_id_path, strip_top=False) if not args.new_set: test_pause_path = os.path.join(args.feature_path, args.test_prefix + \ '_pause.pickle') with open(test_pause_path, 'rb') as f: test_pause_data = pickle.load(f, encoding='latin1') # to_remove = set(test_sent_ids).difference(set(test_pause_data.keys())) # to_remove = sorted([test_sent_ids.index(i) for i in to_remove]) # for x in to_remove[::-1]: # test_treebank.pop(x) # test_sent_ids.pop(x) print("Loaded {:,} test examples.".format(len(test_treebank))) print("Loading model from {}...".format(args.model_path_base)) assert args.model_path_base.endswith(".pt"), "Only pytorch files supported" info = torch_load(args.model_path_base) print(info.keys()) assert 'hparams' in info['spec'], "Older savefiles not supported" parser = parse_model.SpeechParser.from_spec(info['spec'], \ info['state_dict']) from prettytable import PrettyTable total_params = 0 table = PrettyTable(["Modules", "Parameters"]) for name, parameter in parser.named_parameters(): if not parameter.requires_grad: continue param = parameter.numel() table.add_row([name, param]) total_params += param parser.eval() # turn off dropout at evaluation time label_vocab = parser.label_vocab #print("{} ({:,}): {}".format("label", label_vocab.size, \ # sorted(value for value in label_vocab.values))) test_feat_dict = {} if info['spec']['speech_features'] is not None: speech_features = info['spec']['speech_features'] print("Loading speech features for test set...") for feat_type in speech_features: print("\t", feat_type) feat_path = os.path.join(args.feature_path, \ args.test_prefix + '_' + feat_type + '.pickle') with open(feat_path, 'rb') as f: feat_data = pickle.load(f, encoding='latin1') test_feat_dict[feat_type] = feat_data print("Parsing test sentences...") start_time = time.time() test_predicted = [] test_scores = [] pscores = [] gscores = [] with torch.no_grad(): for start_index in range(0, len(test_treebank), args.eval_batch_size): subbatch_treebank = test_treebank[start_index:start_index \ + args.eval_batch_size] subbatch_sent_ids = test_sent_ids[start_index:start_index \ + args.eval_batch_size] if args.test_lbls: # EKN using this instead of the seg flag bc it's an hparam subbatch_txt = [turn[0] for turn in subbatch_treebank] subbatch_lbl = [turn[1] for turn in subbatch_treebank] subbatch_sentences = [[(lbl,txt) for lbl,txt in zip(sent_lbl,sent_txt)] for \ sent_lbl,sent_txt in zip(subbatch_lbl,subbatch_txt)] else: subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in \ tree.leaves()] for tree in subbatch_treebank] subbatch_trees = [t.convert() for t in subbatch_treebank] subbatch_features = load_features(subbatch_sent_ids, test_feat_dict\ , args.sp_off) predicted, scores = parser.parse_batch(subbatch_sentences, \ subbatch_sent_ids, subbatch_features) if not args.get_scores: del scores else: charts = parser.parse_batch(subbatch_sentences, \ subbatch_sent_ids, subbatch_features, subbatch_trees, True) for i in range(len(charts)): decoder_args = dict(sentence_len=len(subbatch_sentences[i]),\ label_scores_chart=charts[i],\ gold=subbatch_trees[i],\ label_vocab=parser.label_vocab, \ is_train=False, \ backoff=True) p_score, _, _, _, _ = chart_helper.decode( False, **decoder_args) g_score, _, _, _, _ = chart_helper.decode( True, **decoder_args) pscores.append(p_score) gscores.append(g_score) test_scores += scores if args.test_lbls: test_predicted.extend(predicted) else: test_predicted.extend([p.convert() for p in predicted]) # DEBUG # print(test_scores) #print(test_score_offsets) with open(args.output_path, 'w') as output_file: for tree in test_predicted: if args.test_lbls: #import pdb;pdb.set_trace() lbls = ' '.join(tree) output_file.write("{}\n".format(lbls)) else: output_file.write("{}\n".format(tree.linearize())) print("Output written to:", args.output_path) if args.get_scores: with open(args.output_path + '.scores', 'w') as output_file: for score1, score2, score3 in zip(test_scores, pscores, gscores): output_file.write("{}\t{}\t{}\n".format( score1, score2, score3)) print("Output scores written to:", args.output_path + '.scores') if args.write_gold: with open(args.test_prefix + '_sent_ids.txt', 'w') as sid_file: for sent_id in test_sent_ids: sid_file.write("{}\n".format(sent_id)) print("Sent ids written to:", args.test_prefix + '_sent_ids.txt') with open(args.test_prefix + '_gold.txt', 'w') as gold_file: for tree in test_treebank: gold_file.write("{}\n".format(tree.linearize())) print("Gold trees written to:", args.test_prefix + '_gold.txt') # The tree loader does some preprocessing to the trees (e.g. stripping TOP # symbols or SPMRL morphological features). We compare with the input file # directly to be extra careful about not corrupting the evaluation. We also # allow specifying a separate "raw" file for the gold trees: the inputs to # our parser have traces removed and may have predicted tags substituted, # and we may wish to compare against the raw gold trees to make sure we # haven't made a mistake. As far as we can tell all of these variations give # equivalent results. ref_gold_path = args.test_path if args.test_path_raw is not None: print("Comparing with raw trees from", args.test_path_raw) ref_gold_path = args.test_path_raw else: # Need this since I'm evaluating on subset ref_gold_path = None if args.test_lbls: test_fscore = evaluate.seg_fscore(test_treebank, test_predicted, is_train=False) else: test_fscore = evaluate.evalb(args.evalb_dir, test_treebank, \ test_predicted, ref_gold_path=ref_gold_path, is_train=False) print("test-fscore {} " "test-elapsed {}".format( test_fscore, format_elapsed(start_time), ))
def run_test(args): const_test_path = args.consttest_ptb_path dep_test_path = args.deptest_ptb_path if args.dataset == 'ctb': const_test_path = args.consttest_ctb_path dep_test_path = args.deptest_ctb_path print("Loading model from {}...".format(args.model_path_base)) assert args.model_path_base.endswith(".pt"), "Only pytorch savefiles supported" info = torch_load(args.model_path_base) assert 'hparams' in info['spec'], "Older savefiles not supported" parser = Zparser.ChartParser.from_spec(info['spec'], info['state_dict']) parser.eval() dep_test_reader = CoNLLXReader(dep_test_path, parser.type_vocab) print('Reading dependency parsing data from %s' % dep_test_path) dep_test_data = [] test_inst = dep_test_reader.getNext() dep_test_headid = np.zeros([40000, 300], dtype=int) dep_test_type = [] dep_test_word = [] dep_test_pos = [] dep_test_lengs = np.zeros(40000, dtype=int) cun = 0 while test_inst is not None: inst_size = test_inst.length() dep_test_lengs[cun] = inst_size sent = test_inst.sentence dep_test_data.append((sent.words, test_inst.postags, test_inst.heads, test_inst.types)) for i in range(inst_size): dep_test_headid[cun][i] = test_inst.heads[i] dep_test_type.append(test_inst.types) dep_test_word.append(sent.words) dep_test_pos.append(sent.postags) # dep_sentences.append([(tag, word) for i, (word, tag) in enumerate(zip(sent.words, sent.postags))]) test_inst = dep_test_reader.getNext() cun = cun + 1 dep_test_reader.close() print("Loading test trees from {}...".format(const_test_path)) test_treebank = trees.load_trees(const_test_path, dep_test_headid, dep_test_type, dep_test_word) print("Loaded {:,} test examples.".format(len(test_treebank))) print("Parsing test sentences...") start_time = time.time() punct_set = '.' '``' "''" ':' ',' parser.eval() test_predicted = [] for start_index in range(0, len(test_treebank), args.eval_batch_size): subbatch_trees = test_treebank[start_index:start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] predicted, _, = parser.parse_batch(subbatch_sentences) del _ test_predicted.extend([p.convert() for p in predicted]) test_fscore = evaluate.evalb(args.evalb_dir, test_treebank, test_predicted) print( "test-fscore {} " "test-elapsed {}".format( test_fscore, format_elapsed(start_time), ) ) test_pred_head = [[leaf.father for leaf in tree.leaves()] for tree in test_predicted] test_pred_type = [[leaf.type for leaf in tree.leaves()] for tree in test_predicted] assert len(test_pred_head) == len(test_pred_type) assert len(test_pred_type) == len(dep_test_type) stats, stats_nopunc, stats_root, test_total_inst = dep_eval.eval(len(test_pred_head), dep_test_word, dep_test_pos, test_pred_head, test_pred_type, dep_test_headid, dep_test_type, dep_test_lengs, punct_set=punct_set, symbolic_root=False) test_ucorrect, test_lcorrect, test_total, test_ucomlpete_match, test_lcomplete_match = stats test_ucorrect_nopunc, test_lcorrect_nopunc, test_total_nopunc, test_ucomlpete_match_nopunc, test_lcomplete_match_nopunc = stats_nopunc test_root_correct, test_total_root = stats_root print( 'best test W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % ( test_ucorrect, test_lcorrect, test_total, test_ucorrect * 100 / test_total, test_lcorrect * 100 / test_total, test_ucomlpete_match * 100 / test_total_inst, test_lcomplete_match * 100 / test_total_inst )) print( 'best test Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% ' % ( test_ucorrect_nopunc, test_lcorrect_nopunc, test_total_nopunc, test_ucorrect_nopunc * 100 / test_total_nopunc, test_lcorrect_nopunc * 100 / test_total_nopunc, test_ucomlpete_match_nopunc * 100 / test_total_inst, test_lcomplete_match_nopunc * 100 / test_total_inst)) print('best test Root: corr: %d, total: %d, acc: %.2f%%' % ( test_root_correct, test_total_root, test_root_correct * 100 / test_total_root)) print( '============================================================================================================================')
def run_test(args): print("Loading test trees from {}...".format(args.test_path)) test_treebank = trees.load_trees(args.test_path) print("Loaded {:,} test examples.".format(len(test_treebank))) print("Loading model from {}...".format(args.model_path_base)) assert args.model_path_base.endswith( ".pt"), "Only pytorch savefiles supported" info = torch_load(args.model_path_base) assert 'hparams' in info['spec'], "Older savefiles not supported" parser = parse_jc.NKChartParser.from_spec(info['spec'], info['state_dict']) if args.redo_vocab: print( "Loading memory bank trees from {} for generating label vocab...". format(args.train_path)) train_treebank = trees.load_trees(args.train_path) parser.label_vocab = gen_label_vocab( [tree.convert() for tree in train_treebank]) print("Parsing test sentences...") start_time = time.time() if args.use_neighbours: index_const = index.FaissIndex if args.library == "faiss" else index.AnnoyIndex span_index = index_const( num_labels=len(parser.label_vocab.values), metric=parser.metric, ) prefix = index.get_index_prefix( index_base_path=args.index_path, full_model_path=args.model_path_base, nn_prefix=args.nn_prefix, ) span_index.load(prefix) # also remove relu parser.no_relu = args.no_relu if args.no_relu: parser.remove_relu() test_predicted = [] for start_index in range(0, len(test_treebank), args.eval_batch_size): subbatch_trees = test_treebank[start_index:start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] predicted, _ = parser.parse_batch( subbatch_sentences, span_index=span_index if args.use_neighbours else None, k=args.k, zero_empty=args.zero_empty, ) del _ test_predicted.extend([p.convert() for p in predicted]) # The tree loader does some preprocessing to the trees (e.g. stripping TOP # symbols or SPMRL morphological features). We compare with the input file # directly to be extra careful about not corrupting the evaluation. We also # allow specifying a separate "raw" file for the gold trees: the inputs to # our parser have traces removed and may have predicted tags substituted, # and we may wish to compare against the raw gold trees to make sure we # haven't made a mistake. As far as we can tell all of these variations give # equivalent results. ref_gold_path = args.test_path if args.test_path_raw is not None: print("Comparing with raw trees from", args.test_path_raw) ref_gold_path = args.test_path_raw test_fscore = evaluate.evalb( args.evalb_dir, test_treebank, test_predicted, ref_gold_path=ref_gold_path, ) print("test-fscore {} " "test-elapsed {}".format( test_fscore, format_elapsed(start_time), ))
def evaluate_on_brown_corpus(args): if not os.path.exists(args.experiment_directory): os.mkdir(args.experiment_directory) model = dy.ParameterCollection() [parser] = dy.load(args.model_path_base, model) assert parser.use_elmo == args.use_elmo, (parser.use_elmo, args.use_elmo) directories = ['cf', 'cg', 'ck', 'cl', 'cm', 'cn', 'cp', 'cr'] for directory in directories: print('-' * 100) print(directory) input_file = '../brown/' + directory + '/' + directory + '.all.mrg' expt_name = args.experiment_directory + '/' + directory if not os.path.exists(expt_name): os.mkdir(expt_name) cleaned_corpus_path = trees.cleanup_text(input_file) treebank = trees.load_trees(cleaned_corpus_path, strip_top=True, filter_none=True) sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves] for tree in treebank] tokenized_lines = [' '.join([word for pos, word in sentence]) for sentence in sentences] if args.use_elmo: embedding_file = compute_elmo_embeddings(tokenized_lines, expt_name) else: embedding_file = None dev_predicted = [] num_correct = 0 total = 0 for tree_index, tree in enumerate(treebank): if tree_index % 100 == 0: print(tree_index) dy.renew_cg() sentence = sentences[tree_index] if args.use_elmo: embeddings_np = embedding_file[str(tree_index)][:, :, :] assert embeddings_np.shape[1] == len(sentence), ( embeddings_np.shape[1], len(sentence)) embeddings = dy.inputTensor(embeddings_np) else: embeddings = None predicted, (additional_info, c, t) = parser.span_parser(sentence, is_train=False, elmo_embeddings=embeddings) num_correct += c total += t dev_predicted.append(predicted.convert()) dev_fscore_without_labels = evaluate.evalb('EVALB/', treebank, dev_predicted, args=args, erase_labels=True, name="without-labels", expt_name=expt_name) print("dev-fscore without labels", dev_fscore_without_labels) dev_fscore_without_labels = evaluate.evalb('EVALB/', treebank, dev_predicted, args=args, erase_labels=True, flatten=True, name="without-label-flattened", expt_name=expt_name) print("dev-fscore without labels and flattened", dev_fscore_without_labels) dev_fscore_without_labels = evaluate.evalb('EVALB/', treebank, dev_predicted, args=args, erase_labels=False, flatten=True, name="flattened", expt_name=expt_name) print("dev-fscore with labels and flattened", dev_fscore_without_labels) test_fscore = evaluate.evalb('EVALB/', treebank, dev_predicted, args=args, name="regular", expt_name=expt_name) print("regular", test_fscore) pos_fraction = num_correct / total print('pos fraction', pos_fraction) with open(expt_name + '/pos_accuracy.txt', 'w') as f: f.write(str(pos_fraction))
def run_test(args): print("Loading test trees from {}...".format(args.test_path)) test_treebank = trees.load_trees(args.test_path) print("Loaded {:,} test examples.".format(len(test_treebank))) print("Loading model from {}...".format(args.model_path_base)) assert args.model_path_base.endswith( ".pt"), "Only pytorch savefiles supported" info = torch_load(args.model_path_base) assert 'hparams' in info['spec'], "Older savefiles not supported" parser = parse_nk.NKChartParser.from_spec(info['spec'], info['state_dict']) hparams = info['spec']['hparams'] if ('use_extra_info' in hparams) and hparams['use_extra_info']: loaded_test_info = h5py.File(args.test_path + '.hdf5', 'r') test_info = list() for i in range(len(test_treebank)): item_info = list() for key in sorted(loaded_test_info.keys()): item_info.append(loaded_test_info[key + '/' + str(i)]) item_info = np.array(item_info) item_info = np.concatenate([ -1e8 * np.ones( (item_info.shape[0], item_info.shape[1], 1)), item_info ], axis=2) item_info = np.concatenate([ item_info, -1e8 * np.ones( (item_info.shape[0], 1, item_info.shape[2])) ], axis=1) test_info.append(item_info) print("Loaded and processed extra info from constituency test.") else: test_info = None print("Parsing test sentences...") start_time = time.time() test_predicted = [] for start_index in range(0, len(test_treebank), args.eval_batch_size): try: subbatch_trees = test_treebank[start_index:start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] subbatch_info = test_info[ start_index:start_index + args.eval_batch_size] if test_info is not None else None predicted, _ = parser.parse_batch(subbatch_sentences, extra_info=subbatch_info) except: from IPython import embed embed(using=False) del _ test_predicted.extend([p.convert() for p in predicted]) # The tree loader does some preprocessing to the trees (e.g. stripping TOP # symbols or SPMRL morphological features). We compare with the input file # directly to be extra careful about not corrupting the evaluation. We also # allow specifying a separate "raw" file for the gold trees: the inputs to # our parser have traces removed and may have predicted tags substituted, # and we may wish to compare against the raw gold trees to make sure we # haven't made a mistake. As far as we can tell all of these variations give # equivalent results. ref_gold_path = args.test_path if args.test_path_raw is not None: print("Comparing with raw trees from", args.test_path_raw) ref_gold_path = args.test_path_raw try: test_fscore = evaluate.evalb(args.evalb_dir, test_treebank, test_predicted, ref_gold_path=ref_gold_path) print("labeled-fscore {} ".format(test_fscore)) except: print('Failed to predict labeled score.') for rm_punct in [True]: for compute_level in ["corpus"]: clean_test_treebank = [ tokens2list( tree.linearize().replace('(', ' ( ').replace(')', ' ) ').split(), rm_punct) for tree in test_treebank ] clean_test_predicted = [ tokens2list( tree.linearize().replace('(', ' ( ').replace(')', ' ) ').split(), rm_punct) for tree in test_predicted ] unlabeled_fscore = evaluate.evaluate_unlabeled( clean_test_treebank, clean_test_predicted, compute_level) print("unlabeled-fscore ({} {}) {}".format(rm_punct, compute_level, unlabeled_fscore)) gold_bracket_num = sum( [len(get_brackets(x)[0]) for x in clean_test_treebank]) predicted_bracket_num = sum( [len(get_brackets(x)[0]) for x in clean_test_predicted]) print("# gold brackets: {}".format(gold_bracket_num)) print("# predicted brackets: {}".format(predicted_bracket_num)) for i, tree in enumerate(clean_test_treebank): print(tree2str(tree)) print(tree2str(clean_test_predicted[i]))