def write_results(args, data, data_domain, split, model, model_domain, eval_dict): str_file = args.full_model_name + '_' + split + '_model_domain_' + model_domain + '_data_domain_' + data_domain res_filename = str_file + '_res.txt' pred_filename = str_file + '_pred.txt' gold_filename = str_file + '_gold.txt' if eval_dict is not None: # save results dictionary into a file with open(res_filename, 'w') as f: json.dump(eval_dict, f) # save predictions and gold labels into files pred_writer = Writer(args.alphabets) gold_writer = Writer(args.alphabets) pred_writer.start(pred_filename) gold_writer.start(gold_filename) for batch in prepare_data.iterate_batch(data, args.batch_size, args.device): word, char, pos, ner, heads, arc_tags, auto_label, masks, lengths = batch out_arc, out_arc_tag, masks, lengths = model.forward(word, char, pos, mask=masks, length=lengths) heads_pred, arc_tags_pred, _ = model.decode(out_arc, out_arc_tag, mask=masks, length=lengths, leading_symbolic=prepare_data.NUM_SYMBOLIC_TAGS) lengths = lengths.cpu().numpy() word = word.data.cpu().numpy() pos = pos.data.cpu().numpy() ner = ner.data.cpu().numpy() heads = heads.data.cpu().numpy() arc_tags = arc_tags.data.cpu().numpy() heads_pred = heads_pred.data.cpu().numpy() arc_tags_pred = arc_tags_pred.data.cpu().numpy() # writing predictions pred_writer.write(word, pos, ner, heads_pred, arc_tags_pred, lengths, symbolic_root=True) # writing gold labels gold_writer.write(word, pos, ner, heads, arc_tags, lengths, symbolic_root=True) pred_writer.close() gold_writer.close()
def evaluation(args, data, split, model, domain, epoch, str_res='results'): # evaluate performance on data model.eval() auto_label_idx2inst = Index2Instance(args.alphabets['auto_label_alphabet']) eval_dict = initialize_eval_dict() eval_dict['epoch'] = epoch pred_labels = [] gold_labels = [] for batch in prepare_data.iterate_batch(data, args.batch_size, args.device): # add debug statements here [rram] word, char, pos, ner, heads, arc_tags, auto_label, masks, lengths = batch output, masks, lengths = model.forward(word, char, pos, mask=masks, length=lengths) auto_label_preds = model.decode(output, mask=masks, length=lengths, leading_symbolic=prepare_data.NUM_SYMBOLIC_TAGS) lengths = lengths.cpu().numpy() word = word.data.cpu().numpy() pos = pos.data.cpu().numpy() ner = ner.data.cpu().numpy() heads = heads.data.cpu().numpy() arc_tags = arc_tags.data.cpu().numpy() auto_label = auto_label.data.cpu().numpy() auto_label_preds = auto_label_preds.data.cpu().numpy() gold_labels += auto_label_idx2inst.index2instance(auto_label, lengths, symbolic_root=True) pred_labels += auto_label_idx2inst.index2instance(auto_label_preds, lengths, symbolic_root=True) eval_dict['auto_label_accuracy'] = accuracy_score(gold_labels, pred_labels) * 100 eval_dict['auto_label_precision'] = precision_score(gold_labels, pred_labels) * 100 eval_dict['auto_label_recall'] = recall_score(gold_labels, pred_labels) * 100 eval_dict['auto_label_f1'] = f1_score(gold_labels, pred_labels) * 100 print_results(eval_dict, split, domain, str_res) return eval_dict
def evaluation(args, data, split, model, domain, epoch, str_res='results'): # evaluate performance on data model.eval() eval_dict = initialize_eval_dict() eval_dict['epoch'] = epoch for batch in prepare_data.iterate_batch(data, args.batch_size, args.device): word, char, pos, ner, heads, arc_tags, auto_label, masks, lengths = batch out_arc, out_arc_tag, masks, lengths = model.forward(word, char, pos, mask=masks, length=lengths) heads_pred, arc_tags_pred, _ = model.decode(out_arc, out_arc_tag, mask=masks, length=lengths, leading_symbolic=prepare_data.NUM_SYMBOLIC_TAGS) lengths = lengths.cpu().numpy() word = word.data.cpu().numpy() pos = pos.data.cpu().numpy() ner = ner.data.cpu().numpy() heads = heads.data.cpu().numpy() arc_tags = arc_tags.data.cpu().numpy() heads_pred = heads_pred.data.cpu().numpy() arc_tags_pred = arc_tags_pred.data.cpu().numpy() stats, stats_nopunc, stats_root, num_inst = parse.eval_(word, pos, heads_pred, arc_tags_pred, heads, arc_tags, args.alphabets['word_alphabet'], args.alphabets['pos_alphabet'], lengths, punct_set=args.punct_set, symbolic_root=True) ucorr, lcorr, total, ucm, lcm = stats ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc corr_root, total_root = stats_root eval_dict['dp_ucorrect'] += ucorr eval_dict['dp_lcorrect'] += lcorr eval_dict['dp_total'] += total eval_dict['dp_ucomplete_match'] += ucm eval_dict['dp_lcomplete_match'] += lcm eval_dict['dp_ucorrect_nopunc'] += ucorr_nopunc eval_dict['dp_lcorrect_nopunc'] += lcorr_nopunc eval_dict['dp_total_nopunc'] += total_nopunc eval_dict['dp_ucomplete_match_nopunc'] += ucm_nopunc eval_dict['dp_lcomplete_match_nopunc'] += lcm_nopunc eval_dict['dp_root_correct'] += corr_root eval_dict['dp_total_root'] += total_root eval_dict['dp_total_inst'] += num_inst eval_dict['dp_uas'] = eval_dict['dp_ucorrect'] * 100 / eval_dict['dp_total'] # considering w. punctuation eval_dict['dp_las'] = eval_dict['dp_lcorrect'] * 100 / eval_dict['dp_total'] # considering w. punctuation print_results(eval_dict, split, domain, str_res) return eval_dict