def evalb(self, gold, pred): gold = evalb_parser.create_from_bracket_string(gold) pred = evalb_parser.create_from_bracket_string(pred) result = scorer.Scorer().score_trees(gold, pred) prec, recall = result.prec, result.recall fscore = 2 * (prec * recall) / (prec + recall) return prec, recall, fscore
def get_results(self): results = [] for i in range(len(self.true_parsed)): sentence_true = self.true_parsed[i] sentence_test = self.test_parsed[i] back, a, b, c = self.Cyk.cyk(sentence_test) sentence = sentence_test.split(' ') result_test = "".join( ['((SENT (', get_parsed(sentence, back, a, b, c), ')))']) result_test = result_test[1:-1] print("Result sentence:") print(result_test) target = parser.create_from_bracket_string(sentence_true) predicted = parser.create_from_bracket_string(result_test) s = scorer.Scorer() result = s.score_trees(target, predicted) print('The recall is: ' + str(result.recall)) print('The precision is: ' + str(result.prec)) results.append(result_test) return (results)
def compute_f1(f_gold, f_test): try: f1 = summary.summary(scorer.Scorer().score_corpus( f_gold, f_test)).bracker_fmeasure return f1 except ZeroDivisionError: return 0.0
def evalb(parse1, parse2): pyparse1 = pyparser.create_from_bracket_string(str(parse1)) pyparse2 = pyparser.create_from_bracket_string(str(parse2)) score = pyscorer.Scorer().score_trees(pyparse1, pyparse2) # cross_brackets = score.cross_brackets f1 = 2 * (score.recall * score.prec) / (score.recall + score.prec) return f1 * score.tag_accracy
def evaluate(sentence, reference): gold_tree = evalbparser.create_from_bracket_string(sentence[1:-1]) test_tree = evalbparser.create_from_bracket_string(reference[1:-1]) s = scorer.Scorer() result = s.score_trees(gold_tree, test_tree) return result.tag_accracy
def parse_instance(idx: int): scorer = evalscorer.Scorer() sent_ = ins_sents[idx] target_ = ins_trees[idx][2:-1] print("Parsing %s set sentence #%d/%d" % (dataset_choice, idx+1, num_sents)) # Perform CYK prediction res_, pred_string = evaluate_predict(sent_, target_, cyk_module, scorer) print(res_, end='\n\n') return res_, pred_string
def evalb(parse1, parse2): from PYEVALB import scorer as pyscorer from PYEVALB import parser as pyparser pyparse1 = pyparser.create_from_bracket_string(str(parse1)) pyparse2 = pyparser.create_from_bracket_string(str(parse2)) try: score = pyscorer.Scorer().score_trees(pyparse1, pyparse2) except Exception as e: print("Exception!") print(e) print(pyparse1) print(pyparse2) return 0 f1 = 2 * (score.recall * score.prec) / (score.recall + score.prec) return f1 * score.tag_accracy
def check_action2treeseq(self): instance = next(iter(self.train_iterator)) action_str_lst = self.id2original(self.ACTIONS, instance.actions) pos_tags = self.id2original(self.POS_TAGS, instance.pos_tags) converted_seq = utils.action2treestr(action_str_lst, instance.raws[0], pos_tags) measure = scorer.Scorer() golden_seq = instance.raw_seq[0] gold_tree = parser.create_from_bracket_string(golden_seq) converted_tree = parser.create_from_bracket_string(converted_seq) ret = measure.score_trees(gold_tree, converted_tree) match_num = ret.matched_brackets gold_num = ret.gold_brackets pred_num = ret.test_brackets assert match_num == gold_num assert match_num == pred_num
def get_eval_metrics(self, instance, pred_action_ids): assert type(pred_action_ids) == list pred_actions = self.id2original(self.ACTIONS, pred_action_ids) tokens = instance.raws[0] pos_tags = self.id2original(self.POS_TAGS, instance.pos_tags) measure = scorer.Scorer() golden_tree_seq = instance.raw_seq[0] gold_tree = parser.create_from_bracket_string(golden_tree_seq) try: pred_tree_seq = utils.action2treestr(pred_actions, tokens, pos_tags) pred_tree = parser.create_from_bracket_string(pred_tree_seq) ret = measure.score_trees(gold_tree, pred_tree) except: return -1 else: match_num = ret.matched_brackets gold_num = ret.gold_brackets pred_num = ret.test_brackets return match_num, gold_num, pred_num
def score(true_parse, proposed_parse): """ Description ----------------- Evaluate parses with the whole non terminals precision and recall, and on only POS tags Parameters ----------------- true_parse, proposed_parse : Bracketed strings, the true and proposed parse trees. Returns ----------------- parse_recall, parse_precision, pos_recall, pos_precision """ true_parse = true_parse[2:-1] proposed_parse = proposed_parse[2:-1] gold_tree = parser.create_from_bracket_string(true_parse) test_tree = parser.create_from_bracket_string(proposed_parse) # Compute recall and precision for POS tags y_true = np.array(gold_tree.poss) y_pred = np.array(test_tree.poss) y_pred = (y_true == y_pred).astype(int) y_true = np.ones(len(y_true)).astype(int) (POS_precision, POS_recall, POS_f_score, beta) = precision_recall_fscore_support(y_true, y_pred, labels=[1]) # Compute recall and precision for the whole parse thescorer = scorer.Scorer() result = thescorer.score_trees(gold_tree, test_tree) return result.recall * 100, result.prec * 100, POS_recall[ 0] * 100, POS_precision[0] * 100
def evalb(evalb_dir, gold_trees, predicted_trees, ref_gold_path=None, is_train=True): """ assert os.path.exists(evalb_dir) evalb_program_path = os.path.join(evalb_dir, "evalb") evalb_spmrl_program_path = os.path.join(evalb_dir, "evalb_spmrl") assert os.path.exists(evalb_program_path) or os.path.exists(evalb_spmrl_program_path) if os.path.exists(evalb_program_path): evalb_param_path = os.path.join(evalb_dir, "nk.prm") else: evalb_program_path = evalb_spmrl_program_path evalb_param_path = os.path.join(evalb_dir, "spmrl.prm") assert os.path.exists(evalb_program_path) assert os.path.exists(evalb_param_path) """ temp_dir = tempfile.TemporaryDirectory(prefix="evalb-") print("Temporary dir", temp_dir) assert len(gold_trees) == len(predicted_trees) for gold_tree, predicted_tree in zip(gold_trees, predicted_trees): assert isinstance(gold_tree, trees.TreebankNode) assert isinstance(predicted_tree, trees.TreebankNode) gold_leaves = list(gold_tree.leaves()) predicted_leaves = list(predicted_tree.leaves()) assert len(gold_leaves) == len(predicted_leaves) assert all(gold_leaf.word == predicted_leaf.word for gold_leaf, predicted_leaf in zip( gold_leaves, predicted_leaves)) gold_path = os.path.join(temp_dir.name, "gold.txt") predicted_path = os.path.join(temp_dir.name, "predicted.txt") output_path = os.path.join(temp_dir.name, "output.txt") with open(gold_path, "w") as outfile: if ref_gold_path is None: for tree in gold_trees: outfile.write("{}\n".format(tree.linearize())) else: # For the SPMRL dataset our data loader performs some modifications # (like stripping morphological features), so we compare to the # raw gold file to be certain that we haven't spoiled the evaluation # in some way. with open(ref_gold_path) as goldfile: outfile.write(goldfile.read()) with open(predicted_path, "w") as outfile: for tree in predicted_trees: try: outfile.write("{}\n".format(tree.linearize())) except: import sys sys.setrecursionlimit(10**6) outfile.write("{}\n".format(tree.linearize())) """ data_dir = '/afs/inf.ed.ac.uk/group/project/prosody/prosody_nlp/data/input_features' perm_gold_path = os.path.join(data_dir, "sent_based_gold.txt") perm_predicted_path = os.path.join(data_dir, "sent_based_predicted.txt") with open(perm_gold_path, "w") as outfile: for tree in gold_trees: outfile.write("{}\n".format(tree.linearize())) with open(perm_predicted_path, "w") as outfile: for tree in predicted_trees: outfile.write("{}\n".format(tree.linearize())) command = "{} -p {} {} {} > {}".format( evalb_program_path, evalb_param_path, gold_path, predicted_path, output_path, ) print(f'evalb shell command: {command}') #subprocess.run(command, shell=True) """ scr = scorer.Scorer() scr.evalb(gold_path, predicted_path, output_path) # debug: subprocess.run("wc {}".format(predicted_path), shell=True) subprocess.run("wc {}".format(output_path), shell=True) fscore = FScore(math.nan, math.nan, math.nan, math.nan) """ with open(output_path) as infile: for line in infile: match = re.match(f"Number of sentence\s+=\s+(\d+\.\d+)", line) if match: print(f'Number of sentences evaled: {match.group(1)}') match = re.match(r"Bracketing Recall\s+=\s+(\d+\.\d+)", line) if match: print("MATCH") fscore.recall = float(match.group(1)) match = re.match(r"Bracketing Precision\s+=\s+(\d+\.\d+)", line) if match: fscore.precision = float(match.group(1)) match = re.match(r"Bracketing FMeasure\s+=\s+(\d+\.\d+)", line) if match: fscore.fscore = float(match.group(1)) match = re.match(r"Complete match\s+=\s+(\d+\.\d+)", line) if match: fscore.complete_match = float(match.group(1)) match = re.match(r"Tagging accuracy\s+=\s+(\d+\.\d+)", line) if match: fscore.tagging_accuracy = float(match.group(1)) break """ with open(output_path) as infile: for line in infile: match = re.match(f"Number of sentence:\s+(\d+\.\d+)", line) if match: print(f'Number of sentences evaled: {match.group(1)}') match = re.match(r"Bracketing Recall:\s+(\d+\.\d+)", line) if match: print("MATCH") fscore.recall = float(match.group(1)) match = re.match(r"Bracketing Precision:\s+(\d+\.\d+)", line) if match: fscore.precision = float(match.group(1)) match = re.match(r"Bracketing FMeasure:\s+(\d+\.\d+)", line) if match: fscore.fscore = float(match.group(1)) match = re.match(r"Complete match:\s+(\d+\.\d+)", line) if match: fscore.complete_match = float(match.group(1)) match = re.match(r"Tagging accuracy:\s+(\d+\.\d+)", line) if match: fscore.tagging_accuracy = float(match.group(1)) break #""" success = (not math.isnan(fscore.fscore) or fscore.recall == 0.0 or fscore.precision == 0.0) if success: #temp_dir.cleanup() print("Successfully parsed in:", predicted_path) else: print("Error reading EVALB results.") print("Gold path: {}".format(gold_path)) print("Predicted path: {}".format(predicted_path)) print("Output path: {}".format(output_path)) import pdb pdb.set_trace() return fscore
def eval(self, insts): gold_path = 'tmp/gold.txt' pred_path = 'tmp/pred.txt' result_path = 'tmp/result.txt' if not os.path.exists('tmp'): os.makedirs('tmp') fgold = open(gold_path, 'w', encoding='utf-8') fpred = open(pred_path, 'w', encoding='utf-8') golds = [] preds = [] for inst in insts: gold = inst.get_output() pred = inst.get_prediction() golds.append(gold) preds.append(pred) fgold.write(gold.linearize() + '\n') fpred.write(pred.linearize() + '\n') fgold.close() fpred.close() return self.evalb('./EVALB', golds, preds) evalb = scorer.Scorer() fscore = FScore(0.0, 0.0, 0.0) try: evalb.evalb(gold_path, pred_path, result_path) with open(result_path) as infile: for line in infile: match = re.match(r"Bracketing Recall:\s+(\d+\.\d+)", line) if match: fscore.recall = float(match.group(1)) / 100 match = re.match(r"Bracketing Precision:\s+(\d+\.\d+)", line) if match: fscore.precision = float(match.group(1)) / 100 match = re.match(r"Bracketing FMeasure:\s+(\d+\.\d+)", line) if match: fscore.fscore = float(match.group(1)) / 100 break except: pass # success = ( # not math.isnan(fscore.fscore) or # fscore.recall == 0.0 or # fscore.precision == 0.0) # # if success: # pass # # temp_dir.cleanup() # else: # print("Error reading EVALB results.") # print("Gold path: {}".format(gold_path)) # print("Predicted path: {}".format(pred_path)) # print("Output path: {}".format(result_path)) return fscore
with open('evaluation_data.parser_output_ter.txt') as f: for sent in f: chom_gold.append(sent.rstrip()) #%% # Removing the 'unparsed' sentences gold = [] test = [] f = open('test_ter.txt', 'w+') g = open('gold_ter.txt', 'w+') for i in range(len(chom_gold)): print(chom_gold[i]) if chom_gold[i] != chom_gold[3]: gold.append(chom_gold[i].rstrip()) g.write(chom_gold[i] + '\n') test.append(chom_test[i].rstrip()) f.write(chom_test[i] + '\n') f.close() g.close() # Create scorer s = scorer.Scorer() # Perform the comparision s.evalb('gold_ter.txt', 'test_ter.txt', 'results_ter.txt')
def evaluate_parser_multiprocess(pcfg, test_trees, filepath="parser_output.txt", write=True): """ Method to evaluate the parser using multiprocess :param pcfg: parser pcfg to evaluate """ y_true = [] y_pred = [] y_true_non_chomsky = [] y_pred_non_chomsky = [] y_true_parsable = [] y_pred_parsable = [] y_true_parsable_non_chomsky = [] y_pred_parsable_non_chomsky = [] recall_list = [] precision_list = [] lines = [] test_trees = test_trees[:5] if write: with open(filepath, 'w') as file: file.write("") with open("non-parsable", 'w') as file: file.write("") list_sentence = [] for c, tree in enumerate(test_trees): list_sentence.append(list(tree.flatten())) # Parsing multi_process : n_job = multiprocessing.cpu_count() start = time.time() with Pool(n_job) as p: result_trees = p.map(pcfg.CYK, list_sentence) print(f"Parsing time is {time.time()-start}") # Analysis of the result nb_non_parsable = 0 list_non_parsable = [] for (c, tree) in enumerate(test_trees): test_sentence = list(tree.flatten()) parsed_tree = result_trees[c] test_sentence_str = ' '.join(str(tree).split()) # If the sentence is parsable if parsed_tree: y_true.extend(get_leaves(tree)) y_pred.extend(get_leaves(parsed_tree)) y_true_parsable.extend(get_leaves(tree)) y_pred_parsable.extend(get_leaves(parsed_tree)) tree.un_chomsky_normal_form(unaryChar="&") parsed_tree.un_chomsky_normal_form(unaryChar="&") y_true_non_chomsky.extend(get_leaves(tree)) y_pred_non_chomsky.extend(get_leaves(parsed_tree)) y_true_parsable_non_chomsky.extend(get_leaves(tree)) y_pred_parsable_non_chomsky.extend(get_leaves(parsed_tree)) lines.append('( ' + ' '.join(str(parsed_tree).split()) + ')') parsed_tree_str = ' '.join(str(parsed_tree).split()) test_sentence_str = ' '.join(str(tree[0]).split()) target_tree = parser.create_from_bracket_string(test_sentence_str) predicted_tree = parser.create_from_bracket_string(parsed_tree_str) s = scorer.Scorer() try: result = s.score_trees(target_tree, predicted_tree) recall_list.append(result.recall) precision_list.append(result.prec) except: print("No Recall or precision") if write: with open(filepath, 'a') as file: file.write(lines[-1] + "\n") # if the sentence is not parsable else: aux = get_leaves(tree) y_true.extend(aux) y_pred.extend(["None" for k in range(len(aux))]) tree.un_chomsky_normal_form(unaryChar="&") y_true_non_chomsky.extend(get_leaves(tree)) y_pred_non_chomsky.extend( ["None" for k in range(len(get_leaves(tree)))]) nb_non_parsable += 1 list_non_parsable.append(test_sentence) if write: with open(filepath, 'a') as file: file.write("\n") with open("non-parsable", 'a') as file: file.write('( ' + ' '.join(str(tree).split()) + ')' + "\n") print('Nb Non parsable {}'.format(nb_non_parsable)) print('Accuracy total chomsky on dev set {}:'.format( accuracy(y_pred, y_true))) print("Accuracy total non chomsky on dev set {}:".format( accuracy(y_true_non_chomsky, y_pred_non_chomsky))) print('Accuracy parsable chomsky on dev set {}:'.format( accuracy(y_pred_parsable, y_true_parsable))) print("Accuracy parsable non chomsky on dev set {}:".format( accuracy(y_true_parsable_non_chomsky, y_pred_parsable_non_chomsky))) print("Recall moyen {} et précision moyenne {}".format( np.mean(recall_list), np.mean(precision_list)))
from PYEVALB import scorer as PYEVALB_scorer # evaluation on the whole corpus PYEVALB_scorer.Scorer().evalb( 'results/real_parsings_test_for_eval.txt', 'results/my_parsings_test_for_eval.txt', 'results/results_pyevalb.txt', )
def evalb(evalb_dir, gold_trees, predicted_trees): assert os.path.exists(evalb_dir) evalb_program_path = os.path.join(evalb_dir, "evalb") evalb_param_path = os.path.join(evalb_dir, "COLLINS.prm") assert os.path.exists(evalb_program_path) assert os.path.exists(evalb_param_path) assert len(gold_trees) == len(predicted_trees) for gold_tree, predicted_tree in zip(gold_trees, predicted_trees): assert isinstance(gold_tree, trees.TreebankNode) assert isinstance(predicted_tree, trees.TreebankNode) gold_leaves = list(gold_tree.leaves()) predicted_leaves = list(predicted_tree.leaves()) assert len(gold_leaves) == len(predicted_leaves) assert all(gold_leaf.word == predicted_leaf.word for gold_leaf, predicted_leaf in zip( gold_leaves, predicted_leaves)) temp_dir = tempfile.TemporaryDirectory(prefix="evalb-") # tempfile.TemporaryDirectory(suffix=None, prefix=None, dir=None)¶ # This function securely creates a temporary directory. The resulting object can # be used as a context manager output_dir = "outputs" gold_path = os.path.join(temp_dir.name, "gold.txt") predicted_path = os.path.join(temp_dir.name, "predicted.txt") output_path = os.path.join(output_dir, "output_test.txt") num = 0 with open(gold_path, "w+") as outfile: for tree in gold_trees: if num < 5: print("Gold tree #{}: {}".format(num + 1, tree.linearize())) num += 1 outfile.write("{}\n".format(tree.linearize())) num = 0 with open(predicted_path, "w+") as outfile: for tree in predicted_trees: if num < 5: print("Predicted tree #{}: {}".format(num + 1, tree.linearize())) num += 1 outfile.write("{}\n".format(tree.linearize())) s = scorer.Scorer() s.evalb(gold_path, predicted_path, output_path) # command = "{} -p {} {} {} > {}".format( # evalb_program_path, # evalb_param_path, # gold_path, # predicted_path, # output_path, # ) # subprocess.run(command, shell=True) fscore = FScore(math.nan, math.nan, math.nan) with open(output_path) as infile: for line in infile: match = re.match(r"Bracketing Recall:\s*(\d+\.\d+)", line) if match: fscore.recall = float(match.group(1)) match = re.match(r"Bracketing Precision:\s*(\d+\.\d+)", line) if match: fscore.precision = float(match.group(1)) match = re.match(r"Bracketing FMeasure:\s*(\d+\.\d+)", line) if match: fscore.fscore = float(match.group(1)) break success = (not math.isnan(fscore.fscore) or fscore.recall == 0.0 or fscore.precision == 0.0) if success: with open("outputs/fscore_test.txt", 'a', encoding='utf-8') as f: f.write(fscore.print_score() + "\n") temp_dir.cleanup() else: print("Error reading EVALB results.") print("Gold path: {}".format(gold_path)) print("Predicted path: {}".format(predicted_path)) print("Output path: {}".format(output_path)) return fscore
def get_diff_prods_no_span(): print('Getting diff between', test_seqs_file, 'and', pred_seqs_file) diff = set() id = 0 from collections import Counter diff_prods_counter = Counter() diff_heights = defaultdict(list) for test_line, pred_line in zip(test_seqs, pred_seqs): # print ('true =', true_line) # print ('pred =', pred_line) measure = scorer.Scorer() gold_tree = parser.create_from_bracket_string(test_line) pred_tree = parser.create_from_bracket_string(pred_line) # print (id) # print(test_line, pred_line) # print (gold_tree.sentence) # print (pred_tree.sentence) # id += 1 ret = measure.score_trees(gold_tree, pred_tree) match_num = ret.matched_brackets gold_num = ret.gold_brackets pred_num = ret.test_brackets if match_num < gold_num or match_num < pred_num: pred_grammar, pred_heights = gold_tree.productions(skip_XX=False, skip_span=False) true_grammar, _ = pred_tree.productions(skip_XX=False, skip_span=False) # print(pred_grammar) # print(true_grammar) # diff_prods = set(pred_grammar) - set(true_grammar) diff_prods = [] diff_prods_heights = [] for id, prod in enumerate(pred_grammar): if prod not in true_grammar: diff_prods.append(prod) diff_prods_heights.append(pred_heights[id]) for id, prod in enumerate(diff_prods): diff_heights[no_span_prod(prod)].append(diff_prods_heights[id]) # if pred_heights[id] == 0: # print (test_line) # print (pred_line) # print ('Height 0 =', prod, no_span_prod(prod)) # sys.exit(0) diff_no_span_prods = set( [no_span_prod(prod) for prod in diff_prods]) diff.update(diff_no_span_prods) diff_prods_counter.update(diff_no_span_prods) # pred_tree_nltk.pretty_print() # true_tree_nltk.pretty_print() # diff_rule_count = dict([e for e in pred_rule_count.items() if e[0] in diff]) # print ('Wrong rules') # print (diff_rule_count) # print ('Len wrong rules = ', len(diff)) # assert len(diff) == len(diff_rule_count) print(diff_prods_counter.most_common(10)) print('There are', len(diff), 'different distint productions') print('Done') print('') return diff, diff_prods_counter, diff_heights
available in the PYEVALB python package @author: Víctor Manuel Tenorio """ import nltk import stanza from stanza.server import CoreNLPClient from sacremoses import MosesDetokenizer from PYEVALB import scorer, parser import numpy as np import re detok = MosesDetokenizer() evalb_scorer = scorer.Scorer() recalls_corenlp = [] precs_corenlp = [] accs_corenlp = [] parsed_sents = nltk.corpus.treebank.parsed_sents() skipped_sents = 0 sents_analyzed = 0 with CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos', 'parse'], output_format="json", timeout=3000001, endpoint='http://localhost:9001') as client: for i, s in enumerate(nltk.corpus.treebank.sents()): sent = detok.detokenize(s) corenlp_model = client.annotate(sent) gold_sent = parser.create_from_bracket_string(
def save_scores(in_file, truth_file, out_file="out.txt"): scorer.Scorer().evalb(in_file, truth_file, out_file)
if turn in turn2subturn: turn2subturn[turn].append(subturn) else: turn2subturn[turn] = [subturn] #for turn in turn2subturn: # turn2subturn[turn] = sorted(turn2subturn[turn]) temp_dir = tempfile.TemporaryDirectory(prefix="evalb-") predicted_path = os.path.join(temp_dir.name,'pred.txt') output_path = os.path.join(temp_dir.name,'out.txt') with open(predicted_path,'w') as f: for turn in turn_ids: subturns = sort_subturns(turn2subturn[turn]) trees = [subturn2predtree[subturn] for subturn in subturns] out_tree = '(TURN '+' '.join(trees)+')' outstr = tree2str(out_tree) goldstr = tree2str(turn2gold[turn]) try: assert outstr==goldstr except: import pdb;pdb.set_trace() f.write(out_tree) f.write('\n') scr = scorer.Scorer() scr.evalb(gold_tree_file,predicted_path,output_path) import pdb;pdb.set_trace()
test_output = f.read().splitlines() # Compute metrics precisions = [] recalls = [] lengths = [] failures = 0 bugs = 0 for gold, test, sent in zip(test_output, parsed_output, test_input): if test == 'No parsing found': failures += 1 else: try: gold_tree = parser.create_from_bracket_string(gold[2:-1]) test_tree = parser.create_from_bracket_string(test[2:-1]) result = scorer.Scorer().score_trees(gold_tree, test_tree) len_sentence = len(sent.split()) lengths.append(len_sentence) print('') print('Sentence length: ' + str(len(gold))) print('Recall =' + str(result.recall)) print('Precision =' + str(result.prec)) recalls.append(result.recall) precisions.append(result.prec) except: bugs +=1 print('') print('Parsing failures for ' + str(failures + bugs) + 'sentences')
parsed_sentence = cyk_parser.parse(sentence) if parsed_sentence is not None: test_sentences_bis.append(sentence) f.write('%s\n' % parsed_sentence) print('Done') # Get accuracy # Get sentences parsed by our parser with open('data/evaluation_data.parser.txt', 'r') as f: file = f.read() parsed_sentences = file.split('\n') # Remove first two and last brackets to use parser from PYEVALB initial_parsed_sentences = [] parsed_sentences_final = [] for sent in test_sentences_bis: initial_parsed_sentences.append(sent[2:-1]) for sent in parsed_sentences: parsed_sentences_final.append(parsed_sentences[2:-1]) # Put in tree form initial_tree = parser.create_from_bracket_string(initial_parsed_sentences) my_tree = parser.create_from_bracket_string(parsed_sentences_final) # Get accuracy result = scorer.Scorer().score_trees(initial_tree, my_tree) print('Accuracy on Evaluation set: ' + str(result.tag_accracy))
def evaluation(): ##################################################################### # Load data # ##################################################################### with codecs.open("output.txt", 'r', 'UTF-8') as file: result = file.read() file.close() result = result.split() result_tree = [] i=-1 for r in result: if 'None' in r : result_tree.append('(SENT (NC <UNKNOWN>))') i += 1 elif 'SENT' in r : result_tree.append(r) i += 1 else : result_tree[i] = result_tree[i] + ' ' + r with codecs.open("sequoia_test_tree.txt", 'r', 'UTF-8') as file: truth = file.read() file.close() truth = truth.split() truth_tree = [] i=-1 for t in truth: if 'SENT' in t: truth_tree.append(t) i += 1 else : truth_tree[i] = truth_tree[i] + ' ' + t assert(len(result_tree)==len(truth_tree)) N = len(result_tree) ##################################################################### # Evaluation # ##################################################################### recall = [] precision = [] Fscore=[] tag_accuracy=[] S = scorer.Scorer() fileOut = codecs.open("evaluation_data.parser_output", 'w', 'UTF-8') for i in range(N): t = parser.create_from_bracket_string(truth_tree[i]) r = parser.create_from_bracket_string(result_tree[i]) fileOut.write(" ".join(str(t.non_terminal_labels))) fileOut.write('\n') if t.sentence == r.sentence : scores = S.score_trees(t, r) recall.append(scores.recall) precision.append(scores.prec) Fscore.append(2*scores.recall*scores.prec/(scores.prec+scores.recall)) tag_accuracy.append(scores.tag_accracy) print('Average recall : ', np.mean(recall)) print('Average precision : ', np.mean(precision)) print('Average F-score: ', np.mean(Fscore)) print('Average tag accuracy: ', np.mean(tag_accuracy)) return()
with open('results/evaluation_data.parser_output', 'a') as f: if my_parsing is None: f.write("Found no viable parsing." + "\n") else: f.write(my_parsing + "\n") if my_parsing is not None: # EVALPB works if we remove first and last brackets of the SEQUOIA format and the extra spaces that come with it real_parsing = real_parsing[2:-1] my_parsing = my_parsing[2:-1] print("Score PYEVALB:") real_tree = parser.create_from_bracket_string(real_parsing) test_tree = parser.create_from_bracket_string(my_parsing) result = scorer.Scorer().score_trees(real_tree, test_tree) print('accuracy ' + str(result.tag_accracy)) # for evaluation on the whole corpus, we save real_parsing # and_my_parsing in new files without first and last brackets with open('results/real_parsings_test_for_eval.txt', 'a') as f: f.write(real_parsing + "\n") with open('results/my_parsings_test_for_eval.txt', 'a') as f: f.write(my_parsing + "\n") save_scores( 'results/real_parsings_test_for_eval.txt', 'results/my_parsings_test_for_eval.txt', 'results/results_pyevalb.txt', )
def pyevalb(pred_path, gold_path, result_path): """Use PYEVALB to score trees.""" scorer.Scorer().evalb(gold_path, pred_path, result_path)
if not s_output: continue print('input --> ', s_input) print('input labels:', s_target) print('output -->', extract_sentence(s_output)) print('output labels:', s_output) target_tree = evalb_parser.create_from_bracket_string(s_target[1:-1]) output_tree = evalb_parser.create_from_bracket_string(s_output[1:-1]) # print(target_tree) # print(output_tree) try: s = evalb_scorer.Scorer() result = s.score_trees(target_tree, output_tree) print( f'sentence {k}, precision={result.prec}, recall={result.recall}' ) total_precision += result.prec total_recall += result.recall print( f'average so far: precision={total_precision/(k+1)}, recall={total_recall/(k+1)}' ) except: print(f'sentence {k}, scorer failed') # break
def get_p_value(baseline_f,gold_f,experiment_f,ids): print(experiment_f.split('/')[-1]) gold_lines = [l.strip() for l in open(gold_f).readlines()] experiment_lines = [l.strip() for l in open(experiment_f).readlines()] baseline_lines = [l.strip() for l in open(baseline_f).readlines()] assert len(gold_lines)==len(ids) assert len(gold_lines)==len(experiment_lines) assert len(gold_lines)==len(baseline_lines) num_lines = len(gold_lines) scr = scorer.Scorer() print('Calculate baseline...') fullset_experiment_f1 = scr.get_f1_from_list(gold_lines,experiment_lines) fullset_baseline_f1 = scr.get_f1_from_list(gold_lines,baseline_lines) fullset_delta = abs(fullset_experiment_f1 - fullset_baseline_f1) big_diffs = 0 id2matched = {'experiment':{}, 'baseline':{}} id2gold = {'experiment':{}, 'baseline':{}} id2test = {'experiment':{}, 'baseline':{}} model2output = {'experiment':experiment_lines, 'baseline':baseline_lines} print('Store bracket scores ...') for model in ('experiment','baseline'): test_lines = model2output[model] for i,turn in enumerate(ids): gold_line = gold_lines[i] test_line = test_lines[i] assert tree2str(gold_line)==tree2str(test_line) mat,gol,tes = scr.get_bracket_counts_from_tree(gold_line,test_line) id2matched[model][turn] = mat id2gold[model][turn] = gol id2test[model][turn] = tes idxs = [i for i in range(len(gold_lines))] print('Resample ...') for i in range(num_resamples): if i%100000==1: print(i) gold_resamples = [] experiment_resamples = [] baseline_resamples = [] resample_idx = random.choices(idxs,k=len(idxs)) resampled_turns = [] for idx in resample_idx: resampled_turns.append(ids[idx]) gold_brackets = sum([id2gold['experiment'][turn] for turn in resampled_turns]) experiment_matched_brackets = sum([id2matched['experiment'][turn] for turn in resampled_turns]) experiment_test_brackets = sum([id2test['experiment'][turn] for turn in resampled_turns]) experiment_rec = experiment_matched_brackets/gold_brackets experiment_prec = experiment_matched_brackets/experiment_test_brackets experiment_f1 = ((2*experiment_rec*experiment_prec)/(experiment_rec+experiment_prec))*100 baseline_matched_brackets = sum([id2matched['baseline'][turn] for turn in resampled_turns]) baseline_test_brackets = sum([id2test['baseline'][turn] for turn in resampled_turns]) baseline_rec = baseline_matched_brackets/gold_brackets baseline_prec = baseline_matched_brackets/baseline_test_brackets baseline_f1 = ((2*baseline_rec*baseline_prec)/(baseline_rec+baseline_prec))*100 #if experiment_f1 - baseline_f1 > (2*fullset_delta): curr_delta = abs(experiment_f1 - baseline_f1) if curr_delta > (2*fullset_delta): big_diffs += 1 print(f'p-value estimate: {big_diffs/num_resamples}')