def evalb(parse1, parse2): pyparse1 = pyparser.create_from_bracket_string(str(parse1)) pyparse2 = pyparser.create_from_bracket_string(str(parse2)) score = pyscorer.Scorer().score_trees(pyparse1, pyparse2) # cross_brackets = score.cross_brackets f1 = 2 * (score.recall * score.prec) / (score.recall + score.prec) return f1 * score.tag_accracy
def evalb(self, gold, pred): gold = evalb_parser.create_from_bracket_string(gold) pred = evalb_parser.create_from_bracket_string(pred) result = scorer.Scorer().score_trees(gold, pred) prec, recall = result.prec, result.recall fscore = 2 * (prec * recall) / (prec + recall) return prec, recall, fscore
def evaluate(goldtest): scorer = Scorer() for gold, test in goldtest: goldtree = parser.create_from_bracket_string(gold) testtree = parser.create_from_bracket_string(test) result = scorer.score_trees(gold_tree, test_tree) print(result)
def score(reference_parse, proposed_parse): """ Performs evaluation on a single parse tree Args: reference_parse (str): reference parse tree for the current sentence proposed_parse (str): proposed parse tree for the current sentence Returns: precision, recall, f_score, accuracy sh1: length of the predicted sentence sh2: length of the true sentence """ true_tree = evalbparser.create_from_bracket_string(reference_parse) test_tree = evalbparser.create_from_bracket_string(proposed_parse) y_true = np.array(true_tree.poss) y_pred = np.array(test_tree.poss) sh1 = y_pred.shape[0] sh2 = y_true.shape[0] y_pred = (y_true == y_pred) * 1 y_true = np.ones(len(y_true)) precision, recall, f_score, _ = precision_recall_fscore_support(y_true, y_pred, labels=[1]) accuracy = accuracy_score(y_true, y_pred) return precision, recall, f_score, accuracy, sh1, sh2
def get_results(self): results = [] for i in range(len(self.true_parsed)): sentence_true = self.true_parsed[i] sentence_test = self.test_parsed[i] back, a, b, c = self.Cyk.cyk(sentence_test) sentence = sentence_test.split(' ') result_test = "".join( ['((SENT (', get_parsed(sentence, back, a, b, c), ')))']) result_test = result_test[1:-1] print("Result sentence:") print(result_test) target = parser.create_from_bracket_string(sentence_true) predicted = parser.create_from_bracket_string(result_test) s = scorer.Scorer() result = s.score_trees(target, predicted) print('The recall is: ' + str(result.recall)) print('The precision is: ' + str(result.prec)) results.append(result_test) return (results)
def evaluate(sentence, reference): gold_tree = evalbparser.create_from_bracket_string(sentence[1:-1]) test_tree = evalbparser.create_from_bracket_string(reference[1:-1]) s = scorer.Scorer() result = s.score_trees(gold_tree, test_tree) return result.tag_accracy
def get_accuracy(self, target, predicted): gold_tree = parser.create_from_bracket_string(target) test_tree = parser.create_from_bracket_string(predicted) s1 = np.array(gold_tree.poss) s2 = np.array(test_tree.poss) acc = np.sum(s1 == s2) / s1.shape[0] return acc
def evaluate_predict(sentence, target_parse, cyk_module: cyk.CYKParser, scorer: evalscorer.Scorer) -> evalscorer.Result: predicted_string = cyk_module.cyk_parse(sentence) pred_tree = evalparser.create_from_bracket_string(predicted_string) gold_tree = evalparser.create_from_bracket_string(target_parse) if "Failure" in predicted_string: result = evalscorer.Result() result.state = 2 else: result = scorer.score_trees(gold_tree, pred_tree) return result, predicted_string
def compute_precision(prediction_train, grammars_train) : scorer = Scorer() tuple_to = [] for i in range(len(prediction_train)) : if prediction_train[i][1] == 1 : tuple_to.append((prediction_train[i][0], grammars_train[i])) precision = [scorer.score_trees(parser.create_from_bracket_string(pred), parser.create_from_bracket_string(real)).prec for\ (pred, real) in tuple_to] return np.sum(precision)/len(grammars_train)
def get_grammar_from_file_new(seq_file): def no_quote_prod(prod): prod = re.sub('\'', '', str(prod)) prod = re.sub(' +', ' ', prod) return prod.strip() print('Getting grammar from', seq_file) f = open(seq_file, 'r') # grammar = None prod_counter = Counter() prods = [] cnt_line = 0 for seq in f: cnt_line += 1 tree = parser.create_from_bracket_string(seq) this_seq_prods, _ = tree.productions(skip_XX=True, skip_span=True) this_seq_prods = [no_quote_prod(prod) for prod in this_seq_prods] this_seq_prods = [ prod for prod in this_seq_prods if 'XX ->' not in prod ] prods.extend(this_seq_prods) prod_counter.update(this_seq_prods) print('Done at', cnt_line, 'lines.') print('There are', len(set(prods)), 'productions') print('Top grammar:', prod_counter.most_common(10)) print('') return set(prods), prod_counter
def evalb(parse1, parse2): from PYEVALB import scorer as pyscorer from PYEVALB import parser as pyparser pyparse1 = pyparser.create_from_bracket_string(str(parse1)) pyparse2 = pyparser.create_from_bracket_string(str(parse2)) try: score = pyscorer.Scorer().score_trees(pyparse1, pyparse2) except Exception as e: print("Exception!") print(e) print(pyparse1) print(pyparse2) return 0 f1 = 2 * (score.recall * score.prec) / (score.recall + score.prec) return f1 * score.tag_accracy
def check_action2treeseq(self): instance = next(iter(self.train_iterator)) action_str_lst = self.id2original(self.ACTIONS, instance.actions) pos_tags = self.id2original(self.POS_TAGS, instance.pos_tags) converted_seq = utils.action2treestr(action_str_lst, instance.raws[0], pos_tags) measure = scorer.Scorer() golden_seq = instance.raw_seq[0] gold_tree = parser.create_from_bracket_string(golden_seq) converted_tree = parser.create_from_bracket_string(converted_seq) ret = measure.score_trees(gold_tree, converted_tree) match_num = ret.matched_brackets gold_num = ret.gold_brackets pred_num = ret.test_brackets assert match_num == gold_num assert match_num == pred_num
def get_eval_metrics(self, instance, pred_action_ids): assert type(pred_action_ids) == list pred_actions = self.id2original(self.ACTIONS, pred_action_ids) tokens = instance.raws[0] pos_tags = self.id2original(self.POS_TAGS, instance.pos_tags) measure = scorer.Scorer() golden_tree_seq = instance.raw_seq[0] gold_tree = parser.create_from_bracket_string(golden_tree_seq) try: pred_tree_seq = utils.action2treestr(pred_actions, tokens, pos_tags) pred_tree = parser.create_from_bracket_string(pred_tree_seq) ret = measure.score_trees(gold_tree, pred_tree) except: return -1 else: match_num = ret.matched_brackets gold_num = ret.gold_brackets pred_num = ret.test_brackets return match_num, gold_num, pred_num
def score(true_parse, proposed_parse): """ Description ----------------- Evaluate parses with the whole non terminals precision and recall, and on only POS tags Parameters ----------------- true_parse, proposed_parse : Bracketed strings, the true and proposed parse trees. Returns ----------------- parse_recall, parse_precision, pos_recall, pos_precision """ true_parse = true_parse[2:-1] proposed_parse = proposed_parse[2:-1] gold_tree = parser.create_from_bracket_string(true_parse) test_tree = parser.create_from_bracket_string(proposed_parse) # Compute recall and precision for POS tags y_true = np.array(gold_tree.poss) y_pred = np.array(test_tree.poss) y_pred = (y_true == y_pred).astype(int) y_true = np.ones(len(y_true)).astype(int) (POS_precision, POS_recall, POS_f_score, beta) = precision_recall_fscore_support(y_true, y_pred, labels=[1]) # Compute recall and precision for the whole parse thescorer = scorer.Scorer() result = thescorer.score_trees(gold_tree, test_tree) return result.recall * 100, result.prec * 100, POS_recall[ 0] * 100, POS_precision[0] * 100
def score(true_bracket, proposed_bracket): """ Performs evaluation on a single parse tree Args: true_bracket (str): reference parse tree for the current sentence proposed_bracket (str): proposed parse tree for the current sentence """ gold_tree = evalbparser.create_from_bracket_string(true_bracket) test_tree = evalbparser.create_from_bracket_string(proposed_bracket) # Compute recall and precision for POS tags y_true = np.array(gold_tree.poss) y_pred = np.array(test_tree.poss) y_pred = (y_true == y_pred) * 1 y_true = np.ones(len(y_true)) precision, recall, f_score, _ = precision_recall_fscore_support(y_true, y_pred, labels=[1]) accuracy = accuracy_score(y_true, y_pred) return precision, recall, f_score, accuracy
evalb_scorer = scorer.Scorer() recalls_corenlp = [] precs_corenlp = [] accs_corenlp = [] parsed_sents = nltk.corpus.treebank.parsed_sents() skipped_sents = 0 sents_analyzed = 0 with CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos', 'parse'], output_format="json", timeout=3000001, endpoint='http://localhost:9001') as client: for i, s in enumerate(nltk.corpus.treebank.sents()): sent = detok.detokenize(s) corenlp_model = client.annotate(sent) gold_sent = parser.create_from_bracket_string( parsed_sents[i].pformat()) parse_tree = parser.create_from_bracket_string( corenlp_model['sentences'][0]['parse']) try: scores = evalb_scorer.score_trees(gold_sent, parse_tree) except: skipped_sents += 1 continue recalls_corenlp.append(scores.recall) precs_corenlp.append(scores.prec) accs_corenlp.append(scores.tag_accracy) sents_analyzed += 1 if sents_analyzed == 100: break print("Results of the constituency parsing by CoreNLP in english")
test_input = f.read().splitlines() with open(test_output_path, 'r') as f: test_output = f.read().splitlines() # Compute metrics precisions = [] recalls = [] lengths = [] failures = 0 bugs = 0 for gold, test, sent in zip(test_output, parsed_output, test_input): if test == 'No parsing found': failures += 1 else: try: gold_tree = parser.create_from_bracket_string(gold[2:-1]) test_tree = parser.create_from_bracket_string(test[2:-1]) result = scorer.Scorer().score_trees(gold_tree, test_tree) len_sentence = len(sent.split()) lengths.append(len_sentence) print('') print('Sentence length: ' + str(len(gold))) print('Recall =' + str(result.recall)) print('Precision =' + str(result.prec)) recalls.append(result.recall) precisions.append(result.prec) except: bugs +=1 print('')
import re def no_span_prod(rule): rule = re.sub('\(([^\)]+)\)', '', str(rule)) rule = re.sub(' +', ' ', rule) rule = rule.strip() return rule # gold = '(IP (NP (PN 这里)) (VP (ADVP (AD 便)) (VP (VV 产生) (IP (NP (QP (CD 一) (CLP (M 个))) (DNP (NP (JJ 结构性)) (DEG 的)) (NP (NN 盲点))) (PU :) (IP (VP (VV 臭味相投) (PU ,) (VV 物以类聚)))))) (PU 。))' # test = '(IP (IP (NP (PN 这里)) (VP (ADVP (AD 便)) (VP (VV 产生) (NP (QP (CD 一) (CLP (M 个))) (DNP (ADJP (JJ 结构性)) (DEG 的)) (NP (NN 盲点)))))) (PU :) (IP (NP (NN 臭味相投)) (PU ,) (VP (VV 物以类聚))) (PU 。))' gold = '(TOP (S (INTJ (XX No)) (XX ,) (NP (XX it)) (VP (XX was) (XX nt) (NP (XX Black) (XX Monday))) (XX .)))' test = '(TOP (S (ADVP (XX No) ) (XX ,) (NP (XX it) ) (VP (XX was) (XX nt) (NP (XX Black) (XX Monday) ) ) (XX .) ) )' gold_tree = parser.create_from_bracket_string(gold) test_tree = parser.create_from_bracket_string(test) gold_prods, gold_heights = gold_tree.productions(skip_XX=False, skip_span=False) test_prods, test_heights = test_tree.productions(skip_XX=False, skip_span=False) gold_nltk_tree = Tree.fromstring(gold).pretty_print() test_nltk_tree = Tree.fromstring(test).pretty_print() print(gold_prods) print(list(map(no_span_prod, gold_prods))) print(gold_heights) print(test_prods) print(test_heights) print('Substract = ', set(gold_prods) - set(test_prods))
outputs = f.readlines() with open(TARGET, 'r') as f: targets = f.readlines() assert len(outputs) == len(targets) n_failures = 0 n_successes = 0 total_accuracy = 0 n = len(outputs) for target, output in zip(targets, outputs): target = target.strip() output = output.strip() if output == '-': n_failures += 1 else: n_successes += 1 target_tree = evalb_parser.create_from_bracket_string(target[2:-1]) output_tree = evalb_parser.create_from_bracket_string(output[2:-1]) s = evalb_scorer.Scorer() result = s.score_trees(target_tree, output_tree) total_accuracy += result.tag_accracy print('successes', n_successes) print('failures:', n_failures) print('mean accuracy on successes:', total_accuracy / n_successes)
parsed_sentence = cyk_parser.parse(sentence) if parsed_sentence is not None: test_sentences_bis.append(sentence) f.write('%s\n' % parsed_sentence) print('Done') # Get accuracy # Get sentences parsed by our parser with open('data/evaluation_data.parser.txt', 'r') as f: file = f.read() parsed_sentences = file.split('\n') # Remove first two and last brackets to use parser from PYEVALB initial_parsed_sentences = [] parsed_sentences_final = [] for sent in test_sentences_bis: initial_parsed_sentences.append(sent[2:-1]) for sent in parsed_sentences: parsed_sentences_final.append(parsed_sentences[2:-1]) # Put in tree form initial_tree = parser.create_from_bracket_string(initial_parsed_sentences) my_tree = parser.create_from_bracket_string(parsed_sentences_final) # Get accuracy result = scorer.Scorer().score_trees(initial_tree, my_tree) print('Accuracy on Evaluation set: ' + str(result.tag_accracy))
def evaluation(): ##################################################################### # Load data # ##################################################################### with codecs.open("output.txt", 'r', 'UTF-8') as file: result = file.read() file.close() result = result.split() result_tree = [] i=-1 for r in result: if 'None' in r : result_tree.append('(SENT (NC <UNKNOWN>))') i += 1 elif 'SENT' in r : result_tree.append(r) i += 1 else : result_tree[i] = result_tree[i] + ' ' + r with codecs.open("sequoia_test_tree.txt", 'r', 'UTF-8') as file: truth = file.read() file.close() truth = truth.split() truth_tree = [] i=-1 for t in truth: if 'SENT' in t: truth_tree.append(t) i += 1 else : truth_tree[i] = truth_tree[i] + ' ' + t assert(len(result_tree)==len(truth_tree)) N = len(result_tree) ##################################################################### # Evaluation # ##################################################################### recall = [] precision = [] Fscore=[] tag_accuracy=[] S = scorer.Scorer() fileOut = codecs.open("evaluation_data.parser_output", 'w', 'UTF-8') for i in range(N): t = parser.create_from_bracket_string(truth_tree[i]) r = parser.create_from_bracket_string(result_tree[i]) fileOut.write(" ".join(str(t.non_terminal_labels))) fileOut.write('\n') if t.sentence == r.sentence : scores = S.score_trees(t, r) recall.append(scores.recall) precision.append(scores.prec) Fscore.append(2*scores.recall*scores.prec/(scores.prec+scores.recall)) tag_accuracy.append(scores.tag_accracy) print('Average recall : ', np.mean(recall)) print('Average precision : ', np.mean(precision)) print('Average F-score: ', np.mean(Fscore)) print('Average tag accuracy: ', np.mean(tag_accuracy)) return()
def get_diff_prods_no_span(): print('Getting diff between', test_seqs_file, 'and', pred_seqs_file) diff = set() id = 0 from collections import Counter diff_prods_counter = Counter() diff_heights = defaultdict(list) for test_line, pred_line in zip(test_seqs, pred_seqs): # print ('true =', true_line) # print ('pred =', pred_line) measure = scorer.Scorer() gold_tree = parser.create_from_bracket_string(test_line) pred_tree = parser.create_from_bracket_string(pred_line) # print (id) # print(test_line, pred_line) # print (gold_tree.sentence) # print (pred_tree.sentence) # id += 1 ret = measure.score_trees(gold_tree, pred_tree) match_num = ret.matched_brackets gold_num = ret.gold_brackets pred_num = ret.test_brackets if match_num < gold_num or match_num < pred_num: pred_grammar, pred_heights = gold_tree.productions(skip_XX=False, skip_span=False) true_grammar, _ = pred_tree.productions(skip_XX=False, skip_span=False) # print(pred_grammar) # print(true_grammar) # diff_prods = set(pred_grammar) - set(true_grammar) diff_prods = [] diff_prods_heights = [] for id, prod in enumerate(pred_grammar): if prod not in true_grammar: diff_prods.append(prod) diff_prods_heights.append(pred_heights[id]) for id, prod in enumerate(diff_prods): diff_heights[no_span_prod(prod)].append(diff_prods_heights[id]) # if pred_heights[id] == 0: # print (test_line) # print (pred_line) # print ('Height 0 =', prod, no_span_prod(prod)) # sys.exit(0) diff_no_span_prods = set( [no_span_prod(prod) for prod in diff_prods]) diff.update(diff_no_span_prods) diff_prods_counter.update(diff_no_span_prods) # pred_tree_nltk.pretty_print() # true_tree_nltk.pretty_print() # diff_rule_count = dict([e for e in pred_rule_count.items() if e[0] in diff]) # print ('Wrong rules') # print (diff_rule_count) # print ('Len wrong rules = ', len(diff)) # assert len(diff) == len(diff_rule_count) print(diff_prods_counter.most_common(10)) print('There are', len(diff), 'different distint productions') print('Done') print('') return diff, diff_prods_counter, diff_heights
def evaluate_parser_multiprocess(pcfg, test_trees, filepath="parser_output.txt", write=True): """ Method to evaluate the parser using multiprocess :param pcfg: parser pcfg to evaluate """ y_true = [] y_pred = [] y_true_non_chomsky = [] y_pred_non_chomsky = [] y_true_parsable = [] y_pred_parsable = [] y_true_parsable_non_chomsky = [] y_pred_parsable_non_chomsky = [] recall_list = [] precision_list = [] lines = [] test_trees = test_trees[:5] if write: with open(filepath, 'w') as file: file.write("") with open("non-parsable", 'w') as file: file.write("") list_sentence = [] for c, tree in enumerate(test_trees): list_sentence.append(list(tree.flatten())) # Parsing multi_process : n_job = multiprocessing.cpu_count() start = time.time() with Pool(n_job) as p: result_trees = p.map(pcfg.CYK, list_sentence) print(f"Parsing time is {time.time()-start}") # Analysis of the result nb_non_parsable = 0 list_non_parsable = [] for (c, tree) in enumerate(test_trees): test_sentence = list(tree.flatten()) parsed_tree = result_trees[c] test_sentence_str = ' '.join(str(tree).split()) # If the sentence is parsable if parsed_tree: y_true.extend(get_leaves(tree)) y_pred.extend(get_leaves(parsed_tree)) y_true_parsable.extend(get_leaves(tree)) y_pred_parsable.extend(get_leaves(parsed_tree)) tree.un_chomsky_normal_form(unaryChar="&") parsed_tree.un_chomsky_normal_form(unaryChar="&") y_true_non_chomsky.extend(get_leaves(tree)) y_pred_non_chomsky.extend(get_leaves(parsed_tree)) y_true_parsable_non_chomsky.extend(get_leaves(tree)) y_pred_parsable_non_chomsky.extend(get_leaves(parsed_tree)) lines.append('( ' + ' '.join(str(parsed_tree).split()) + ')') parsed_tree_str = ' '.join(str(parsed_tree).split()) test_sentence_str = ' '.join(str(tree[0]).split()) target_tree = parser.create_from_bracket_string(test_sentence_str) predicted_tree = parser.create_from_bracket_string(parsed_tree_str) s = scorer.Scorer() try: result = s.score_trees(target_tree, predicted_tree) recall_list.append(result.recall) precision_list.append(result.prec) except: print("No Recall or precision") if write: with open(filepath, 'a') as file: file.write(lines[-1] + "\n") # if the sentence is not parsable else: aux = get_leaves(tree) y_true.extend(aux) y_pred.extend(["None" for k in range(len(aux))]) tree.un_chomsky_normal_form(unaryChar="&") y_true_non_chomsky.extend(get_leaves(tree)) y_pred_non_chomsky.extend( ["None" for k in range(len(get_leaves(tree)))]) nb_non_parsable += 1 list_non_parsable.append(test_sentence) if write: with open(filepath, 'a') as file: file.write("\n") with open("non-parsable", 'a') as file: file.write('( ' + ' '.join(str(tree).split()) + ')' + "\n") print('Nb Non parsable {}'.format(nb_non_parsable)) print('Accuracy total chomsky on dev set {}:'.format( accuracy(y_pred, y_true))) print("Accuracy total non chomsky on dev set {}:".format( accuracy(y_true_non_chomsky, y_pred_non_chomsky))) print('Accuracy parsable chomsky on dev set {}:'.format( accuracy(y_pred_parsable, y_true_parsable))) print("Accuracy parsable non chomsky on dev set {}:".format( accuracy(y_true_parsable_non_chomsky, y_pred_parsable_non_chomsky))) print("Recall moyen {} et précision moyenne {}".format( np.mean(recall_list), np.mean(precision_list)))
for k, sentence in enumerate(train_data): # sentence = train_data[4] s_input = extract_sentence(sentence) s_target = remove_functional_labels(sentence).strip() s_output = parser.parse(s_input) if not s_output: continue print('input --> ', s_input) print('input labels:', s_target) print('output -->', extract_sentence(s_output)) print('output labels:', s_output) target_tree = evalb_parser.create_from_bracket_string(s_target[1:-1]) output_tree = evalb_parser.create_from_bracket_string(s_output[1:-1]) # print(target_tree) # print(output_tree) try: s = evalb_scorer.Scorer() result = s.score_trees(target_tree, output_tree) print( f'sentence {k}, precision={result.prec}, recall={result.recall}' ) total_precision += result.prec total_recall += result.recall print(
tac = time.time() print("Done in " + str(round(tac - tic, 2)) + "sec\n") with open('results/evaluation_data.parser_output', 'a') as f: if my_parsing is None: f.write("Found no viable parsing." + "\n") else: f.write(my_parsing + "\n") if my_parsing is not None: # EVALPB works if we remove first and last brackets of the SEQUOIA format and the extra spaces that come with it real_parsing = real_parsing[2:-1] my_parsing = my_parsing[2:-1] print("Score PYEVALB:") real_tree = parser.create_from_bracket_string(real_parsing) test_tree = parser.create_from_bracket_string(my_parsing) result = scorer.Scorer().score_trees(real_tree, test_tree) print('accuracy ' + str(result.tag_accracy)) # for evaluation on the whole corpus, we save real_parsing # and_my_parsing in new files without first and last brackets with open('results/real_parsings_test_for_eval.txt', 'a') as f: f.write(real_parsing + "\n") with open('results/my_parsings_test_for_eval.txt', 'a') as f: f.write(my_parsing + "\n") save_scores( 'results/real_parsings_test_for_eval.txt', 'results/my_parsings_test_for_eval.txt',