def conll_eval(system_file, gold_file): gold_ud = load_conllu_file( gold_file ) #"/work/work/taa/corpus/ud-test-v2.0-conll2017/gold/conll17-ud-test-2017-05-09/ro.conllu") system_ud = load_conllu_file( system_file) #"/work/work/sq2/sequence_labeler/ro.conllu") return evaluate(gold_ud, system_ud, None)
def benchmark_model(model_name, test_data_path, ner_test_data): with open(test_data_path) as f: data = conllu.parse(f.read()) text = " ".join(d.metadata["text"] for d in data) load_model = getattr(importlib.import_module(model_name), "load") nlp = load_model() _parsed = StringIO(format_as_conllu(nlp(text), 1)) parsed = conll17_ud_eval.load_conllu(_parsed) gold = conll17_ud_eval.load_conllu_file(test_data_path) results = pd.DataFrame({ k: v.__dict__ for k, v in conll17_ud_eval.evaluate(gold, parsed).items() }).T print(results) diterator = DataIterator() test_sents = list( itertools.islice(diterator.tagged_sentences(ner_test_data), None)) scorer = Scorer() for sentence, annot in test_sents: doc_gold_text = nlp.make_doc(sentence) gold = GoldParse(doc_gold_text, entities=annot) predicted = nlp(sentence) scorer.score(predicted, gold) print(scorer.scores)
def score_las(filename_test, filename_gold): score = 0 with open(filename_test, 'r') as ftest, \ open(filename_gold, 'r') as fgold: test = las.load_conllu(ftest) gold = las.load_conllu(fgold) score = las.evaluate(gold, test)['LAS'].f1 return score
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): with text_loc.open("r", encoding="utf8") as text_file: texts = split_text(text_file.read()) docs = list(nlp.pipe(texts)) with sys_loc.open("w", encoding="utf8") as out_file: write_conllu(docs, out_file) with gold_loc.open("r", encoding="utf8") as gold_file: gold_ud = conll17_ud_eval.load_conllu(gold_file) with sys_loc.open("r", encoding="utf8") as sys_file: sys_ud = conll17_ud_eval.load_conllu(sys_file) scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) return scores
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): with text_loc.open("r", encoding="utf8") as text_file: texts = split_text(text_file.read()) docs = list(nlp.pipe(texts)) with sys_loc.open("w", encoding="utf8") as out_file: write_conllu(docs, out_file) with gold_loc.open("r", encoding="utf8") as gold_file: gold_ud = conll17_ud_eval.load_conllu(gold_file) with sys_loc.open("r", encoding="utf8") as sys_file: sys_ud = conll17_ud_eval.load_conllu(sys_file) scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) return scores
def benchmark_model(model_name, test_data_path): with open(test_data_path) as f: data = conllu.parse(f.read()) text = " ".join(d.metadata["text"] for d in data) load_model = getattr(importlib.import_module(model_name), "load") nlp = load_model() _parsed = StringIO(format_as_conllu(nlp(text), 1)) parsed = conll17_ud_eval.load_conllu(_parsed) gold = conll17_ud_eval.load_conllu_file(test_data_path) results = pd.DataFrame({ k: v.__dict__ for k, v in conll17_ud_eval.evaluate(gold, parsed).items() }).T print(results)
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): if text_loc.parts[-1].endswith(".conllu"): docs = [] with text_loc.open() as file_: for conllu_doc in read_conllu(file_): for conllu_sent in conllu_doc: words = [line[1] for line in conllu_sent] docs.append(Doc(nlp.vocab, words=words)) for name, component in nlp.pipeline: docs = list(component.pipe(docs)) else: with text_loc.open("r", encoding="utf8") as text_file: texts = split_text(text_file.read()) docs = list(nlp.pipe(texts)) with sys_loc.open("w", encoding="utf8") as out_file: write_conllu(docs, out_file) with gold_loc.open("r", encoding="utf8") as gold_file: gold_ud = conll17_ud_eval.load_conllu(gold_file) with sys_loc.open("r", encoding="utf8") as sys_file: sys_ud = conll17_ud_eval.load_conllu(sys_file) scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) return docs, scores
def run_single_eval(nlp, loading_time, print_name, text_path, gold_ud, tmp_output_path, out_file, print_header, check_parse, print_freq_tasks): """" Run an evaluation of a model nlp on a certain specified treebank """ with text_path.open(mode='r', encoding='utf-8') as f: flat_text = f.read() # STEP 1: tokenize text tokenization_start = time.time() texts = split_text(flat_text) docs = list(nlp.pipe(texts)) tokenization_end = time.time() tokenization_time = tokenization_end - tokenization_start # STEP 2: record stats and timings tokens_per_s = int(len(gold_ud.tokens) / tokenization_time) print_header_1 = ['date', 'text_path', 'gold_tokens', 'model', 'loading_time', 'tokenization_time', 'tokens_per_s'] print_string_1 = [str(datetime.date.today()), text_path.name, len(gold_ud.tokens), print_name, "%.2f" % loading_time, "%.2f" % tokenization_time, tokens_per_s] # STEP 3: evaluate predicted tokens and features with tmp_output_path.open(mode="w", encoding="utf8") as tmp_out_file: write_conllu(docs, tmp_out_file) with tmp_output_path.open(mode="r", encoding="utf8") as sys_file: sys_ud = conll17_ud_eval.load_conllu(sys_file, check_parse=check_parse) tmp_output_path.unlink() scores = conll17_ud_eval.evaluate(gold_ud, sys_ud, check_parse=check_parse) # STEP 4: format the scoring results eval_headers = EVAL_PARSE if not check_parse: eval_headers = EVAL_NO_PARSE for score_name in eval_headers: score = scores[score_name] print_string_1.extend(["%.2f" % score.precision, "%.2f" % score.recall, "%.2f" % score.f1]) print_string_1.append("-" if score.aligned_accuracy is None else "%.2f" % score.aligned_accuracy) print_string_1.append("-" if score.undersegmented is None else "%.4f" % score.under_perc) print_string_1.append("-" if score.oversegmented is None else "%.4f" % score.over_perc) print_header_1.extend([score_name + '_p', score_name + '_r', score_name + '_F', score_name + '_acc', score_name + '_under', score_name + '_over']) if score_name in print_freq_tasks: print_header_1.extend([score_name + '_word_under_ex', score_name + '_shape_under_ex', score_name + '_word_over_ex', score_name + '_shape_over_ex']) d_under_words = get_freq_tuples(score.undersegmented, PRINT_TOTAL) d_under_shapes = get_freq_tuples([word_shape(x) for x in score.undersegmented], PRINT_TOTAL) d_over_words = get_freq_tuples(score.oversegmented, PRINT_TOTAL) d_over_shapes = get_freq_tuples([word_shape(x) for x in score.oversegmented], PRINT_TOTAL) # saving to CSV with ; seperator so blinding ; in the example output print_string_1.append( str({k: v for k, v in d_under_words if v > PRINT_FREQ}).replace(";", "*SEMICOLON*")) print_string_1.append( str({k: v for k, v in d_under_shapes if v > PRINT_FREQ}).replace(";", "*SEMICOLON*")) print_string_1.append( str({k: v for k, v in d_over_words if v > PRINT_FREQ}).replace(";", "*SEMICOLON*")) print_string_1.append( str({k: v for k, v in d_over_shapes if v > PRINT_FREQ}).replace(";", "*SEMICOLON*")) # STEP 5: print the formatted results to CSV if print_header: out_file.write(';'.join(map(str, print_header_1)) + '\n') out_file.write(';'.join(map(str, print_string_1)) + '\n')
def main(): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("truth", type=str, help="Directory name of the truth dataset.") parser.add_argument("system", type=str, help="Directory name of system output.") parser.add_argument("output", type=str, help="Directory name of the output directory.") args = parser.parse_args() # Load input dataset metadata.json with open(args.truth + "/metadata.json", "r") as metadata_file: metadata = json.load(metadata_file) # Evaluate and compute sum of all treebanks metrics = [ "Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS" ] treebanks = 0 summation = {} results = [] results_las = {} for entry in metadata: treebanks += 1 ltcode, goldfile, outfile = entry['ltcode'], entry['goldfile'], entry[ 'outfile'] # Load gold data try: gold = load_conllu_file(args.truth + "/" + goldfile) except: results.append( (ltcode + "-Status", "Error: Cannot load gold file")) continue # Load system data try: system = load_conllu_file(args.system + "/" + outfile) except UDError as e: if e.args[0].startswith("There is a cycle"): results.append( (ltcode + "-Status", "Error: There is a cycle in generated CoNLL-U file")) continue if e.args[0].startswith("There are multiple roots"): results.append(( ltcode + "-Status", "Error: There are multiple roots in a sentence in generated CoNLL-U file" )) continue results.append(( ltcode + "-Status", "Error: There is a format error (tabs, ID values, etc) in generated CoNLL-U file" )) continue except: results.append((ltcode + "-Status", "Error: Cannot open generated CoNLL-U file")) continue # Check for correctness if not system.characters: results.append( (ltcode + "-Status", "Error: The system file is empty")) continue if system.characters != gold.characters: results.append(( ltcode + "-Status", "Error: The concatenation of tokens in gold file and in system file differ, system file has {} nonspace characters, which is approximately {}% of the gold file" .format( len(system.characters), int(100 * len(system.characters) / len(gold.characters))))) continue # Evaluate try: evaluation = evaluate(gold, system) except: # Should not happen results.append(( ltcode + "-Status", "Error: Cannot evaluate generated CoNLL-U file, internal error" )) continue # Generate output metrics and compute sum results.append( (ltcode + "-Status", "OK: Evaluated non-zero LAS F1 score" if evaluation["LAS"].f1 > 0 else "Error: Evaluated zero LAS F1 score")) for metric in metrics: results.append((ltcode + "-" + metric + "-F1", "{:.2f}".format(100 * evaluation[metric].f1))) summation[metric] = summation.get(metric, 0) + evaluation[metric].f1 results_las[ltcode] = evaluation["LAS"].f1 # Compute averages for metric in reversed(metrics): results.insert(0, ("total-" + metric + "-F1", "{:.2f}".format( 100 * summation.get(metric, 0) / treebanks))) # Generate evaluation.prototext with open(args.output + "/evaluation.prototext", "w") as evaluation: for key, value in results: print('measure{{\n key: "{}"\n value: "{}"\n}}'.format( key, value), file=evaluation) # Generate LAS-F1 + Status on stdout, Status on stderr for key, value in results: if not key.endswith("-Status"): continue ltcode = key[:-len("-Status")] print("{:13} LAS:{:6.2f} ({})".format( ltcode, 100 * results_las.get(ltcode, 0.), value), file=sys.stdout) print("{:13} {}".format(ltcode, value), file=sys.stderr)
def main(): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("truth", type=str, help="Directory name of the truth dataset.") parser.add_argument("system", type=str, help="Directory name of system output.") parser.add_argument("output", type=str, help="Directory name of the output directory.") args = parser.parse_args() # Load input dataset metadata.json with open(args.truth + "/metadata.json","r") as metadata_file: metadata = json.load(metadata_file) # Evaluate and compute sum of all treebanks metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS"] treebanks = 0 summation = {} results = [] results_las = {} for entry in metadata: treebanks += 1 ltcode, goldfile, outfile = entry['ltcode'], entry['goldfile'], entry['outfile'] # Load gold data try: gold = load_conllu_file(args.truth + "/" + goldfile) except: results.append((ltcode+"-Status", "Error: Cannot load gold file")) continue # Load system data try: system = load_conllu_file(args.system + "/" + outfile) except UDError as e: if e.args[0].startswith("There is a cycle"): results.append((ltcode+"-Status", "Error: There is a cycle in generated CoNLL-U file")) continue if e.args[0].startswith("There are multiple roots"): results.append((ltcode+"-Status", "Error: There are multiple roots in a sentence in generated CoNLL-U file")) continue results.append((ltcode+"-Status", "Error: There is a format error (tabs, ID values, etc) in generated CoNLL-U file")) continue except: results.append((ltcode+"-Status", "Error: Cannot open generated CoNLL-U file")) continue # Check for correctness if not system.characters: results.append((ltcode+"-Status", "Error: The system file is empty")) continue if system.characters != gold.characters: results.append((ltcode+"-Status", "Error: The concatenation of tokens in gold file and in system file differ, system file has {} nonspace characters, which is approximately {}% of the gold file".format(len(system.characters), int(100 * len(system.characters) / len(gold.characters))))) continue # Evaluate try: evaluation = evaluate(gold, system) except: # Should not happen results.append((ltcode+"-Status", "Error: Cannot evaluate generated CoNLL-U file, internal error")) continue # Generate output metrics and compute sum results.append((ltcode+"-Status", "OK: Evaluated non-zero LAS F1 score" if evaluation["LAS"].f1 > 0 else "Error: Evaluated zero LAS F1 score")) for metric in metrics: results.append((ltcode+"-"+metric+"-F1", "{:.2f}".format(100 * evaluation[metric].f1))) summation[metric] = summation.get(metric, 0) + evaluation[metric].f1 results_las[ltcode] = evaluation["LAS"].f1 # Compute averages for metric in reversed(metrics): results.insert(0, ("total-"+metric+"-F1", "{:.2f}".format(100 * summation.get(metric, 0) / treebanks))) # Generate evaluation.prototext with open(args.output + "/evaluation.prototext", "w") as evaluation: for key, value in results: print('measure{{\n key: "{}"\n value: "{}"\n}}'.format(key, value), file=evaluation) # Generate LAS-F1 + Status on stdout, Status on stderr for key, value in results: if not key.endswith("-Status"): continue ltcode = key[:-len("-Status")] print("{:13} LAS:{:6.2f} ({})".format(ltcode, 100 * results_las.get(ltcode, 0.), value), file=sys.stdout) print("{:13} {}".format(ltcode, value), file=sys.stderr)