def evaluate_ner(self, model, eval_set, ent_types): """Evaluate the performance of a Named Entity model Arguments: model (spacy model object) -- trained Named Entity model to evaluate eval_set (list) -- Evaluation set passed in the format [["<doc_text>",{"entities:[[<start_pos>,<end_pos>,"<ENTITY_TYPE>"], [<start_pos>,<end_pos>,"<ENTITY_TYPE>"]]}]] ent_types (list) -- list with what entities types to extract Returns: (Spacy.scorer.scores) -- scored metrics for the model """ scorer = Scorer() for data, expected_result in eval_set: selected_entities = [] for ent in expected_result.get('entities'): if ent[-1] in ent_types: selected_entities.append(ent) ground_truth_text = model.make_doc(data) ground_truth = GoldParse(ground_truth_text, entities=selected_entities) pred_value = model(data) scorer.score(pred_value, ground_truth) return scorer.scores
def evaluate(self, verbose=1): """Do evaluation on test data Parameters ---------- verbose : bool print out the wrong case from prediction """ scorer = Scorer() wrong_case = 0 for input_, annot in self.data: doc_gold_text = self.nlp.make_doc(input_) gold = GoldParse(doc_gold_text, entities=annot['entities']) pred_value = self.nlp(input_) #return gold current_score = scorer.ents_f scorer.score(pred_value, gold) if (current_score > scorer.ents_f): wrong_case += 1 if (verbose == 1): print_beauty_NER(prediction_to_IOB(pred_value, gold)) current_score = scorer.ents_f return scorer.scores #, wrong_case, len(self.data)
def benchmark_model(model_name, test_data_path, ner_test_data): with open(test_data_path) as f: data = conllu.parse(f.read()) text = " ".join(d.metadata["text"] for d in data) load_model = getattr(importlib.import_module(model_name), "load") nlp = load_model() _parsed = StringIO(format_as_conllu(nlp(text), 1)) parsed = conll17_ud_eval.load_conllu(_parsed) gold = conll17_ud_eval.load_conllu_file(test_data_path) results = pd.DataFrame({ k: v.__dict__ for k, v in conll17_ud_eval.evaluate(gold, parsed).items() }).T print(results) diterator = DataIterator() test_sents = list( itertools.islice(diterator.tagged_sentences(ner_test_data), None)) scorer = Scorer() for sentence, annot in test_sents: doc_gold_text = nlp.make_doc(sentence) gold = GoldParse(doc_gold_text, entities=annot) predicted = nlp(sentence) scorer.score(predicted, gold) print(scorer.scores)
def evaluate_entity(self, nlp): """evaluates a given model with spacy gold Keyword arguments: model -- path to the trained model examples -- path to test data to compare model with Return: scorer.ents_p -- precision score scorer.ents_r -- recall score scorer.ents_f -- f1 score """ logging.info("Started evaluating entities...") examples = self.test_data start_time = time.perf_counter() scorer = Scorer() for text, annotations, _ in examples: doc_gold = nlp.make_doc(text) entity = annotations.get("entities") gold = GoldParse(doc_gold, entities=entity) scorer.score(nlp(text), gold) logging.debug("Testing data took %d seconds to run", time.perf_counter() - start_time) logging.debug("Score for entity is: p:%f,r:%f,f:%f", scorer.ents_p, scorer.ents_r, scorer.ents_f) logging.info("Finished evaluating entities") return scorer.ents_p, scorer.ents_r, scorer.ents_f
def spacy_evaluator(ner_model, examples): """ Evaluate the created NER model using different metrics Args: ner_model (spacy model object): The dataset examples (list): testing examples Returns: score_object (score object): object with different scoring metrics """ try: score_object = Scorer() for base_input, annotations in examples: doc_for_gold = ner_model.make_doc(base_input) gold = GoldParse(doc_for_gold, entities=annotations) prediction = ner_model(base_input) #evaluation score_object.score(prediction, gold) except Exception as e: print("Unable to evaluate model with error: " + str(e)) return None return score_object.scores
def evaluate(self, examples): scorer = Scorer() for input_, annot in examples: gold = self.make_gold(input_, annot) doc = self(input_) scorer.score(doc, gold) return scorer.scores
def predict(self, list_data): """ Method that performs prediction on a given dataset :param list_data: list of data given in the format expected by spaCy E.g. [(This is a nice summer, {"entities":(15, 21, SEASON)})] :return: dict_performance - dictionary where keys are precision, recall and F1 and values are the corresponding values of such metrics """ # load customized NER model nlp_custom = spacy.load(self.output_dir) # instantiate scorer scorer = Scorer() # loop over list of data given for input_, annotation_ in list_data: doc_gold_text = nlp_custom.make_doc(input_) gold = GoldParse(doc_gold_text, entities=annotation_['entities']) pred_value = nlp_custom(input_) scorer.score(pred_value, gold) # create dictionary to be returned... dict_perf = scorer.scores dict_perf_out = { 'precision': dict_perf['ents_p'], 'recall': dict_perf['ents_r'], 'F1': dict_perf['ents_f'] } self.dict_performance = dict_perf_out # ... and return it return self.dict_performance
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None, cand_preproc=None): nlp = Language(data_dir=model_dir) if nlp.lang == 'de': nlp.vocab.morphology.lemmatizer = lambda string, pos: set([string]) if beam_width is not None: nlp.parser.cfg.beam_width = beam_width scorer = Scorer() for raw_text, sents in gold_tuples: if gold_preproc: raw_text = None else: sents = _merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.parser(tokens) nlp.entity(tokens) else: tokens = nlp(raw_text) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=verbose) return scorer
def evaluate(self, test_data): scorer = Scorer() for input_, annot in test_data: gold = self._sent_to_goldparse(input_, annot) predicted = self.nlp(input_) scorer.score(predicted, gold) return scorer.scores
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None): nlp = Language(data_dir=model_dir) if beam_width is not None: nlp.parser.cfg.beam_width = beam_width scorer = Scorer() for raw_text, sents in gold_tuples: if gold_preproc: raw_text = None else: sents = _merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.entity(tokens) nlp.parser(tokens) else: tokens = nlp(raw_text, merge_mwes=False) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=verbose) return scorer
def ner_eval(model, examples): scorer = Scorer() for input_, annot in examples: gold_text = model.make_doc(input_) gold = GoldParse(gold_text, entities=annot) pred_val = model(input_) scorer.score(pred_val, gold) return scorer.scores
def evaluate(ner_model, examples): scorer = Scorer() for x in examples: doc_gold_text = ner_model.make_doc(x[0]) gold = GoldParse(doc_gold_text, entities=x[1]['entities']) pred_value = ner_model(x[0]) scorer.score(pred_value, gold) return scorer.scores
def get_spacy_accuracy(spacy_pos_model, spacy_test_list, eval_punct=False): scorer = Scorer(eval_punct=eval_punct) for tokens, label in spacy_test_list: doc = Doc(spacy_pos_model.vocab, words=tokens) gold = GoldParse(doc, words=tokens, tags=label) processed = spacy_pos_model.tagger(doc) scorer.score(processed, gold) return scorer.scores['tags_acc']
def score_model(nlp, gold_docs, verbose=False): scorer = Scorer() for _, gold_doc in gold_docs: for annot_tuples, _ in gold_doc: tokens = nlp(list(annot_tuples[1]), tags=list(annot_tuples[2])) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=verbose) return scorer
def evaluate(ner_model, examples): scorer = Scorer() for input_, annotations in examples: doc_gold_text = ner_model.make_doc(input_) gold = GoldParse(doc_gold_text, entities=annotations.get('entities')) pred_value = ner_model(input_) scorer.score(pred_value, gold) return scorer.scores
def evaluate(ner_model, test_data): scorer = Scorer() for input_, annot in test_data: doc_gold_text = ner_model.make_doc(input_) gold = GoldParse(doc_gold_text, entities = annot['entities']) pred_value = ner_model(input_) scorer.score(pred_value, gold) return scorer.scores
def evaluate(model, examples): scorer = Scorer() for input_, annot in examples: doc_gold_text = model.make_doc(input_) gold = GoldParse(doc_gold_text, entities=annot.get("entities")) pred_value = model(input_) scorer.score(pred_value, gold) return scorer.scores
def test_ner_per_type(en_vocab): # Gold and Doc are identical scorer = Scorer() examples = [] for input_, annot in test_ner_cardinal: doc = Doc(en_vocab, words=input_.split(" "), ents=["B-CARDINAL", "O", "B-CARDINAL"]) entities = offsets_to_biluo_tags(doc, annot["entities"]) example = Example.from_dict(doc, {"entities": entities}) # a hack for sentence boundaries example.predicted[1].is_sent_start = False example.reference[1].is_sent_start = False examples.append(example) results = scorer.score(examples) assert results["ents_p"] == 1.0 assert results["ents_r"] == 1.0 assert results["ents_f"] == 1.0 assert results["ents_per_type"]["CARDINAL"]["p"] == 1.0 assert results["ents_per_type"]["CARDINAL"]["r"] == 1.0 assert results["ents_per_type"]["CARDINAL"]["f"] == 1.0 # Doc has one missing and one extra entity # Entity type MONEY is not present in Doc scorer = Scorer() examples = [] for input_, annot in test_ner_apple: doc = Doc( en_vocab, words=input_.split(" "), ents=[ "B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O" ], ) entities = offsets_to_biluo_tags(doc, annot["entities"]) example = Example.from_dict(doc, {"entities": entities}) # a hack for sentence boundaries example.predicted[1].is_sent_start = False example.reference[1].is_sent_start = False examples.append(example) results = scorer.score(examples) assert results["ents_p"] == approx(0.6666666) assert results["ents_r"] == approx(0.6666666) assert results["ents_f"] == approx(0.6666666) assert "GPE" in results["ents_per_type"] assert "MONEY" in results["ents_per_type"] assert "ORG" in results["ents_per_type"] assert results["ents_per_type"]["GPE"]["p"] == 1.0 assert results["ents_per_type"]["GPE"]["r"] == 1.0 assert results["ents_per_type"]["GPE"]["f"] == 1.0 assert results["ents_per_type"]["MONEY"]["p"] == 0 assert results["ents_per_type"]["MONEY"]["r"] == 0 assert results["ents_per_type"]["MONEY"]["f"] == 0 assert results["ents_per_type"]["ORG"]["p"] == 0.5 assert results["ents_per_type"]["ORG"]["r"] == 1.0 assert results["ents_per_type"]["ORG"]["f"] == approx(0.6666666)
def evaluate(self, model, force=False): golds = self.get_golds(model, True) parsed = list(model.pipe(self.texts)) scorer = Scorer() for p, g in zip(parsed, golds): scorer.score(p, g) return scorer.scores
def evaluate(ner_model, examples): scorer = Scorer() for input_, annot in examples: annot = annot["entities"] doc_gold_text = ner_model.make_doc(input_) gold = GoldParse(doc_gold_text, entities=annot) pred_value = ner_model(input_) scorer.score(pred_value, gold) return scorer.scores
def evaluate(nerModel, examples): scorer = Scorer() for input_, annot in examples: docGoldText = nerModel.make_doc(input_) gold = GoldParse(docGoldText, entities=annot['entities']) predValue = nerModel(input_) scorer.score(predValue, gold) return scorer.scores
def model_evaluate(model, data): scorer = Scorer() # print(data) for input_, annotations in data: # for ent in annotations.get("entities"): doc_gold_text = model.make_doc(input_) gold = GoldParse(doc_gold_text, entities=annotations.get("entities")) pred_value = model(input_) scorer.score(pred_value, gold) return scorer.ents_per_type
def evaluate(nlp, gold_tuples, gold_preproc=True): scorer = Scorer() for raw_text, sents in gold_tuples: for annot_tuples, brackets in sents: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.parser(tokens) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold) return scorer
def evaluate(Language, dev_loc, model_dir, gold_preproc=False, verbose=True): assert not gold_preproc nlp = Language(data_dir=model_dir) gold_tuples = read_docparse_file(dev_loc) scorer = Scorer() for raw_text, segmented_text, annot_tuples in gold_tuples: tokens = nlp(raw_text, merge_mwes=False) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=verbose) return scorer
def evaluate(ner_model, examples): """ return the score for ner_model against test set in examples""" scorer = Scorer() for input_, annot in examples: doc_gold_text = ner_model.make_doc(input_) gold = GoldParse(doc_gold_text, entities=annot) pred_value = ner_model(input_) scorer.score(pred_value, gold) return scorer.scores
def evaluate(model, examples): scorer = Scorer() for input_, annot in examples: # print(type(input_),type(annot),annot) doc_gold_text = model.make_doc(input_) gold = GoldParse(doc_gold_text, entities=annot['entities']) pred_value = model(input_) scorer.score(pred_value, gold) # print(type(pred_value), type(gold)) return scorer.scores
def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') ner_model_dir = path.join(model_dir, 'ner') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) if path.exists(pos_model_dir): shutil.rmtree(pos_model_dir) if path.exists(ner_model_dir): shutil.rmtree(ner_model_dir) os.mkdir(dep_model_dir) os.mkdir(pos_model_dir) os.mkdir(ner_model_dir) setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) gold_tuples = read_docparse_file(train_loc) Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, labels=Language.ParserTransitionSystem.get_labels(gold_tuples)) Config.write(ner_model_dir, 'config', features='ner', seed=seed, labels=Language.EntityTransitionSystem.get_labels(gold_tuples)) if n_sents > 0: gold_tuples = gold_tuples[:n_sents] nlp = Language(data_dir=model_dir) print "Itn.\tUAS\tNER F.\tTag %" for itn in range(n_iter): scorer = Scorer() for raw_text, segmented_text, annot_tuples in gold_tuples: # Eval before train tokens = nlp(raw_text, merge_mwes=False) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=False) if gold_preproc: sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text] else: sents = [nlp.tokenizer(raw_text)] for tokens in sents: gold = GoldParse(tokens, annot_tuples) nlp.tagger(tokens) nlp.parser.train(tokens, gold) if gold.ents: nlp.entity.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) print '%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.ents_f, scorer.tags_acc) random.shuffle(gold_tuples) nlp.parser.model.end_training() nlp.entity.model.end_training() nlp.tagger.model.end_training() nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
def score_model(vocab, tagger, parser, gold_docs, verbose=False): scorer = Scorer() for _, gold_doc in gold_docs: for (ids, words, tags, heads, deps, entities), _ in gold_doc: doc = Doc(vocab, words=words) tagger(doc) parser(doc) PseudoProjectivity.deprojectivize(doc) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) scorer.score(doc, gold, verbose=verbose) return scorer
def evaluate(ner_model, examples): scorer = Scorer() for input_, annot in examples: doc_gold_text = ner_model.make_doc(input_) gold = GoldParse(doc_gold_text, entities=annot.get('entities')) pred_value = ner_model(input_) scorer.score(pred_value, gold) return scorer.scores # random.shuffle(TRAIN_COMBINED) # print(evaluate(nlp, TRAIN_COMBINED[:100]))
def evaluate(ner_model, examples): """ Evaluate your spacy NER model by passing in the model and Test Data (in spacy format) """ scorer = Scorer() for input_, annot in examples: doc_gold_text = ner_model.make_doc(input_) gold = GoldParse(doc_gold_text, entities=annot['entities']) pred_value = ner_model(input_) scorer.score(pred_value, gold) return scorer.scores
def test_partial_annotation(en_tokenizer): pred_doc = en_tokenizer("a b c d e") pred_doc[0].tag_ = "A" pred_doc[0].pos_ = "X" pred_doc[0].set_morph("Feat=Val") pred_doc[0].dep_ = "dep" # unannotated reference ref_doc = en_tokenizer("a b c d e") ref_doc.has_unknown_spaces = True example = Example(pred_doc, ref_doc) scorer = Scorer() scores = scorer.score([example]) for key in scores: # cats doesn't have an unset state if key.startswith("cats"): continue assert scores[key] is None # partially annotated reference, not overlapping with predicted annotation ref_doc = en_tokenizer("a b c d e") ref_doc.has_unknown_spaces = True ref_doc[1].tag_ = "A" ref_doc[1].pos_ = "X" ref_doc[1].set_morph("Feat=Val") ref_doc[1].dep_ = "dep" example = Example(pred_doc, ref_doc) scorer = Scorer() scores = scorer.score([example]) assert scores["token_acc"] is None assert scores["tag_acc"] == 0.0 assert scores["pos_acc"] == 0.0 assert scores["morph_acc"] == 0.0 assert scores["dep_uas"] == 1.0 assert scores["dep_las"] == 0.0 assert scores["sents_f"] is None # partially annotated reference, overlapping with predicted annotation ref_doc = en_tokenizer("a b c d e") ref_doc.has_unknown_spaces = True ref_doc[0].tag_ = "A" ref_doc[0].pos_ = "X" ref_doc[1].set_morph("Feat=Val") ref_doc[1].dep_ = "dep" example = Example(pred_doc, ref_doc) scorer = Scorer() scores = scorer.score([example]) assert scores["token_acc"] is None assert scores["tag_acc"] == 1.0 assert scores["pos_acc"] == 1.0 assert scores["morph_acc"] == 0.0 assert scores["dep_uas"] == 1.0 assert scores["dep_las"] == 0.0 assert scores["sents_f"] is None
def train(Language, sentences, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, force_gold=False): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) if path.exists(pos_model_dir): shutil.rmtree(pos_model_dir) os.mkdir(dep_model_dir) os.mkdir(pos_model_dir) setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, labels=Language.ParserTransitionSystem.get_labels(sentences)) nlp = Language(data_dir=model_dir) for itn in range(n_iter): scorer = Scorer() for _1, _2, (_3, words, tags, heads, labels, ner) in sentences: tokens = nlp.tokenizer.tokens_from_list(words) assert len(words) == len(tokens) == len(heads) string_indices = [token.idx for token in tokens] heads = [string_indices[head] for head in heads] annot_tuples = (string_indices, words, tags, heads, labels, ner) nlp.tagger.tag_from_strings(tokens, tags) # Eval before train nlp.parser(tokens) scorer.score(tokens, GoldParse(tokens, annot_tuples), verbose=False) # Make fresh tokens, and train tokens = nlp.tokenizer.tokens_from_list(words) nlp.tagger.tag_from_strings(tokens, tags) try: nlp.parser.train(tokens, GoldParse(tokens, annot_tuples)) except AssertionError: continue print '%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las) random.shuffle(sentences) nlp.parser.model.end_training() nlp.tagger.model.end_training() nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): nlp = Language(data_dir=model_dir) if beam_width is not None: nlp.parser.cfg.beam_width = beam_width gold_tuples = read_json_file(dev_loc) scorer = Scorer() out_file = codecs.open(out_loc, "w", "utf8") for raw_text, sents in gold_tuples: sents = _merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.entity(tokens) nlp.parser(tokens) else: tokens = nlp(raw_text, merge_mwes=False) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=False) for t in tokens: out_file.write("%s\t%s\t%s\t%s\n" % (t.orth_, t.tag_, t.head.orth_, t.dep_)) return scorer
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None, cand_preproc=None): nlp = Language(data_dir=model_dir) if nlp.lang == 'de': nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string]) if beam_width is not None: nlp.parser.cfg.beam_width = beam_width scorer = Scorer() for raw_text, sents in gold_tuples: if gold_preproc: raw_text = None else: sents = _merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.parser(tokens) nlp.entity(tokens) else: tokens = nlp(raw_text) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=verbose) return scorer