def _evaluate_sequence_tagger(model, sentences: List[Sentence], eval_mini_batch_size: int = 32, embeddings_in_memory: bool = True, out_path: Path = None) -> (dict, float): with torch.no_grad(): eval_loss = 0 batch_no: int = 0 batches = [sentences[x:x + eval_mini_batch_size] for x in range(0, len(sentences), eval_mini_batch_size)] metric = Metric('Evaluation') lines: List[str] = [] for batch in batches: batch_no += 1 tags, loss = model.forward_labels_and_loss(batch) eval_loss += loss for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token: Token = token token.add_tag_label('predicted', tag) # append both to file for evaluation eval_line = '{} {} {} {}\n'.format(token.text, token.get_tag(model.tag_type).value, tag.value, tag.score) lines.append(eval_line) lines.append('\n') for sentence in batch: # make list of gold tags gold_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans(model.tag_type)] # make list of predicted tags predicted_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans('predicted')] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) else: metric.add_tn(tag) clear_embeddings(batch, also_clear_word_embeddings=not embeddings_in_memory) eval_loss /= len(sentences) if out_path is not None: with open(out_path, "w", encoding='utf-8') as outfile: outfile.write(''.join(lines)) return metric, eval_loss
def _evaluate_sentence_for_text_classification(metric: Metric, available_labels: List[str], predictions: List[str], true_values: List[str]): for label in available_labels: if label in predictions and label in true_values: metric.add_tp(label) elif label in predictions and label not in true_values: metric.add_fp(label) elif label not in predictions and label in true_values: metric.add_fn(label) elif label not in predictions and label not in true_values: metric.add_tn(label)
def test_metric_with_classes(): metric = Metric("Test") metric.add_tp("class-1") metric.add_tn("class-1") metric.add_tn("class-1") metric.add_fp("class-1") metric.add_tp("class-2") metric.add_tn("class-2") metric.add_tn("class-2") metric.add_fp("class-2") for i in range(0, 10): metric.add_tp("class-3") for i in range(0, 90): metric.add_fp("class-3") metric.add_tp("class-4") metric.add_tn("class-4") metric.add_tn("class-4") metric.add_fp("class-4") print(metric) assert metric.precision("class-1") == 0.5 assert metric.precision("class-2") == 0.5 assert metric.precision("class-3") == 0.1 assert metric.precision("class-4") == 0.5 assert metric.recall("class-1") == 1 assert metric.recall("class-2") == 1 assert metric.recall("class-3") == 1 assert metric.recall("class-4") == 1 assert metric.accuracy() == metric.micro_avg_accuracy() assert metric.f_score() == metric.micro_avg_f_score() assert metric.f_score("class-1") == 0.6666666666666666 assert metric.f_score("class-2") == 0.6666666666666666 assert metric.f_score("class-3") == 0.18181818181818182 assert metric.f_score("class-4") == 0.6666666666666666 assert metric.accuracy("class-1") == 0.75 assert metric.accuracy("class-2") == 0.75 assert metric.accuracy("class-3") == 0.1 assert metric.accuracy("class-4") == 0.75 assert metric.micro_avg_f_score() == 0.21848739495798317 assert metric.macro_avg_f_score() == 0.5454545454545454 assert metric.micro_avg_accuracy() == 0.16964285714285715 assert metric.macro_avg_accuracy() == 0.5875 assert metric.precision() == 0.12264150943396226 assert metric.recall() == 1
def _evaluate_text_classifier( model: flair.nn.Model, sentences: List[Sentence], eval_mini_batch_size: int = 32, embeddings_in_memory: bool = False) -> (dict, float): with torch.no_grad(): eval_loss = 0 batches = [ sentences[x:x + eval_mini_batch_size] for x in range(0, len(sentences), eval_mini_batch_size) ] metric = Metric('Evaluation') for batch in batches: labels, loss = model.forward_labels_and_loss(batch) clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) eval_loss += loss for predictions, true_values in zip( [[label.value for label in sent_labels] for sent_labels in labels], [sentence.get_label_names() for sentence in batch]): for prediction in predictions: if prediction in true_values: metric.add_tp(prediction) else: metric.add_fp(prediction) for true_value in true_values: if true_value not in predictions: metric.add_fn(true_value) else: metric.add_tn(true_value) eval_loss /= len(sentences) return metric, eval_loss
def test_metric_with_classes(): metric = Metric("Test") metric.add_tp("class-1") metric.add_tn("class-1") metric.add_tn("class-1") metric.add_fp("class-1") metric.add_tp("class-2") metric.add_tn("class-2") metric.add_tn("class-2") metric.add_fp("class-2") for i in range(0, 10): metric.add_tp("class-3") for i in range(0, 90): metric.add_fp("class-3") metric.add_tp("class-4") metric.add_tn("class-4") metric.add_tn("class-4") metric.add_fp("class-4") assert metric.precision("class-1") == 0.5 assert metric.precision("class-2") == 0.5 assert metric.precision("class-3") == 0.1 assert metric.precision("class-4") == 0.5 assert metric.recall("class-1") == 1 assert metric.recall("class-2") == 1 assert metric.recall("class-3") == 1 assert metric.recall("class-4") == 1 assert metric.accuracy() == metric.micro_avg_accuracy() assert metric.f_score() == metric.micro_avg_f_score() assert metric.f_score("class-1") == 0.6667 assert metric.f_score("class-2") == 0.6667 assert metric.f_score("class-3") == 0.1818 assert metric.f_score("class-4") == 0.6667 assert metric.accuracy("class-1") == 0.5 assert metric.accuracy("class-2") == 0.5 assert metric.accuracy("class-3") == 0.1 assert metric.accuracy("class-4") == 0.5 assert metric.micro_avg_f_score() == 0.2184 assert metric.macro_avg_f_score() == 0.5454749999999999 assert metric.micro_avg_accuracy() == 0.1226 assert metric.macro_avg_accuracy() == 0.4 assert metric.precision() == 0.1226 assert metric.recall() == 1
def test_metric_with_classes(): metric = Metric('Test') metric.add_tp('class-1') metric.add_tn('class-1') metric.add_tn('class-1') metric.add_fp('class-1') metric.add_tp('class-2') metric.add_tn('class-2') metric.add_tn('class-2') metric.add_fp('class-2') for i in range(0, 10): metric.add_tp('class-3') for i in range(0, 90): metric.add_fp('class-3') metric.add_tp('class-4') metric.add_tn('class-4') metric.add_tn('class-4') metric.add_fp('class-4') assert(metric.precision('class-1') == 0.5) assert(metric.precision('class-2') == 0.5) assert(metric.precision('class-3') == 0.1) assert(metric.precision('class-4') == 0.5) assert(metric.recall('class-1') == 1) assert(metric.recall('class-2') == 1) assert(metric.recall('class-3') == 1) assert(metric.recall('class-4') == 1) assert(metric.accuracy() == metric.micro_avg_accuracy()) assert(metric.f_score() == metric.micro_avg_f_score()) assert(metric.f_score('class-1') == 0.6667) assert(metric.f_score('class-2') == 0.6667) assert(metric.f_score('class-3') == 0.1818) assert(metric.f_score('class-4') == 0.6667) assert(metric.accuracy('class-1') == 0.75) assert(metric.accuracy('class-2') == 0.75) assert(metric.accuracy('class-3') == 0.1) assert(metric.accuracy('class-4') == 0.75) assert(metric.micro_avg_f_score() == 0.2184) assert(metric.macro_avg_f_score() == 0.4) assert(metric.micro_avg_accuracy() == 0.1696) assert(metric.macro_avg_accuracy() == 0.5875) assert(metric.precision() == 0.1226) assert(metric.recall() == 1)
def evaluate(gold_file: Path, pred_file: Path, match_func: Callable[[Tuple, List], Tuple]) -> Metric: gold_annotations = read_annotations(gold_file) pred_annotations = read_annotations(pred_file) metric = Metric("Evaluation", beta=1) copy_gold = copy_dict(gold_annotations) for document_id, annotations in pred_annotations.items(): for pred_entry in annotations: # Documents may not contain any gold entity! if document_id in copy_gold: matched_gold = match_func(pred_entry, copy_gold[document_id]) else: matched_gold = None if matched_gold: # Assert same document and same entity type! assert matched_gold[0] == pred_entry[0] and matched_gold[3] == pred_entry[3] copy_gold[document_id].remove(matched_gold) metric.add_tp(pred_entry[3]) else: metric.add_fp(pred_entry[3]) copy_pred = copy_dict(pred_annotations) for document_id, annotations in gold_annotations.items(): for gold_entry in annotations: if document_id in copy_pred: matched_pred = match_func(gold_entry, copy_pred[document_id]) else: matched_pred = None if not matched_pred: metric.add_fn(gold_entry[3]) else: # Assert same document and same entity type! assert matched_pred[0] == gold_entry[0] and matched_pred[3] == gold_entry[3] copy_pred[document_id].remove(matched_pred) return metric
def evaluate( self, data_loader: DataLoader, out_path: Path = None, embeddings_storage_mode: str = "none", eval_mode: EvalMode = EvalMode.Standard, misspell_mode: MisspellingMode = MisspellingMode.Random, misspelling_rate: float = 0.0, char_vocab: set = {}, lut: dict = {}, cmx: np.array = None, typos: dict = {}, correction_mode: CorrectionMode = CorrectionMode.NotSpecified, eval_dict_name=None, evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE, ) -> (Result, float): if type(out_path) == str: out_path = Path(out_path) from robust_ner.spellcheck import load_correction_dict, get_lang_from_corpus_name if correction_mode == CorrectionMode.NotSpecified: eval_dict = None else: eval_dict = load_correction_dict(eval_dict_name, log) # note: use 'save_correction_dict' to re-generate a dictionary lang = get_lang_from_corpus_name(eval_dict_name) eval_params = {} eval_params["eval_mode"] = eval_mode eval_params["misspelling_rate"] = misspelling_rate eval_params["misspell_mode"] = misspell_mode eval_params["char_vocab"] = char_vocab eval_params["lut"] = lut eval_params["cmx"] = cmx eval_params["typos"] = typos eval_params["correction_mode"] = correction_mode eval_params["lang"] = lang eval_params["dictionary"] = eval_dict with torch.no_grad(): eval_loss = 0 batch_no: int = 0 metric = Metric("Evaluation") lines: List[str] = [] if self.use_crf: transitions = self.transitions.detach().cpu().numpy() else: transitions = None for batch in data_loader: batch_no += 1 with torch.no_grad(): features = self.forward(batch, eval_params) loss = self._calculate_loss(features, batch) tags, _ = self._obtain_labels( feature=features, batch_sentences=batch, transitions=transitions, get_all_tags=False, ) eval_loss += loss for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token: Token = token token.add_tag("predicted", tag.value, tag.score) # append both to file for evaluation eval_line = "{} {} {} {}\n".format( token.text, token.get_tag(self.tag_type).value, tag.value, tag.score, ) lines.append(eval_line) lines.append("\n") for sentence in batch: # make list of gold tags gold_tags = [(tag.tag, tag.text) for tag in sentence.get_spans(self.tag_type)] # make list of predicted tags predicted_tags = [ (tag.tag, tag.text) for tag in sentence.get_spans("predicted") ] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) else: metric.add_tn(tag) store_embeddings(batch, embeddings_storage_mode) eval_loss /= batch_no if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) detailed_result = ( f"\nMICRO_AVG: acc {metric.micro_avg_accuracy():.4f} - f1-score {metric.micro_avg_f_score():.4f}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy():.4f} - f1-score {metric.macro_avg_f_score():.4f}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}") if evaluation_metric == EvaluationMetric.MICRO_F1_SCORE: main_score = metric.micro_avg_f_score() elif evaluation_metric == EvaluationMetric.MACRO_F1_SCORE: main_score = metric.macro_avg_f_score() elif evaluation_metric == EvaluationMetric.MICRO_ACCURACY: main_score = metric.micro_avg_accuracy() elif evaluation_metric == EvaluationMetric.MACRO_ACCURACY: main_score = metric.macro_avg_accuracy() elif evaluation_metric == EvaluationMetric.MEAN_SQUARED_ERROR: main_score = metric.mean_squared_error() else: log.error(f"unknown evaluation metric: {evaluation_metric}") result = Result( main_score=main_score, log_line= f"{metric.precision():.4f}\t{metric.recall():.4f}\t{main_score:.4f}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss
def _evaluate_with_span_F1(self, data_loader, embedding_storage_mode, mini_batch_size, out_path): eval_loss = 0 batch_no: int = 0 metric = Metric("Evaluation", beta=self.beta) lines: List[str] = [] y_true = [] y_pred = [] for batch in data_loader: # predict for batch loss = self.predict(batch, embedding_storage_mode=embedding_storage_mode, mini_batch_size=mini_batch_size, label_name='predicted', return_loss=True) eval_loss += loss batch_no += 1 for sentence in batch: # make list of gold tags gold_spans = sentence.get_spans(self.tag_type) gold_tags = [(span.tag, repr(span)) for span in gold_spans] # make list of predicted tags predicted_spans = sentence.get_spans("predicted") predicted_tags = [(span.tag, repr(span)) for span in predicted_spans] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) tags_gold = [] tags_pred = [] # also write to file in BIO format to use old conlleval script if out_path: for token in sentence: # check if in gold spans gold_tag = 'O' for span in gold_spans: if token in span: gold_tag = 'B-' + span.tag if token == span[0] else 'I-' + span.tag tags_gold.append(gold_tag) predicted_tag = 'O' # check if in predicted spans for span in predicted_spans: if token in span: predicted_tag = 'B-' + span.tag if token == span[0] else 'I-' + span.tag tags_pred.append(predicted_tag) lines.append(f'{token.text} {gold_tag} {predicted_tag}\n') lines.append('\n') y_true.append(tags_gold) y_pred.append(tags_pred) if out_path: with open(Path(out_path), "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) eval_loss /= batch_no detailed_result = ( "\nResults:" f"\n- F1-score (micro) {metric.micro_avg_f_score():.4f}" f"\n- F1-score (macro) {metric.macro_avg_f_score():.4f}" '\n\nBy class:' ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"f1-score: " f"{metric.f_score(class_name):.4f}" ) result = Result( main_score=metric.micro_avg_f_score(), log_line=f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss
def evaluate( self, data_loader: DataLoader, out_path: Path = None, embedding_storage_mode: str = "none", ) -> (Result, float): if type(out_path) == str: out_path = Path(out_path) metric = Metric("Evaluation", beta=self.beta) parsing_metric = ParsingMetric() lines: List[str] = [] eval_loss_arc = 0 eval_loss_rel = 0 for batch_idx, batch in enumerate(data_loader): with torch.no_grad(): score_arc, score_rel = self.forward(batch) loss_arc, loss_rel = self._calculate_loss( score_arc, score_rel, batch) arc_prediction, relation_prediction = self._obtain_labels_( score_arc, score_rel) parsing_metric(arc_prediction, relation_prediction, batch) eval_loss_arc += loss_arc eval_loss_rel += loss_rel for (sentence, arcs, sent_tags) in zip(batch, arc_prediction, relation_prediction): for (token, arc, tag) in zip(sentence.tokens, arcs, sent_tags): token: Token = token token.add_tag_label("predicted", Label(tag)) token.add_tag_label("predicted_head_id", Label(str(arc))) # append both to file for evaluation eval_line = "{} {} {} {} {}\n".format( token.text, token.tags['dependency'].value, str(token.head_id), tag, str(arc), ) lines.append(eval_line) lines.append("\n") for sentence in batch: # make list of gold tags gold_tags = [ token.tags['dependency'].value for token in sentence.tokens ] # make list of predicted tags predicted_tags = [ tag.tag for tag in sentence.get_spans("predicted") ] # check for true positives, false positives and false negatives for tag_indx, predicted_tag in enumerate(predicted_tags): if predicted_tag == gold_tags[tag_indx]: metric.add_tp(tag) else: metric.add_fp(tag) for tag_indx, label_tag in enumerate(gold_tags): if label_tag != predicted_tags[tag_indx]: metric.add_fn(tag) else: metric.add_tn(tag) store_embeddings(batch, embedding_storage_mode) eval_loss_arc /= len(data_loader) eval_loss_rel /= len(data_loader) if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) detailed_result = ( f"\nUAS : {parsing_metric.get_uas():.4f} - LAS : {parsing_metric.get_las():.4f}" f"\neval loss rel : {eval_loss_rel:.4f} - eval loss arc : {eval_loss_arc:.4f}" f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss_arc + eval_loss_rel
def evaluate( self, sentences: Dataset, eval_mini_batch_size: int = 32, embeddings_in_memory: bool = True, out_path: Path = None, ) -> (Result, float): with torch.no_grad(): eval_loss = 0 batch_no: int = 0 batch_loader = torch.utils.data.DataLoader( sentences, batch_size=eval_mini_batch_size, shuffle=False, num_workers=4, collate_fn=list, ) metric = Metric("Evaluation") lines: List[str] = [] for batch in batch_loader: batch_no += 1 with torch.no_grad(): features = self.forward(batch) loss = self._calculate_loss(features, batch) tags = self._obtain_labels(features, batch) eval_loss += loss for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token: Token = token token.add_tag_label("predicted", tag) # append both to file for evaluation eval_line = "{} {} {} {}\n".format( token.text, token.get_tag(self.tag_type).value, tag.value, tag.score, ) lines.append(eval_line) lines.append("\n") for sentence in batch: # make list of gold tags gold_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type) ] # make list of predicted tags predicted_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans("predicted") ] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) else: metric.add_tn(tag) clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory ) eval_loss /= len(sentences) if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) detailed_result = ( f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}" ) result = Result( main_score=metric.micro_avg_f_score(), log_line=f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss
def evaluate( self, sentences: List[Sentence], eval_mini_batch_size: int = 32, embeddings_in_memory: bool = False, out_path: Path = None, ) -> (Result, float): with torch.no_grad(): eval_loss = 0 batches = [ sentences[x:x + eval_mini_batch_size] for x in range(0, len(sentences), eval_mini_batch_size) ] metric = Metric("Evaluation") lines: List[str] = [] for batch in batches: labels, loss = self.forward_labels_and_loss(batch) clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) eval_loss += loss sentences_for_batch = [ sent.to_plain_string() for sent in batch ] confidences_for_batch = [[ label.score for label in sent_labels ] for sent_labels in labels] predictions_for_batch = [[ label.value for label in sent_labels ] for sent_labels in labels] true_values_for_batch = [ sentence.get_label_names() for sentence in batch ] available_labels = self.label_dictionary.get_items() for sentence, confidence, prediction, true_value in zip( sentences_for_batch, confidences_for_batch, predictions_for_batch, true_values_for_batch, ): eval_line = "{}\t{}\t{}\t{}\n".format( sentence, true_value, prediction, confidence) lines.append(eval_line) for predictions_for_sentence, true_values_for_sentence in zip( predictions_for_batch, true_values_for_batch): for label in available_labels: if (label in predictions_for_sentence and label in true_values_for_sentence): metric.add_tp(label) elif (label in predictions_for_sentence and label not in true_values_for_sentence): metric.add_fp(label) elif (label not in predictions_for_sentence and label in true_values_for_sentence): metric.add_fn(label) elif (label not in predictions_for_sentence and label not in true_values_for_sentence): metric.add_tn(label) eval_loss /= len(sentences) detailed_result = ( f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) return result, eval_loss
def evaluate( self, sentences: Union[List[Sentence], Dataset], out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: int = 8, wsd_evaluation: bool = False, **kwargs, ) -> (Result, float): # read Dataset into data loader (if list of sentences passed, make Dataset first) if not isinstance(sentences, Dataset): sentences = SentenceDataset(sentences) data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) eval_loss = 0 eval_count = 0 batch_no: int = 0 metric = Metric("Evaluation", beta=self.beta) lines: List[str] = [] y_true = [] y_pred = [] for batch in data_loader: # predict for batch loss_and_count = self.predict( batch, embedding_storage_mode=embedding_storage_mode, mini_batch_size=mini_batch_size, label_name='predicted', return_loss=True) eval_loss += loss_and_count[0] eval_count += loss_and_count[1] batch_no += 1 for sentence in batch: # make list of gold tags gold_spans = sentence.get_spans(self.get_current_tag_type()) gold_tags = [(span.tag, repr(span)) for span in gold_spans] # make list of predicted tags predicted_spans = sentence.get_spans("predicted") predicted_tags = [(span.tag, repr(span)) for span in predicted_spans] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) tags_gold = [] tags_pred = [] # also write to file in BIO format to use old conlleval script if out_path: for token in sentence: # check if in gold spans gold_tag = 'O' for span in gold_spans: if token in span: gold_tag = 'B-' + span.tag if token == span[ 0] else 'I-' + span.tag tags_gold.append(gold_tag) predicted_tag = 'O' # check if in predicted spans for span in predicted_spans: if token in span: predicted_tag = 'B-' + span.tag if token == span[ 0] else 'I-' + span.tag tags_pred.append(predicted_tag) lines.append( f'{token.text} {gold_tag} {predicted_tag}\n') lines.append('\n') y_true.append(tags_gold) y_pred.append(tags_pred) if out_path: with open(Path(out_path), "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) detailed_result = ( "\nResults:" f"\n- F1-score (micro) {metric.micro_avg_f_score():.4f}" f"\n- F1-score (macro) {metric.macro_avg_f_score():.4f}" '\n\nBy class:') for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss / eval_count
from pathlib import Path
def evaluate(self, data_loader, out_path=None, embedding_storage_mode="none"): eval_loss = 0 batch_no: int = 0 metric = Metric("Evaluation", beta=1.0) # lines: List[str] = [] for batch in data_loader: batch_no += 1 loss, tags = _predict_batch(self.__models, batch) eval_loss += loss for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token: Token = token token.add_tag("predicted", tag.value, tag.score) # append both to file for evaluation # eval_line = "{} {} {} {}\n".format( # token.text, # token.get_tag(self.tag_type).value, # tag.value, # tag.score, # ) # lines.append(eval_line) # lines.append("\n") for sentence in batch: # make list of gold tags gold_tags = [(tag.tag, tag.text) for tag in sentence.get_spans(self.tag_type)] # make list of predicted tags predicted_tags = [(tag.tag, tag.text) for tag in sentence.get_spans("predicted")] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) else: metric.add_tn(tag) store_embeddings(batch, embedding_storage_mode) eval_loss /= batch_no detailed_result = ( f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss
def evaluate(self, data_loader: DataLoader, out_path: Path = None, embeddings_storage_mode: str = 'cpu') -> (Result, float): with torch.no_grad(): eval_loss = 0 metric = Metric('Evaluation') lines = [] batch_count = 0 for batch in data_loader: batch_count += 1 (labels, loss) = self.forward_labels_and_loss(batch) eval_loss += loss sentences_for_batch = [ sent.to_plain_string() for sent in batch ] confidences_for_batch = [[ label.score for label in sent_labels ] for sent_labels in labels] predictions_for_batch = [[ label.value for label in sent_labels ] for sent_labels in labels] true_values_for_batch = [ sentence.get_label_names() for sentence in batch ] available_labels = self.label_dictionary.get_items() for (sentence, confidence, prediction, true_value) in zip( sentences_for_batch, confidences_for_batch, predictions_for_batch, true_values_for_batch): eval_line = '{}\t{}\t{}\t{}\n'.format( sentence, true_value, prediction, confidence) lines.append(eval_line) for (predictions_for_sentence, true_values_for_sentence) in zip(predictions_for_batch, true_values_for_batch): for label in available_labels: if ((label in predictions_for_sentence) and (label in true_values_for_sentence)): metric.add_tp(label) elif ((label in predictions_for_sentence) and (label not in true_values_for_sentence)): metric.add_fp(label) elif ((label not in predictions_for_sentence) and (label in true_values_for_sentence)): metric.add_fn(label) elif ((label not in predictions_for_sentence) and (label not in true_values_for_sentence)): metric.add_tn(label) store_embeddings(batch, embeddings_storage_mode) eval_loss /= batch_count detailed_result = ''.join([ '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()), ' - f1-score ', '{}'.format(metric.micro_avg_f_score()), '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()), ' - f1-score ', '{}'.format(metric.macro_avg_f_score()) ]) for class_name in metric.get_classes(): detailed_result += ''.join([ '\n', '{:<10}'.format(class_name), ' tp: ', '{}'.format(metric.get_tp(class_name)), ' - fp: ', '{}'.format(metric.get_fp(class_name)), ' - fn: ', '{}'.format(metric.get_fn(class_name)), ' - tn: ', '{}'.format(metric.get_tn(class_name)), ' - precision: ', '{:.4f}'.format(metric.precision(class_name)), ' - recall: ', '{:.4f}'.format(metric.recall(class_name)), ' - accuracy: ', '{:.4f}'.format( metric.accuracy(class_name)), ' - f1-score: ', '{:.4f}'.format(metric.f_score(class_name)) ]) result = Result(main_score=metric.micro_avg_f_score(), log_line=''.join([ '{}'.format(metric.precision()), '\t', '{}'.format(metric.recall()), '\t', '{}'.format(metric.micro_avg_f_score()) ]), log_header='PRECISION\tRECALL\tF1', detailed_results=detailed_result) if (out_path is not None): with open(out_path, 'w', encoding='utf-8') as outfile: outfile.write(''.join(lines)) return (result, eval_loss)
from __future__ import absolute_import
def eval_flair_spans(data, predicted_list, batch_size, out_path=None): metric = Metric('Evaluation') mini_batch_size = batch_size batches = [ data[x:x + mini_batch_size] for x in range(0, len(data), mini_batch_size) ] lines: List[str] = [] word_counter = 0 for batch in batches: for sentence in batch: for token in sentence.tokens: tag = Label(predicted_list[word_counter]) word_counter += 1 token.add_tag_label('predicted', tag) # append both to file for evaluation eval_line = '{} {} {} {}\n'.format(token.text, token.get_tag('ner').value, tag.value, tag.score) lines.append(eval_line) lines.append('\n') for sentence in batch: # make list of gold tags gold_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans('ner')] # make list of predicted tags predicted_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans('predicted')] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) else: metric.add_tn(tag) # add metrics scores at the beginning of the file lines.insert(0, str(metric) + "\n\n") if out_path is not None: # create folder for json and corresponding output if not os.path.exists(os.path.dirname(out_path)): try: os.makedirs(os.path.dirname(out_path)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise with open(out_path, "w", encoding='utf-8') as outfile: outfile.write(''.join(lines)) # # esnWrapper.model.output_activation = output_activation_training return metric
def evaluate( self, data_loader: DataLoader, out_path: Path = None, embeddings_storage_mode: str = "cpu", ) -> (Result, float): with torch.no_grad(): eval_loss = 0 metric = Metric("Evaluation") lines: List[str] = [] batch_count: int = 0 for batch in data_loader: batch_count += 1 labels, loss = self.forward_labels_and_loss(batch) eval_loss += loss sentences_for_batch = [ sent.to_plain_string() for sent in batch ] confidences_for_batch = [[ label.score for label in sent_labels ] for sent_labels in labels] predictions_for_batch = [[ label.value for label in sent_labels ] for sent_labels in labels] true_values_for_batch = [ sentence.get_label_names() for sentence in batch ] available_labels = self.label_dictionary.get_items() for sentence, confidence, prediction, true_value in zip( sentences_for_batch, confidences_for_batch, predictions_for_batch, true_values_for_batch, ): eval_line = "{}\t{}\t{}\t{}\n".format( sentence, true_value, prediction, confidence) lines.append(eval_line) for predictions_for_sentence, true_values_for_sentence in zip( predictions_for_batch, true_values_for_batch): for label in available_labels: if (label in predictions_for_sentence and label in true_values_for_sentence): metric.add_tp(label) elif (label in predictions_for_sentence and label not in true_values_for_sentence): metric.add_fp(label) elif (label not in predictions_for_sentence and label in true_values_for_sentence): metric.add_fn(label) elif (label not in predictions_for_sentence and label not in true_values_for_sentence): metric.add_tn(label) store_embeddings(batch, embeddings_storage_mode) eval_loss /= batch_count detailed_result = ( f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) return result, eval_loss
def evaluate( self, data_loader: DataLoader, out_path: Path = None, embeddings_storage_mode: str = "cpu", prediction_mode: bool = False, ) -> (Result, float): eval_loss = 0 batch_no = 0 data_loader.assign_embeddings() if out_path is not None: outfile = open(out_path, "w", encoding="utf-8") if not self.binary: metric = Metric("Evaluation") with torch.no_grad(): for batch in data_loader: batch_no += 1 scores = self.forward(batch, prediction_mode=prediction_mode) loss = self._calculate_loss(scores, batch, self.mask) eval_loss += loss if self.binary: pdb.set_trace() result = Result( main_score=LF1, log_line=f"\nUF1: {UF1} - LF1 {LF1}", log_header="PRECISION\tRECALL\tF1", detailed_results=f"\nUF1: {UF1} - LF1 {LF1}", ) else: # if prediction_mode: # eval_loss, metric=self.dependency_evaluate(data_loader,out_path=out_path,prediction_mode=prediction_mode) # return eval_loss, metric # else: tags, _ = self._obtain_labels(scores, batch) for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token: Token = token token.add_tag_label("predicted", tag) # append both to file for evaluation eval_line = "{} {} {} {}\n".format( token.text, token.get_tag(self.tag_type).value, tag.value, tag.score, ) # lines.append(eval_line) if out_path is not None: outfile.write(eval_line) # lines.append("\n") if out_path is not None: outfile.write("\n") for sentence in batch: # make list of gold tags gold_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type) ] # make list of predicted tags predicted_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans("predicted") ] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) else: metric.add_tn(tag) eval_loss /= batch_no if out_path is not None: outfile.close() detailed_result = ( f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss
def evaluate( self, data_loader: DataLoader, out_path: Path = None, embedding_storage_mode: str = "none", ) -> (Result, float): if type(out_path) == str: out_path = Path(out_path) with torch.no_grad(): eval_loss = 0 batch_no: int = 0 metric = Metric("Evaluation", beta=self.beta) lines: List[str] = [] if self.use_crf: transitions = self.transitions.detach().cpu().numpy() else: transitions = None for batch in data_loader: batch_no += 1 with torch.no_grad(): features = self.forward(batch) loss = self._calculate_loss(features, batch) tags, _ = self._obtain_labels( feature=features, batch_sentences=batch, transitions=transitions, get_all_tags=False, ) eval_loss += loss for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token: Token = token token.add_tag("predicted", tag.value, tag.score) # append both to file for evaluation eval_line = "{} {} {} {}\n".format( token.text, token.get_tag(self.tag_type).value, tag.value, tag.score, ) lines.append(eval_line) lines.append("\n") for sentence in batch: # make list of gold tags gold_tags = [ (tag.tag, tag.text) for tag in sentence.get_spans(self.tag_type) ] # make list of predicted tags predicted_tags = [ (tag.tag, tag.text) for tag in sentence.get_spans("predicted") ] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) else: metric.add_tn(tag) store_embeddings(batch, embedding_storage_mode) eval_loss /= batch_no if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) detailed_result = ( f"\nMICRO_AVG: acc {metric.micro_avg_accuracy():.4f} - f1-score {metric.micro_avg_f_score():.4f}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy():.4f} - f1-score {metric.macro_avg_f_score():.4f}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}" ) result = Result( main_score=metric.micro_avg_f_score(), log_line=f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss
def evaluate(self, data_loader: DataLoader, out_path: Path = None, embeddings_storage_mode: str = 'cpu') -> (Result, float): with torch.no_grad(): eval_loss = 0 batch_no = 0 metric = Metric('Evaluation') lines = [] for batch in data_loader: batch_no += 1 with torch.no_grad(): features = self.forward(batch) loss = self._calculate_loss(features, batch) (tags, _) = self._obtain_labels(features, batch) eval_loss += loss for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token = token token.add_tag_label('predicted', tag) eval_line = '{} {} {} {}\n'.format( token.text, token.get_tag(self.tag_type).value, tag.value, tag.score) lines.append(eval_line) lines.append('\n') for sentence in batch: gold_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type)] predicted_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans('predicted') ] for (tag, prediction) in predicted_tags: if ((tag, prediction) in gold_tags): metric.add_tp(tag) else: metric.add_fp(tag) for (tag, gold) in gold_tags: if ((tag, gold) not in predicted_tags): metric.add_fn(tag) else: metric.add_tn(tag) store_embeddings(batch, embeddings_storage_mode) eval_loss /= batch_no if (out_path is not None): with open(out_path, 'w', encoding='utf-8') as outfile: outfile.write(''.join(lines)) detailed_result = ''.join([ '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()), ' - f1-score ', '{}'.format(metric.micro_avg_f_score()), '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()), ' - f1-score ', '{}'.format(metric.macro_avg_f_score()) ]) for class_name in metric.get_classes(): detailed_result += ''.join([ '\n', '{:<10}'.format(class_name), ' tp: ', '{}'.format(metric.get_tp(class_name)), ' - fp: ', '{}'.format(metric.get_fp(class_name)), ' - fn: ', '{}'.format(metric.get_fn(class_name)), ' - tn: ', '{}'.format(metric.get_tn(class_name)), ' - precision: ', '{:.4f}'.format(metric.precision(class_name)), ' - recall: ', '{:.4f}'.format(metric.recall(class_name)), ' - accuracy: ', '{:.4f}'.format( metric.accuracy(class_name)), ' - f1-score: ', '{:.4f}'.format(metric.f_score(class_name)) ]) result = Result(main_score=metric.micro_avg_f_score(), log_line=''.join([ '{}'.format(metric.precision()), '\t', '{}'.format(metric.recall()), '\t', '{}'.format(metric.micro_avg_f_score()) ]), log_header='PRECISION\tRECALL\tF1', detailed_results=detailed_result) return (result, eval_loss)