def test_metric_with_classes(): metric = Metric("Test") metric.add_tp("class-1") metric.add_tn("class-1") metric.add_tn("class-1") metric.add_fp("class-1") metric.add_tp("class-2") metric.add_tn("class-2") metric.add_tn("class-2") metric.add_fp("class-2") for i in range(0, 10): metric.add_tp("class-3") for i in range(0, 90): metric.add_fp("class-3") metric.add_tp("class-4") metric.add_tn("class-4") metric.add_tn("class-4") metric.add_fp("class-4") print(metric) assert metric.precision("class-1") == 0.5 assert metric.precision("class-2") == 0.5 assert metric.precision("class-3") == 0.1 assert metric.precision("class-4") == 0.5 assert metric.recall("class-1") == 1 assert metric.recall("class-2") == 1 assert metric.recall("class-3") == 1 assert metric.recall("class-4") == 1 assert metric.accuracy() == metric.micro_avg_accuracy() assert metric.f_score() == metric.micro_avg_f_score() assert metric.f_score("class-1") == 0.6666666666666666 assert metric.f_score("class-2") == 0.6666666666666666 assert metric.f_score("class-3") == 0.18181818181818182 assert metric.f_score("class-4") == 0.6666666666666666 assert metric.accuracy("class-1") == 0.75 assert metric.accuracy("class-2") == 0.75 assert metric.accuracy("class-3") == 0.1 assert metric.accuracy("class-4") == 0.75 assert metric.micro_avg_f_score() == 0.21848739495798317 assert metric.macro_avg_f_score() == 0.5454545454545454 assert metric.micro_avg_accuracy() == 0.16964285714285715 assert metric.macro_avg_accuracy() == 0.5875 assert metric.precision() == 0.12264150943396226 assert metric.recall() == 1
def test_metric_with_classes(): metric = Metric("Test") metric.add_tp("class-1") metric.add_tn("class-1") metric.add_tn("class-1") metric.add_fp("class-1") metric.add_tp("class-2") metric.add_tn("class-2") metric.add_tn("class-2") metric.add_fp("class-2") for i in range(0, 10): metric.add_tp("class-3") for i in range(0, 90): metric.add_fp("class-3") metric.add_tp("class-4") metric.add_tn("class-4") metric.add_tn("class-4") metric.add_fp("class-4") assert metric.precision("class-1") == 0.5 assert metric.precision("class-2") == 0.5 assert metric.precision("class-3") == 0.1 assert metric.precision("class-4") == 0.5 assert metric.recall("class-1") == 1 assert metric.recall("class-2") == 1 assert metric.recall("class-3") == 1 assert metric.recall("class-4") == 1 assert metric.accuracy() == metric.micro_avg_accuracy() assert metric.f_score() == metric.micro_avg_f_score() assert metric.f_score("class-1") == 0.6667 assert metric.f_score("class-2") == 0.6667 assert metric.f_score("class-3") == 0.1818 assert metric.f_score("class-4") == 0.6667 assert metric.accuracy("class-1") == 0.5 assert metric.accuracy("class-2") == 0.5 assert metric.accuracy("class-3") == 0.1 assert metric.accuracy("class-4") == 0.5 assert metric.micro_avg_f_score() == 0.2184 assert metric.macro_avg_f_score() == 0.5454749999999999 assert metric.micro_avg_accuracy() == 0.1226 assert metric.macro_avg_accuracy() == 0.4 assert metric.precision() == 0.1226 assert metric.recall() == 1
def test_metric_with_classes(): metric = Metric('Test') metric.add_tp('class-1') metric.add_tn('class-1') metric.add_tn('class-1') metric.add_fp('class-1') metric.add_tp('class-2') metric.add_tn('class-2') metric.add_tn('class-2') metric.add_fp('class-2') for i in range(0, 10): metric.add_tp('class-3') for i in range(0, 90): metric.add_fp('class-3') metric.add_tp('class-4') metric.add_tn('class-4') metric.add_tn('class-4') metric.add_fp('class-4') assert(metric.precision('class-1') == 0.5) assert(metric.precision('class-2') == 0.5) assert(metric.precision('class-3') == 0.1) assert(metric.precision('class-4') == 0.5) assert(metric.recall('class-1') == 1) assert(metric.recall('class-2') == 1) assert(metric.recall('class-3') == 1) assert(metric.recall('class-4') == 1) assert(metric.accuracy() == metric.micro_avg_accuracy()) assert(metric.f_score() == metric.micro_avg_f_score()) assert(metric.f_score('class-1') == 0.6667) assert(metric.f_score('class-2') == 0.6667) assert(metric.f_score('class-3') == 0.1818) assert(metric.f_score('class-4') == 0.6667) assert(metric.accuracy('class-1') == 0.75) assert(metric.accuracy('class-2') == 0.75) assert(metric.accuracy('class-3') == 0.1) assert(metric.accuracy('class-4') == 0.75) assert(metric.micro_avg_f_score() == 0.2184) assert(metric.macro_avg_f_score() == 0.4) assert(metric.micro_avg_accuracy() == 0.1696) assert(metric.macro_avg_accuracy() == 0.5875) assert(metric.precision() == 0.1226) assert(metric.recall() == 1)
def log_metric(self, metric: Metric, dataset_name: str, log_class_metrics=False): log.info( "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}" .format(dataset_name, metric.f_score(), metric.accuracy(), metric.get_tp(), metric.get_fp(), metric.get_fn(), metric.get_tn())) if log_class_metrics: for cls in metric.get_classes(): log.info( "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}" .format(cls, metric.f_score(cls), metric.accuracy(cls), metric.get_tp(cls), metric.get_fp(cls), metric.get_fn(cls), metric.get_tn(cls)))
def evaluate(self, evaluation: List[Sentence], out_path=None, evaluation_method: str = 'F1', eval_batch_size: int = 32, embeddings_in_memory: bool = True): batch_no: int = 0 batches = [ evaluation[x:x + eval_batch_size] for x in range(0, len(evaluation), eval_batch_size) ] metric = Metric('') lines: List[str] = [] for batch in batches: batch_no += 1 scores, tag_seq = self.model._predict_scores_batch(batch) predicted_ids = tag_seq all_tokens = [] for sentence in batch: all_tokens.extend(sentence.tokens) for (token, score, predicted_id) in zip(all_tokens, scores, predicted_ids): token: Token = token # get the predicted tag predicted_value = self.model.tag_dictionary.get_item_for_index( predicted_id) token.add_tag('predicted', predicted_value, score) for sentence in batch: # add predicted tags for token in sentence.tokens: predicted_tag: Label = token.get_tag('predicted') # append both to file for evaluation eval_line = '{} {} {}\n'.format( token.text, token.get_tag(self.model.tag_type).value, predicted_tag.value) lines.append(eval_line) lines.append('\n') # make list of gold tags gold_tags = [ str(tag) for tag in sentence.get_spans(self.model.tag_type) ] # make list of predicted tags predicted_tags = [ str(tag) for tag in sentence.get_spans('predicted') ] # check for true positives, false positives and false negatives for prediction in predicted_tags: if prediction in gold_tags: metric.tp() else: metric.fp() for gold in gold_tags: if gold not in predicted_tags: metric.fn() if not embeddings_in_memory: self.clear_embeddings_in_batch(batch) if out_path is not None: test_tsv = os.path.join(out_path, "test.tsv") with open(test_tsv, "w", encoding='utf-8') as outfile: outfile.write(''.join(lines)) if evaluation_method == 'accuracy': score = metric.accuracy() return score, metric if evaluation_method == 'F1': score = metric.f_score() return score, metric
def evaluate(self, evaluation: List[Sentence], out_path=None, evaluation_method: str = 'F1', embeddings_in_memory: bool = True): tp: int = 0 fp: int = 0 batch_no: int = 0 mini_batch_size = 32 batches = [evaluation[x:x + mini_batch_size] for x in range(0, len(evaluation), mini_batch_size)] metric = Metric('') lines: List[str] = [] for batch in batches: batch_no += 1 self.model.embeddings.embed(batch) for sentence in batch: sentence: Sentence = sentence # Step 3. Run our forward pass. score, tag_seq = self.model.predict_scores(sentence) # Step 5. Compute predictions predicted_id = tag_seq for (token, pred_id) in zip(sentence.tokens, predicted_id): token: Token = token # get the predicted tag predicted_tag = self.model.tag_dictionary.get_item_for_index(pred_id) token.add_tag('predicted', predicted_tag) # get the gold tag gold_tag = token.get_tag(self.model.tag_type) # append both to file for evaluation eval_line = token.text + ' ' + gold_tag + ' ' + predicted_tag + "\n" # positives if predicted_tag != '': # true positives if predicted_tag == gold_tag: metric.tp() # false positive if predicted_tag != gold_tag: metric.fp() # negatives if predicted_tag == '': # true negative if predicted_tag == gold_tag: metric.tn() # false negative if predicted_tag != gold_tag: metric.fn() lines.append(eval_line) lines.append('\n') if not embeddings_in_memory: self.clear_embeddings_in_batch(batch) if out_path is not None: test_tsv = os.path.join(out_path, "test.tsv") with open(test_tsv, "w", encoding='utf-8') as outfile: outfile.write(''.join(lines)) if evaluation_method == 'span-F1': # get the eval script eval_script = cached_path('https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/scripts/conll03_eval_script.pl', cache_dir='scripts') os.chmod(eval_script, 0o777) eval_data = ''.join(lines) p = run(eval_script, stdout=PIPE, input=eval_data, encoding='utf-8') main_result = p.stdout print(main_result) main_result = main_result.split('\n')[1] # parse the result file main_result = re.sub(';', ' ', main_result) main_result = re.sub('precision', 'p', main_result) main_result = re.sub('recall', 'r', main_result) main_result = re.sub('accuracy', 'acc', main_result) f_score = float(re.findall(r'\d+\.\d+$', main_result)[0]) return f_score, metric._fp, main_result if evaluation_method == 'accuracy': score = metric.accuracy() return score, metric._fp, str(score) if evaluation_method == 'F1': score = metric.f_score() return score, metric._fp, str(metric)
def evaluate(self, data_loader: DataLoader, out_path: Path = None, embeddings_storage_mode: str = 'cpu') -> (Result, float): with torch.no_grad(): eval_loss = 0 metric = Metric('Evaluation') lines = [] batch_count = 0 for batch in data_loader: batch_count += 1 (labels, loss) = self.forward_labels_and_loss(batch) eval_loss += loss sentences_for_batch = [ sent.to_plain_string() for sent in batch ] confidences_for_batch = [[ label.score for label in sent_labels ] for sent_labels in labels] predictions_for_batch = [[ label.value for label in sent_labels ] for sent_labels in labels] true_values_for_batch = [ sentence.get_label_names() for sentence in batch ] available_labels = self.label_dictionary.get_items() for (sentence, confidence, prediction, true_value) in zip( sentences_for_batch, confidences_for_batch, predictions_for_batch, true_values_for_batch): eval_line = '{}\t{}\t{}\t{}\n'.format( sentence, true_value, prediction, confidence) lines.append(eval_line) for (predictions_for_sentence, true_values_for_sentence) in zip(predictions_for_batch, true_values_for_batch): for label in available_labels: if ((label in predictions_for_sentence) and (label in true_values_for_sentence)): metric.add_tp(label) elif ((label in predictions_for_sentence) and (label not in true_values_for_sentence)): metric.add_fp(label) elif ((label not in predictions_for_sentence) and (label in true_values_for_sentence)): metric.add_fn(label) elif ((label not in predictions_for_sentence) and (label not in true_values_for_sentence)): metric.add_tn(label) store_embeddings(batch, embeddings_storage_mode) eval_loss /= batch_count detailed_result = ''.join([ '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()), ' - f1-score ', '{}'.format(metric.micro_avg_f_score()), '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()), ' - f1-score ', '{}'.format(metric.macro_avg_f_score()) ]) for class_name in metric.get_classes(): detailed_result += ''.join([ '\n', '{:<10}'.format(class_name), ' tp: ', '{}'.format(metric.get_tp(class_name)), ' - fp: ', '{}'.format(metric.get_fp(class_name)), ' - fn: ', '{}'.format(metric.get_fn(class_name)), ' - tn: ', '{}'.format(metric.get_tn(class_name)), ' - precision: ', '{:.4f}'.format(metric.precision(class_name)), ' - recall: ', '{:.4f}'.format(metric.recall(class_name)), ' - accuracy: ', '{:.4f}'.format( metric.accuracy(class_name)), ' - f1-score: ', '{:.4f}'.format(metric.f_score(class_name)) ]) result = Result(main_score=metric.micro_avg_f_score(), log_line=''.join([ '{}'.format(metric.precision()), '\t', '{}'.format(metric.recall()), '\t', '{}'.format(metric.micro_avg_f_score()) ]), log_header='PRECISION\tRECALL\tF1', detailed_results=detailed_result) if (out_path is not None): with open(out_path, 'w', encoding='utf-8') as outfile: outfile.write(''.join(lines)) return (result, eval_loss)
def evaluate(self, data_loader: DataLoader, out_path: Path = None, embeddings_storage_mode: str = 'cpu') -> (Result, float): with torch.no_grad(): eval_loss = 0 batch_no = 0 metric = Metric('Evaluation') lines = [] for batch in data_loader: batch_no += 1 with torch.no_grad(): features = self.forward(batch) loss = self._calculate_loss(features, batch) (tags, _) = self._obtain_labels(features, batch) eval_loss += loss for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token = token token.add_tag_label('predicted', tag) eval_line = '{} {} {} {}\n'.format( token.text, token.get_tag(self.tag_type).value, tag.value, tag.score) lines.append(eval_line) lines.append('\n') for sentence in batch: gold_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type)] predicted_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans('predicted') ] for (tag, prediction) in predicted_tags: if ((tag, prediction) in gold_tags): metric.add_tp(tag) else: metric.add_fp(tag) for (tag, gold) in gold_tags: if ((tag, gold) not in predicted_tags): metric.add_fn(tag) else: metric.add_tn(tag) store_embeddings(batch, embeddings_storage_mode) eval_loss /= batch_no if (out_path is not None): with open(out_path, 'w', encoding='utf-8') as outfile: outfile.write(''.join(lines)) detailed_result = ''.join([ '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()), ' - f1-score ', '{}'.format(metric.micro_avg_f_score()), '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()), ' - f1-score ', '{}'.format(metric.macro_avg_f_score()) ]) for class_name in metric.get_classes(): detailed_result += ''.join([ '\n', '{:<10}'.format(class_name), ' tp: ', '{}'.format(metric.get_tp(class_name)), ' - fp: ', '{}'.format(metric.get_fp(class_name)), ' - fn: ', '{}'.format(metric.get_fn(class_name)), ' - tn: ', '{}'.format(metric.get_tn(class_name)), ' - precision: ', '{:.4f}'.format(metric.precision(class_name)), ' - recall: ', '{:.4f}'.format(metric.recall(class_name)), ' - accuracy: ', '{:.4f}'.format( metric.accuracy(class_name)), ' - f1-score: ', '{:.4f}'.format(metric.f_score(class_name)) ]) result = Result(main_score=metric.micro_avg_f_score(), log_line=''.join([ '{}'.format(metric.precision()), '\t', '{}'.format(metric.recall()), '\t', '{}'.format(metric.micro_avg_f_score()) ]), log_header='PRECISION\tRECALL\tF1', detailed_results=detailed_result) return (result, eval_loss)