Beispiel #1
0
def test_metric_with_classes():
    metric = Metric("Test")

    metric.add_tp("class-1")
    metric.add_tn("class-1")
    metric.add_tn("class-1")
    metric.add_fp("class-1")

    metric.add_tp("class-2")
    metric.add_tn("class-2")
    metric.add_tn("class-2")
    metric.add_fp("class-2")

    for i in range(0, 10):
        metric.add_tp("class-3")
    for i in range(0, 90):
        metric.add_fp("class-3")

    metric.add_tp("class-4")
    metric.add_tn("class-4")
    metric.add_tn("class-4")
    metric.add_fp("class-4")

    print(metric)

    assert metric.precision("class-1") == 0.5
    assert metric.precision("class-2") == 0.5
    assert metric.precision("class-3") == 0.1
    assert metric.precision("class-4") == 0.5

    assert metric.recall("class-1") == 1
    assert metric.recall("class-2") == 1
    assert metric.recall("class-3") == 1
    assert metric.recall("class-4") == 1

    assert metric.accuracy() == metric.micro_avg_accuracy()
    assert metric.f_score() == metric.micro_avg_f_score()

    assert metric.f_score("class-1") == 0.6666666666666666
    assert metric.f_score("class-2") == 0.6666666666666666
    assert metric.f_score("class-3") == 0.18181818181818182
    assert metric.f_score("class-4") == 0.6666666666666666

    assert metric.accuracy("class-1") == 0.75
    assert metric.accuracy("class-2") == 0.75
    assert metric.accuracy("class-3") == 0.1
    assert metric.accuracy("class-4") == 0.75

    assert metric.micro_avg_f_score() == 0.21848739495798317
    assert metric.macro_avg_f_score() == 0.5454545454545454

    assert metric.micro_avg_accuracy() == 0.16964285714285715
    assert metric.macro_avg_accuracy() == 0.5875

    assert metric.precision() == 0.12264150943396226
    assert metric.recall() == 1
Beispiel #2
0
def test_metric_with_classes():
    metric = Metric("Test")

    metric.add_tp("class-1")
    metric.add_tn("class-1")
    metric.add_tn("class-1")
    metric.add_fp("class-1")

    metric.add_tp("class-2")
    metric.add_tn("class-2")
    metric.add_tn("class-2")
    metric.add_fp("class-2")

    for i in range(0, 10):
        metric.add_tp("class-3")
    for i in range(0, 90):
        metric.add_fp("class-3")

    metric.add_tp("class-4")
    metric.add_tn("class-4")
    metric.add_tn("class-4")
    metric.add_fp("class-4")

    assert metric.precision("class-1") == 0.5
    assert metric.precision("class-2") == 0.5
    assert metric.precision("class-3") == 0.1
    assert metric.precision("class-4") == 0.5

    assert metric.recall("class-1") == 1
    assert metric.recall("class-2") == 1
    assert metric.recall("class-3") == 1
    assert metric.recall("class-4") == 1

    assert metric.accuracy() == metric.micro_avg_accuracy()
    assert metric.f_score() == metric.micro_avg_f_score()

    assert metric.f_score("class-1") == 0.6667
    assert metric.f_score("class-2") == 0.6667
    assert metric.f_score("class-3") == 0.1818
    assert metric.f_score("class-4") == 0.6667

    assert metric.accuracy("class-1") == 0.5
    assert metric.accuracy("class-2") == 0.5
    assert metric.accuracy("class-3") == 0.1
    assert metric.accuracy("class-4") == 0.5

    assert metric.micro_avg_f_score() == 0.2184
    assert metric.macro_avg_f_score() == 0.5454749999999999

    assert metric.micro_avg_accuracy() == 0.1226
    assert metric.macro_avg_accuracy() == 0.4

    assert metric.precision() == 0.1226
    assert metric.recall() == 1
Beispiel #3
0
def test_metric_with_classes():
    metric = Metric('Test')

    metric.add_tp('class-1')
    metric.add_tn('class-1')
    metric.add_tn('class-1')
    metric.add_fp('class-1')

    metric.add_tp('class-2')
    metric.add_tn('class-2')
    metric.add_tn('class-2')
    metric.add_fp('class-2')

    for i in range(0, 10):
        metric.add_tp('class-3')
    for i in range(0, 90):
        metric.add_fp('class-3')

    metric.add_tp('class-4')
    metric.add_tn('class-4')
    metric.add_tn('class-4')
    metric.add_fp('class-4')

    assert(metric.precision('class-1') == 0.5)
    assert(metric.precision('class-2') == 0.5)
    assert(metric.precision('class-3') == 0.1)
    assert(metric.precision('class-4') == 0.5)

    assert(metric.recall('class-1') == 1)
    assert(metric.recall('class-2') == 1)
    assert(metric.recall('class-3') == 1)
    assert(metric.recall('class-4') == 1)

    assert(metric.accuracy() == metric.micro_avg_accuracy())
    assert(metric.f_score() == metric.micro_avg_f_score())

    assert(metric.f_score('class-1') == 0.6667)
    assert(metric.f_score('class-2') == 0.6667)
    assert(metric.f_score('class-3') == 0.1818)
    assert(metric.f_score('class-4') == 0.6667)

    assert(metric.accuracy('class-1') == 0.75)
    assert(metric.accuracy('class-2') == 0.75)
    assert(metric.accuracy('class-3') == 0.1)
    assert(metric.accuracy('class-4') == 0.75)

    assert(metric.micro_avg_f_score() == 0.2184)
    assert(metric.macro_avg_f_score() == 0.4)

    assert(metric.micro_avg_accuracy() == 0.1696)
    assert(metric.macro_avg_accuracy() == 0.5875)

    assert(metric.precision() == 0.1226)
    assert(metric.recall() == 1)
 def evaluate(self,
              data_loader: DataLoader,
              out_path: Path = None,
              embeddings_storage_mode: str = 'cpu') -> (Result, float):
     with torch.no_grad():
         eval_loss = 0
         metric = Metric('Evaluation')
         lines = []
         batch_count = 0
         for batch in data_loader:
             batch_count += 1
             (labels, loss) = self.forward_labels_and_loss(batch)
             eval_loss += loss
             sentences_for_batch = [
                 sent.to_plain_string() for sent in batch
             ]
             confidences_for_batch = [[
                 label.score for label in sent_labels
             ] for sent_labels in labels]
             predictions_for_batch = [[
                 label.value for label in sent_labels
             ] for sent_labels in labels]
             true_values_for_batch = [
                 sentence.get_label_names() for sentence in batch
             ]
             available_labels = self.label_dictionary.get_items()
             for (sentence, confidence, prediction, true_value) in zip(
                     sentences_for_batch, confidences_for_batch,
                     predictions_for_batch, true_values_for_batch):
                 eval_line = '{}\t{}\t{}\t{}\n'.format(
                     sentence, true_value, prediction, confidence)
                 lines.append(eval_line)
             for (predictions_for_sentence,
                  true_values_for_sentence) in zip(predictions_for_batch,
                                                   true_values_for_batch):
                 for label in available_labels:
                     if ((label in predictions_for_sentence)
                             and (label in true_values_for_sentence)):
                         metric.add_tp(label)
                     elif ((label in predictions_for_sentence)
                           and (label not in true_values_for_sentence)):
                         metric.add_fp(label)
                     elif ((label not in predictions_for_sentence)
                           and (label in true_values_for_sentence)):
                         metric.add_fn(label)
                     elif ((label not in predictions_for_sentence)
                           and (label not in true_values_for_sentence)):
                         metric.add_tn(label)
             store_embeddings(batch, embeddings_storage_mode)
         eval_loss /= batch_count
         detailed_result = ''.join([
             '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.micro_avg_f_score()),
             '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.macro_avg_f_score())
         ])
         for class_name in metric.get_classes():
             detailed_result += ''.join([
                 '\n', '{:<10}'.format(class_name), ' tp: ',
                 '{}'.format(metric.get_tp(class_name)), ' - fp: ',
                 '{}'.format(metric.get_fp(class_name)), ' - fn: ',
                 '{}'.format(metric.get_fn(class_name)), ' - tn: ',
                 '{}'.format(metric.get_tn(class_name)), ' - precision: ',
                 '{:.4f}'.format(metric.precision(class_name)),
                 ' - recall: ', '{:.4f}'.format(metric.recall(class_name)),
                 ' - accuracy: ', '{:.4f}'.format(
                     metric.accuracy(class_name)), ' - f1-score: ',
                 '{:.4f}'.format(metric.f_score(class_name))
             ])
         result = Result(main_score=metric.micro_avg_f_score(),
                         log_line=''.join([
                             '{}'.format(metric.precision()), '\t',
                             '{}'.format(metric.recall()), '\t',
                             '{}'.format(metric.micro_avg_f_score())
                         ]),
                         log_header='PRECISION\tRECALL\tF1',
                         detailed_results=detailed_result)
         if (out_path is not None):
             with open(out_path, 'w', encoding='utf-8') as outfile:
                 outfile.write(''.join(lines))
         return (result, eval_loss)
Beispiel #5
0
 def evaluate(self,
              data_loader: DataLoader,
              out_path: Path = None,
              embeddings_storage_mode: str = 'cpu') -> (Result, float):
     with torch.no_grad():
         eval_loss = 0
         batch_no = 0
         metric = Metric('Evaluation')
         lines = []
         for batch in data_loader:
             batch_no += 1
             with torch.no_grad():
                 features = self.forward(batch)
                 loss = self._calculate_loss(features, batch)
                 (tags, _) = self._obtain_labels(features, batch)
             eval_loss += loss
             for (sentence, sent_tags) in zip(batch, tags):
                 for (token, tag) in zip(sentence.tokens, sent_tags):
                     token = token
                     token.add_tag_label('predicted', tag)
                     eval_line = '{} {} {} {}\n'.format(
                         token.text,
                         token.get_tag(self.tag_type).value, tag.value,
                         tag.score)
                     lines.append(eval_line)
                 lines.append('\n')
             for sentence in batch:
                 gold_tags = [(tag.tag, str(tag))
                              for tag in sentence.get_spans(self.tag_type)]
                 predicted_tags = [
                     (tag.tag, str(tag))
                     for tag in sentence.get_spans('predicted')
                 ]
                 for (tag, prediction) in predicted_tags:
                     if ((tag, prediction) in gold_tags):
                         metric.add_tp(tag)
                     else:
                         metric.add_fp(tag)
                 for (tag, gold) in gold_tags:
                     if ((tag, gold) not in predicted_tags):
                         metric.add_fn(tag)
                     else:
                         metric.add_tn(tag)
             store_embeddings(batch, embeddings_storage_mode)
         eval_loss /= batch_no
         if (out_path is not None):
             with open(out_path, 'w', encoding='utf-8') as outfile:
                 outfile.write(''.join(lines))
         detailed_result = ''.join([
             '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.micro_avg_f_score()),
             '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.macro_avg_f_score())
         ])
         for class_name in metric.get_classes():
             detailed_result += ''.join([
                 '\n', '{:<10}'.format(class_name), ' tp: ',
                 '{}'.format(metric.get_tp(class_name)), ' - fp: ',
                 '{}'.format(metric.get_fp(class_name)), ' - fn: ',
                 '{}'.format(metric.get_fn(class_name)), ' - tn: ',
                 '{}'.format(metric.get_tn(class_name)), ' - precision: ',
                 '{:.4f}'.format(metric.precision(class_name)),
                 ' - recall: ', '{:.4f}'.format(metric.recall(class_name)),
                 ' - accuracy: ', '{:.4f}'.format(
                     metric.accuracy(class_name)), ' - f1-score: ',
                 '{:.4f}'.format(metric.f_score(class_name))
             ])
         result = Result(main_score=metric.micro_avg_f_score(),
                         log_line=''.join([
                             '{}'.format(metric.precision()), '\t',
                             '{}'.format(metric.recall()), '\t',
                             '{}'.format(metric.micro_avg_f_score())
                         ]),
                         log_header='PRECISION\tRECALL\tF1',
                         detailed_results=detailed_result)
         return (result, eval_loss)