Ejemplo n.º 1
0
def test_metric_with_classes():
    metric = Metric('Test')

    metric.add_tp('class-1')
    metric.add_tn('class-1')
    metric.add_tn('class-1')
    metric.add_fp('class-1')

    metric.add_tp('class-2')
    metric.add_tn('class-2')
    metric.add_tn('class-2')
    metric.add_fp('class-2')

    for i in range(0, 10):
        metric.add_tp('class-3')
    for i in range(0, 90):
        metric.add_fp('class-3')

    metric.add_tp('class-4')
    metric.add_tn('class-4')
    metric.add_tn('class-4')
    metric.add_fp('class-4')

    assert(metric.precision('class-1') == 0.5)
    assert(metric.precision('class-2') == 0.5)
    assert(metric.precision('class-3') == 0.1)
    assert(metric.precision('class-4') == 0.5)

    assert(metric.recall('class-1') == 1)
    assert(metric.recall('class-2') == 1)
    assert(metric.recall('class-3') == 1)
    assert(metric.recall('class-4') == 1)

    assert(metric.accuracy() == metric.micro_avg_accuracy())
    assert(metric.f_score() == metric.micro_avg_f_score())

    assert(metric.f_score('class-1') == 0.6667)
    assert(metric.f_score('class-2') == 0.6667)
    assert(metric.f_score('class-3') == 0.1818)
    assert(metric.f_score('class-4') == 0.6667)

    assert(metric.accuracy('class-1') == 0.75)
    assert(metric.accuracy('class-2') == 0.75)
    assert(metric.accuracy('class-3') == 0.1)
    assert(metric.accuracy('class-4') == 0.75)

    assert(metric.micro_avg_f_score() == 0.2184)
    assert(metric.macro_avg_f_score() == 0.5714)

    assert(metric.micro_avg_accuracy() == 0.1696)
    assert(metric.macro_avg_accuracy() == 0.5875)

    assert(metric.precision() == 0.1226)
    assert(metric.recall() == 1)
Ejemplo n.º 2
0
 def log_metric(self,
                metric: Metric,
                dataset_name: str,
                log_class_metrics=False):
     log.info(
         "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}"
         .format(dataset_name, metric.f_score(), metric.accuracy(),
                 metric.get_tp(), metric.get_fp(), metric.get_fn(),
                 metric.get_tn()))
     if log_class_metrics:
         for cls in metric.get_classes():
             log.info(
                 "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}"
                 .format(cls, metric.f_score(cls), metric.accuracy(cls),
                         metric.get_tp(cls), metric.get_fp(cls),
                         metric.get_fn(cls), metric.get_tn(cls)))
Ejemplo n.º 3
0
    def evaluate(self,
                 evaluation: List[Sentence],
                 out_path=None,
                 evaluation_method: str = 'F1',
                 eval_batch_size: int = 32,
                 embeddings_in_memory: bool = True):

        batch_no: int = 0
        batches = [
            evaluation[x:x + eval_batch_size]
            for x in range(0, len(evaluation), eval_batch_size)
        ]

        metric = Metric('')

        lines: List[str] = []

        for batch in batches:
            batch_no += 1

            scores, tag_seq = self.model._predict_scores_batch(batch)
            predicted_ids = tag_seq
            all_tokens = []
            for sentence in batch:
                all_tokens.extend(sentence.tokens)

            for (token, score, predicted_id) in zip(all_tokens, scores,
                                                    predicted_ids):
                token: Token = token
                # get the predicted tag
                predicted_value = self.model.tag_dictionary.get_item_for_index(
                    predicted_id)
                token.add_tag('predicted', predicted_value, score)

            for sentence in batch:

                # add predicted tags
                for token in sentence.tokens:
                    predicted_tag: Label = token.get_tag('predicted')

                    # append both to file for evaluation
                    eval_line = '{} {} {}\n'.format(
                        token.text,
                        token.get_tag(self.model.tag_type).value,
                        predicted_tag.value)

                    lines.append(eval_line)
                lines.append('\n')

                # make list of gold tags
                gold_tags = [
                    str(tag) for tag in sentence.get_spans(self.model.tag_type)
                ]

                # make list of predicted tags
                predicted_tags = [
                    str(tag) for tag in sentence.get_spans('predicted')
                ]

                # check for true positives, false positives and false negatives
                for prediction in predicted_tags:
                    if prediction in gold_tags:
                        metric.tp()
                    else:
                        metric.fp()

                for gold in gold_tags:
                    if gold not in predicted_tags:
                        metric.fn()

            if not embeddings_in_memory:
                self.clear_embeddings_in_batch(batch)

        if out_path is not None:
            test_tsv = os.path.join(out_path, "test.tsv")
            with open(test_tsv, "w", encoding='utf-8') as outfile:
                outfile.write(''.join(lines))

        if evaluation_method == 'accuracy':
            score = metric.accuracy()
            return score, metric

        if evaluation_method == 'F1':
            score = metric.f_score()
            return score, metric
Ejemplo n.º 4
0
    def evaluate(self, evaluation: List[Sentence], out_path=None, evaluation_method: str = 'F1',
                 embeddings_in_memory: bool = True):

        tp: int = 0
        fp: int = 0

        batch_no: int = 0
        mini_batch_size = 32
        batches = [evaluation[x:x + mini_batch_size] for x in
                   range(0, len(evaluation), mini_batch_size)]

        metric = Metric('')

        lines: List[str] = []

        for batch in batches:
            batch_no += 1

            self.model.embeddings.embed(batch)

            for sentence in batch:

                sentence: Sentence = sentence

                # Step 3. Run our forward pass.
                score, tag_seq = self.model.predict_scores(sentence)

                # Step 5. Compute predictions
                predicted_id = tag_seq
                for (token, pred_id) in zip(sentence.tokens, predicted_id):
                    token: Token = token
                    # get the predicted tag
                    predicted_tag = self.model.tag_dictionary.get_item_for_index(pred_id)
                    token.add_tag('predicted', predicted_tag)

                    # get the gold tag
                    gold_tag = token.get_tag(self.model.tag_type)

                    # append both to file for evaluation
                    eval_line = token.text + ' ' + gold_tag + ' ' + predicted_tag + "\n"

                    # positives
                    if predicted_tag != '':
                        # true positives
                        if predicted_tag == gold_tag:
                            metric.tp()
                        # false positive
                        if predicted_tag != gold_tag:
                            metric.fp()

                    # negatives
                    if predicted_tag == '':
                        # true negative
                        if predicted_tag == gold_tag:
                            metric.tn()
                        # false negative
                        if predicted_tag != gold_tag:
                            metric.fn()

                    lines.append(eval_line)

                lines.append('\n')

            if not embeddings_in_memory:
                self.clear_embeddings_in_batch(batch)

        if out_path is not None:
            test_tsv = os.path.join(out_path, "test.tsv")
            with open(test_tsv, "w", encoding='utf-8') as outfile:
                outfile.write(''.join(lines))

        if evaluation_method == 'span-F1':

            # get the eval script
            eval_script = cached_path('https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/scripts/conll03_eval_script.pl', cache_dir='scripts')
            os.chmod(eval_script, 0o777)

            eval_data = ''.join(lines)

            p = run(eval_script, stdout=PIPE, input=eval_data, encoding='utf-8')
            main_result = p.stdout
            print(main_result)

            main_result = main_result.split('\n')[1]

            # parse the result file
            main_result = re.sub(';', ' ', main_result)
            main_result = re.sub('precision', 'p', main_result)
            main_result = re.sub('recall', 'r', main_result)
            main_result = re.sub('accuracy', 'acc', main_result)

            f_score = float(re.findall(r'\d+\.\d+$', main_result)[0])
            return f_score, metric._fp, main_result

        if evaluation_method == 'accuracy':
            score = metric.accuracy()
            return score, metric._fp, str(score)

        if evaluation_method == 'F1':
            score = metric.f_score()
            return score, metric._fp, str(metric)
Ejemplo n.º 5
0
    def evaluate(
            self,
            data_loader: DataLoader,
            out_path: Path = None,
            embedding_storage_mode: str = "none",
    ) -> (Result, float):

        if type(out_path) == str:
            out_path = Path(out_path)

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0

            metric = Metric("Evaluation", beta=self.beta)

            lines: List[str] = []

            if self.use_crf:
                transitions = self.transitions.detach().cpu().numpy()
            else:
                transitions = None

            for batch in data_loader:
                batch_no += 1

                with torch.no_grad():
                    features = self.forward(batch)
                    loss = self._calculate_loss(features, batch)
                    tags, _ = self._obtain_labels(
                        feature=features,
                        batch_sentences=batch,
                        transitions=transitions,
                        get_all_tags=False,
                    )

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag_label("predicted", tag)

                        # append both to file for evaluation
                        eval_line = "{} {} {} {}\n".format(
                            token.text,
                            token.get_tag(self.tag_type).value,
                            tag.value,
                            tag.score,
                        )
                        lines.append(eval_line)
                    lines.append("\n")
                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [
                        (tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type)
                    ]
                    # make list of predicted tags
                    predicted_tags = [
                        (tag.tag, str(tag)) for tag in sentence.get_spans("predicted")
                    ]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                store_embeddings(batch, embedding_storage_mode)

            eval_loss /= batch_no

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
                f"\nBINARY: f1-score {metric.f_score(self.class_name)}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}"
                )

            result = Result(
                main_score=metric.f_score(self.class_name),
                log_line=f"{metric.precision(self.class_name)}\t{metric.recall(self.class_name)}\t{metric.f_score(self.class_name)}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            return result, eval_loss
Ejemplo n.º 6
0
 def evaluate(self,
              data_loader: DataLoader,
              out_path: Path = None,
              embeddings_storage_mode: str = 'cpu') -> (Result, float):
     with torch.no_grad():
         eval_loss = 0
         metric = Metric('Evaluation')
         lines = []
         batch_count = 0
         for batch in data_loader:
             batch_count += 1
             (labels, loss) = self.forward_labels_and_loss(batch)
             eval_loss += loss
             sentences_for_batch = [
                 sent.to_plain_string() for sent in batch
             ]
             confidences_for_batch = [[
                 label.score for label in sent_labels
             ] for sent_labels in labels]
             predictions_for_batch = [[
                 label.value for label in sent_labels
             ] for sent_labels in labels]
             true_values_for_batch = [
                 sentence.get_label_names() for sentence in batch
             ]
             available_labels = self.label_dictionary.get_items()
             for (sentence, confidence, prediction, true_value) in zip(
                     sentences_for_batch, confidences_for_batch,
                     predictions_for_batch, true_values_for_batch):
                 eval_line = '{}\t{}\t{}\t{}\n'.format(
                     sentence, true_value, prediction, confidence)
                 lines.append(eval_line)
             for (predictions_for_sentence,
                  true_values_for_sentence) in zip(predictions_for_batch,
                                                   true_values_for_batch):
                 for label in available_labels:
                     if ((label in predictions_for_sentence)
                             and (label in true_values_for_sentence)):
                         metric.add_tp(label)
                     elif ((label in predictions_for_sentence)
                           and (label not in true_values_for_sentence)):
                         metric.add_fp(label)
                     elif ((label not in predictions_for_sentence)
                           and (label in true_values_for_sentence)):
                         metric.add_fn(label)
                     elif ((label not in predictions_for_sentence)
                           and (label not in true_values_for_sentence)):
                         metric.add_tn(label)
             store_embeddings(batch, embeddings_storage_mode)
         eval_loss /= batch_count
         detailed_result = ''.join([
             '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.micro_avg_f_score()),
             '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.macro_avg_f_score())
         ])
         for class_name in metric.get_classes():
             detailed_result += ''.join([
                 '\n', '{:<10}'.format(class_name), ' tp: ',
                 '{}'.format(metric.get_tp(class_name)), ' - fp: ',
                 '{}'.format(metric.get_fp(class_name)), ' - fn: ',
                 '{}'.format(metric.get_fn(class_name)), ' - tn: ',
                 '{}'.format(metric.get_tn(class_name)), ' - precision: ',
                 '{:.4f}'.format(metric.precision(class_name)),
                 ' - recall: ', '{:.4f}'.format(metric.recall(class_name)),
                 ' - accuracy: ', '{:.4f}'.format(
                     metric.accuracy(class_name)), ' - f1-score: ',
                 '{:.4f}'.format(metric.f_score(class_name))
             ])
         result = Result(main_score=metric.micro_avg_f_score(),
                         log_line=''.join([
                             '{}'.format(metric.precision()), '\t',
                             '{}'.format(metric.recall()), '\t',
                             '{}'.format(metric.micro_avg_f_score())
                         ]),
                         log_header='PRECISION\tRECALL\tF1',
                         detailed_results=detailed_result)
         if (out_path is not None):
             with open(out_path, 'w', encoding='utf-8') as outfile:
                 outfile.write(''.join(lines))
         return (result, eval_loss)
Ejemplo n.º 7
0
 def evaluate(self,
              data_loader: DataLoader,
              out_path: Path = None,
              embeddings_storage_mode: str = 'cpu') -> (Result, float):
     with torch.no_grad():
         eval_loss = 0
         batch_no = 0
         metric = Metric('Evaluation')
         lines = []
         for batch in data_loader:
             batch_no += 1
             with torch.no_grad():
                 features = self.forward(batch)
                 loss = self._calculate_loss(features, batch)
                 (tags, _) = self._obtain_labels(features, batch)
             eval_loss += loss
             for (sentence, sent_tags) in zip(batch, tags):
                 for (token, tag) in zip(sentence.tokens, sent_tags):
                     token = token
                     token.add_tag_label('predicted', tag)
                     eval_line = '{} {} {} {}\n'.format(
                         token.text,
                         token.get_tag(self.tag_type).value, tag.value,
                         tag.score)
                     lines.append(eval_line)
                 lines.append('\n')
             for sentence in batch:
                 gold_tags = [(tag.tag, str(tag))
                              for tag in sentence.get_spans(self.tag_type)]
                 predicted_tags = [
                     (tag.tag, str(tag))
                     for tag in sentence.get_spans('predicted')
                 ]
                 for (tag, prediction) in predicted_tags:
                     if ((tag, prediction) in gold_tags):
                         metric.add_tp(tag)
                     else:
                         metric.add_fp(tag)
                 for (tag, gold) in gold_tags:
                     if ((tag, gold) not in predicted_tags):
                         metric.add_fn(tag)
                     else:
                         metric.add_tn(tag)
             store_embeddings(batch, embeddings_storage_mode)
         eval_loss /= batch_no
         if (out_path is not None):
             with open(out_path, 'w', encoding='utf-8') as outfile:
                 outfile.write(''.join(lines))
         detailed_result = ''.join([
             '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.micro_avg_f_score()),
             '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.macro_avg_f_score())
         ])
         for class_name in metric.get_classes():
             detailed_result += ''.join([
                 '\n', '{:<10}'.format(class_name), ' tp: ',
                 '{}'.format(metric.get_tp(class_name)), ' - fp: ',
                 '{}'.format(metric.get_fp(class_name)), ' - fn: ',
                 '{}'.format(metric.get_fn(class_name)), ' - tn: ',
                 '{}'.format(metric.get_tn(class_name)), ' - precision: ',
                 '{:.4f}'.format(metric.precision(class_name)),
                 ' - recall: ', '{:.4f}'.format(metric.recall(class_name)),
                 ' - accuracy: ', '{:.4f}'.format(
                     metric.accuracy(class_name)), ' - f1-score: ',
                 '{:.4f}'.format(metric.f_score(class_name))
             ])
         result = Result(main_score=metric.micro_avg_f_score(),
                         log_line=''.join([
                             '{}'.format(metric.precision()), '\t',
                             '{}'.format(metric.recall()), '\t',
                             '{}'.format(metric.micro_avg_f_score())
                         ]),
                         log_header='PRECISION\tRECALL\tF1',
                         detailed_results=detailed_result)
         return (result, eval_loss)