コード例 #1
0
ファイル: trainer.py プロジェクト: alirezadir/flair
    def _evaluate_text_classifier(model: flair.nn.Model,
                                  sentences: List[Sentence],
                                  eval_mini_batch_size: int = 32,
                                  embeddings_in_memory: bool = False,
                                  out_path: Path = None) -> (dict, float):

        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + eval_mini_batch_size]
                for x in range(0, len(sentences), eval_mini_batch_size)
            ]

            metric = Metric('Evaluation')

            lines: List[str] = []
            for batch in batches:

                labels, loss = model.forward_labels_and_loss(batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                sentences_for_batch = [
                    sent.to_plain_string() for sent in batch
                ]
                confidences_for_batch = [[
                    label.score for label in sent_labels
                ] for sent_labels in labels]
                predictions_for_batch = [[
                    label.value for label in sent_labels
                ] for sent_labels in labels]
                true_values_for_batch = [
                    sentence.get_label_names() for sentence in batch
                ]
                available_labels = model.label_dictionary.get_items()

                for sentence, confidence, prediction, true_value in zip(
                        sentences_for_batch, confidences_for_batch,
                        predictions_for_batch, true_values_for_batch):
                    eval_line = '{}\t{}\t{}\t{}\n'.format(
                        sentence, true_value, prediction, confidence)
                    lines.append(eval_line)

                for predictions_for_sentence, true_values_for_sentence in zip(
                        predictions_for_batch, true_values_for_batch):
                    ModelTrainer._evaluate_sentence_for_text_classification(
                        metric, available_labels, predictions_for_sentence,
                        true_values_for_sentence)

            eval_loss /= len(sentences)

            if out_path is not None:
                with open(out_path, "w", encoding='utf-8') as outfile:
                    outfile.write(''.join(lines))

            return metric, eval_loss
コード例 #2
0
    def _evaluate_text_classifier(model: flair.nn.Model,
                                  sentences: List[Sentence],
                                  eval_mini_batch_size: int = 32,
                                  embeddings_in_memory: bool = False) -> (dict, float):

        with torch.no_grad():
            eval_loss = 0

            batches = [sentences[x:x + eval_mini_batch_size] for x in
                       range(0, len(sentences), eval_mini_batch_size)]

            metric = Metric('Evaluation')

            for batch in batches:

                labels, loss = model.forward_labels_and_loss(batch)

                clear_embeddings(batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                predictions_for_batch = [[label.value for label in sent_labels] for sent_labels in labels]
                true_values_for_batch = [sentence.get_label_names() for sentence in batch]
                available_labels = model.label_dictionary.get_items()

                for predictions_for_sentence, true_values_for_sentence in zip(predictions_for_batch, true_values_for_batch):
                    ModelTrainer._evaluate_sentence_for_text_classification(metric,
                                                                            available_labels,
                                                                            predictions_for_sentence,
                                                                            true_values_for_sentence)

            eval_loss /= len(sentences)

            return metric, eval_loss
コード例 #3
0
    def _evaluate_sequence_tagger(model,
                                  sentences: List[Sentence],
                                  eval_mini_batch_size: int = 32,
                                  embeddings_in_memory: bool = True,
                                  out_path: Path = None) -> (dict, float):

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0
            batches = [sentences[x:x + eval_mini_batch_size] for x in range(0, len(sentences), eval_mini_batch_size)]

            metric = Metric('Evaluation')

            lines: List[str] = []
            for batch in batches:
                batch_no += 1

                tags, loss = model.forward_labels_and_loss(batch)

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag_label('predicted', tag)

                        # append both to file for evaluation
                        eval_line = '{} {} {} {}\n'.format(token.text,
                                                           token.get_tag(model.tag_type).value, tag.value, tag.score)
                        lines.append(eval_line)
                    lines.append('\n')
                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans(model.tag_type)]
                    # make list of predicted tags
                    predicted_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans('predicted')]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                clear_embeddings(batch, also_clear_word_embeddings=not embeddings_in_memory)

            eval_loss /= len(sentences)

            if out_path is not None:
                with open(out_path, "w", encoding='utf-8') as outfile:
                    outfile.write(''.join(lines))

            return metric, eval_loss
コード例 #4
0
    def evaluate(self,
                 sentences: List[Sentence],
                 eval_class_metrics: bool = False,
                 mini_batch_size: int = 32,
                 embeddings_in_memory: bool = False,
                 metric_name: str = 'MICRO_AVG') -> (dict, float):
        """
        Evaluates the model with the given list of sentences.
        :param sentences: the list of sentences
        :param eval_class_metrics: boolean indicating whether to print class metrics or not
        :param mini_batch_size: the mini batch size to use
        :param embeddings_in_memory: boolean value indicating, if embeddings should be kept in memory or not
        :param metric_name: the name of the metrics to compute
        :return: list of metrics, and the loss
        """
        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + mini_batch_size]
                for x in range(0, len(sentences), mini_batch_size)
            ]

            metric = Metric(metric_name)

            for batch in batches:
                scores = self.model.forward(batch)
                labels = self.model.obtain_labels(scores)
                loss = self.model.calculate_loss(scores, batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                for predictions, true_values in zip(
                    [[label.value for label in sent_labels]
                     for sent_labels in labels],
                    [sentence.get_label_names() for sentence in batch]):
                    for prediction in predictions:
                        if prediction in true_values:
                            metric.tp()
                            if eval_class_metrics: metric.tp(prediction)
                        else:
                            metric.fp()
                            if eval_class_metrics: metric.fp(prediction)

                    for true_value in true_values:
                        if true_value not in predictions:
                            metric.fn()
                            if eval_class_metrics: metric.fn(true_value)
                        else:
                            metric.tn()
                            if eval_class_metrics: metric.tn(true_value)

            eval_loss /= len(sentences)

            return metric, eval_loss
コード例 #5
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embeddings_storage_mode: str = "cpu",
    ) -> (Result, float):

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0

            metric = Metric("Evaluation")

            lines: List[str] = []
            for batch in data_loader:
                batch_no += 1

                with torch.no_grad():
                    features = self.forward(batch)
                    loss = self.calculate_loss(features, batch)
                    tags, _ = self.obtain_labels(features, batch)

                eval_loss += loss

                metric = self.obtain_performance_metric(
                    batch, tags, lines, metric)

                store_embeddings(batch, embeddings_storage_mode)

            eval_loss /= batch_no

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}")

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=
                f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            return result, eval_loss
コード例 #6
0
ファイル: test_utils.py プロジェクト: zzg-971030/flair
def test_metric_with_classes():
    metric = Metric("Test")

    metric.add_tp("class-1")
    metric.add_tn("class-1")
    metric.add_tn("class-1")
    metric.add_fp("class-1")

    metric.add_tp("class-2")
    metric.add_tn("class-2")
    metric.add_tn("class-2")
    metric.add_fp("class-2")

    for i in range(0, 10):
        metric.add_tp("class-3")
    for i in range(0, 90):
        metric.add_fp("class-3")

    metric.add_tp("class-4")
    metric.add_tn("class-4")
    metric.add_tn("class-4")
    metric.add_fp("class-4")

    print(metric)

    assert metric.precision("class-1") == 0.5
    assert metric.precision("class-2") == 0.5
    assert metric.precision("class-3") == 0.1
    assert metric.precision("class-4") == 0.5

    assert metric.recall("class-1") == 1
    assert metric.recall("class-2") == 1
    assert metric.recall("class-3") == 1
    assert metric.recall("class-4") == 1

    assert metric.accuracy() == metric.micro_avg_accuracy()
    assert metric.f_score() == metric.micro_avg_f_score()

    assert metric.f_score("class-1") == 0.6666666666666666
    assert metric.f_score("class-2") == 0.6666666666666666
    assert metric.f_score("class-3") == 0.18181818181818182
    assert metric.f_score("class-4") == 0.6666666666666666

    assert metric.accuracy("class-1") == 0.75
    assert metric.accuracy("class-2") == 0.75
    assert metric.accuracy("class-3") == 0.1
    assert metric.accuracy("class-4") == 0.75

    assert metric.micro_avg_f_score() == 0.21848739495798317
    assert metric.macro_avg_f_score() == 0.5454545454545454

    assert metric.micro_avg_accuracy() == 0.16964285714285715
    assert metric.macro_avg_accuracy() == 0.5875

    assert metric.precision() == 0.12264150943396226
    assert metric.recall() == 1
コード例 #7
0
ファイル: test_utils.py プロジェクト: rkwojdan/flair35
def test_metric_get_classes():
    metric = Metric(u'Test')
    metric.add_fn(u'class-1')
    metric.add_fn(u'class-3')
    metric.add_tn(u'class-1')
    metric.add_tp(u'class-2')
    assert (3 == len(metric.get_classes()))
    assert (u'class-1' in metric.get_classes())
    assert (u'class-2' in metric.get_classes())
    assert (u'class-3' in metric.get_classes())
コード例 #8
0
ファイル: test_utils.py プロジェクト: bluesea0/ditk
def test_metric_with_classes():
    metric = Metric("Test")

    metric.add_tp("class-1")
    metric.add_tn("class-1")
    metric.add_tn("class-1")
    metric.add_fp("class-1")

    metric.add_tp("class-2")
    metric.add_tn("class-2")
    metric.add_tn("class-2")
    metric.add_fp("class-2")

    for i in range(0, 10):
        metric.add_tp("class-3")
    for i in range(0, 90):
        metric.add_fp("class-3")

    metric.add_tp("class-4")
    metric.add_tn("class-4")
    metric.add_tn("class-4")
    metric.add_fp("class-4")

    assert metric.precision("class-1") == 0.5
    assert metric.precision("class-2") == 0.5
    assert metric.precision("class-3") == 0.1
    assert metric.precision("class-4") == 0.5

    assert metric.recall("class-1") == 1
    assert metric.recall("class-2") == 1
    assert metric.recall("class-3") == 1
    assert metric.recall("class-4") == 1

    assert metric.accuracy() == metric.micro_avg_accuracy()
    assert metric.f_score() == metric.micro_avg_f_score()

    assert metric.f_score("class-1") == 0.6667
    assert metric.f_score("class-2") == 0.6667
    assert metric.f_score("class-3") == 0.1818
    assert metric.f_score("class-4") == 0.6667

    assert metric.accuracy("class-1") == 0.5
    assert metric.accuracy("class-2") == 0.5
    assert metric.accuracy("class-3") == 0.1
    assert metric.accuracy("class-4") == 0.5

    assert metric.micro_avg_f_score() == 0.2184
    assert metric.macro_avg_f_score() == 0.5454749999999999

    assert metric.micro_avg_accuracy() == 0.1226
    assert metric.macro_avg_accuracy() == 0.4

    assert metric.precision() == 0.1226
    assert metric.recall() == 1
コード例 #9
0
ファイル: test_utils.py プロジェクト: zllrunning/flair
def test_metric_with_classes():
    metric = Metric('Test')

    metric.add_tp('class-1')
    metric.add_tn('class-1')
    metric.add_tn('class-1')
    metric.add_fp('class-1')

    metric.add_tp('class-2')
    metric.add_tn('class-2')
    metric.add_tn('class-2')
    metric.add_fp('class-2')

    for i in range(0, 10):
        metric.add_tp('class-3')
    for i in range(0, 90):
        metric.add_fp('class-3')

    metric.add_tp('class-4')
    metric.add_tn('class-4')
    metric.add_tn('class-4')
    metric.add_fp('class-4')

    assert(metric.precision('class-1') == 0.5)
    assert(metric.precision('class-2') == 0.5)
    assert(metric.precision('class-3') == 0.1)
    assert(metric.precision('class-4') == 0.5)

    assert(metric.recall('class-1') == 1)
    assert(metric.recall('class-2') == 1)
    assert(metric.recall('class-3') == 1)
    assert(metric.recall('class-4') == 1)

    assert(metric.accuracy() == metric.micro_avg_accuracy())
    assert(metric.f_score() == metric.micro_avg_f_score())

    assert(metric.f_score('class-1') == 0.6667)
    assert(metric.f_score('class-2') == 0.6667)
    assert(metric.f_score('class-3') == 0.1818)
    assert(metric.f_score('class-4') == 0.6667)

    assert(metric.accuracy('class-1') == 0.75)
    assert(metric.accuracy('class-2') == 0.75)
    assert(metric.accuracy('class-3') == 0.1)
    assert(metric.accuracy('class-4') == 0.75)

    assert(metric.micro_avg_f_score() == 0.2184)
    assert(metric.macro_avg_f_score() == 0.4)

    assert(metric.micro_avg_accuracy() == 0.1696)
    assert(metric.macro_avg_accuracy() == 0.5875)

    assert(metric.precision() == 0.1226)
    assert(metric.recall() == 1)
コード例 #10
0
ファイル: test_utils.py プロジェクト: bluesea0/ditk
def test_metric_get_classes():
    metric = Metric("Test")

    metric.add_fn("class-1")
    metric.add_fn("class-3")
    metric.add_tn("class-1")
    metric.add_tp("class-2")

    assert 3 == len(metric.get_classes())
    assert "class-1" in metric.get_classes()
    assert "class-2" in metric.get_classes()
    assert "class-3" in metric.get_classes()
コード例 #11
0
    def obtain_performance_metric(
            self,
            batch,
            tags,
            lines=None,
            metric=Metric("Perf_Metric"),
    ) -> Metric:

        for (sentence, sent_tags) in zip(batch, tags):
            for (token, tag) in zip(sentence.tokens, sent_tags):
                token: Token = token
                token.add_tag_label("predicted", tag)

                # append both to file for evaluation
                eval_line = "{} {} {} {}\n".format(
                    token.text,
                    token.get_tag(self.tag_type).value,
                    tag.value,
                    tag.score,
                )
                # if provided None, dont append to the lines
                if isinstance(lines, List):
                    lines.append(eval_line)
            if isinstance(lines, List):
                lines.append("\n")

        for sentence in batch:
            # make list of gold tags
            gold_tags = [(tag.tag, str(tag))
                         for tag in sentence.get_spans(self.tag_type)]
            # make list of predicted tags
            predicted_tags = [(tag.tag, str(tag))
                              for tag in sentence.get_spans("predicted")]

            # check for true positives, false positives and false negatives
            for tag, prediction in predicted_tags:
                if (tag, prediction) in gold_tags:
                    metric.add_tp(tag)
                else:
                    metric.add_fp(tag)

            for tag, gold in gold_tags:
                if (tag, gold) not in predicted_tags:
                    metric.add_fn(tag)
                else:
                    metric.add_tn(tag)

        return metric
コード例 #12
0
def test_multiclass_metrics():

    metric = Metric('Test')
    available_labels = ['A', 'B', 'C']

    predictions = ['A', 'B']
    true_values = ['A']
    ModelTrainer._evaluate_sentence_for_text_classification(
        metric, available_labels, predictions, true_values)

    predictions = ['C', 'B']
    true_values = ['A', 'B']
    ModelTrainer._evaluate_sentence_for_text_classification(
        metric, available_labels, predictions, true_values)

    print(metric)
コード例 #13
0
ファイル: test_utils.py プロジェクト: bluesea0/ditk
def test_multiclass_metrics():

    metric = Metric("Test")
    available_labels = ["A", "B", "C"]

    predictions = ["A", "B"]
    true_values = ["A"]
    ModelTrainer._evaluate_sentence_for_text_classification(
        metric, available_labels, predictions, true_values
    )

    predictions = ["C", "B"]
    true_values = ["A", "B"]
    ModelTrainer._evaluate_sentence_for_text_classification(
        metric, available_labels, predictions, true_values
    )

    print(metric)
コード例 #14
0
    def _evaluate_text_classifier(
            model: flair.nn.Model,
            sentences: List[Sentence],
            eval_mini_batch_size: int = 32,
            embeddings_in_memory: bool = False) -> (dict, float):

        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + eval_mini_batch_size]
                for x in range(0, len(sentences), eval_mini_batch_size)
            ]

            metric = Metric('Evaluation')

            for batch in batches:

                labels, loss = model.forward_labels_and_loss(batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                for predictions, true_values in zip(
                    [[label.value for label in sent_labels]
                     for sent_labels in labels],
                    [sentence.get_label_names() for sentence in batch]):
                    for prediction in predictions:
                        if prediction in true_values:
                            metric.add_tp(prediction)
                        else:
                            metric.add_fp(prediction)

                    for true_value in true_values:
                        if true_value not in predictions:
                            metric.add_fn(true_value)
                        else:
                            metric.add_tn(true_value)

            eval_loss /= len(sentences)

            return metric, eval_loss
コード例 #15
0
    def obtain_performance_metric(
            self,
            batch,
            labels,
            lines,
            metric=Metric("Perf_Metric"),
    ) -> Metric:
        sentences_for_batch = [sent.to_plain_string() for sent in batch]
        confidences_for_batch = [[label.score for label in sent_labels]
                                 for sent_labels in labels]
        predictions_for_batch = [[label.value for label in sent_labels]
                                 for sent_labels in labels]
        true_values_for_batch = [
            sentence.get_label_names() for sentence in batch
        ]
        available_labels = self.label_dictionary.get_items()

        for sentence, confidence, prediction, true_value in zip(
                sentences_for_batch,
                confidences_for_batch,
                predictions_for_batch,
                true_values_for_batch,
        ):
            eval_line = "{}\t{}\t{}\t{}\n".format(sentence, true_value,
                                                  prediction, confidence)
            lines.append(eval_line)

        for predictions_for_sentence, true_values_for_sentence in zip(
                predictions_for_batch, true_values_for_batch):

            for label in available_labels:
                if (label in predictions_for_sentence
                        and label in true_values_for_sentence):
                    metric.add_tp(label)
                elif (label in predictions_for_sentence
                      and label not in true_values_for_sentence):
                    metric.add_fp(label)
                elif (label not in predictions_for_sentence
                      and label in true_values_for_sentence):
                    metric.add_fn(label)
                elif (label not in predictions_for_sentence
                      and label not in true_values_for_sentence):
                    metric.add_tn(label)
        return metric
コード例 #16
0
def evaluate(gold_file: Path, pred_file: Path, match_func: Callable[[Tuple, List], Tuple]) -> Metric:
    gold_annotations = read_annotations(gold_file)
    pred_annotations = read_annotations(pred_file)

    metric = Metric("Evaluation", beta=1)

    copy_gold = copy_dict(gold_annotations)
    for document_id, annotations in pred_annotations.items():
        for pred_entry in annotations:
            # Documents may not contain any gold entity!
            if document_id in copy_gold:
                matched_gold = match_func(pred_entry, copy_gold[document_id])
            else:
                matched_gold = None

            if matched_gold:
                # Assert same document and same entity type!
                assert matched_gold[0] == pred_entry[0] and matched_gold[3] == pred_entry[3]

                copy_gold[document_id].remove(matched_gold)
                metric.add_tp(pred_entry[3])
            else:
                metric.add_fp(pred_entry[3])

    copy_pred = copy_dict(pred_annotations)

    for document_id, annotations in gold_annotations.items():
        for gold_entry in annotations:
            if document_id in copy_pred:
                matched_pred = match_func(gold_entry, copy_pred[document_id])
            else:
                matched_pred = None

            if not matched_pred:
                metric.add_fn(gold_entry[3])
            else:
                # Assert same document and same entity type!
                assert matched_pred[0] == gold_entry[0] and matched_pred[3] == gold_entry[3]

                copy_pred[document_id].remove(matched_pred)

    return metric
コード例 #17
0
    def evaluate(self,
                 evaluation: List[Sentence],
                 out_path=None,
                 evaluation_method: str = 'F1',
                 eval_batch_size: int = 32,
                 embeddings_in_memory: bool = True):

        batch_no: int = 0
        batches = [
            evaluation[x:x + eval_batch_size]
            for x in range(0, len(evaluation), eval_batch_size)
        ]

        metric = Metric('')

        lines: List[str] = []

        for batch in batches:
            batch_no += 1

            scores, tag_seq = self.model._predict_scores_batch(batch)
            predicted_ids = tag_seq
            all_tokens = []
            for sentence in batch:
                all_tokens.extend(sentence.tokens)

            for (token, score, predicted_id) in zip(all_tokens, scores,
                                                    predicted_ids):
                token: Token = token
                # get the predicted tag
                predicted_value = self.model.tag_dictionary.get_item_for_index(
                    predicted_id)
                token.add_tag('predicted', predicted_value, score)

            for sentence in batch:

                # add predicted tags
                for token in sentence.tokens:
                    predicted_tag: Label = token.get_tag('predicted')

                    # append both to file for evaluation
                    eval_line = '{} {} {}\n'.format(
                        token.text,
                        token.get_tag(self.model.tag_type).value,
                        predicted_tag.value)

                    lines.append(eval_line)
                lines.append('\n')

                # make list of gold tags
                gold_tags = [
                    str(tag) for tag in sentence.get_spans(self.model.tag_type)
                ]

                # make list of predicted tags
                predicted_tags = [
                    str(tag) for tag in sentence.get_spans('predicted')
                ]

                # check for true positives, false positives and false negatives
                for prediction in predicted_tags:
                    if prediction in gold_tags:
                        metric.tp()
                    else:
                        metric.fp()

                for gold in gold_tags:
                    if gold not in predicted_tags:
                        metric.fn()

            if not embeddings_in_memory:
                self.clear_embeddings_in_batch(batch)

        if out_path is not None:
            test_tsv = os.path.join(out_path, "test.tsv")
            with open(test_tsv, "w", encoding='utf-8') as outfile:
                outfile.write(''.join(lines))

        if evaluation_method == 'accuracy':
            score = metric.accuracy()
            return score, metric

        if evaluation_method == 'F1':
            score = metric.f_score()
            return score, metric
コード例 #18
0
    def evaluate(
        self,
        sentences: Dataset,
        eval_mini_batch_size: int = 32,
        embeddings_in_memory: bool = True,
        out_path: Path = None,
    ) -> (Result, float):

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0

            batch_loader = torch.utils.data.DataLoader(
                sentences,
                batch_size=eval_mini_batch_size,
                shuffle=False,
                num_workers=4,
                collate_fn=list,
            )

            metric = Metric("Evaluation")

            lines: List[str] = []
            for batch in batch_loader:
                batch_no += 1

                with torch.no_grad():
                    features = self.forward(batch)
                    loss = self._calculate_loss(features, batch)
                    tags = self._obtain_labels(features, batch)

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag_label("predicted", tag)

                        # append both to file for evaluation
                        eval_line = "{} {} {} {}\n".format(
                            token.text,
                            token.get_tag(self.tag_type).value,
                            tag.value,
                            tag.score,
                        )
                        lines.append(eval_line)
                    lines.append("\n")
                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [
                        (tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type)
                    ]
                    # make list of predicted tags
                    predicted_tags = [
                        (tag.tag, str(tag)) for tag in sentence.get_spans("predicted")
                    ]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory
                )

            eval_loss /= len(sentences)

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}"
                )

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            return result, eval_loss
コード例 #19
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embedding_storage_mode: str = "none",
    ) -> (Result, float):

        if type(out_path) == str:
            out_path = Path(out_path)

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0

            metric = Metric("Evaluation", beta=self.beta)

            lines: List[str] = []

            if self.use_crf:
                transitions = self.transitions.detach().cpu().numpy()
            else:
                transitions = None

            for batch in data_loader:
                batch_no += 1

                with torch.no_grad():
                    features = self.forward(batch)
                    loss = self._calculate_loss(features, batch)
                    tags, _ = self._obtain_labels(
                        feature=features,
                        batch_sentences=batch,
                        transitions=transitions,
                        get_all_tags=False,
                    )

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag("predicted", tag.value, tag.score)

                        # append both to file for evaluation
                        eval_line = "{} {} {} {}\n".format(
                            token.text,
                            token.get_tag(self.tag_type).value,
                            tag.value,
                            tag.score,
                        )
                        lines.append(eval_line)
                    lines.append("\n")

                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [
                        (tag.tag, tag.text) for tag in sentence.get_spans(self.tag_type)
                    ]
                    # make list of predicted tags
                    predicted_tags = [
                        (tag.tag, tag.text) for tag in sentence.get_spans("predicted")
                    ]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                store_embeddings(batch, embedding_storage_mode)

            eval_loss /= batch_no

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy():.4f} - f1-score {metric.micro_avg_f_score():.4f}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy():.4f} - f1-score {metric.macro_avg_f_score():.4f}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}"
                )

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            return result, eval_loss
コード例 #20
0
ファイル: trainer.py プロジェクト: myaldiz/flair
    def train_epoch(
        self,
        epoch,
        batch_loader,
        optimizer,
        weight_extractor,
        embeddings_storage_mode: str = "cpu",
        param_selection_mode: bool = False,
    ):
        metric = Metric("Training")
        train_loss: float = 0

        seen_batches = 0
        total_number_of_batches = len(batch_loader)

        modulo = max(1, int(total_number_of_batches / 10))

        # process mini-batches
        for batch_no, batch in enumerate(batch_loader):

            optimizer.zero_grad()
            features = self.model.forward(batch)
            loss = self.model.calculate_loss(features, batch)
            tags, _ = self.model.obtain_labels(features, batch)

            # TODO: fix this for text regression model
            metric = self.model.obtain_performance_metric(batch,
                                                          tags,
                                                          metric=metric)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
            optimizer.step()

            seen_batches += 1
            train_loss += loss.item()

            # depending on memory mode, embeddings are moved to CPU, GPU or deleted
            store_embeddings(batch, embeddings_storage_mode)

            if batch_no % modulo == 0:
                log.info(
                    f"epoch {epoch + 1} - iter {batch_no}/{total_number_of_batches} - loss "
                    f"{train_loss / seen_batches:.6f} - running_score "
                    f"{metric.micro_avg_f_score():.6f}")
                iteration = epoch * total_number_of_batches + batch_no
                if not param_selection_mode:
                    weight_extractor.extract_weights(self.model.state_dict(),
                                                     iteration)

        train_loss /= seen_batches

        detailed_result = (
            f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
            f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
        )
        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                f"{metric.f_score(class_name):.4f}")

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=
            f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )

        return result, train_loss
コード例 #21
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embeddings_storage_mode: str = "cpu",
    ) -> (Result, float):

        with torch.no_grad():
            eval_loss = 0

            metric = Metric("Evaluation")

            lines: List[str] = []
            batch_count: int = 0
            for batch in data_loader:

                batch_count += 1

                labels, loss = self.forward_labels_and_loss(batch)

                eval_loss += loss

                sentences_for_batch = [
                    sent.to_plain_string() for sent in batch
                ]
                confidences_for_batch = [[
                    label.score for label in sent_labels
                ] for sent_labels in labels]
                predictions_for_batch = [[
                    label.value for label in sent_labels
                ] for sent_labels in labels]
                true_values_for_batch = [
                    sentence.get_label_names() for sentence in batch
                ]
                available_labels = self.label_dictionary.get_items()

                for sentence, confidence, prediction, true_value in zip(
                        sentences_for_batch,
                        confidences_for_batch,
                        predictions_for_batch,
                        true_values_for_batch,
                ):
                    eval_line = "{}\t{}\t{}\t{}\n".format(
                        sentence, true_value, prediction, confidence)
                    lines.append(eval_line)

                for predictions_for_sentence, true_values_for_sentence in zip(
                        predictions_for_batch, true_values_for_batch):

                    for label in available_labels:
                        if (label in predictions_for_sentence
                                and label in true_values_for_sentence):
                            metric.add_tp(label)
                        elif (label in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_fp(label)
                        elif (label not in predictions_for_sentence
                              and label in true_values_for_sentence):
                            metric.add_fn(label)
                        elif (label not in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_tn(label)

                store_embeddings(batch, embeddings_storage_mode)

            eval_loss /= batch_count

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}")

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=
                f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            return result, eval_loss
コード例 #22
0
    def evaluate(self, evaluation: List[Sentence], out_path=None, evaluation_method: str = 'F1',
                 embeddings_in_memory: bool = True):

        tp: int = 0
        fp: int = 0

        batch_no: int = 0
        mini_batch_size = 32
        batches = [evaluation[x:x + mini_batch_size] for x in
                   range(0, len(evaluation), mini_batch_size)]

        metric = Metric('')

        lines: List[str] = []

        for batch in batches:
            batch_no += 1

            self.model.embeddings.embed(batch)

            for sentence in batch:

                sentence: Sentence = sentence

                # Step 3. Run our forward pass.
                score, tag_seq = self.model.predict_scores(sentence)

                # Step 5. Compute predictions
                predicted_id = tag_seq
                for (token, pred_id) in zip(sentence.tokens, predicted_id):
                    token: Token = token
                    # get the predicted tag
                    predicted_tag = self.model.tag_dictionary.get_item_for_index(pred_id)
                    token.add_tag('predicted', predicted_tag)

                    # get the gold tag
                    gold_tag = token.get_tag(self.model.tag_type)

                    # append both to file for evaluation
                    eval_line = token.text + ' ' + gold_tag + ' ' + predicted_tag + "\n"

                    # positives
                    if predicted_tag != '':
                        # true positives
                        if predicted_tag == gold_tag:
                            metric.tp()
                        # false positive
                        if predicted_tag != gold_tag:
                            metric.fp()

                    # negatives
                    if predicted_tag == '':
                        # true negative
                        if predicted_tag == gold_tag:
                            metric.tn()
                        # false negative
                        if predicted_tag != gold_tag:
                            metric.fn()

                    lines.append(eval_line)

                lines.append('\n')

            if not embeddings_in_memory:
                self.clear_embeddings_in_batch(batch)

        if out_path is not None:
            test_tsv = os.path.join(out_path, "test.tsv")
            with open(test_tsv, "w", encoding='utf-8') as outfile:
                outfile.write(''.join(lines))

        if evaluation_method == 'span-F1':

            # get the eval script
            eval_script = cached_path('https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/scripts/conll03_eval_script.pl', cache_dir='scripts')
            os.chmod(eval_script, 0o777)

            eval_data = ''.join(lines)

            p = run(eval_script, stdout=PIPE, input=eval_data, encoding='utf-8')
            main_result = p.stdout
            print(main_result)

            main_result = main_result.split('\n')[1]

            # parse the result file
            main_result = re.sub(';', ' ', main_result)
            main_result = re.sub('precision', 'p', main_result)
            main_result = re.sub('recall', 'r', main_result)
            main_result = re.sub('accuracy', 'acc', main_result)

            f_score = float(re.findall(r'\d+\.\d+$', main_result)[0])
            return f_score, metric._fp, main_result

        if evaluation_method == 'accuracy':
            score = metric.accuracy()
            return score, metric._fp, str(score)

        if evaluation_method == 'F1':
            score = metric.f_score()
            return score, metric._fp, str(metric)
コード例 #23
0
    def evaluate(
        self,
        sentences: List[Sentence],
        eval_mini_batch_size: int = 32,
        embeddings_in_memory: bool = False,
        out_path: Path = None,
    ) -> (Result, float):

        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + eval_mini_batch_size]
                for x in range(0, len(sentences), eval_mini_batch_size)
            ]

            metric = Metric("Evaluation")

            lines: List[str] = []
            for batch in batches:

                labels, loss = self.forward_labels_and_loss(batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                sentences_for_batch = [
                    sent.to_plain_string() for sent in batch
                ]
                confidences_for_batch = [[
                    label.score for label in sent_labels
                ] for sent_labels in labels]
                predictions_for_batch = [[
                    label.value for label in sent_labels
                ] for sent_labels in labels]
                true_values_for_batch = [
                    sentence.get_label_names() for sentence in batch
                ]
                available_labels = self.label_dictionary.get_items()

                for sentence, confidence, prediction, true_value in zip(
                        sentences_for_batch,
                        confidences_for_batch,
                        predictions_for_batch,
                        true_values_for_batch,
                ):
                    eval_line = "{}\t{}\t{}\t{}\n".format(
                        sentence, true_value, prediction, confidence)
                    lines.append(eval_line)

                for predictions_for_sentence, true_values_for_sentence in zip(
                        predictions_for_batch, true_values_for_batch):

                    for label in available_labels:
                        if (label in predictions_for_sentence
                                and label in true_values_for_sentence):
                            metric.add_tp(label)
                        elif (label in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_fp(label)
                        elif (label not in predictions_for_sentence
                              and label in true_values_for_sentence):
                            metric.add_fn(label)
                        elif (label not in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_tn(label)

            eval_loss /= len(sentences)

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}")

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=
                f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            return result, eval_loss
コード例 #24
0
    def evaluate(
        self,
        sentences: Union[List[Sentence], Dataset],
        out_path: Union[str, Path] = None,
        embedding_storage_mode: str = "none",
        mini_batch_size: int = 32,
        num_workers: int = 8,
        wsd_evaluation: bool = False,
        **kwargs,
    ) -> (Result, float):

        # read Dataset into data loader (if list of sentences passed, make Dataset first)
        if not isinstance(sentences, Dataset):
            sentences = SentenceDataset(sentences)
        data_loader = DataLoader(sentences,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        eval_loss = 0
        eval_count = 0

        batch_no: int = 0

        metric = Metric("Evaluation", beta=self.beta)

        lines: List[str] = []

        y_true = []
        y_pred = []

        for batch in data_loader:

            # predict for batch
            loss_and_count = self.predict(
                batch,
                embedding_storage_mode=embedding_storage_mode,
                mini_batch_size=mini_batch_size,
                label_name='predicted',
                return_loss=True)

            eval_loss += loss_and_count[0]
            eval_count += loss_and_count[1]
            batch_no += 1

            for sentence in batch:

                # make list of gold tags
                gold_spans = sentence.get_spans(self.get_current_tag_type())
                gold_tags = [(span.tag, repr(span)) for span in gold_spans]

                # make list of predicted tags
                predicted_spans = sentence.get_spans("predicted")
                predicted_tags = [(span.tag, repr(span))
                                  for span in predicted_spans]

                # check for true positives, false positives and false negatives
                for tag, prediction in predicted_tags:
                    if (tag, prediction) in gold_tags:
                        metric.add_tp(tag)
                    else:
                        metric.add_fp(tag)

                for tag, gold in gold_tags:
                    if (tag, gold) not in predicted_tags:
                        metric.add_fn(tag)

                tags_gold = []
                tags_pred = []

                # also write to file in BIO format to use old conlleval script
                if out_path:
                    for token in sentence:
                        # check if in gold spans
                        gold_tag = 'O'
                        for span in gold_spans:
                            if token in span:
                                gold_tag = 'B-' + span.tag if token == span[
                                    0] else 'I-' + span.tag
                        tags_gold.append(gold_tag)

                        predicted_tag = 'O'
                        # check if in predicted spans
                        for span in predicted_spans:
                            if token in span:
                                predicted_tag = 'B-' + span.tag if token == span[
                                    0] else 'I-' + span.tag
                        tags_pred.append(predicted_tag)

                        lines.append(
                            f'{token.text} {gold_tag} {predicted_tag}\n')
                    lines.append('\n')

                y_true.append(tags_gold)
                y_pred.append(tags_pred)

        if out_path:
            with open(Path(out_path), "w", encoding="utf-8") as outfile:
                outfile.write("".join(lines))

        detailed_result = (
            "\nResults:"
            f"\n- F1-score (micro) {metric.micro_avg_f_score():.4f}"
            f"\n- F1-score (macro) {metric.macro_avg_f_score():.4f}"
            '\n\nBy class:')

        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"f1-score: "
                f"{metric.f_score(class_name):.4f}")

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=
            f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )

        return result, eval_loss / eval_count
コード例 #25
0
ファイル: trainer.py プロジェクト: rkwojdan/flair35
from __future__ import absolute_import
コード例 #26
0
def eval_flair_spans(data, predicted_list, batch_size, out_path=None):
    metric = Metric('Evaluation')

    mini_batch_size = batch_size
    batches = [
        data[x:x + mini_batch_size]
        for x in range(0, len(data), mini_batch_size)
    ]

    lines: List[str] = []
    word_counter = 0
    for batch in batches:
        for sentence in batch:
            for token in sentence.tokens:
                tag = Label(predicted_list[word_counter])
                word_counter += 1
                token.add_tag_label('predicted', tag)

                # append both to file for evaluation
                eval_line = '{} {} {} {}\n'.format(token.text,
                                                   token.get_tag('ner').value,
                                                   tag.value, tag.score)

                lines.append(eval_line)
            lines.append('\n')

        for sentence in batch:
            # make list of gold tags
            gold_tags = [(tag.tag, str(tag))
                         for tag in sentence.get_spans('ner')]
            # make list of predicted tags
            predicted_tags = [(tag.tag, str(tag))
                              for tag in sentence.get_spans('predicted')]

            # check for true positives, false positives and false negatives
            for tag, prediction in predicted_tags:
                if (tag, prediction) in gold_tags:
                    metric.add_tp(tag)
                else:
                    metric.add_fp(tag)

            for tag, gold in gold_tags:
                if (tag, gold) not in predicted_tags:
                    metric.add_fn(tag)
                else:
                    metric.add_tn(tag)

    # add metrics scores at the beginning of the file
    lines.insert(0, str(metric) + "\n\n")

    if out_path is not None:

        # create folder for json and corresponding output
        if not os.path.exists(os.path.dirname(out_path)):
            try:
                os.makedirs(os.path.dirname(out_path))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

        with open(out_path, "w", encoding='utf-8') as outfile:
            outfile.write(''.join(lines))
        #
    # esnWrapper.model.output_activation = output_activation_training
    return metric
コード例 #27
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embeddings_storage_mode: str = "cpu",
        prediction_mode: bool = False,
    ) -> (Result, float):
        eval_loss = 0
        batch_no = 0
        data_loader.assign_embeddings()
        if out_path is not None:
            outfile = open(out_path, "w", encoding="utf-8")
        if not self.binary:
            metric = Metric("Evaluation")
        with torch.no_grad():
            for batch in data_loader:
                batch_no += 1
                scores = self.forward(batch, prediction_mode=prediction_mode)
                loss = self._calculate_loss(scores, batch, self.mask)
                eval_loss += loss
                if self.binary:
                    pdb.set_trace()
                    result = Result(
                        main_score=LF1,
                        log_line=f"\nUF1: {UF1} - LF1 {LF1}",
                        log_header="PRECISION\tRECALL\tF1",
                        detailed_results=f"\nUF1: {UF1} - LF1 {LF1}",
                    )
                else:
                    # if prediction_mode:
                    #   eval_loss, metric=self.dependency_evaluate(data_loader,out_path=out_path,prediction_mode=prediction_mode)
                    #   return eval_loss, metric
                    # else:

                    tags, _ = self._obtain_labels(scores, batch)
                    for (sentence, sent_tags) in zip(batch, tags):
                        for (token, tag) in zip(sentence.tokens, sent_tags):
                            token: Token = token
                            token.add_tag_label("predicted", tag)

                            # append both to file for evaluation
                            eval_line = "{} {} {} {}\n".format(
                                token.text,
                                token.get_tag(self.tag_type).value,
                                tag.value,
                                tag.score,
                            )
                            # lines.append(eval_line)
                            if out_path is not None:
                                outfile.write(eval_line)
                        # lines.append("\n")
                        if out_path is not None:
                            outfile.write("\n")
                    for sentence in batch:
                        # make list of gold tags
                        gold_tags = [
                            (tag.tag, str(tag))
                            for tag in sentence.get_spans(self.tag_type)
                        ]
                        # make list of predicted tags
                        predicted_tags = [
                            (tag.tag, str(tag))
                            for tag in sentence.get_spans("predicted")
                        ]

                        # check for true positives, false positives and false negatives
                        for tag, prediction in predicted_tags:
                            if (tag, prediction) in gold_tags:
                                metric.add_tp(tag)
                            else:
                                metric.add_fp(tag)

                        for tag, gold in gold_tags:
                            if (tag, gold) not in predicted_tags:
                                metric.add_fn(tag)
                            else:
                                metric.add_tn(tag)
        eval_loss /= batch_no
        if out_path is not None:
            outfile.close()
        detailed_result = (
            f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
            f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
        )
        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                f"{metric.f_score(class_name):.4f}")

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=
            f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )
        return result, eval_loss
コード例 #28
0
    def dependency_evaluate(self,
                            loader,
                            out_path=None,
                            prediction_mode=False):
        # self.model.eval()

        loss, metric = 0, Metric()
        # total_start_time=time.time()
        # forward_time=0
        # loss_time=0
        # decode_time=0
        # punct_time=0
        lines = []
        for batch in loader:
            forward_start = time.time()
            arc_scores, rel_scores = self.forward(batch)
            # forward_end=time.time()
            mask = self.mask
            if not prediction_mode:
                loss += self._calculate_loss(arc_scores, rel_scores, batch,
                                             mask)
            # loss_end=time.time()
            # forward_time+=forward_end-forward_start
            # loss_time+=loss_end-forward_end
            mask = mask.bool()
            # decode_start=time.time()
            arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask)
            # decode_end=time.time()
            # decode_time+=decode_end-decode_start
            # ignore all punctuation if not specified
            # if out_path is not None:
            #   pdb.set_trace()
            if not self.punct:
                for sent_id, sentence in enumerate(batch):
                    for token_id, token in enumerate(sentence):
                        upos = token.get_tag('upos').value
                        xpos = token.get_tag('pos').value
                        word = token.text
                        if is_punctuation(word, upos,
                                          self.punct_list) or is_punctuation(
                                              word, upos, self.punct_list):
                            mask[sent_id][token_id] = 0
                # mask &= words.unsqueeze(-1).ne(self.puncts).all(-1)
            if out_path is not None:
                for (sent_idx, sentence) in enumerate(batch):
                    for token_idx, token in enumerate(sentence):
                        if token_idx == 0:
                            continue

                        # append both to file for evaluation
                        eval_line = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                            token_idx,
                            token.text,
                            'X',
                            'X',
                            'X',
                            'X',
                            arc_preds[sent_idx, token_idx],
                            self.tag_dictionary.get_item_for_index(
                                rel_preds[sent_idx, token_idx]),
                            'X',
                            'X',
                        )
                        lines.append(eval_line)
                    lines.append("\n")

            if not prediction_mode:
                # punct_end=time.time()
                # punct_time+=punct_end-decode_end
                metric(arc_preds, rel_preds, self.arcs, self.rels, mask)
        # if out_path is not None:
        #   with open(out_path, "w", encoding="utf-8") as outfile:
        #       outfile.write("".join(lines))
        if prediction_mode:
            return None, None
        # total_end_time=time.time()
        # print(total_start_time-total_end_time)
        # print(forward_time)
        # print(punct_time)
        # print(decode_time)

        loss /= len(loader)

        return loss, metric
コード例 #29
0
    def _evaluate_with_span_F1(self, data_loader, embedding_storage_mode, mini_batch_size, out_path):
        eval_loss = 0

        batch_no: int = 0

        metric = Metric("Evaluation", beta=self.beta)

        lines: List[str] = []

        y_true = []
        y_pred = []

        for batch in data_loader:

            # predict for batch
            loss = self.predict(batch,
                                embedding_storage_mode=embedding_storage_mode,
                                mini_batch_size=mini_batch_size,
                                label_name='predicted',
                                return_loss=True)
            eval_loss += loss
            batch_no += 1

            for sentence in batch:

                # make list of gold tags
                gold_spans = sentence.get_spans(self.tag_type)
                gold_tags = [(span.tag, repr(span)) for span in gold_spans]

                # make list of predicted tags
                predicted_spans = sentence.get_spans("predicted")
                predicted_tags = [(span.tag, repr(span)) for span in predicted_spans]

                # check for true positives, false positives and false negatives
                for tag, prediction in predicted_tags:
                    if (tag, prediction) in gold_tags:
                        metric.add_tp(tag)
                    else:
                        metric.add_fp(tag)

                for tag, gold in gold_tags:
                    if (tag, gold) not in predicted_tags:
                        metric.add_fn(tag)

                tags_gold = []
                tags_pred = []

                # also write to file in BIO format to use old conlleval script
                if out_path:
                    for token in sentence:
                        # check if in gold spans
                        gold_tag = 'O'
                        for span in gold_spans:
                            if token in span:
                                gold_tag = 'B-' + span.tag if token == span[0] else 'I-' + span.tag
                        tags_gold.append(gold_tag)

                        predicted_tag = 'O'
                        # check if in predicted spans
                        for span in predicted_spans:
                            if token in span:
                                predicted_tag = 'B-' + span.tag if token == span[0] else 'I-' + span.tag
                        tags_pred.append(predicted_tag)

                        lines.append(f'{token.text} {gold_tag} {predicted_tag}\n')
                    lines.append('\n')

                y_true.append(tags_gold)
                y_pred.append(tags_pred)

        if out_path:
            with open(Path(out_path), "w", encoding="utf-8") as outfile:
                outfile.write("".join(lines))

        eval_loss /= batch_no

        detailed_result = (
            "\nResults:"
            f"\n- F1-score (micro) {metric.micro_avg_f_score():.4f}"
            f"\n- F1-score (macro) {metric.macro_avg_f_score():.4f}"
            '\n\nBy class:'
        )

        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"f1-score: "
                f"{metric.f_score(class_name):.4f}"
            )

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )

        return result, eval_loss
コード例 #30
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embeddings_storage_mode: str = "none",
        eval_mode: EvalMode = EvalMode.Standard,
        misspell_mode: MisspellingMode = MisspellingMode.Random,
        misspelling_rate: float = 0.0,
        char_vocab: set = {},
        lut: dict = {},
        cmx: np.array = None,
        typos: dict = {},
        correction_mode: CorrectionMode = CorrectionMode.NotSpecified,
        eval_dict_name=None,
        evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE,
    ) -> (Result, float):

        if type(out_path) == str:
            out_path = Path(out_path)

        from robust_ner.spellcheck import load_correction_dict, get_lang_from_corpus_name

        if correction_mode == CorrectionMode.NotSpecified:
            eval_dict = None
        else:
            eval_dict = load_correction_dict(eval_dict_name, log)
            # note: use 'save_correction_dict' to re-generate a dictionary

        lang = get_lang_from_corpus_name(eval_dict_name)

        eval_params = {}
        eval_params["eval_mode"] = eval_mode
        eval_params["misspelling_rate"] = misspelling_rate
        eval_params["misspell_mode"] = misspell_mode
        eval_params["char_vocab"] = char_vocab
        eval_params["lut"] = lut
        eval_params["cmx"] = cmx
        eval_params["typos"] = typos
        eval_params["correction_mode"] = correction_mode
        eval_params["lang"] = lang
        eval_params["dictionary"] = eval_dict

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0

            metric = Metric("Evaluation")

            lines: List[str] = []

            if self.use_crf:
                transitions = self.transitions.detach().cpu().numpy()
            else:
                transitions = None

            for batch in data_loader:
                batch_no += 1

                with torch.no_grad():
                    features = self.forward(batch, eval_params)
                    loss = self._calculate_loss(features, batch)
                    tags, _ = self._obtain_labels(
                        feature=features,
                        batch_sentences=batch,
                        transitions=transitions,
                        get_all_tags=False,
                    )

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag("predicted", tag.value, tag.score)

                        # append both to file for evaluation
                        eval_line = "{} {} {} {}\n".format(
                            token.text,
                            token.get_tag(self.tag_type).value,
                            tag.value,
                            tag.score,
                        )
                        lines.append(eval_line)
                    lines.append("\n")

                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [(tag.tag, tag.text)
                                 for tag in sentence.get_spans(self.tag_type)]
                    # make list of predicted tags
                    predicted_tags = [
                        (tag.tag, tag.text)
                        for tag in sentence.get_spans("predicted")
                    ]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                store_embeddings(batch, embeddings_storage_mode)

            eval_loss /= batch_no

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy():.4f} - f1-score {metric.micro_avg_f_score():.4f}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy():.4f} - f1-score {metric.macro_avg_f_score():.4f}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}")

            if evaluation_metric == EvaluationMetric.MICRO_F1_SCORE:
                main_score = metric.micro_avg_f_score()
            elif evaluation_metric == EvaluationMetric.MACRO_F1_SCORE:
                main_score = metric.macro_avg_f_score()
            elif evaluation_metric == EvaluationMetric.MICRO_ACCURACY:
                main_score = metric.micro_avg_accuracy()
            elif evaluation_metric == EvaluationMetric.MACRO_ACCURACY:
                main_score = metric.macro_avg_accuracy()
            elif evaluation_metric == EvaluationMetric.MEAN_SQUARED_ERROR:
                main_score = metric.mean_squared_error()
            else:
                log.error(f"unknown evaluation metric: {evaluation_metric}")

            result = Result(
                main_score=main_score,
                log_line=
                f"{metric.precision():.4f}\t{metric.recall():.4f}\t{main_score:.4f}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            return result, eval_loss