コード例 #1
0
    def _evaluate_sequence_tagger(model,
                                  sentences: List[Sentence],
                                  eval_mini_batch_size: int = 32,
                                  embeddings_in_memory: bool = True,
                                  out_path: Path = None) -> (dict, float):

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0
            batches = [sentences[x:x + eval_mini_batch_size] for x in range(0, len(sentences), eval_mini_batch_size)]

            metric = Metric('Evaluation')

            lines: List[str] = []
            for batch in batches:
                batch_no += 1

                tags, loss = model.forward_labels_and_loss(batch)

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag_label('predicted', tag)

                        # append both to file for evaluation
                        eval_line = '{} {} {} {}\n'.format(token.text,
                                                           token.get_tag(model.tag_type).value, tag.value, tag.score)
                        lines.append(eval_line)
                    lines.append('\n')
                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans(model.tag_type)]
                    # make list of predicted tags
                    predicted_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans('predicted')]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                clear_embeddings(batch, also_clear_word_embeddings=not embeddings_in_memory)

            eval_loss /= len(sentences)

            if out_path is not None:
                with open(out_path, "w", encoding='utf-8') as outfile:
                    outfile.write(''.join(lines))

            return metric, eval_loss
コード例 #2
0
ファイル: trainer.py プロジェクト: alirezadir/flair
    def _evaluate_sentence_for_text_classification(metric: Metric,
                                                   available_labels: List[str],
                                                   predictions: List[str],
                                                   true_values: List[str]):

        for label in available_labels:
            if label in predictions and label in true_values:
                metric.add_tp(label)
            elif label in predictions and label not in true_values:
                metric.add_fp(label)
            elif label not in predictions and label in true_values:
                metric.add_fn(label)
            elif label not in predictions and label not in true_values:
                metric.add_tn(label)
コード例 #3
0
ファイル: test_utils.py プロジェクト: zzg-971030/flair
def test_metric_with_classes():
    metric = Metric("Test")

    metric.add_tp("class-1")
    metric.add_tn("class-1")
    metric.add_tn("class-1")
    metric.add_fp("class-1")

    metric.add_tp("class-2")
    metric.add_tn("class-2")
    metric.add_tn("class-2")
    metric.add_fp("class-2")

    for i in range(0, 10):
        metric.add_tp("class-3")
    for i in range(0, 90):
        metric.add_fp("class-3")

    metric.add_tp("class-4")
    metric.add_tn("class-4")
    metric.add_tn("class-4")
    metric.add_fp("class-4")

    print(metric)

    assert metric.precision("class-1") == 0.5
    assert metric.precision("class-2") == 0.5
    assert metric.precision("class-3") == 0.1
    assert metric.precision("class-4") == 0.5

    assert metric.recall("class-1") == 1
    assert metric.recall("class-2") == 1
    assert metric.recall("class-3") == 1
    assert metric.recall("class-4") == 1

    assert metric.accuracy() == metric.micro_avg_accuracy()
    assert metric.f_score() == metric.micro_avg_f_score()

    assert metric.f_score("class-1") == 0.6666666666666666
    assert metric.f_score("class-2") == 0.6666666666666666
    assert metric.f_score("class-3") == 0.18181818181818182
    assert metric.f_score("class-4") == 0.6666666666666666

    assert metric.accuracy("class-1") == 0.75
    assert metric.accuracy("class-2") == 0.75
    assert metric.accuracy("class-3") == 0.1
    assert metric.accuracy("class-4") == 0.75

    assert metric.micro_avg_f_score() == 0.21848739495798317
    assert metric.macro_avg_f_score() == 0.5454545454545454

    assert metric.micro_avg_accuracy() == 0.16964285714285715
    assert metric.macro_avg_accuracy() == 0.5875

    assert metric.precision() == 0.12264150943396226
    assert metric.recall() == 1
コード例 #4
0
    def _evaluate_text_classifier(
            model: flair.nn.Model,
            sentences: List[Sentence],
            eval_mini_batch_size: int = 32,
            embeddings_in_memory: bool = False) -> (dict, float):

        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + eval_mini_batch_size]
                for x in range(0, len(sentences), eval_mini_batch_size)
            ]

            metric = Metric('Evaluation')

            for batch in batches:

                labels, loss = model.forward_labels_and_loss(batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                for predictions, true_values in zip(
                    [[label.value for label in sent_labels]
                     for sent_labels in labels],
                    [sentence.get_label_names() for sentence in batch]):
                    for prediction in predictions:
                        if prediction in true_values:
                            metric.add_tp(prediction)
                        else:
                            metric.add_fp(prediction)

                    for true_value in true_values:
                        if true_value not in predictions:
                            metric.add_fn(true_value)
                        else:
                            metric.add_tn(true_value)

            eval_loss /= len(sentences)

            return metric, eval_loss
コード例 #5
0
ファイル: test_utils.py プロジェクト: bluesea0/ditk
def test_metric_with_classes():
    metric = Metric("Test")

    metric.add_tp("class-1")
    metric.add_tn("class-1")
    metric.add_tn("class-1")
    metric.add_fp("class-1")

    metric.add_tp("class-2")
    metric.add_tn("class-2")
    metric.add_tn("class-2")
    metric.add_fp("class-2")

    for i in range(0, 10):
        metric.add_tp("class-3")
    for i in range(0, 90):
        metric.add_fp("class-3")

    metric.add_tp("class-4")
    metric.add_tn("class-4")
    metric.add_tn("class-4")
    metric.add_fp("class-4")

    assert metric.precision("class-1") == 0.5
    assert metric.precision("class-2") == 0.5
    assert metric.precision("class-3") == 0.1
    assert metric.precision("class-4") == 0.5

    assert metric.recall("class-1") == 1
    assert metric.recall("class-2") == 1
    assert metric.recall("class-3") == 1
    assert metric.recall("class-4") == 1

    assert metric.accuracy() == metric.micro_avg_accuracy()
    assert metric.f_score() == metric.micro_avg_f_score()

    assert metric.f_score("class-1") == 0.6667
    assert metric.f_score("class-2") == 0.6667
    assert metric.f_score("class-3") == 0.1818
    assert metric.f_score("class-4") == 0.6667

    assert metric.accuracy("class-1") == 0.5
    assert metric.accuracy("class-2") == 0.5
    assert metric.accuracy("class-3") == 0.1
    assert metric.accuracy("class-4") == 0.5

    assert metric.micro_avg_f_score() == 0.2184
    assert metric.macro_avg_f_score() == 0.5454749999999999

    assert metric.micro_avg_accuracy() == 0.1226
    assert metric.macro_avg_accuracy() == 0.4

    assert metric.precision() == 0.1226
    assert metric.recall() == 1
コード例 #6
0
ファイル: test_utils.py プロジェクト: zllrunning/flair
def test_metric_with_classes():
    metric = Metric('Test')

    metric.add_tp('class-1')
    metric.add_tn('class-1')
    metric.add_tn('class-1')
    metric.add_fp('class-1')

    metric.add_tp('class-2')
    metric.add_tn('class-2')
    metric.add_tn('class-2')
    metric.add_fp('class-2')

    for i in range(0, 10):
        metric.add_tp('class-3')
    for i in range(0, 90):
        metric.add_fp('class-3')

    metric.add_tp('class-4')
    metric.add_tn('class-4')
    metric.add_tn('class-4')
    metric.add_fp('class-4')

    assert(metric.precision('class-1') == 0.5)
    assert(metric.precision('class-2') == 0.5)
    assert(metric.precision('class-3') == 0.1)
    assert(metric.precision('class-4') == 0.5)

    assert(metric.recall('class-1') == 1)
    assert(metric.recall('class-2') == 1)
    assert(metric.recall('class-3') == 1)
    assert(metric.recall('class-4') == 1)

    assert(metric.accuracy() == metric.micro_avg_accuracy())
    assert(metric.f_score() == metric.micro_avg_f_score())

    assert(metric.f_score('class-1') == 0.6667)
    assert(metric.f_score('class-2') == 0.6667)
    assert(metric.f_score('class-3') == 0.1818)
    assert(metric.f_score('class-4') == 0.6667)

    assert(metric.accuracy('class-1') == 0.75)
    assert(metric.accuracy('class-2') == 0.75)
    assert(metric.accuracy('class-3') == 0.1)
    assert(metric.accuracy('class-4') == 0.75)

    assert(metric.micro_avg_f_score() == 0.2184)
    assert(metric.macro_avg_f_score() == 0.4)

    assert(metric.micro_avg_accuracy() == 0.1696)
    assert(metric.macro_avg_accuracy() == 0.5875)

    assert(metric.precision() == 0.1226)
    assert(metric.recall() == 1)
コード例 #7
0
def evaluate(gold_file: Path, pred_file: Path, match_func: Callable[[Tuple, List], Tuple]) -> Metric:
    gold_annotations = read_annotations(gold_file)
    pred_annotations = read_annotations(pred_file)

    metric = Metric("Evaluation", beta=1)

    copy_gold = copy_dict(gold_annotations)
    for document_id, annotations in pred_annotations.items():
        for pred_entry in annotations:
            # Documents may not contain any gold entity!
            if document_id in copy_gold:
                matched_gold = match_func(pred_entry, copy_gold[document_id])
            else:
                matched_gold = None

            if matched_gold:
                # Assert same document and same entity type!
                assert matched_gold[0] == pred_entry[0] and matched_gold[3] == pred_entry[3]

                copy_gold[document_id].remove(matched_gold)
                metric.add_tp(pred_entry[3])
            else:
                metric.add_fp(pred_entry[3])

    copy_pred = copy_dict(pred_annotations)

    for document_id, annotations in gold_annotations.items():
        for gold_entry in annotations:
            if document_id in copy_pred:
                matched_pred = match_func(gold_entry, copy_pred[document_id])
            else:
                matched_pred = None

            if not matched_pred:
                metric.add_fn(gold_entry[3])
            else:
                # Assert same document and same entity type!
                assert matched_pred[0] == gold_entry[0] and matched_pred[3] == gold_entry[3]

                copy_pred[document_id].remove(matched_pred)

    return metric
コード例 #8
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embeddings_storage_mode: str = "none",
        eval_mode: EvalMode = EvalMode.Standard,
        misspell_mode: MisspellingMode = MisspellingMode.Random,
        misspelling_rate: float = 0.0,
        char_vocab: set = {},
        lut: dict = {},
        cmx: np.array = None,
        typos: dict = {},
        correction_mode: CorrectionMode = CorrectionMode.NotSpecified,
        eval_dict_name=None,
        evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE,
    ) -> (Result, float):

        if type(out_path) == str:
            out_path = Path(out_path)

        from robust_ner.spellcheck import load_correction_dict, get_lang_from_corpus_name

        if correction_mode == CorrectionMode.NotSpecified:
            eval_dict = None
        else:
            eval_dict = load_correction_dict(eval_dict_name, log)
            # note: use 'save_correction_dict' to re-generate a dictionary

        lang = get_lang_from_corpus_name(eval_dict_name)

        eval_params = {}
        eval_params["eval_mode"] = eval_mode
        eval_params["misspelling_rate"] = misspelling_rate
        eval_params["misspell_mode"] = misspell_mode
        eval_params["char_vocab"] = char_vocab
        eval_params["lut"] = lut
        eval_params["cmx"] = cmx
        eval_params["typos"] = typos
        eval_params["correction_mode"] = correction_mode
        eval_params["lang"] = lang
        eval_params["dictionary"] = eval_dict

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0

            metric = Metric("Evaluation")

            lines: List[str] = []

            if self.use_crf:
                transitions = self.transitions.detach().cpu().numpy()
            else:
                transitions = None

            for batch in data_loader:
                batch_no += 1

                with torch.no_grad():
                    features = self.forward(batch, eval_params)
                    loss = self._calculate_loss(features, batch)
                    tags, _ = self._obtain_labels(
                        feature=features,
                        batch_sentences=batch,
                        transitions=transitions,
                        get_all_tags=False,
                    )

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag("predicted", tag.value, tag.score)

                        # append both to file for evaluation
                        eval_line = "{} {} {} {}\n".format(
                            token.text,
                            token.get_tag(self.tag_type).value,
                            tag.value,
                            tag.score,
                        )
                        lines.append(eval_line)
                    lines.append("\n")

                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [(tag.tag, tag.text)
                                 for tag in sentence.get_spans(self.tag_type)]
                    # make list of predicted tags
                    predicted_tags = [
                        (tag.tag, tag.text)
                        for tag in sentence.get_spans("predicted")
                    ]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                store_embeddings(batch, embeddings_storage_mode)

            eval_loss /= batch_no

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy():.4f} - f1-score {metric.micro_avg_f_score():.4f}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy():.4f} - f1-score {metric.macro_avg_f_score():.4f}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}")

            if evaluation_metric == EvaluationMetric.MICRO_F1_SCORE:
                main_score = metric.micro_avg_f_score()
            elif evaluation_metric == EvaluationMetric.MACRO_F1_SCORE:
                main_score = metric.macro_avg_f_score()
            elif evaluation_metric == EvaluationMetric.MICRO_ACCURACY:
                main_score = metric.micro_avg_accuracy()
            elif evaluation_metric == EvaluationMetric.MACRO_ACCURACY:
                main_score = metric.macro_avg_accuracy()
            elif evaluation_metric == EvaluationMetric.MEAN_SQUARED_ERROR:
                main_score = metric.mean_squared_error()
            else:
                log.error(f"unknown evaluation metric: {evaluation_metric}")

            result = Result(
                main_score=main_score,
                log_line=
                f"{metric.precision():.4f}\t{metric.recall():.4f}\t{main_score:.4f}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            return result, eval_loss
コード例 #9
0
    def _evaluate_with_span_F1(self, data_loader, embedding_storage_mode, mini_batch_size, out_path):
        eval_loss = 0

        batch_no: int = 0

        metric = Metric("Evaluation", beta=self.beta)

        lines: List[str] = []

        y_true = []
        y_pred = []

        for batch in data_loader:

            # predict for batch
            loss = self.predict(batch,
                                embedding_storage_mode=embedding_storage_mode,
                                mini_batch_size=mini_batch_size,
                                label_name='predicted',
                                return_loss=True)
            eval_loss += loss
            batch_no += 1

            for sentence in batch:

                # make list of gold tags
                gold_spans = sentence.get_spans(self.tag_type)
                gold_tags = [(span.tag, repr(span)) for span in gold_spans]

                # make list of predicted tags
                predicted_spans = sentence.get_spans("predicted")
                predicted_tags = [(span.tag, repr(span)) for span in predicted_spans]

                # check for true positives, false positives and false negatives
                for tag, prediction in predicted_tags:
                    if (tag, prediction) in gold_tags:
                        metric.add_tp(tag)
                    else:
                        metric.add_fp(tag)

                for tag, gold in gold_tags:
                    if (tag, gold) not in predicted_tags:
                        metric.add_fn(tag)

                tags_gold = []
                tags_pred = []

                # also write to file in BIO format to use old conlleval script
                if out_path:
                    for token in sentence:
                        # check if in gold spans
                        gold_tag = 'O'
                        for span in gold_spans:
                            if token in span:
                                gold_tag = 'B-' + span.tag if token == span[0] else 'I-' + span.tag
                        tags_gold.append(gold_tag)

                        predicted_tag = 'O'
                        # check if in predicted spans
                        for span in predicted_spans:
                            if token in span:
                                predicted_tag = 'B-' + span.tag if token == span[0] else 'I-' + span.tag
                        tags_pred.append(predicted_tag)

                        lines.append(f'{token.text} {gold_tag} {predicted_tag}\n')
                    lines.append('\n')

                y_true.append(tags_gold)
                y_pred.append(tags_pred)

        if out_path:
            with open(Path(out_path), "w", encoding="utf-8") as outfile:
                outfile.write("".join(lines))

        eval_loss /= batch_no

        detailed_result = (
            "\nResults:"
            f"\n- F1-score (micro) {metric.micro_avg_f_score():.4f}"
            f"\n- F1-score (macro) {metric.macro_avg_f_score():.4f}"
            '\n\nBy class:'
        )

        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"f1-score: "
                f"{metric.f_score(class_name):.4f}"
            )

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )

        return result, eval_loss
コード例 #10
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embedding_storage_mode: str = "none",
    ) -> (Result, float):

        if type(out_path) == str:
            out_path = Path(out_path)
        metric = Metric("Evaluation", beta=self.beta)
        parsing_metric = ParsingMetric()

        lines: List[str] = []

        eval_loss_arc = 0
        eval_loss_rel = 0

        for batch_idx, batch in enumerate(data_loader):

            with torch.no_grad():
                score_arc, score_rel = self.forward(batch)
                loss_arc, loss_rel = self._calculate_loss(
                    score_arc, score_rel, batch)
                arc_prediction, relation_prediction = self._obtain_labels_(
                    score_arc, score_rel)

            parsing_metric(arc_prediction, relation_prediction, batch)

            eval_loss_arc += loss_arc
            eval_loss_rel += loss_rel

            for (sentence, arcs, sent_tags) in zip(batch, arc_prediction,
                                                   relation_prediction):
                for (token, arc, tag) in zip(sentence.tokens, arcs, sent_tags):
                    token: Token = token
                    token.add_tag_label("predicted", Label(tag))
                    token.add_tag_label("predicted_head_id", Label(str(arc)))

                    # append both to file for evaluation
                    eval_line = "{} {} {} {} {}\n".format(
                        token.text,
                        token.tags['dependency'].value,
                        str(token.head_id),
                        tag,
                        str(arc),
                    )
                    lines.append(eval_line)
                lines.append("\n")

            for sentence in batch:

                # make list of gold tags
                gold_tags = [
                    token.tags['dependency'].value for token in sentence.tokens
                ]

                # make list of predicted tags
                predicted_tags = [
                    tag.tag for tag in sentence.get_spans("predicted")
                ]

                # check for true positives, false positives and false negatives
                for tag_indx, predicted_tag in enumerate(predicted_tags):
                    if predicted_tag == gold_tags[tag_indx]:
                        metric.add_tp(tag)
                    else:
                        metric.add_fp(tag)

                for tag_indx, label_tag in enumerate(gold_tags):
                    if label_tag != predicted_tags[tag_indx]:
                        metric.add_fn(tag)
                    else:
                        metric.add_tn(tag)
            store_embeddings(batch, embedding_storage_mode)

        eval_loss_arc /= len(data_loader)
        eval_loss_rel /= len(data_loader)

        if out_path is not None:
            with open(out_path, "w", encoding="utf-8") as outfile:
                outfile.write("".join(lines))

        detailed_result = (
            f"\nUAS : {parsing_metric.get_uas():.4f} - LAS : {parsing_metric.get_las():.4f}"
            f"\neval loss rel : {eval_loss_rel:.4f} - eval loss arc : {eval_loss_arc:.4f}"
            f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
            f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
        )
        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                f"{metric.f_score(class_name):.4f}")

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=
            f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )

        return result, eval_loss_arc + eval_loss_rel
コード例 #11
0
    def evaluate(
        self,
        sentences: Dataset,
        eval_mini_batch_size: int = 32,
        embeddings_in_memory: bool = True,
        out_path: Path = None,
    ) -> (Result, float):

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0

            batch_loader = torch.utils.data.DataLoader(
                sentences,
                batch_size=eval_mini_batch_size,
                shuffle=False,
                num_workers=4,
                collate_fn=list,
            )

            metric = Metric("Evaluation")

            lines: List[str] = []
            for batch in batch_loader:
                batch_no += 1

                with torch.no_grad():
                    features = self.forward(batch)
                    loss = self._calculate_loss(features, batch)
                    tags = self._obtain_labels(features, batch)

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag_label("predicted", tag)

                        # append both to file for evaluation
                        eval_line = "{} {} {} {}\n".format(
                            token.text,
                            token.get_tag(self.tag_type).value,
                            tag.value,
                            tag.score,
                        )
                        lines.append(eval_line)
                    lines.append("\n")
                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [
                        (tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type)
                    ]
                    # make list of predicted tags
                    predicted_tags = [
                        (tag.tag, str(tag)) for tag in sentence.get_spans("predicted")
                    ]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory
                )

            eval_loss /= len(sentences)

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}"
                )

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            return result, eval_loss
コード例 #12
0
    def evaluate(
        self,
        sentences: List[Sentence],
        eval_mini_batch_size: int = 32,
        embeddings_in_memory: bool = False,
        out_path: Path = None,
    ) -> (Result, float):

        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + eval_mini_batch_size]
                for x in range(0, len(sentences), eval_mini_batch_size)
            ]

            metric = Metric("Evaluation")

            lines: List[str] = []
            for batch in batches:

                labels, loss = self.forward_labels_and_loss(batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                sentences_for_batch = [
                    sent.to_plain_string() for sent in batch
                ]
                confidences_for_batch = [[
                    label.score for label in sent_labels
                ] for sent_labels in labels]
                predictions_for_batch = [[
                    label.value for label in sent_labels
                ] for sent_labels in labels]
                true_values_for_batch = [
                    sentence.get_label_names() for sentence in batch
                ]
                available_labels = self.label_dictionary.get_items()

                for sentence, confidence, prediction, true_value in zip(
                        sentences_for_batch,
                        confidences_for_batch,
                        predictions_for_batch,
                        true_values_for_batch,
                ):
                    eval_line = "{}\t{}\t{}\t{}\n".format(
                        sentence, true_value, prediction, confidence)
                    lines.append(eval_line)

                for predictions_for_sentence, true_values_for_sentence in zip(
                        predictions_for_batch, true_values_for_batch):

                    for label in available_labels:
                        if (label in predictions_for_sentence
                                and label in true_values_for_sentence):
                            metric.add_tp(label)
                        elif (label in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_fp(label)
                        elif (label not in predictions_for_sentence
                              and label in true_values_for_sentence):
                            metric.add_fn(label)
                        elif (label not in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_tn(label)

            eval_loss /= len(sentences)

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}")

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=
                f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            return result, eval_loss
コード例 #13
0
    def evaluate(
        self,
        sentences: Union[List[Sentence], Dataset],
        out_path: Union[str, Path] = None,
        embedding_storage_mode: str = "none",
        mini_batch_size: int = 32,
        num_workers: int = 8,
        wsd_evaluation: bool = False,
        **kwargs,
    ) -> (Result, float):

        # read Dataset into data loader (if list of sentences passed, make Dataset first)
        if not isinstance(sentences, Dataset):
            sentences = SentenceDataset(sentences)
        data_loader = DataLoader(sentences,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        eval_loss = 0
        eval_count = 0

        batch_no: int = 0

        metric = Metric("Evaluation", beta=self.beta)

        lines: List[str] = []

        y_true = []
        y_pred = []

        for batch in data_loader:

            # predict for batch
            loss_and_count = self.predict(
                batch,
                embedding_storage_mode=embedding_storage_mode,
                mini_batch_size=mini_batch_size,
                label_name='predicted',
                return_loss=True)

            eval_loss += loss_and_count[0]
            eval_count += loss_and_count[1]
            batch_no += 1

            for sentence in batch:

                # make list of gold tags
                gold_spans = sentence.get_spans(self.get_current_tag_type())
                gold_tags = [(span.tag, repr(span)) for span in gold_spans]

                # make list of predicted tags
                predicted_spans = sentence.get_spans("predicted")
                predicted_tags = [(span.tag, repr(span))
                                  for span in predicted_spans]

                # check for true positives, false positives and false negatives
                for tag, prediction in predicted_tags:
                    if (tag, prediction) in gold_tags:
                        metric.add_tp(tag)
                    else:
                        metric.add_fp(tag)

                for tag, gold in gold_tags:
                    if (tag, gold) not in predicted_tags:
                        metric.add_fn(tag)

                tags_gold = []
                tags_pred = []

                # also write to file in BIO format to use old conlleval script
                if out_path:
                    for token in sentence:
                        # check if in gold spans
                        gold_tag = 'O'
                        for span in gold_spans:
                            if token in span:
                                gold_tag = 'B-' + span.tag if token == span[
                                    0] else 'I-' + span.tag
                        tags_gold.append(gold_tag)

                        predicted_tag = 'O'
                        # check if in predicted spans
                        for span in predicted_spans:
                            if token in span:
                                predicted_tag = 'B-' + span.tag if token == span[
                                    0] else 'I-' + span.tag
                        tags_pred.append(predicted_tag)

                        lines.append(
                            f'{token.text} {gold_tag} {predicted_tag}\n')
                    lines.append('\n')

                y_true.append(tags_gold)
                y_pred.append(tags_pred)

        if out_path:
            with open(Path(out_path), "w", encoding="utf-8") as outfile:
                outfile.write("".join(lines))

        detailed_result = (
            "\nResults:"
            f"\n- F1-score (micro) {metric.micro_avg_f_score():.4f}"
            f"\n- F1-score (macro) {metric.macro_avg_f_score():.4f}"
            '\n\nBy class:')

        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"f1-score: "
                f"{metric.f_score(class_name):.4f}")

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=
            f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )

        return result, eval_loss / eval_count
コード例 #14
0
ファイル: strip_hint.py プロジェクト: rkwojdan/flair35
from pathlib import Path
コード例 #15
0
    def evaluate(self,
                 data_loader,
                 out_path=None,
                 embedding_storage_mode="none"):
        eval_loss = 0

        batch_no: int = 0

        metric = Metric("Evaluation", beta=1.0)

        # lines: List[str] = []

        for batch in data_loader:
            batch_no += 1

            loss, tags = _predict_batch(self.__models, batch)

            eval_loss += loss

            for (sentence, sent_tags) in zip(batch, tags):
                for (token, tag) in zip(sentence.tokens, sent_tags):
                    token: Token = token
                    token.add_tag("predicted", tag.value, tag.score)

                    # append both to file for evaluation
                #     eval_line = "{} {} {} {}\n".format(
                #         token.text,
                #         token.get_tag(self.tag_type).value,
                #         tag.value,
                #         tag.score,
                #     )
                #     lines.append(eval_line)
                # lines.append("\n")

            for sentence in batch:
                # make list of gold tags
                gold_tags = [(tag.tag, tag.text)
                             for tag in sentence.get_spans(self.tag_type)]
                # make list of predicted tags
                predicted_tags = [(tag.tag, tag.text)
                                  for tag in sentence.get_spans("predicted")]

                # check for true positives, false positives and false negatives
                for tag, prediction in predicted_tags:
                    if (tag, prediction) in gold_tags:
                        metric.add_tp(tag)
                    else:
                        metric.add_fp(tag)

                for tag, gold in gold_tags:
                    if (tag, gold) not in predicted_tags:
                        metric.add_fn(tag)
                    else:
                        metric.add_tn(tag)

            store_embeddings(batch, embedding_storage_mode)

        eval_loss /= batch_no

        detailed_result = (
            f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
            f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
        )
        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                f"{metric.f_score(class_name):.4f}")

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=
            f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )

        return result, eval_loss
コード例 #16
0
 def evaluate(self,
              data_loader: DataLoader,
              out_path: Path = None,
              embeddings_storage_mode: str = 'cpu') -> (Result, float):
     with torch.no_grad():
         eval_loss = 0
         metric = Metric('Evaluation')
         lines = []
         batch_count = 0
         for batch in data_loader:
             batch_count += 1
             (labels, loss) = self.forward_labels_and_loss(batch)
             eval_loss += loss
             sentences_for_batch = [
                 sent.to_plain_string() for sent in batch
             ]
             confidences_for_batch = [[
                 label.score for label in sent_labels
             ] for sent_labels in labels]
             predictions_for_batch = [[
                 label.value for label in sent_labels
             ] for sent_labels in labels]
             true_values_for_batch = [
                 sentence.get_label_names() for sentence in batch
             ]
             available_labels = self.label_dictionary.get_items()
             for (sentence, confidence, prediction, true_value) in zip(
                     sentences_for_batch, confidences_for_batch,
                     predictions_for_batch, true_values_for_batch):
                 eval_line = '{}\t{}\t{}\t{}\n'.format(
                     sentence, true_value, prediction, confidence)
                 lines.append(eval_line)
             for (predictions_for_sentence,
                  true_values_for_sentence) in zip(predictions_for_batch,
                                                   true_values_for_batch):
                 for label in available_labels:
                     if ((label in predictions_for_sentence)
                             and (label in true_values_for_sentence)):
                         metric.add_tp(label)
                     elif ((label in predictions_for_sentence)
                           and (label not in true_values_for_sentence)):
                         metric.add_fp(label)
                     elif ((label not in predictions_for_sentence)
                           and (label in true_values_for_sentence)):
                         metric.add_fn(label)
                     elif ((label not in predictions_for_sentence)
                           and (label not in true_values_for_sentence)):
                         metric.add_tn(label)
             store_embeddings(batch, embeddings_storage_mode)
         eval_loss /= batch_count
         detailed_result = ''.join([
             '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.micro_avg_f_score()),
             '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.macro_avg_f_score())
         ])
         for class_name in metric.get_classes():
             detailed_result += ''.join([
                 '\n', '{:<10}'.format(class_name), ' tp: ',
                 '{}'.format(metric.get_tp(class_name)), ' - fp: ',
                 '{}'.format(metric.get_fp(class_name)), ' - fn: ',
                 '{}'.format(metric.get_fn(class_name)), ' - tn: ',
                 '{}'.format(metric.get_tn(class_name)), ' - precision: ',
                 '{:.4f}'.format(metric.precision(class_name)),
                 ' - recall: ', '{:.4f}'.format(metric.recall(class_name)),
                 ' - accuracy: ', '{:.4f}'.format(
                     metric.accuracy(class_name)), ' - f1-score: ',
                 '{:.4f}'.format(metric.f_score(class_name))
             ])
         result = Result(main_score=metric.micro_avg_f_score(),
                         log_line=''.join([
                             '{}'.format(metric.precision()), '\t',
                             '{}'.format(metric.recall()), '\t',
                             '{}'.format(metric.micro_avg_f_score())
                         ]),
                         log_header='PRECISION\tRECALL\tF1',
                         detailed_results=detailed_result)
         if (out_path is not None):
             with open(out_path, 'w', encoding='utf-8') as outfile:
                 outfile.write(''.join(lines))
         return (result, eval_loss)
コード例 #17
0
ファイル: trainer.py プロジェクト: rkwojdan/flair35
from __future__ import absolute_import
コード例 #18
0
def eval_flair_spans(data, predicted_list, batch_size, out_path=None):
    metric = Metric('Evaluation')

    mini_batch_size = batch_size
    batches = [
        data[x:x + mini_batch_size]
        for x in range(0, len(data), mini_batch_size)
    ]

    lines: List[str] = []
    word_counter = 0
    for batch in batches:
        for sentence in batch:
            for token in sentence.tokens:
                tag = Label(predicted_list[word_counter])
                word_counter += 1
                token.add_tag_label('predicted', tag)

                # append both to file for evaluation
                eval_line = '{} {} {} {}\n'.format(token.text,
                                                   token.get_tag('ner').value,
                                                   tag.value, tag.score)

                lines.append(eval_line)
            lines.append('\n')

        for sentence in batch:
            # make list of gold tags
            gold_tags = [(tag.tag, str(tag))
                         for tag in sentence.get_spans('ner')]
            # make list of predicted tags
            predicted_tags = [(tag.tag, str(tag))
                              for tag in sentence.get_spans('predicted')]

            # check for true positives, false positives and false negatives
            for tag, prediction in predicted_tags:
                if (tag, prediction) in gold_tags:
                    metric.add_tp(tag)
                else:
                    metric.add_fp(tag)

            for tag, gold in gold_tags:
                if (tag, gold) not in predicted_tags:
                    metric.add_fn(tag)
                else:
                    metric.add_tn(tag)

    # add metrics scores at the beginning of the file
    lines.insert(0, str(metric) + "\n\n")

    if out_path is not None:

        # create folder for json and corresponding output
        if not os.path.exists(os.path.dirname(out_path)):
            try:
                os.makedirs(os.path.dirname(out_path))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

        with open(out_path, "w", encoding='utf-8') as outfile:
            outfile.write(''.join(lines))
        #
    # esnWrapper.model.output_activation = output_activation_training
    return metric
コード例 #19
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embeddings_storage_mode: str = "cpu",
    ) -> (Result, float):

        with torch.no_grad():
            eval_loss = 0

            metric = Metric("Evaluation")

            lines: List[str] = []
            batch_count: int = 0
            for batch in data_loader:

                batch_count += 1

                labels, loss = self.forward_labels_and_loss(batch)

                eval_loss += loss

                sentences_for_batch = [
                    sent.to_plain_string() for sent in batch
                ]
                confidences_for_batch = [[
                    label.score for label in sent_labels
                ] for sent_labels in labels]
                predictions_for_batch = [[
                    label.value for label in sent_labels
                ] for sent_labels in labels]
                true_values_for_batch = [
                    sentence.get_label_names() for sentence in batch
                ]
                available_labels = self.label_dictionary.get_items()

                for sentence, confidence, prediction, true_value in zip(
                        sentences_for_batch,
                        confidences_for_batch,
                        predictions_for_batch,
                        true_values_for_batch,
                ):
                    eval_line = "{}\t{}\t{}\t{}\n".format(
                        sentence, true_value, prediction, confidence)
                    lines.append(eval_line)

                for predictions_for_sentence, true_values_for_sentence in zip(
                        predictions_for_batch, true_values_for_batch):

                    for label in available_labels:
                        if (label in predictions_for_sentence
                                and label in true_values_for_sentence):
                            metric.add_tp(label)
                        elif (label in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_fp(label)
                        elif (label not in predictions_for_sentence
                              and label in true_values_for_sentence):
                            metric.add_fn(label)
                        elif (label not in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_tn(label)

                store_embeddings(batch, embeddings_storage_mode)

            eval_loss /= batch_count

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}")

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=
                f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            return result, eval_loss
コード例 #20
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embeddings_storage_mode: str = "cpu",
        prediction_mode: bool = False,
    ) -> (Result, float):
        eval_loss = 0
        batch_no = 0
        data_loader.assign_embeddings()
        if out_path is not None:
            outfile = open(out_path, "w", encoding="utf-8")
        if not self.binary:
            metric = Metric("Evaluation")
        with torch.no_grad():
            for batch in data_loader:
                batch_no += 1
                scores = self.forward(batch, prediction_mode=prediction_mode)
                loss = self._calculate_loss(scores, batch, self.mask)
                eval_loss += loss
                if self.binary:
                    pdb.set_trace()
                    result = Result(
                        main_score=LF1,
                        log_line=f"\nUF1: {UF1} - LF1 {LF1}",
                        log_header="PRECISION\tRECALL\tF1",
                        detailed_results=f"\nUF1: {UF1} - LF1 {LF1}",
                    )
                else:
                    # if prediction_mode:
                    #   eval_loss, metric=self.dependency_evaluate(data_loader,out_path=out_path,prediction_mode=prediction_mode)
                    #   return eval_loss, metric
                    # else:

                    tags, _ = self._obtain_labels(scores, batch)
                    for (sentence, sent_tags) in zip(batch, tags):
                        for (token, tag) in zip(sentence.tokens, sent_tags):
                            token: Token = token
                            token.add_tag_label("predicted", tag)

                            # append both to file for evaluation
                            eval_line = "{} {} {} {}\n".format(
                                token.text,
                                token.get_tag(self.tag_type).value,
                                tag.value,
                                tag.score,
                            )
                            # lines.append(eval_line)
                            if out_path is not None:
                                outfile.write(eval_line)
                        # lines.append("\n")
                        if out_path is not None:
                            outfile.write("\n")
                    for sentence in batch:
                        # make list of gold tags
                        gold_tags = [
                            (tag.tag, str(tag))
                            for tag in sentence.get_spans(self.tag_type)
                        ]
                        # make list of predicted tags
                        predicted_tags = [
                            (tag.tag, str(tag))
                            for tag in sentence.get_spans("predicted")
                        ]

                        # check for true positives, false positives and false negatives
                        for tag, prediction in predicted_tags:
                            if (tag, prediction) in gold_tags:
                                metric.add_tp(tag)
                            else:
                                metric.add_fp(tag)

                        for tag, gold in gold_tags:
                            if (tag, gold) not in predicted_tags:
                                metric.add_fn(tag)
                            else:
                                metric.add_tn(tag)
        eval_loss /= batch_no
        if out_path is not None:
            outfile.close()
        detailed_result = (
            f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
            f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
        )
        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                f"{metric.f_score(class_name):.4f}")

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=
            f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )
        return result, eval_loss
コード例 #21
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embedding_storage_mode: str = "none",
    ) -> (Result, float):

        if type(out_path) == str:
            out_path = Path(out_path)

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0

            metric = Metric("Evaluation", beta=self.beta)

            lines: List[str] = []

            if self.use_crf:
                transitions = self.transitions.detach().cpu().numpy()
            else:
                transitions = None

            for batch in data_loader:
                batch_no += 1

                with torch.no_grad():
                    features = self.forward(batch)
                    loss = self._calculate_loss(features, batch)
                    tags, _ = self._obtain_labels(
                        feature=features,
                        batch_sentences=batch,
                        transitions=transitions,
                        get_all_tags=False,
                    )

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag("predicted", tag.value, tag.score)

                        # append both to file for evaluation
                        eval_line = "{} {} {} {}\n".format(
                            token.text,
                            token.get_tag(self.tag_type).value,
                            tag.value,
                            tag.score,
                        )
                        lines.append(eval_line)
                    lines.append("\n")

                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [
                        (tag.tag, tag.text) for tag in sentence.get_spans(self.tag_type)
                    ]
                    # make list of predicted tags
                    predicted_tags = [
                        (tag.tag, tag.text) for tag in sentence.get_spans("predicted")
                    ]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                store_embeddings(batch, embedding_storage_mode)

            eval_loss /= batch_no

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy():.4f} - f1-score {metric.micro_avg_f_score():.4f}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy():.4f} - f1-score {metric.macro_avg_f_score():.4f}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}"
                )

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            return result, eval_loss
コード例 #22
0
 def evaluate(self,
              data_loader: DataLoader,
              out_path: Path = None,
              embeddings_storage_mode: str = 'cpu') -> (Result, float):
     with torch.no_grad():
         eval_loss = 0
         batch_no = 0
         metric = Metric('Evaluation')
         lines = []
         for batch in data_loader:
             batch_no += 1
             with torch.no_grad():
                 features = self.forward(batch)
                 loss = self._calculate_loss(features, batch)
                 (tags, _) = self._obtain_labels(features, batch)
             eval_loss += loss
             for (sentence, sent_tags) in zip(batch, tags):
                 for (token, tag) in zip(sentence.tokens, sent_tags):
                     token = token
                     token.add_tag_label('predicted', tag)
                     eval_line = '{} {} {} {}\n'.format(
                         token.text,
                         token.get_tag(self.tag_type).value, tag.value,
                         tag.score)
                     lines.append(eval_line)
                 lines.append('\n')
             for sentence in batch:
                 gold_tags = [(tag.tag, str(tag))
                              for tag in sentence.get_spans(self.tag_type)]
                 predicted_tags = [
                     (tag.tag, str(tag))
                     for tag in sentence.get_spans('predicted')
                 ]
                 for (tag, prediction) in predicted_tags:
                     if ((tag, prediction) in gold_tags):
                         metric.add_tp(tag)
                     else:
                         metric.add_fp(tag)
                 for (tag, gold) in gold_tags:
                     if ((tag, gold) not in predicted_tags):
                         metric.add_fn(tag)
                     else:
                         metric.add_tn(tag)
             store_embeddings(batch, embeddings_storage_mode)
         eval_loss /= batch_no
         if (out_path is not None):
             with open(out_path, 'w', encoding='utf-8') as outfile:
                 outfile.write(''.join(lines))
         detailed_result = ''.join([
             '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.micro_avg_f_score()),
             '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.macro_avg_f_score())
         ])
         for class_name in metric.get_classes():
             detailed_result += ''.join([
                 '\n', '{:<10}'.format(class_name), ' tp: ',
                 '{}'.format(metric.get_tp(class_name)), ' - fp: ',
                 '{}'.format(metric.get_fp(class_name)), ' - fn: ',
                 '{}'.format(metric.get_fn(class_name)), ' - tn: ',
                 '{}'.format(metric.get_tn(class_name)), ' - precision: ',
                 '{:.4f}'.format(metric.precision(class_name)),
                 ' - recall: ', '{:.4f}'.format(metric.recall(class_name)),
                 ' - accuracy: ', '{:.4f}'.format(
                     metric.accuracy(class_name)), ' - f1-score: ',
                 '{:.4f}'.format(metric.f_score(class_name))
             ])
         result = Result(main_score=metric.micro_avg_f_score(),
                         log_line=''.join([
                             '{}'.format(metric.precision()), '\t',
                             '{}'.format(metric.recall()), '\t',
                             '{}'.format(metric.micro_avg_f_score())
                         ]),
                         log_header='PRECISION\tRECALL\tF1',
                         detailed_results=detailed_result)
         return (result, eval_loss)