def test_tagged_corpus_get_all_sentences():
    train_sentence = Sentence("I'm used in training.")
    dev_sentence = Sentence("I'm a dev sentence.")
    test_sentence = Sentence("I will be only used for testing.")

    corpus: Corpus = Corpus(
        FlairDatapointDataset([train_sentence]),
        FlairDatapointDataset([dev_sentence]),
        FlairDatapointDataset([test_sentence]),
    )

    all_sentences = corpus.get_all_sentences()

    assert 3 == len(all_sentences)
def test_tagged_corpus_downsample():
    sentence = Sentence("I love Berlin.",
                        use_tokenizer=True).add_label("label", "class_1")

    corpus: Corpus = Corpus(
        FlairDatapointDataset([
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
        ]),
        sample_missing_splits=False,
    )

    assert 10 == len(corpus.train)

    corpus.downsample(percentage=0.3,
                      downsample_dev=False,
                      downsample_test=False)

    assert 3 == len(corpus.train)
def test_tagged_corpus_make_label_dictionary():
    sentence_1 = Sentence("sentence 1").add_label("label", "class_1")

    sentence_2 = Sentence("sentence 2").add_label("label", "class_2")

    sentence_3 = Sentence("sentence 3").add_label("label", "class_1")

    corpus: Corpus = Corpus(
        FlairDatapointDataset([sentence_1, sentence_2, sentence_3]),
        FlairDatapointDataset([]),
        FlairDatapointDataset([]),
    )

    label_dict = corpus.make_label_dictionary("label")

    assert 3 == len(label_dict)
    assert "<unk>" in label_dict.get_items()
    assert "class_1" in label_dict.get_items()
    assert "class_2" in label_dict.get_items()
    def predict(
        self,
        sentences: Union[Sentence, List[Sentence]],
        mini_batch_size: int = 32,
        verbose: bool = False,
        label_name: Optional[str] = None,
        embedding_storage_mode="none",
    ) -> List[Sentence]:

        if label_name is None:
            label_name = self.label_name if self.label_name is not None else "label"

        with torch.no_grad():
            if not isinstance(sentences, list):
                sentences = [sentences]

            if not sentences:
                return sentences

            reordered_sentences = sorted(sentences,
                                         key=lambda s: len(s),
                                         reverse=True)

            if len(reordered_sentences) == 0:
                return sentences

            dataloader = DataLoader(
                dataset=FlairDatapointDataset(reordered_sentences),
                batch_size=mini_batch_size,
            )
            # progress bar for verbosity
            if verbose:
                progress_bar = tqdm(dataloader)
                progress_bar.set_description("Batch inference")
                dataloader = progress_bar

            for batch in dataloader:
                # stop if all sentences are empty
                if not batch:
                    continue
                scores = self.forward(batch)

                for (sentence, score) in zip(batch, scores.tolist()):
                    sentence.set_label(label_name, value=str(score[0]))

                # clearing token embeddings to save memory
                store_embeddings(batch, storage_mode=embedding_storage_mode)

            return sentences
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence],
        mini_batch_size: int = 32,
        num_workers: int = 8,
        print_tree: bool = False,
        embedding_storage_mode="none",
    ) -> None:
        """
        Predict arcs and tags for Dependency Parser task
        :param sentences: a Sentence or a List of Sentence
        :param mini_batch_size: mini batch size to use
        :param print_tree: set to True to print dependency parser of sentence as tree shape
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
        you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        'gpu' to store embeddings in GPU memory.
        """
        if not isinstance(sentences, list):
            sentences = [sentences]
        sentence_dataset = FlairDatapointDataset(sentences)
        data_loader = DataLoader(sentence_dataset,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        for batch in data_loader:
            with torch.no_grad():
                score_arc, score_rel = self.forward(batch)
                arc_prediction, relation_prediction = self._obtain_labels_(
                    score_arc, score_rel)

            for sentnce_index, (sentence, sent_tags, sent_arcs) in enumerate(
                    zip(batch, relation_prediction, arc_prediction)):

                for token_index, (token, tag, head_id) in enumerate(
                        zip(sentence.tokens, sent_tags, sent_arcs)):
                    token.add_tag(self.tag_type, tag,
                                  score_rel[sentnce_index][token_index])

                    token.head_id = int(head_id)

                if print_tree:
                    tree_printer(sentence, self.tag_type)
                    print("-" * 50)
            store_embeddings(batch, storage_mode=embedding_storage_mode)
def test_tagged_corpus_make_vocab_dictionary():
    train_sentence = Sentence("used in training. training is cool.")

    corpus: Corpus = Corpus(FlairDatapointDataset([train_sentence]),
                            sample_missing_splits=False)

    vocab = corpus.make_vocab_dictionary(max_tokens=2, min_freq=-1)

    assert 3 == len(vocab)
    assert "<unk>" in vocab.get_items()
    assert "training" in vocab.get_items()
    assert "." in vocab.get_items()

    vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=-1)

    assert 7 == len(vocab)

    vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=2)

    assert 3 == len(vocab)
    assert "<unk>" in vocab.get_items()
    assert "training" in vocab.get_items()
    assert "." in vocab.get_items()
Beispiel #7
0
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence],
        mini_batch_size: int = 32,
        return_probabilities_for_all_classes: bool = False,
        verbose: bool = False,
        label_name: Optional[str] = None,
        return_loss=False,
        embedding_storage_mode="none",
    ):
        """
        Predicts labels for current batch with CRF or Softmax.
        :param sentences: List of sentences in batch
        :param mini_batch_size: batch size for test data
        :param return_probabilities_for_all_classes: Whether to return probabilites for all classes
        :param verbose: whether to use progress bar
        :param label_name: which label to predict
        :param return_loss: whether to return loss value
        :param embedding_storage_mode: determines where to store embeddings - can be "gpu", "cpu" or None.
        """
        if label_name is None:
            label_name = self.tag_type

        with torch.no_grad():
            if not sentences:
                return sentences

            # make sure its a list
            if not isinstance(sentences, list) and not isinstance(
                    sentences, flair.data.Dataset):
                sentences = [sentences]

            # filter empty sentences
            sentences = [
                sentence for sentence in sentences if len(sentence) > 0
            ]

            # reverse sort all sequences by their length
            reordered_sentences = sorted(sentences,
                                         key=lambda s: len(s),
                                         reverse=True)

            if len(reordered_sentences) == 0:
                return sentences

            dataloader = DataLoader(
                dataset=FlairDatapointDataset(reordered_sentences),
                batch_size=mini_batch_size,
            )
            # progress bar for verbosity
            if verbose:
                dataloader = tqdm(dataloader, desc="Batch inference")

            overall_loss = torch.zeros(1, device=flair.device)
            batch_no = 0
            label_count = 0
            for batch in dataloader:

                batch_no += 1

                # stop if all sentences are empty
                if not batch:
                    continue

                # get features from forward propagation
                features, gold_labels = self.forward(batch)

                # remove previously predicted labels of this type
                for sentence in batch:
                    sentence.remove_labels(label_name)

                # if return_loss, get loss value
                if return_loss:
                    loss = self._calculate_loss(features, gold_labels)
                    overall_loss += loss[0]
                    label_count += loss[1]

                # Sort batch in same way as forward propagation
                lengths = torch.LongTensor(
                    [len(sentence) for sentence in batch])
                _, sort_indices = lengths.sort(dim=0, descending=True)
                batch = [batch[i] for i in sort_indices]

                # make predictions
                if self.use_crf:
                    predictions, all_tags = self.viterbi_decoder.decode(
                        features, return_probabilities_for_all_classes)
                else:
                    predictions, all_tags = self._standard_inference(
                        features, batch, return_probabilities_for_all_classes)

                # add predictions to Sentence
                for sentence, sentence_predictions in zip(batch, predictions):

                    # BIOES-labels need to be converted to spans
                    if self.predict_spans:
                        sentence_tags = [
                            label[0] for label in sentence_predictions
                        ]
                        sentence_scores = [
                            label[1] for label in sentence_predictions
                        ]
                        predicted_spans = get_spans_from_bio(
                            sentence_tags, sentence_scores)
                        for predicted_span in predicted_spans:
                            span: Span = sentence[
                                predicted_span[0][0]:predicted_span[0][-1] + 1]
                            span.add_label(label_name,
                                           value=predicted_span[2],
                                           score=predicted_span[1])

                    # token-labels can be added directly
                    else:
                        for token, label in zip(sentence.tokens,
                                                sentence_predictions):
                            token.add_label(typename=label_name,
                                            value=label[0],
                                            score=label[1])

                # all_tags will be empty if all_tag_prob is set to False, so the for loop will be avoided
                for (sentence, sent_all_tags) in zip(batch, all_tags):
                    for (token, token_all_tags) in zip(sentence.tokens,
                                                       sent_all_tags):
                        token.add_tags_proba_dist(label_name, token_all_tags)

                store_embeddings(sentences,
                                 storage_mode=embedding_storage_mode)

            if return_loss:
                return overall_loss, label_count
Beispiel #8
0
    def predict(
        self,
        sentences: Union[List[DT], DT],
        mini_batch_size: int = 32,
        return_probabilities_for_all_classes: bool = False,
        verbose: bool = False,
        label_name: Optional[str] = None,
        return_loss=False,
        embedding_storage_mode="none",
    ):
        """
        Predicts the class labels for the given sentences. The labels are directly added to the sentences.  # noqa: E501
        :param sentences: list of sentences
        :param mini_batch_size: mini batch size to use
        :param return_probabilities_for_all_classes : return probabilities for all classes instead of only best predicted  # noqa: E501
        :param verbose: set to True to display a progress bar
        :param return_loss: set to True to return loss
        :param label_name: set this to change the name of the label type that is predicted  # noqa: E501
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.  # noqa: E501
        'gpu' to store embeddings in GPU memory.
        """
        if label_name is None:
            label_name = self.label_type if self.label_type is not None else "label"

        with torch.no_grad():
            if not sentences:
                return sentences

            if not isinstance(sentences, list):
                sentences = [sentences]

            reordered_sentences = self._sort_data(sentences)

            if len(reordered_sentences) == 0:
                return sentences

            dataloader = DataLoader(
                dataset=FlairDatapointDataset(reordered_sentences),
                batch_size=mini_batch_size,
            )
            # progress bar for verbosity
            if verbose:
                progress_bar = tqdm(dataloader)
                progress_bar.set_description("Batch inference")
                dataloader = progress_bar

            overall_loss = 0
            label_count = 0
            for batch in dataloader:
                # stop if all sentences are empty
                if not batch:
                    continue

                scores, gold_labels, data_points, label_candidates = self.forward_pass(
                    batch, return_label_candidates=True  # type: ignore
                )
                # remove previously predicted labels of this type
                for sentence in data_points:
                    sentence.remove_labels(label_name)

                if return_loss:
                    overall_loss += self._calculate_loss(scores, gold_labels)[0]
                    label_count += len(label_candidates)

                # if anything could possibly be predicted
                if len(label_candidates) > 0:
                    if self.multi_label:
                        sigmoided = torch.sigmoid(scores)  # size: (n_sentences, n_classes)
                        n_labels = sigmoided.size(1)
                        for s_idx, (data_point, label_candidate) in enumerate(zip(data_points, label_candidates)):
                            for l_idx in range(n_labels):
                                label_value = self.label_dictionary.get_item_for_index(l_idx)
                                if label_value == "O":
                                    continue
                                label_threshold = self._get_label_threshold(label_value)
                                label_score = sigmoided[s_idx, l_idx].item()
                                if label_score > label_threshold or return_probabilities_for_all_classes:
                                    label = label_candidate.spawn(value=label_value, score=label_score)
                                    data_point.add_complex_label(label_name, label)
                    else:
                        softmax = torch.nn.functional.softmax(scores, dim=-1)

                        if return_probabilities_for_all_classes:
                            n_labels = softmax.size(1)
                            for s_idx, (data_point, label_candidate) in enumerate(zip(data_points, label_candidates)):
                                for l_idx in range(n_labels):
                                    label_value = self.label_dictionary.get_item_for_index(l_idx)
                                    if label_value == "O":
                                        continue
                                    label_score = softmax[s_idx, l_idx].item()
                                    label = label_candidate.spawn(value=label_value, score=label_score)
                                    data_point.add_complex_label(label_name, label)
                        else:
                            conf, idx = torch.max(softmax, dim=-1)
                            for data_point, label_candidate, c, i in zip(data_points, label_candidates, conf, idx):
                                label_value = self.label_dictionary.get_item_for_index(i.item())
                                if label_value == "O":
                                    continue
                                label = label_candidate.spawn(value=label_value, score=c.item())
                                data_point.add_complex_label(label_name, label)

                store_embeddings(batch, storage_mode=embedding_storage_mode)

            if return_loss:
                return overall_loss, label_count
Beispiel #9
0
    def evaluate(
        self,
        data_points: Union[List[DT], Dataset],
        gold_label_type: str,
        out_path: Union[str, Path] = None,
        embedding_storage_mode: str = "none",
        mini_batch_size: int = 32,
        num_workers: Optional[int] = 8,
        main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
        exclude_labels: List[str] = [],
        gold_label_dictionary: Optional[Dictionary] = None,
        **kwargs,
    ) -> Result:
        import numpy as np
        import sklearn

        # read Dataset into data loader, if list of sentences passed, make Dataset first
        if not isinstance(data_points, Dataset):
            data_points = FlairDatapointDataset(data_points)
        data_loader = DataLoader(data_points, batch_size=mini_batch_size, num_workers=num_workers)

        with torch.no_grad():

            # loss calculation
            eval_loss = torch.zeros(1, device=flair.device)
            average_over = 0

            # variables for printing
            lines: List[str] = []

            # variables for computing scores
            all_spans: List[str] = []
            all_true_values = {}
            all_predicted_values = {}

            sentence_id = 0
            for batch in data_loader:

                # remove any previously predicted labels
                for datapoint in batch:
                    datapoint.remove_labels("predicted")
                # predict for batch
                loss_and_count = self.predict(
                    batch,
                    embedding_storage_mode=embedding_storage_mode,
                    mini_batch_size=mini_batch_size,
                    label_name="predicted",
                    return_loss=True,
                )

                if isinstance(loss_and_count, tuple):
                    average_over += loss_and_count[1]
                    eval_loss += loss_and_count[0]
                else:
                    eval_loss += loss_and_count

                # get the gold labels
                for datapoint in batch:

                    for gold_label in datapoint.get_labels(gold_label_type):
                        representation = str(sentence_id) + ": " + gold_label.identifier

                        value = gold_label.value
                        if gold_label_dictionary and gold_label_dictionary.get_idx_for_item(value) == 0:
                            value = "<unk>"

                        if representation not in all_true_values:
                            all_true_values[representation] = [value]
                        else:
                            all_true_values[representation].append(value)

                        if representation not in all_spans:
                            all_spans.append(representation)

                    for predicted_span in datapoint.get_labels("predicted"):
                        representation = str(sentence_id) + ": " + predicted_span.identifier

                        # add to all_predicted_values
                        if representation not in all_predicted_values:
                            all_predicted_values[representation] = [predicted_span.value]
                        else:
                            all_predicted_values[representation].append(predicted_span.value)

                        if representation not in all_spans:
                            all_spans.append(representation)

                    sentence_id += 1

                store_embeddings(batch, embedding_storage_mode)

                # make printout lines
                if out_path:
                    lines.extend(self._print_predictions(batch, gold_label_type))

            # write all_predicted_values to out_file if set
            if out_path:
                with open(Path(out_path), "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            # make the evaluation dictionary
            evaluation_label_dictionary = Dictionary(add_unk=False)
            evaluation_label_dictionary.add_item("O")
            for true_values in all_true_values.values():
                for label in true_values:
                    evaluation_label_dictionary.add_item(label)
            for predicted_values in all_predicted_values.values():
                for label in predicted_values:
                    evaluation_label_dictionary.add_item(label)

            # finally, compute numbers
            y_true = []
            y_pred = []

            for span in all_spans:

                true_values = all_true_values[span] if span in all_true_values else ["O"]
                predicted_values = all_predicted_values[span] if span in all_predicted_values else ["O"]

                y_true_instance = np.zeros(len(evaluation_label_dictionary), dtype=int)
                for true_value in true_values:
                    y_true_instance[evaluation_label_dictionary.get_idx_for_item(true_value)] = 1
                y_true.append(y_true_instance.tolist())

                y_pred_instance = np.zeros(len(evaluation_label_dictionary), dtype=int)
                for predicted_value in predicted_values:
                    y_pred_instance[evaluation_label_dictionary.get_idx_for_item(predicted_value)] = 1
                y_pred.append(y_pred_instance.tolist())

        # now, calculate evaluation numbers
        target_names = []
        labels = []

        counter = Counter(itertools.chain.from_iterable(all_true_values.values()))
        counter.update(list(itertools.chain.from_iterable(all_predicted_values.values())))

        for label_name, count in counter.most_common():
            if label_name == "O":
                continue
            if label_name in exclude_labels:
                continue
            target_names.append(label_name)
            labels.append(evaluation_label_dictionary.get_idx_for_item(label_name))

        # there is at least one gold label or one prediction (default)
        if len(all_true_values) + len(all_predicted_values) > 1:
            classification_report = sklearn.metrics.classification_report(
                y_true,
                y_pred,
                digits=4,
                target_names=target_names,
                zero_division=0,
                labels=labels,
            )

            classification_report_dict = sklearn.metrics.classification_report(
                y_true,
                y_pred,
                target_names=target_names,
                zero_division=0,
                output_dict=True,
                labels=labels,
            )

            accuracy_score = round(sklearn.metrics.accuracy_score(y_true, y_pred), 4)

            precision_score = round(classification_report_dict["micro avg"]["precision"], 4)
            recall_score = round(classification_report_dict["micro avg"]["recall"], 4)
            micro_f_score = round(classification_report_dict["micro avg"]["f1-score"], 4)
            macro_f_score = round(classification_report_dict["macro avg"]["f1-score"], 4)

            main_score = classification_report_dict[main_evaluation_metric[0]][main_evaluation_metric[1]]

        else:
            # issue error and default all evaluation numbers to 0.
            log.error(
                "ACHTUNG! No gold labels and no all_predicted_values found! "
                "Could be an error in your corpus or how you "
                "initialize the trainer!"
            )
            accuracy_score = precision_score = recall_score = micro_f_score = macro_f_score = main_score = 0.0
            classification_report = ""
            classification_report_dict = {}

        detailed_result = (
            "\nResults:"
            f"\n- F-score (micro) {micro_f_score}"
            f"\n- F-score (macro) {macro_f_score}"
            f"\n- Accuracy {accuracy_score}"
            "\n\nBy class:\n" + classification_report
        )

        # line for log file
        log_header = "PRECISION\tRECALL\tF1\tACCURACY"
        log_line = f"{precision_score}\t" f"{recall_score}\t" f"{micro_f_score}\t" f"{accuracy_score}"

        if average_over > 0:
            eval_loss /= average_over

        result = Result(
            main_score=main_score,
            log_line=log_line,
            log_header=log_header,
            detailed_results=detailed_result,
            classification_report=classification_report_dict,
            loss=eval_loss.item(),
        )

        return result
    def evaluate(
        self,
        data_points: Union[List[Sentence], Dataset],
        gold_label_type: str,
        out_path: Union[str, Path] = None,
        embedding_storage_mode: str = "none",
        mini_batch_size: int = 32,
        num_workers: Optional[int] = 8,
        **kwargs,
    ) -> Result:

        # read Dataset into data loader, if list of sentences passed, make Dataset first
        if not isinstance(data_points, Dataset):
            data_points = FlairDatapointDataset(data_points)
        data_loader = DataLoader(data_points,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        with torch.no_grad():
            eval_loss = torch.zeros(1, device=flair.device)

            metric = MetricRegression("Evaluation")

            lines: List[str] = []
            total_count = 0
            for batch_nr, batch in enumerate(data_loader):

                if isinstance(batch, Sentence):
                    batch = [batch]

                scores, loss = self.forward_labels_and_loss(batch)

                true_values = []
                for sentence in batch:
                    total_count += 1
                    for label in sentence.get_labels(gold_label_type):
                        true_values.append(float(label.value))

                results = []
                for score in scores:
                    results.append(score[0])

                eval_loss += loss

                metric.true.extend(true_values)
                metric.pred.extend(results)

                for sentence, prediction, true_value in zip(
                        batch, results, true_values):
                    eval_line = "{}\t{}\t{}\n".format(
                        sentence.to_original_text(), true_value, prediction)
                    lines.append(eval_line)

                store_embeddings(batch, embedding_storage_mode)

            eval_loss /= total_count

            # TODO: not saving lines yet
            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            log_line = f"{metric.mean_squared_error()}\t{metric.spearmanr()}" f"\t{metric.pearsonr()}"
            log_header = "MSE\tSPEARMAN\tPEARSON"

            detailed_result = (
                f"AVG: mse: {metric.mean_squared_error():.4f} - "
                f"mae: {metric.mean_absolute_error():.4f} - "
                f"pearson: {metric.pearsonr():.4f} - "
                f"spearman: {metric.spearmanr():.4f}")

            result: Result = Result(
                main_score=metric.pearsonr(),
                loss=eval_loss.item(),
                log_header=log_header,
                log_line=log_line,
                detailed_results=detailed_result,
            )

            return result
    def evaluate(
        self,
        data_points: Union[List[DataPoint], Dataset],
        gold_label_type: str,
        out_path: Union[str, Path] = None,
        embedding_storage_mode: str = "none",
        mini_batch_size: int = 32,
        num_workers: Optional[int] = 8,
        main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
        exclude_labels: List[str] = [],
        gold_label_dictionary: Optional[Dictionary] = None,
        **kwargs,
    ) -> Result:

        if not isinstance(data_points, Dataset):
            data_points = FlairDatapointDataset(data_points)
        data_loader = DataLoader(data_points,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        lines: List[str] = [
            "token gold_tag gold_arc predicted_tag predicted_arc\n"
        ]

        average_over = 0
        eval_loss_arc = 0.0
        eval_loss_rel = 0.0

        y_true = []
        y_pred = []

        parsing_metric = ParsingMetric()

        for batch in data_loader:
            average_over += 1
            with torch.no_grad():
                score_arc, score_rel = self.forward(batch)
                loss_arc, loss_rel = self._calculate_loss(
                    score_arc, score_rel, batch)
                arc_prediction, relation_prediction = self._obtain_labels_(
                    score_arc, score_rel)

            parsing_metric(arc_prediction, relation_prediction, batch,
                           gold_label_type)

            eval_loss_arc += loss_arc.item()
            eval_loss_rel += loss_rel.item()

            for (sentence, arcs, sent_tags) in zip(batch, arc_prediction,
                                                   relation_prediction):
                for (token, arc, tag) in zip(sentence.tokens, arcs, sent_tags):
                    token.add_tag_label("predicted", Label(tag))
                    token.add_tag_label("predicted_head_id",
                                        Label(str(int(arc))))

                    # append both to file for evaluation
                    eval_line = "{} {} {} {} {}\n".format(
                        token.text,
                        token.get_tag(gold_label_type).value,
                        str(token.head_id),
                        tag,
                        str(int(arc)),
                    )
                    lines.append(eval_line)
                lines.append("\n")

            for sentence in batch:
                gold_tags = [
                    token.get_tag(gold_label_type).value
                    for token in sentence.tokens
                ]
                predicted_tags = [
                    tag.tag for tag in sentence.get_spans("predicted")
                ]

                y_pred += [
                    self.relations_dictionary.get_idx_for_item(tag)
                    for tag in predicted_tags
                ]
                y_true += [
                    self.relations_dictionary.get_idx_for_item(tag)
                    for tag in gold_tags
                ]

            store_embeddings(batch, embedding_storage_mode)

        eval_loss_arc /= average_over
        eval_loss_rel /= average_over

        if out_path is not None:
            with open(out_path, "w", encoding="utf-8") as outfile:
                outfile.write("".join(lines))

        classification_report_dict = sklearn.metrics.classification_report(
            y_true,
            y_pred,
            target_names=self.relations_dictionary.idx2item,
            zero_division=0,
            output_dict=True,
            labels=range(len(self.relations_dictionary)),
        )

        accuracy_score = round(sklearn.metrics.accuracy_score(y_true, y_pred),
                               4)

        precision_score = round(
            classification_report_dict["micro avg"]["precision"], 4)
        recall_score = round(classification_report_dict["micro avg"]["recall"],
                             4)
        micro_f_score = round(
            classification_report_dict["micro avg"]["f1-score"], 4)
        macro_f_score = round(
            classification_report_dict["macro avg"]["f1-score"], 4)

        main_score = classification_report_dict[main_evaluation_metric[0]][
            main_evaluation_metric[1]]

        detailed_result = (
            f"\nUAS : {parsing_metric.get_uas():.4f} - LAS : {parsing_metric.get_las():.4f}"
            f"\neval loss rel : {eval_loss_rel:.4f} - eval loss arc : {eval_loss_arc:.4f}"
            f"\nF-Score: micro : {micro_f_score} - macro : {macro_f_score}"
            f"\n Accuracy: {accuracy_score} - Precision {precision_score} - Recall {recall_score}"
        )
        log_header = "PRECISION\tRECALL\tF1\tACCURACY"
        log_line = f"{precision_score}\t" f"{recall_score}\t" f"{micro_f_score}\t" f"{accuracy_score}"

        result = Result(
            main_score=main_score,
            log_line=log_line,
            log_header=log_header,
            detailed_results=detailed_result,
            classification_report=classification_report_dict,
            loss=(eval_loss_rel + eval_loss_arc),
        )
        return result
Beispiel #12
0
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence],
        mini_batch_size: int = 16,
        return_probabilities_for_all_classes: bool = False,
        verbose: bool = False,
        label_name="predicted",
        return_loss=False,
        embedding_storage_mode="none",
    ):
        """
        Predict lemmas of words for a given (list of) sentence(s).
        :param sentences: sentences to predict
        :param label_name: label name used for predicted lemmas
        :param mini_batch_size: number of tokens that are send through the RNN simultaneously, assuming batching_in_rnn
            is set to True
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
            you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        :param return_loss: whether or not to compute and return loss. Setting it to True only makes sense if labels
            are provided
        :param verbose: If True, lemmatized sentences will be printed in the console.
        """
        if isinstance(sentences, Sentence):
            sentences = [sentences]

        # filter empty sentences
        sentences = [sentence for sentence in sentences if len(sentence) > 0]
        if len(sentences) == 0:
            return sentences

        # max length of the predicted sequences
        if not self.dependent_on_input:
            max_length = self.max_sequence_length
        else:
            max_length = max([
                len(token.text) + 1 for sentence in sentences
                for token in sentence
            ])

        # for printing
        line_to_print = ""

        overall_loss = 0.0
        number_tokens_in_total = 0

        with torch.no_grad():

            dataloader = DataLoader(dataset=FlairDatapointDataset(sentences),
                                    batch_size=mini_batch_size)

            for batch in dataloader:

                # stop if all sentences are empty
                if not batch:
                    continue

                # remove previously predicted labels of this type
                for sentence in batch:
                    for token in sentence:
                        token.remove_labels(label_name)

                # create list of tokens in batch
                tokens_in_batch = [
                    token for sentence in batch for token in sentence
                ]
                number_tokens = len(tokens_in_batch)
                number_tokens_in_total += number_tokens

                # encode inputs
                hidden, all_encoder_outputs = self.encode(batch)

                # create input for first pass (batch_size, 1, input_size), first letter is special character <S>
                # sequence length is always set to one in prediction
                input_indices = self.start_index * torch.ones(
                    number_tokens, dtype=torch.long,
                    device=flair.device).unsqueeze(1)

                # option 1: greedy decoding
                if self.beam_size == 1:

                    # predictions
                    predicted: List[List[int]] = [[]
                                                  for _ in range(number_tokens)
                                                  ]

                    for decode_step in range(max_length):

                        # decode next character
                        output_vectors, hidden = self.decode(
                            input_indices, hidden, all_encoder_outputs)

                        log_softmax_probs = torch.nn.functional.log_softmax(
                            output_vectors, dim=2)
                        # pick top beam size many outputs with highest probabilities
                        input_indices = log_softmax_probs.argmax(dim=2)

                        for i in range(number_tokens):
                            if len(predicted[i]) > 0 and predicted[i][
                                    -1] == self.end_index:
                                continue
                            predicted[i].append(input_indices[i].item())

                    for t_id, token in enumerate(tokens_in_batch):
                        predicted_lemma = "".join(
                            self.char_dictionary.get_item_for_index(idx)
                            if idx != self.end_index else ""
                            for idx in predicted[t_id])
                        token.set_label(typename=label_name,
                                        value=predicted_lemma)

                # option 2: beam search
                else:
                    output_vectors, hidden = self.decode(
                        input_indices, hidden, all_encoder_outputs)

                    # out_probs = self.softmax(output_vectors).squeeze(1)
                    log_softmax_probs = torch.nn.functional.log_softmax(
                        output_vectors, dim=2).squeeze(1)
                    # make sure no dummy symbol <> or start symbol <S> is predicted
                    log_softmax_probs[:, self.dummy_index] = -inf
                    log_softmax_probs[:, self.start_index] = -inf

                    # pick top beam size many outputs with highest probabilities
                    # probabilities, leading_indices = out_probs.topk(self.beam_size, 1)  # max prob along dimension 1
                    log_probabilities, leading_indices = log_softmax_probs.topk(
                        self.beam_size, 1)
                    # leading_indices and probabilities have size (batch_size, beam_size)

                    # keep scores of beam_size many hypothesis for each token in the batch
                    scores = log_probabilities.view(-1, 1)
                    # stack all leading indices of all hypothesis and corresponding hidden states in two tensors
                    leading_indices = leading_indices.view(
                        -1,
                        1)  # this vector goes through RNN in each iteration

                    hidden_states_beam = torch.stack(self.beam_size * [hidden],
                                                     dim=2).view(
                                                         self.rnn_layers, -1,
                                                         self.rnn_hidden_size)

                    # save sequences so far
                    sequences = torch.tensor([[i.item()]
                                              for i in leading_indices],
                                             device=flair.device)

                    # keep track of how many hypothesis were completed for each token
                    n_completed = [0 for _ in range(number_tokens)]  # cpu
                    final_candidates: List[List[Tuple[
                        torch.Tensor,
                        float]]] = [[] for _ in range(number_tokens)]  # cpu

                    # if all_encoder_outputs returned, expand them to beam size (otherwise keep this as None)
                    batched_encoding_output = (torch.stack(
                        self.beam_size * [all_encoder_outputs], dim=1).view(
                            self.beam_size *
                            number_tokens, -1, self.rnn_hidden_size)
                                               if self.use_attention else None)

                    for j in range(1, max_length):

                        output_vectors, hidden_states_beam = self.decode(
                            leading_indices, hidden_states_beam,
                            batched_encoding_output)

                        # decode with log softmax
                        out_log_probs = torch.nn.functional.log_softmax(
                            output_vectors, dim=2)
                        # make sure no dummy symbol <> or start symbol <S> is predicted
                        out_log_probs[:, 0, self.dummy_index] = -inf
                        out_log_probs[:, 0, self.start_index] = -inf
                        log_probabilities, index_candidates = out_log_probs.topk(
                            self.beam_size, 2)
                        log_probabilities.squeeze_(1)
                        index_candidates.squeeze_(1)

                        # check if an end symbol <E> has been predicted and, in that case, set hypothesis aside
                        end_symbols = (
                            index_candidates == self.end_index).nonzero(
                                as_tuple=False)
                        for tuple in end_symbols:

                            # if the sequence is already ended, do not record as candidate
                            if sequences[tuple[0],
                                         -1].item() == self.end_index:
                                continue

                            # index of token in in list tokens_in_batch
                            token_number = torch.div(tuple[0],
                                                     self.beam_size,
                                                     rounding_mode="trunc")
                            # print(token_number)
                            seq = sequences[tuple[0], :]  # hypothesis sequence
                            # hypothesis score
                            score = (scores[tuple[0]] +
                                     log_probabilities[tuple[0], tuple[1]]) / (
                                         len(seq) + 1)

                            final_candidates[token_number].append(
                                (seq, score.item()))
                            # TODO: remove token if number of completed hypothesis exceeds given value
                            n_completed[token_number] += 1

                            # set score of corresponding entry to -inf so it will not be expanded
                            log_probabilities[tuple[0], tuple[1]] = -inf

                        # get leading_indices for next expansion
                        # find highest scoring hypothesis among beam_size*beam_size possible ones for each token

                        # take beam_size many copies of scores vector and add scores of possible new extensions
                        # size (beam_size*batch_size, beam_size)
                        hypothesis_scores = torch.cat(
                            self.beam_size * [scores],
                            dim=1) + log_probabilities
                        # print(hypothesis_scores)

                        # reshape to vector of size (batch_size, beam_size*beam_size),
                        # each row contains beam_size*beam_size scores of the new possible hypothesis
                        hypothesis_scores_per_token = hypothesis_scores.view(
                            number_tokens, self.beam_size**2)
                        # print(hypothesis_scores_per_token)

                        # choose beam_size best for each token - size (batch_size, beam_size)
                        (
                            best_scores,
                            indices_per_token,
                        ) = hypothesis_scores_per_token.topk(
                            self.beam_size, 1)

                        # out of indices_per_token we now need to recompute the original indices of the hypothesis in
                        # a list of length beam_size*batch_size
                        # where the first three inidices belong to the first token, the next three to the second token,
                        # and so on
                        beam_numbers: List[int] = []
                        seq_numbers: List[int] = []

                        for i, row in enumerate(indices_per_token):
                            beam_numbers.extend(i * self.beam_size +
                                                index.item() // self.beam_size
                                                for index in row)

                            seq_numbers.extend(index.item() % self.beam_size
                                               for index in row)

                        # with these indices we can compute the tensors for the next iteration
                        # expand sequences with corresponding index
                        sequences = torch.cat(
                            (
                                sequences[beam_numbers],
                                index_candidates[beam_numbers,
                                                 seq_numbers].unsqueeze(1),
                            ),
                            dim=1,
                        )

                        # add log-probabilities to the scores
                        scores = scores[beam_numbers] + log_probabilities[
                            beam_numbers, seq_numbers].unsqueeze(1)

                        # save new leading indices
                        leading_indices = index_candidates[
                            beam_numbers, seq_numbers].unsqueeze(1)

                        # save corresponding hidden states
                        hidden_states_beam = hidden_states_beam[:,
                                                                beam_numbers, :]

                    # it may happen that no end symbol <E> is predicted for a token in all of the max_length iterations
                    # in that case we append one of the final seuqences without end symbol to the final_candidates
                    best_scores, indices = scores.view(number_tokens,
                                                       -1).topk(1, 1)

                    for j, (score, index) in enumerate(
                            zip(best_scores.squeeze(1), indices.squeeze(1))):
                        if len(final_candidates[j]) == 0:
                            beam = j * self.beam_size + index.item()
                            final_candidates[j].append(
                                (sequences[beam, :],
                                 score.item() / max_length))

                    # get best final hypothesis for each token
                    output_sequences = []
                    for candidate in final_candidates:
                        l_ordered = sorted(candidate,
                                           key=lambda tup: tup[1],
                                           reverse=True)
                        output_sequences.append(l_ordered[0])

                    # get characters from index sequences and add predicted label to token
                    for i, out_seq in enumerate(output_sequences):
                        predicted_lemma = ""
                        for idx in out_seq[0]:
                            predicted_lemma += self.char_dictionary.get_item_for_index(
                                idx)
                        line_to_print += predicted_lemma
                        line_to_print += " "
                        tokens_in_batch[i].add_tag(tag_type=label_name,
                                                   tag_value=predicted_lemma)

                if return_loss:
                    overall_loss += self.forward_loss(batch)[0].item()

                store_embeddings(batch, storage_mode=embedding_storage_mode)

            if verbose:
                log.info(line_to_print)

            if return_loss:
                return overall_loss, number_tokens_in_total
Beispiel #13
0
    def evaluate(
        self,
        data_points: Union[List[DataPair[DT, DT2]], Dataset],
        gold_label_type: str,
        out_path: Union[str, Path] = None,
        embedding_storage_mode="none",
        mini_batch_size=32,
        num_workers: Optional[int] = 8,
        **kwargs,
    ) -> Result:
        # assumes that for each data pair there's at least one embedding per modality
        if not isinstance(data_points, Dataset):
            data_points = FlairDatapointDataset(data_points)

        data_loader = DataLoader(data_points, batch_size=mini_batch_size, num_workers=num_workers)

        with torch.no_grad():
            # pre-compute embeddings for all targets in evaluation dataset
            target_index: Dict[str, int] = {}
            all_target_embeddings_list = []
            for batch in data_loader:
                target_inputs = []
                for data_point in batch:
                    if str(data_point.second) not in target_index:
                        target_index[str(data_point.second)] = len(target_index)
                        target_inputs.append(data_point)
                if target_inputs:
                    all_target_embeddings_list.append(self._embed_target(target_inputs).to(self.eval_device))
                store_embeddings(data_points, embedding_storage_mode)
            all_target_embeddings = torch.cat(all_target_embeddings_list, dim=0)  # [n0, d0]
            assert len(target_index) == all_target_embeddings.shape[0]

            ranks = []
            for batch in data_loader:
                batch_embeddings = self._embed_source(batch)

                batch_source_embeddings = batch_embeddings.to(self.eval_device)
                # compute the similarity
                batch_similarity_matrix = self.similarity_measure.forward(
                    [batch_source_embeddings, all_target_embeddings]
                )

                # sort the similarity matrix across modality 1
                batch_modality_1_argsort = torch.argsort(batch_similarity_matrix, descending=True, dim=1)

                # get the ranks, so +1 to start counting ranks from 1
                batch_modality_1_ranks = torch.argsort(batch_modality_1_argsort, dim=1) + 1

                batch_target_indices = [target_index[str(data_point.second)] for data_point in batch]

                batch_gt_ranks = batch_modality_1_ranks[
                    torch.arange(batch_similarity_matrix.shape[0]),
                    torch.tensor(batch_target_indices),
                ]
                ranks.extend(batch_gt_ranks.tolist())

                store_embeddings(data_points, embedding_storage_mode)

        ranks_arr = np.array(ranks)
        median_rank = np.median(ranks_arr)
        recall_at = {k: np.mean(ranks_arr <= k) for k in self.recall_at_points}

        results_header = ["Median rank"] + ["Recall@top" + str(r) for r in self.recall_at_points]
        results_header_str = "\t".join(results_header)
        epoch_results = [str(median_rank)] + [str(recall_at[k]) for k in self.recall_at_points]
        epoch_results_str = "\t".join(epoch_results)
        detailed_results = ", ".join([f"{h}={v}" for h, v in zip(results_header, epoch_results)])

        validated_measure = sum(
            [recall_at[r] * w for r, w in zip(self.recall_at_points, self.recall_at_points_weights)]
        )

        return Result(
            validated_measure,
            results_header_str,
            epoch_results_str,
            detailed_results,
            loss=0.0,
        )
Beispiel #14
0
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence],
        mini_batch_size=32,
        return_probabilities_for_all_classes: bool = False,
        verbose: bool = False,
        label_name: Optional[str] = None,
        return_loss=False,
        embedding_storage_mode="none",
        label_threshold: float = 0.5,
        multi_label: Optional[bool] = None,
    ):
        """
        Predict sequence tags for Named Entity Recognition task
        :param sentences: a Sentence or a List of Sentence
        :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory,
        up to a point when it has no more effect.
        :param all_tag_prob: True to compute the score for each tag on each token,
        otherwise only the score of the best tag is returned
        :param verbose: set to True to display a progress bar
        :param return_loss: set to True to return loss
        :param label_name: set this to change the name of the label type that is predicted
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
        you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        'gpu' to store embeddings in GPU memory.
        """
        if label_name is None:
            label_name = self.get_current_label_type()

        if multi_label is None:
            multi_label = self.is_current_task_multi_label()

        # with torch.no_grad():
        if not sentences:
            return sentences

        if isinstance(sentences, Sentence):
            sentences = [sentences]

        # set context if not set already
        previous_sentence = None
        for sentence in sentences:
            if sentence.is_context_set():
                continue
            sentence._previous_sentence = previous_sentence
            sentence._next_sentence = None
            if previous_sentence:
                previous_sentence._next_sentence = sentence
            previous_sentence = sentence

        reordered_sentences = sorted(sentences,
                                     key=lambda s: len(s),
                                     reverse=True)

        dataloader = DataLoader(
            dataset=FlairDatapointDataset(reordered_sentences),
            batch_size=mini_batch_size,
        )

        # progress bar for verbosity
        if verbose:
            progressbar = tqdm(dataloader)
            progressbar.set_description("Batch inference")
            dataloader = progressbar

        overall_loss = 0
        overall_count = 0
        batch_no = 0
        with torch.no_grad():
            for batch in dataloader:

                batch_no += 1

                batch = self._filter_empty_sentences(batch)
                # stop if all sentences are empty
                if not batch:
                    continue

                # go through each sentence in the batch
                for sentence in batch:

                    # always remove tags first
                    sentence.remove_labels(label_name)

                    all_labels = [
                        label.decode("utf-8") for label in
                        self.get_current_label_dictionary().idx2item
                    ]

                    best_label = None
                    for label in all_labels:
                        tars_sentence = self._get_tars_formatted_sentence(
                            label, sentence)

                        loss_and_count = self.tars_model.predict(
                            tars_sentence,
                            label_name=label_name,
                            return_loss=True,
                            return_probabilities_for_all_classes=True
                            if label_threshold < 0.5 else False,
                        )

                        overall_loss += loss_and_count[0].item()
                        overall_count += loss_and_count[1]

                        # add all labels that according to TARS match the text and are above threshold
                        for predicted_tars_label in tars_sentence.get_labels(
                                label_name):
                            if (predicted_tars_label.value == self.LABEL_MATCH
                                    and predicted_tars_label.score >
                                    label_threshold):
                                # do not add labels below confidence threshold
                                sentence.add_label(label_name, label,
                                                   predicted_tars_label.score)

                    # only use label with highest confidence if enforcing single-label predictions
                    if not multi_label:
                        if len(sentence.get_labels(label_name)) > 0:
                            # get all label scores and do an argmax to get the best label
                            label_scores = torch.tensor(
                                [
                                    label.score for label in
                                    sentence.get_labels(label_name)
                                ],
                                dtype=torch.float,
                            )
                            best_label = sentence.get_labels(label_name)[
                                torch.argmax(label_scores)]

                            # remove previously added labels and only add the best label
                            sentence.remove_labels(label_name)
                            sentence.add_label(
                                typename=label_name,
                                value=best_label.value,
                                score=best_label.score,
                            )

                # clearing token embeddings to save memory
                store_embeddings(batch, storage_mode=embedding_storage_mode)

        if return_loss:
            return overall_loss, overall_count
Beispiel #15
0
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence],
        mini_batch_size=32,
        return_probabilities_for_all_classes: bool = False,
        verbose: bool = False,
        label_name: Optional[str] = None,
        return_loss=False,
        embedding_storage_mode="none",
        most_probable_first: bool = True,
    ):
        # return
        """
        Predict sequence tags for Named Entity Recognition task
        :param sentences: a Sentence or a List of Sentence
        :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory,
        up to a point when it has no more effect.
        :param all_tag_prob: True to compute the score for each tag on each token,
        otherwise only the score of the best tag is returned
        :param verbose: set to True to display a progress bar
        :param return_loss: set to True to return loss
        :param label_name: set this to change the name of the label type that is predicted
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
        you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        'gpu' to store embeddings in GPU memory.
        """
        if label_name is None:
            label_name = self.get_current_label_type()

        # with torch.no_grad():
        if not sentences:
            return sentences

        if not isinstance(sentences, list):
            sentences = [sentences]

        reordered_sentences = sorted(sentences,
                                     key=lambda s: len(s),
                                     reverse=True)

        dataloader = DataLoader(
            dataset=FlairDatapointDataset(reordered_sentences),
            batch_size=mini_batch_size,
        )

        # progress bar for verbosity
        if verbose:
            dataloader = tqdm(dataloader)

        overall_loss = 0
        overall_count = 0
        with torch.no_grad():
            for batch in dataloader:

                batch = self._filter_empty_sentences(batch)
                # stop if all sentences are empty
                if not batch:
                    continue

                # go through each sentence in the batch
                for sentence in batch:

                    # always remove tags first
                    for token in sentence:
                        token.remove_labels(label_name)

                    all_labels = [
                        label.decode("utf-8") for label in
                        self.get_current_label_dictionary().idx2item
                    ]

                    all_detected = {}
                    for label in all_labels:
                        tars_sentence = self._get_tars_formatted_sentence(
                            label, sentence)

                        label_length = 0 if not self.prefix else len(
                            label.split(" ")) + len(self.separator.split(" "))

                        loss_and_count = self.tars_model.predict(
                            tars_sentence,
                            label_name=label_name,
                            return_loss=True,
                        )
                        overall_loss += loss_and_count[0].item()
                        overall_count += loss_and_count[1]

                        for span in tars_sentence.get_spans(label_name):
                            span.set_label("tars_temp_label", label)
                            all_detected[span] = span.score

                        if not most_probable_first:
                            for span in tars_sentence.get_spans(label_name):
                                for token in span:
                                    corresponding_token = sentence.get_token(
                                        token.idx - label_length)
                                    if corresponding_token is None:
                                        continue
                                    if (corresponding_token.get_tag(
                                            label_name).value != ""
                                            and corresponding_token.get_tag(
                                                label_name).score >
                                            token.get_tag(label_name).score):
                                        continue
                                    corresponding_token.add_tag(
                                        label_name,
                                        token.get_tag(label_name).value +
                                        label,
                                        token.get_tag(label_name).score,
                                    )

                    if most_probable_first:
                        import operator

                        sorted_x = sorted(all_detected.items(),
                                          key=operator.itemgetter(1))
                        sorted_x.reverse()
                        for tuple in sorted_x:
                            # get the span and its label
                            span = tuple[0]
                            label = span.get_labels("tars_temp_label")[0].value
                            label_length = (0 if not self.prefix else
                                            len(label.split(" ")) +
                                            len(self.separator.split(" ")))

                            # determine whether tokens in this span already have a label
                            tag_this = True
                            for token in span:
                                corresponding_token = sentence.get_token(
                                    token.idx - label_length)
                                if corresponding_token is None:
                                    tag_this = False
                                    continue
                                if (corresponding_token.get_tag(
                                        label_name).value != ""
                                        and corresponding_token.get_tag(
                                            label_name).score >
                                        token.get_tag(label_name).score):
                                    tag_this = False
                                    continue

                            # only add if all tokens have no label
                            if tag_this:
                                for token in span:
                                    corresponding_token = sentence.get_token(
                                        token.idx - label_length)
                                    corresponding_token.add_tag(
                                        label_name,
                                        token.get_tag(label_name).value +
                                        label,
                                        token.get_tag(label_name).score,
                                    )

                # clearing token embeddings to save memory
                store_embeddings(batch, storage_mode=embedding_storage_mode)

        if return_loss:
            return overall_loss, overall_count