Esempio n. 1
0
    def predict(self,
                sentences: Union[List[Sentence], Sentence],
                mini_batch_size=32) -> List[Sentence]:

        with torch.no_grad():
            if type(sentences) is Sentence:
                sentences = [sentences]

            filtered_sentences = self._filter_empty_sentences(sentences)

            # remove previous embeddings
            clear_embeddings(filtered_sentences,
                             also_clear_word_embeddings=True)

            # make mini-batches
            batches = [
                filtered_sentences[x:x + mini_batch_size]
                for x in range(0, len(filtered_sentences), mini_batch_size)
            ]

            for batch in batches:
                scores, predicted_ids = self._predict_scores_batch(batch)
                all_tokens = []
                for sentence in batch:
                    all_tokens.extend(sentence.tokens)

                for (token, score,
                     predicted_id) in zip(all_tokens, scores, predicted_ids):
                    token: Token = token
                    # get the predicted tag
                    predicted_tag = self.tag_dictionary.get_item_for_index(
                        predicted_id)
                    token.add_tag(self.tag_type, predicted_tag, score)

            return sentences
Esempio n. 2
0
    def _evaluate_sequence_tagger(model,
                                  sentences: List[Sentence],
                                  eval_mini_batch_size: int = 32,
                                  embeddings_in_memory: bool = True,
                                  out_path: Path = None) -> (dict, float):

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0
            batches = [sentences[x:x + eval_mini_batch_size] for x in range(0, len(sentences), eval_mini_batch_size)]

            metric = Metric('Evaluation')

            lines: List[str] = []
            for batch in batches:
                batch_no += 1

                tags, loss = model.forward_labels_and_loss(batch)

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag_label('predicted', tag)

                        # append both to file for evaluation
                        eval_line = '{} {} {} {}\n'.format(token.text,
                                                           token.get_tag(model.tag_type).value, tag.value, tag.score)
                        lines.append(eval_line)
                    lines.append('\n')
                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans(model.tag_type)]
                    # make list of predicted tags
                    predicted_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans('predicted')]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                clear_embeddings(batch, also_clear_word_embeddings=not embeddings_in_memory)

            eval_loss /= len(sentences)

            if out_path is not None:
                with open(out_path, "w", encoding='utf-8') as outfile:
                    outfile.write(''.join(lines))

            return metric, eval_loss
Esempio n. 3
0
    def _evaluate_text_classifier(model: flair.nn.Model,
                                  sentences: List[Sentence],
                                  eval_mini_batch_size: int = 32,
                                  embeddings_in_memory: bool = False) -> (dict, float):

        with torch.no_grad():
            eval_loss = 0

            batches = [sentences[x:x + eval_mini_batch_size] for x in
                       range(0, len(sentences), eval_mini_batch_size)]

            metric = Metric('Evaluation')

            for batch in batches:

                labels, loss = model.forward_labels_and_loss(batch)

                clear_embeddings(batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                predictions_for_batch = [[label.value for label in sent_labels] for sent_labels in labels]
                true_values_for_batch = [sentence.get_label_names() for sentence in batch]
                available_labels = model.label_dictionary.get_items()

                for predictions_for_sentence, true_values_for_sentence in zip(predictions_for_batch, true_values_for_batch):
                    ModelTrainer._evaluate_sentence_for_text_classification(metric,
                                                                            available_labels,
                                                                            predictions_for_sentence,
                                                                            true_values_for_sentence)

            eval_loss /= len(sentences)

            return metric, eval_loss
    def predict(self,
                sentences: Union[List[Sentence], Sentence],
                mini_batch_size=32,
                verbose=False) -> List[Sentence]:
        with torch.no_grad():
            if isinstance(sentences, Sentence):
                sentences = [sentences]

            filtered_sentences = self._filter_empty_sentences(sentences)

            # remove previous embeddings
            clear_embeddings(filtered_sentences,
                             also_clear_word_embeddings=True)

            # revere sort all sequences by their length
            filtered_sentences.sort(key=lambda x: len(x), reverse=True)

            # make mini-batches
            batches = [
                filtered_sentences[x:x + mini_batch_size]
                for x in range(0, len(filtered_sentences), mini_batch_size)
            ]

            # progress bar for verbosity
            if verbose:
                batches = tqdm(batches)

            for i, batch in enumerate(batches):

                if verbose:
                    batches.set_description(f'Inferencing on batch {i}')

                tags, _ = self.forward_labels_and_loss(batch, sort=False)

                # for (sentence, sent_tags) in zip(batch, tags):
                #     for (token, tag) in zip(sentence.tokens, sent_tags):
                #         token: Token = token
                #         for tag_type in self.tag_types:
                #             token.add_tag_label(tag_type, tag)

                # for b in range(len(batch)):
                #     sentence = batch[b]

                for (sentence, sent_tags) in zip(batch, tags):
                    for s in range(len(sentence.tokens)):

                        token: Token = sentence.tokens[s]

                        for t in range(len(self.tag_types)):

                            token.add_tag_label(self.tag_types[t],
                                                sent_tags[t][s])

                    # for (token, tag) in zip(sentence.tokens, sent_tags):
                    #     for tag_type in self.tag_types:

                # clearing token embeddings to save memory
                clear_embeddings(batch, also_clear_word_embeddings=True)

            return sentences
    def predict(self,
                sentences: Union[List[Sentence], Sentence],
                mini_batch_size=32) -> List[Sentence]:
        with torch.no_grad():
            if type(sentences) is Sentence:
                sentences = [sentences]

            filtered_sentences = self._filter_empty_sentences(sentences)

            # remove previous embeddings
            clear_embeddings(filtered_sentences,
                             also_clear_word_embeddings=True)

            # make mini-batches
            batches = [
                filtered_sentences[x:x + mini_batch_size]
                for x in range(0, len(filtered_sentences), mini_batch_size)
            ]

            for batch in batches:
                tags, _ = self.forward_labels_and_loss(batch)

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag_label(self.tag_type, tag)

            return sentences
Esempio n. 6
0
    def predict(self, sentences: Union[List[Sentence], Sentence], mini_batch_size=32) -> List[Sentence]:

        if type(sentences) is Sentence:
            sentences = [sentences]

        # remove previous embeddings
        clear_embeddings(sentences)

        # make mini-batches
        batches = [sentences[x:x + mini_batch_size] for x in range(0, len(sentences), mini_batch_size)]

        for batch in batches:
            score, tag_seq = self._predict_scores_batch(batch)
            predicted_id = tag_seq
            all_tokens = []
            for sentence in batch:
                all_tokens.extend(sentence.tokens)

            for (token, pred_id) in zip(all_tokens, predicted_id):
                token: Token = token
                # get the predicted tag
                predicted_tag = self.tag_dictionary.get_item_for_index(pred_id)
                token.add_tag(self.tag_type, predicted_tag)

        return sentences
Esempio n. 7
0
    def _evaluate_text_classifier(model: flair.nn.Model,
                                  sentences: List[Sentence],
                                  eval_mini_batch_size: int = 32,
                                  embeddings_in_memory: bool = False,
                                  out_path: Path = None) -> (dict, float):

        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + eval_mini_batch_size]
                for x in range(0, len(sentences), eval_mini_batch_size)
            ]

            metric = Metric('Evaluation')

            lines: List[str] = []
            for batch in batches:

                labels, loss = model.forward_labels_and_loss(batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                sentences_for_batch = [
                    sent.to_plain_string() for sent in batch
                ]
                confidences_for_batch = [[
                    label.score for label in sent_labels
                ] for sent_labels in labels]
                predictions_for_batch = [[
                    label.value for label in sent_labels
                ] for sent_labels in labels]
                true_values_for_batch = [
                    sentence.get_label_names() for sentence in batch
                ]
                available_labels = model.label_dictionary.get_items()

                for sentence, confidence, prediction, true_value in zip(
                        sentences_for_batch, confidences_for_batch,
                        predictions_for_batch, true_values_for_batch):
                    eval_line = '{}\t{}\t{}\t{}\n'.format(
                        sentence, true_value, prediction, confidence)
                    lines.append(eval_line)

                for predictions_for_sentence, true_values_for_sentence in zip(
                        predictions_for_batch, true_values_for_batch):
                    ModelTrainer._evaluate_sentence_for_text_classification(
                        metric, available_labels, predictions_for_sentence,
                        true_values_for_sentence)

            eval_loss /= len(sentences)

            if out_path is not None:
                with open(out_path, "w", encoding='utf-8') as outfile:
                    outfile.write(''.join(lines))

            return metric, eval_loss
    def predict(
        self,
        sentences: Union[Sentence, List[Sentence]],
        mini_batch_size: int = 32,
        multi_class_prob: bool = False,
    ) -> List[Sentence]:
        """
        Predicts the class labels for the given sentences. The labels are directly added to the sentences.
        :param sentences: list of sentences
        :param mini_batch_size: mini batch size to use
        :param multi_class_prob : return probability for all class for multiclass
        :return: the list of sentences containing the labels
        """
        with torch.no_grad():
            if type(sentences) is Sentence:
                sentences = [sentences]

            filtered_sentences = self._filter_empty_sentences(sentences)

            batches = [
                filtered_sentences[x:x + mini_batch_size]
                for x in range(0, len(filtered_sentences), mini_batch_size)
            ]

            for batch in batches:
                scores = self.forward(batch)
                predicted_labels = self._obtain_labels(
                    scores, predict_prob=multi_class_prob)

                for (sentence, labels) in zip(batch, predicted_labels):
                    sentence.labels = labels

                clear_embeddings(batch)

            return sentences
Esempio n. 9
0
 def predict(self, sentences, mini_batch_size=32, verbose=False):
     with torch.no_grad():
         if isinstance(sentences, Sentence):
             sentences = [sentences]
         filtered_sentences = self._filter_empty_sentences(sentences)
         clear_embeddings(filtered_sentences,
                          also_clear_word_embeddings=True)
         filtered_sentences.sort(key=(lambda x: len(x)), reverse=True)
         batches = [
             filtered_sentences[x:(x + mini_batch_size)]
             for x in range(0, len(filtered_sentences), mini_batch_size)
         ]
         if verbose:
             batches = tqdm(batches)
         for (i, batch) in enumerate(batches):
             if verbose:
                 batches.set_description(u''.join(
                     [u'Inferencing on batch ', u'{}'.format(i)]))
             (tags, _) = self.forward_labels_and_loss(batch, sort=False)
             for (sentence, sent_tags) in zip(batch, tags):
                 for (token, tag) in zip(sentence.tokens, sent_tags):
                     token = token
                     token.add_tag_label(self.tag_type, tag)
             clear_embeddings(batch, also_clear_word_embeddings=True)
         return sentences
Esempio n. 10
0
    def predict(self,
                sentences: Union[Sentence, List[Sentence]],
                mini_batch_size: int = 32) -> List[Sentence]:
        """
        Predicts the class labels for the given sentences. The labels are directly added to the sentences.
        :param sentences: list of sentences
        :param mini_batch_size: mini batch size to use
        :return: the list of sentences containing the labels
        """
        if type(sentences) is Sentence:
            sentences = [sentences]

        filtered_sentences = self._filter_empty_sentences(sentences)

        batches = [
            filtered_sentences[x:x + mini_batch_size]
            for x in range(0, len(filtered_sentences), mini_batch_size)
        ]

        for batch in batches:
            scores = self.forward(batch)
            predicted_labels = self.obtain_labels(scores)

            for (sentence, labels) in zip(batch, predicted_labels):
                sentence.labels = labels

            clear_embeddings(batch)

        return sentences
    def evaluate(self,
                 sentences: List[Sentence],
                 eval_class_metrics: bool = False,
                 mini_batch_size: int = 32,
                 embeddings_in_memory: bool = False,
                 metric_name: str = 'MICRO_AVG') -> (dict, float):
        """
        Evaluates the model with the given list of sentences.
        :param sentences: the list of sentences
        :param eval_class_metrics: boolean indicating whether to print class metrics or not
        :param mini_batch_size: the mini batch size to use
        :param embeddings_in_memory: boolean value indicating, if embeddings should be kept in memory or not
        :param metric_name: the name of the metrics to compute
        :return: list of metrics, and the loss
        """
        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + mini_batch_size]
                for x in range(0, len(sentences), mini_batch_size)
            ]

            metric = Metric(metric_name)

            for batch in batches:
                scores = self.model.forward(batch)
                labels = self.model.obtain_labels(scores)
                loss = self.model.calculate_loss(scores, batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                for predictions, true_values in zip(
                    [[label.value for label in sent_labels]
                     for sent_labels in labels],
                    [sentence.get_label_names() for sentence in batch]):
                    for prediction in predictions:
                        if prediction in true_values:
                            metric.tp()
                            if eval_class_metrics: metric.tp(prediction)
                        else:
                            metric.fp()
                            if eval_class_metrics: metric.fp(prediction)

                    for true_value in true_values:
                        if true_value not in predictions:
                            metric.fn()
                            if eval_class_metrics: metric.fn(true_value)
                        else:
                            metric.tn()
                            if eval_class_metrics: metric.tn(true_value)

            eval_loss /= len(sentences)

            return metric, eval_loss
Esempio n. 12
0
    def evaluate(self,
                 sentences: List[Sentence],
                 eval_class_metrics: bool = False,
                 mini_batch_size: int = 32,
                 embeddings_in_memory: bool = False) -> (dict, float):
        """
        Evaluates the model with the given list of sentences.
        :param sentences: the list of sentences
        :param eval_class_metrics: boolean indicating whether to print class metrics or not
        :param mini_batch_size: the mini batch size to use
        :param embeddings_in_memory: boolean value indicating, if embeddings should be kept in memory or not
        :return: list of metrics, and the loss
        """
        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + mini_batch_size]
                for x in range(0, len(sentences), mini_batch_size)
            ]

            y_pred = []
            y_true = []

            for batch in batches:
                scores = self.model.forward(batch)
                labels = self.model.obtain_labels(scores)
                loss = self.model.calculate_loss(scores, batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                y_pred.extend(
                    convert_labels_to_one_hot(
                        [[label.value for label in sent_labels]
                         for sent_labels in labels], self.label_dict))
                y_true.extend(
                    convert_labels_to_one_hot(
                        [sentence.get_label_names() for sentence in batch],
                        self.label_dict))

            metrics = [
                calculate_micro_avg_metric(y_true, y_pred, self.label_dict)
            ]
            if eval_class_metrics:
                metrics.extend(
                    calculate_class_metrics(y_true, y_pred, self.label_dict))

            eval_loss /= len(sentences)

            metrics_dict = {metric.name: metric for metric in metrics}

            return metrics_dict, eval_loss
Esempio n. 13
0
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence],
        mini_batch_size=32,
        verbose=False,
        clear_word_embeddings=True,
    ) -> List[Sentence]:
        with torch.no_grad():
            if isinstance(sentences, Sentence):
                sentences = [sentences]

            filtered_sentences = self._filter_empty_sentences(sentences)

            # remove previous embeddings
            clear_embeddings(filtered_sentences,
                             also_clear_word_embeddings=True)

            # revere sort all sequences by their length
            filtered_sentences.sort(key=lambda x: len(x), reverse=True)

            # make mini-batches
            batches = [
                filtered_sentences[x:x + mini_batch_size]
                for x in range(0, len(filtered_sentences), mini_batch_size)
            ]

            # progress bar for verbosity
            if verbose:
                batches = tqdm(batches)

            for i, batch in enumerate(batches):

                if verbose:
                    batches.set_description(f"Inferencing on batch {i}")

                with torch.no_grad():
                    feature = self.forward(batch)
                    tags, all_tags = self._obtain_labels(feature, batch)

                for (sentence, sent_tags,
                     sent_all_tags) in zip(batch, tags, all_tags):
                    for (token, tag,
                         token_all_tags) in zip(sentence.tokens, sent_tags,
                                                sent_all_tags):
                        token.add_tag_label(self.tag_type, tag)
                        token.add_tags_proba_dist(self.tag_type,
                                                  token_all_tags)

                # clearing token embeddings to save memory
                clear_embeddings(
                    batch, also_clear_word_embeddings=clear_word_embeddings)

            return sentences
Esempio n. 14
0
    def predict(self,
                sentences: Union[List[Sentence], Sentence],
                mini_batch_size=32,
                verbose=False) -> List[Sentence]:
        """
        Apply the model to an ensemble of Sentence objects
        :param sentences: ensemble of Sentence objects for which the model predict the tags
        :param mini_batch_size: number of Sentence in the mini-batch used for prediction
        :param verbose:
        :return: list of Sentence objects tagged according to the model
        """
        with torch.no_grad():  # forbid the use of back-prop
            if isinstance(sentences, Sentence):
                sentences = [sentences]

            filtered_sentences = self._filter_empty_sentences(sentences)

            # remove previous embeddings
            clear_embeddings(filtered_sentences,
                             also_clear_word_embeddings=True)

            # revere sort all sequences by their length
            filtered_sentences.sort(key=lambda x: len(x), reverse=True)

            # make mini-batches
            batches = [
                filtered_sentences[x:x + mini_batch_size]
                for x in range(0, len(filtered_sentences), mini_batch_size)
            ]

            # progress bar for verbosity
            if verbose:
                batches = tqdm(batches)

            for i, batch in enumerate(batches):

                if verbose:
                    batches.set_description(f'Inferencing on batch {i}')

                tags, _ = self.forward_labels_and_loss(batch, sort=False)

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag_label(self.tag_type, tag)

                # clearing token embeddings to save memory
                clear_embeddings(batch, also_clear_word_embeddings=True)

            return sentences
    def _forward_loss_stability(self,
                                sentences: Union[List[Sentence], Sentence],
                                alpha: float,
                                misspelling_rate: float,
                                char_vocab: dict,
                                lut: dict = {},
                                cmx: np.array = None,
                                embeddings_in_memory: bool = True,
                                verbose: bool = False) -> (torch.tensor, dict):
        """
        stability objective for classification -> KL divergence (see Zheng 2016 Eq.10)
        L_stab(x,x') = -sum_j(P(yj|x)*log(P(yj|x')))
        The output loss is the sum of the standard loss and the similarity objective.
        """

        misspelled_sentences, _ = noise_sentences(sentences,
                                                  self.misspell_mode,
                                                  misspelling_rate, char_vocab,
                                                  cmx, lut, {}, verbose)
        clear_embeddings(misspelled_sentences, also_clear_word_embeddings=True)

        embeddings, lengths = self._embed_sentences(sentences)
        embeddings_misspell, lengths_misspell = self._embed_sentences(
            misspelled_sentences)

        if not check_embeddings(sentences, misspelled_sentences, embeddings,
                                embeddings_misspell):
            log.warning(
                "WARNING: embedding of the misspelled text may be invalid!")

        outputs_base, features_base = self._forward(embeddings, lengths)
        outputs_misspell, features_misspell = self._forward(
            embeddings_misspell, lengths_misspell)

        loss_base = self._calculate_loss(outputs_base, sentences)

        target_distrib = F.softmax(outputs_base, dim=2).transpose(1,
                                                                  2).detach()
        input_log_distrib = F.log_softmax(outputs_misspell,
                                          dim=2).transpose(1, 2)
        loss_stability = alpha * F.kl_div(input_log_distrib,
                                          target_distrib,
                                          reduction='none').transpose(2, 1)
        loss_sum = get_masked_sum(loss_stability, lengths)
        loss_mean = get_per_batch_mean(loss_sum, lengths)
        # log.info(f"loss_base: {loss_base.item():.4f} loss_stability: {loss_mean.item():.4f}")

        auxilary_losses = {'loss_base': loss_base, 'loss_kldiv': loss_mean}
        return (loss_base + loss_mean), auxilary_losses
Esempio n. 16
0
    def predict(self, sentences: Union[List[Sentence], Sentence],
                mini_batch_size=32, verbose=False, return_featmat=False) -> List[Sentence]:
        with torch.no_grad():
            if isinstance(sentences, Sentence):
                sentences = [sentences]

            filtered_sentences = self._filter_empty_sentences(sentences)

            # remove previous embeddings
            clear_embeddings(filtered_sentences, also_clear_word_embeddings=True)

            # revere sort all sequences by their length
            if not return_featmat:
                filtered_sentences.sort(key=lambda x: len(x), reverse=True)

            # make mini-batches
            batches = [filtered_sentences[x:x + mini_batch_size] for x in
                       range(0, len(filtered_sentences), mini_batch_size)]

            # progress bar for verbosity
            if verbose:
                batches = tqdm(batches)

            all_features = []  # AZ

            for i, batch in enumerate(batches):

                if verbose:
                    batches.set_description(f'Inferencing on batch {i}')

                if not return_featmat:  # [AZ]
                    tags, _ = self.forward_labels_and_loss(batch, sort=False)
                else:
                    tags, _, feature_matrix = self.forward_labels_and_loss_feature(batch, sort=False)  #  [AZ]
                    all_features.append(feature_matrix)

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag_label(self.tag_type, tag)

                # clearing token embeddings to save memory
                clear_embeddings(batch, also_clear_word_embeddings=True)


            if return_featmat:  #[AZ]
                return sentences, all_features
            else:
                return sentences
Esempio n. 17
0
    def _evaluate_text_regressor(model: flair.nn.Model,
                                 sentences: List[Sentence],
                                 eval_mini_batch_size: int = 32,
                                 embeddings_in_memory: bool = False,
                                 out_path: Path = None) -> (dict, float):

        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + eval_mini_batch_size]
                for x in range(0, len(sentences), eval_mini_batch_size)
            ]

            metric = MetricRegression('Evaluation')

            lines: List[str] = []
            for batch in batches:

                scores, loss = model.forward_labels_and_loss(batch)

                true_values = []
                for sentence in batch:
                    for label in sentence.labels:
                        true_values.append(float(label.value))

                results = []
                for score in scores:
                    if type(score[0]) is Label:
                        results.append(float(score[0].score))
                    else:
                        results.append(float(score[0]))

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                metric.true.extend(true_values)
                metric.pred.extend(results)

            eval_loss /= len(sentences)

            ##TODO: not saving lines yet
            if out_path is not None:
                with open(out_path, "w", encoding='utf-8') as outfile:
                    outfile.write(''.join(lines))

            return metric, eval_loss
    def evaluate(self,
                 sentences: List[Sentence],
                 eval_class_metrics: bool = False,
                 mini_batch_size: int = 32,
                 embeddings_in_memory: bool = True) -> (dict, float):
        """
        Evaluates the model with the given list of sentences.
        :param sentences: the list of sentences
        :param mini_batch_size: the mini batch size to use
        :return: list of metrics, and the loss
        """
        eval_loss = 0

        batches = [
            sentences[x:x + mini_batch_size]
            for x in range(0, len(sentences), mini_batch_size)
        ]

        y_pred = []
        y_true = []

        for batch in batches:
            scores = self.model.forward(batch)
            labels = self.model.obtain_labels(scores)
            loss = self.model.calculate_loss(scores, batch)

            eval_loss += loss

            y_true.extend([sentence.get_label_names() for sentence in batch])
            y_pred.extend([[label.name for label in sent_labels]
                           for sent_labels in labels])

            if not embeddings_in_memory:
                clear_embeddings(batch)

        y_pred = convert_labels_to_one_hot(y_pred, self.label_dict)
        y_true = convert_labels_to_one_hot(y_true, self.label_dict)

        metrics = [calculate_micro_avg_metric(y_true, y_pred, self.label_dict)]
        if eval_class_metrics:
            metrics.extend(
                calculate_class_metrics(y_true, y_pred, self.label_dict))

        eval_loss /= len(sentences)

        metrics_dict = {metric.name: metric for metric in metrics}

        return metrics_dict, eval_loss
    def _forward_standard(self, sentences: List[Sentence], spell_check=None):

        # self.zero_grad()
        if spell_check != None:
            from robust_ner.spellcheck import correct_sentences
            corrected_sentences = correct_sentences(spell_check, sentences)
            clear_embeddings(corrected_sentences,
                             also_clear_word_embeddings=True)

            embeddings, lengths = self._embed_sentences(corrected_sentences)
        else:
            embeddings, lengths = self._embed_sentences(sentences)

        outputs, _ = self._forward(embeddings, lengths)

        return outputs
 def predict(self, sentences, mini_batch_size=32):
     u'\n        Predicts the class labels for the given sentences. The labels are directly added to the sentences.\n        :param sentences: list of sentences\n        :param mini_batch_size: mini batch size to use\n        :return: the list of sentences containing the labels\n        '
     with torch.no_grad():
         if (type(sentences) is Sentence):
             sentences = [sentences]
         filtered_sentences = self._filter_empty_sentences(sentences)
         batches = [
             filtered_sentences[x:(x + mini_batch_size)]
             for x in range(0, len(filtered_sentences), mini_batch_size)
         ]
         for batch in batches:
             scores = self.forward(batch)
             predicted_labels = self._obtain_labels(scores)
             for (sentence, labels) in zip(batch, predicted_labels):
                 sentence.labels = labels
             clear_embeddings(batch)
         return sentences
    def _forward_loss_data_augmentation(
            self,
            sentences: Union[List[Sentence], Sentence],
            alpha: float,
            misspelling_rate: float,
            char_vocab: dict,
            lut: dict = {},
            cmx: np.array = None,
            embeddings_in_memory: bool = True,
            verbose: bool = False) -> (torch.tensor, dict):
        """
        Data augmentation objective. Returns the auxiliary loss as the sum of standard objectives calculated on the
        original and the perturbed samples.
        """

        misspelled_sentences, _ = noise_sentences(sentences,
                                                  self.misspell_mode,
                                                  misspelling_rate, char_vocab,
                                                  cmx, lut, {}, verbose)
        clear_embeddings(misspelled_sentences, also_clear_word_embeddings=True)

        embeddings, lengths = self._embed_sentences(sentences)
        embeddings_misspell, lengths_misspell = self._embed_sentences(
            misspelled_sentences)

        if not check_embeddings(sentences, misspelled_sentences, embeddings,
                                embeddings_misspell):
            log.warning(
                "WARNING: embedding of the misspelled text may be invalid!")

        outputs_base, _ = self._forward(embeddings, lengths)
        outputs_misspell, _ = self._forward(embeddings_misspell,
                                            lengths_misspell)

        loss_base = self._calculate_loss(outputs_base, sentences)
        loss_misspell = alpha * self._calculate_loss(outputs_misspell,
                                                     misspelled_sentences)

        auxilary_losses = {
            'loss_base': loss_base,
            'loss_misspell': loss_misspell
        }

        return (loss_base + loss_misspell), auxilary_losses
Esempio n. 22
0
    def _evaluate_text_classifier(
            model: flair.nn.Model,
            sentences: List[Sentence],
            eval_mini_batch_size: int = 32,
            embeddings_in_memory: bool = False) -> (dict, float):

        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + eval_mini_batch_size]
                for x in range(0, len(sentences), eval_mini_batch_size)
            ]

            metric = Metric('Evaluation')

            for batch in batches:

                labels, loss = model.forward_labels_and_loss(batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                for predictions, true_values in zip(
                    [[label.value for label in sent_labels]
                     for sent_labels in labels],
                    [sentence.get_label_names() for sentence in batch]):
                    for prediction in predictions:
                        if prediction in true_values:
                            metric.add_tp(prediction)
                        else:
                            metric.add_fp(prediction)

                    for true_value in true_values:
                        if true_value not in predictions:
                            metric.add_fn(true_value)
                        else:
                            metric.add_tn(true_value)

            eval_loss /= len(sentences)

            return metric, eval_loss
    def _forward_misspelled(self,
                            sentences: Union[List[Sentence], Sentence],
                            misspelling_rate: float,
                            misspell_mode: MisspellingMode,
                            char_vocab: set,
                            cmx: np.array,
                            lut: dict,
                            typos: dict,
                            spell_check=None,
                            verbose: bool = False) -> (torch.tensor, dict):

        misspelled_sentences, _ = noise_sentences(sentences, misspell_mode,
                                                  misspelling_rate, char_vocab,
                                                  cmx, lut, typos, verbose)
        clear_embeddings(misspelled_sentences, also_clear_word_embeddings=True)

        outputs_misspell = self._forward_standard(misspelled_sentences,
                                                  spell_check)

        return outputs_misspell
Esempio n. 24
0
def evaluate_sequence_tagger(
    model: SequenceTagger,
    sentences: List[Sentence],
    eval_mini_batch_size: int = 32,
    embeddings_in_memory: bool = True,
) -> (dict, float):
    with torch.no_grad():
        eval_loss = 0

        batch_no: int = 0
        batches = [
            sentences[x:x + eval_mini_batch_size]
            for x in range(0, len(sentences), eval_mini_batch_size)
        ]

        gold_seqs = []
        pred_seqs = []

        for batch in batches:
            batch_no += 1

            features = model.forward(batch)
            loss = model._calculate_loss(features, batch)
            pred_tags = model._obtain_labels(features, batch)

            eval_loss += loss

            for (sentence, pred_sent_tags) in zip(batch, pred_tags):
                gold_tags = [tok.tags['ner'].value for tok in sentence]
                predicted_tags = [l.value for l in pred_sent_tags]
                gold_seqs.append(gold_tags)
                pred_seqs.append(predicted_tags)

            clear_embeddings(
                batch, also_clear_word_embeddings=not embeddings_in_memory)

        eval_loss /= len(sentences)

        scores = calc_seqtag_eval_scores(gold_seqs, pred_seqs)
        scores['eval-loss'] = eval_loss
        return scores
Esempio n. 25
0
    def predict(self,
                sentences: Union[Sentence, List[Sentence]],
                mini_batch_size: int = 32) -> List[Sentence]:

        with torch.no_grad():
            if type(sentences) is Sentence:
                sentences = [sentences]

            filtered_sentences = self._filter_empty_sentences(sentences)

            batches = [
                filtered_sentences[x:x + mini_batch_size]
                for x in range(0, len(filtered_sentences), mini_batch_size)
            ]

            for batch in batches:
                scores = self.forward(batch)

                for (sentence, score) in zip(batch, scores.tolist()):
                    sentence.labels = [Label(value=str(score[0]))]

                clear_embeddings(batch)

            return sentences
Esempio n. 26
0
    def evaluate(
        self,
        sentences: List[Sentence],
        eval_mini_batch_size: int = 32,
        embeddings_in_memory: bool = False,
        out_path: Path = None,
    ) -> (Result, float):

        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + eval_mini_batch_size]
                for x in range(0, len(sentences), eval_mini_batch_size)
            ]

            metric = Metric("Evaluation")

            lines: List[str] = []
            for batch in batches:

                labels, loss = self.forward_labels_and_loss(batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                sentences_for_batch = [
                    sent.to_plain_string() for sent in batch
                ]
                confidences_for_batch = [[
                    label.score for label in sent_labels
                ] for sent_labels in labels]
                predictions_for_batch = [[
                    label.value for label in sent_labels
                ] for sent_labels in labels]
                true_values_for_batch = [
                    sentence.get_label_names() for sentence in batch
                ]
                available_labels = self.label_dictionary.get_items()

                for sentence, confidence, prediction, true_value in zip(
                        sentences_for_batch,
                        confidences_for_batch,
                        predictions_for_batch,
                        true_values_for_batch,
                ):
                    eval_line = "{}\t{}\t{}\t{}\n".format(
                        sentence, true_value, prediction, confidence)
                    lines.append(eval_line)

                for predictions_for_sentence, true_values_for_sentence in zip(
                        predictions_for_batch, true_values_for_batch):

                    for label in available_labels:
                        if (label in predictions_for_sentence
                                and label in true_values_for_sentence):
                            metric.add_tp(label)
                        elif (label in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_fp(label)
                        elif (label not in predictions_for_sentence
                              and label in true_values_for_sentence):
                            metric.add_fn(label)
                        elif (label not in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_tn(label)

            eval_loss /= len(sentences)

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}")

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=
                f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            return result, eval_loss
Esempio n. 27
0
from __future__ import absolute_import
    def train(self,
              base_path: str,
              learning_rate: float = 0.1,
              mini_batch_size: int = 32,
              max_epochs: int = 50,
              anneal_factor: float = 0.5,
              patience: int = 5,
              train_with_dev: bool = False,
              embeddings_in_memory: bool = False,
              checkpoint: bool = False,
              save_final_model: bool = True,
              anneal_with_restarts: bool = False,
              eval_on_train: bool = True):
        """
        Trains a text classification model using the training data of the corpus.
        :param base_path: the directory to which any results should be written to
        :param learning_rate: the learning rate
        :param mini_batch_size: the mini batch size
        :param max_epochs: the maximum number of epochs to train
        :param anneal_factor: learning rate will be decreased by this factor
        :param patience: number of 'bad' epochs before learning rate gets decreased
        :param train_with_dev: boolean indicating, if the dev data set should be used for training or not
        :param embeddings_in_memory: boolean indicating, if embeddings should be kept in memory or not
        :param checkpoint: boolean indicating, whether the model should be save after every epoch or not
        :param save_final_model: boolean indicating, whether the final model should be saved or not
        :param anneal_with_restarts: boolean indicating, whether the best model should be reloaded once the learning
        rate changed or not
        :param eval_on_train: boolean value indicating, if evaluation metrics should be calculated on training data set
        or not
        """

        loss_txt = init_output_file(base_path, 'loss.tsv')
        with open(loss_txt, 'a') as f:
            f.write(
                'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n'
                .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'),
                        Metric.tsv_header('TEST')))

        weight_extractor = WeightExtractor(base_path)

        optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate)

        anneal_mode = 'min' if train_with_dev else 'max'
        scheduler: ReduceLROnPlateau = ReduceLROnPlateau(optimizer,
                                                         factor=anneal_factor,
                                                         patience=patience,
                                                         mode=anneal_mode)

        train_data = self.corpus.train

        # if training also uses dev data, include in training set
        if train_with_dev:
            train_data.extend(self.corpus.dev)

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            previous_learning_rate = learning_rate

            for epoch in range(max_epochs):
                log.info('-' * 100)

                bad_epochs = scheduler.num_bad_epochs
                for group in optimizer.param_groups:
                    learning_rate = group['lr']

                # reload last best model if annealing with restarts is enabled
                if learning_rate != previous_learning_rate and anneal_with_restarts and \
                        os.path.exists(base_path + "/best-model.pt"):
                    log.info('Resetting to best model ...')
                    self.model.load_from_file(base_path + "/best-model.pt")

                previous_learning_rate = learning_rate

                # stop training if learning rate becomes too small
                if learning_rate < 0.001:
                    log.info('Learning rate too small - quitting training!')
                    break

                if not self.test_mode:
                    random.shuffle(train_data)

                self.model.train()

                batches = [
                    self.corpus.train[x:x + mini_batch_size]
                    for x in range(0, len(self.corpus.train), mini_batch_size)
                ]

                current_loss: float = 0
                seen_sentences = 0
                modulo = max(1, int(len(batches) / 10))

                for batch_no, batch in enumerate(batches):
                    scores = self.model.forward(batch)
                    loss = self.model.calculate_loss(scores, batch)

                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   5.0)
                    optimizer.step()

                    seen_sentences += len(batch)
                    current_loss += loss.item()

                    clear_embeddings(
                        batch,
                        also_clear_word_embeddings=not embeddings_in_memory)

                    if batch_no % modulo == 0:
                        log.info(
                            "epoch {0} - iter {1}/{2} - loss {3:.8f}".format(
                                epoch + 1, batch_no, len(batches),
                                current_loss / seen_sentences))
                        iteration = epoch * len(batches) + batch_no
                        weight_extractor.extract_weights(
                            self.model.state_dict(), iteration)

                current_loss /= len(train_data)

                self.model.eval()

                # if checkpoint is enable, save model at each epoch
                if checkpoint:
                    self.model.save(base_path + "/checkpoint.pt")

                log.info('-' * 100)
                log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format(
                    epoch + 1, learning_rate, bad_epochs))

                dev_metric = train_metric = None
                dev_loss = '_'
                train_loss = current_loss

                if eval_on_train:
                    train_metric, train_loss = self._calculate_evaluation_results_for(
                        'TRAIN', self.corpus.train, embeddings_in_memory,
                        mini_batch_size)

                if not train_with_dev:
                    dev_metric, dev_loss = self._calculate_evaluation_results_for(
                        'DEV', self.corpus.dev, embeddings_in_memory,
                        mini_batch_size)

                with open(loss_txt, 'a') as f:
                    train_metric_str = train_metric.to_tsv(
                    ) if train_metric is not None else Metric.to_empty_tsv()
                    dev_metric_str = dev_metric.to_tsv(
                    ) if dev_metric is not None else Metric.to_empty_tsv()
                    f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        epoch, datetime.datetime.now(), train_loss,
                        train_metric_str, dev_loss, dev_metric_str, '_',
                        Metric.to_empty_tsv()))

                # anneal against train loss if training with dev, otherwise anneal against dev score
                scheduler.step(
                    current_loss) if train_with_dev else scheduler.step(
                        dev_metric.f_score())

                current_score = dev_metric.f_score(
                ) if not train_with_dev else train_metric.f_score()

                # if we use dev data, remember best model based on dev evaluation score
                if not train_with_dev and current_score == scheduler.best:
                    self.model.save(base_path + "/best-model.pt")

            if save_final_model:
                self.model.save(base_path + "/final-model.pt")

            log.info('-' * 100)
            log.info('Testing using best model ...')

            self.model.eval()

            if os.path.exists(base_path + "/best-model.pt"):
                self.model = TextClassifier.load_from_file(base_path +
                                                           "/best-model.pt")

            test_metric, test_loss = self.evaluate(
                self.corpus.test,
                mini_batch_size=mini_batch_size,
                eval_class_metrics=True,
                embeddings_in_memory=embeddings_in_memory,
                metric_name='TEST')

            test_metric.print()
            self.model.train()

            log.info('-' * 100)

        except KeyboardInterrupt:
            log.info('-' * 100)
            log.info('Exiting from training early.')
            log.info('Saving model ...')
            with open(base_path + "/final-model.pt", 'wb') as model_save_file:
                torch.save(self.model, model_save_file, pickle_protocol=4)
                model_save_file.close()
            log.info('Done.')
Esempio n. 29
0
    def evaluate(
        self,
        sentences: Dataset,
        eval_mini_batch_size: int = 32,
        embeddings_in_memory: bool = True,
        out_path: Path = None,
    ) -> (Result, float):

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0

            batch_loader = torch.utils.data.DataLoader(
                sentences,
                batch_size=eval_mini_batch_size,
                shuffle=False,
                num_workers=4,
                collate_fn=list,
            )

            metric = Metric("Evaluation")

            lines: List[str] = []
            for batch in batch_loader:
                batch_no += 1

                with torch.no_grad():
                    features = self.forward(batch)
                    loss = self._calculate_loss(features, batch)
                    tags = self._obtain_labels(features, batch)

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag_label("predicted", tag)

                        # append both to file for evaluation
                        eval_line = "{} {} {} {}\n".format(
                            token.text,
                            token.get_tag(self.tag_type).value,
                            tag.value,
                            tag.score,
                        )
                        lines.append(eval_line)
                    lines.append("\n")
                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [
                        (tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type)
                    ]
                    # make list of predicted tags
                    predicted_tags = [
                        (tag.tag, str(tag)) for tag in sentence.get_spans("predicted")
                    ]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory
                )

            eval_loss /= len(sentences)

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}"
                )

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            return result, eval_loss
Esempio n. 30
0
    def train(self,
              base_path: Union[Path, str],
              evaluation_metric: EvaluationMetric = EvaluationMetric.
              MICRO_F1_SCORE,
              learning_rate: float = 0.1,
              mini_batch_size: int = 32,
              eval_mini_batch_size: int = None,
              max_epochs: int = 100,
              anneal_factor: float = 0.5,
              patience: int = 3,
              anneal_against_train_loss: bool = True,
              train_with_dev: bool = False,
              monitor_train: bool = False,
              embeddings_in_memory: bool = True,
              checkpoint: bool = False,
              save_final_model: bool = True,
              anneal_with_restarts: bool = False,
              test_mode: bool = False,
              param_selection_mode: bool = False,
              **kwargs) -> dict:

        if eval_mini_batch_size is None:
            eval_mini_batch_size = mini_batch_size

        # cast string to Path
        if type(base_path) is str:
            base_path = Path(base_path)

        add_file_handler(log, base_path / 'training.log')

        log_line(log)
        log.info(f'Evaluation method: {evaluation_metric.name}')

        if not param_selection_mode:
            loss_txt = init_output_file(base_path, 'loss.tsv')
            with open(loss_txt, 'a') as f:
                f.write(
                    f'EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS\t{Metric.tsv_header("TRAIN")}\tDEV_LOSS\t{Metric.tsv_header("DEV")}'
                    f'\tTEST_LOSS\t{Metric.tsv_header("TEST")}\n')

            weight_extractor = WeightExtractor(base_path)

        optimizer = self.optimizer(self.model.parameters(),
                                   lr=learning_rate,
                                   **kwargs)
        if self.optimizer_state is not None:
            optimizer.load_state_dict(self.optimizer_state)

        # annealing scheduler
        anneal_mode = 'min' if anneal_against_train_loss else 'max'
        if isinstance(optimizer, (AdamW, SGDW)):
            scheduler = ReduceLRWDOnPlateau(optimizer,
                                            factor=anneal_factor,
                                            patience=patience,
                                            mode=anneal_mode,
                                            verbose=True)
        else:
            scheduler = ReduceLROnPlateau(optimizer,
                                          factor=anneal_factor,
                                          patience=patience,
                                          mode=anneal_mode,
                                          verbose=True)
        if self.scheduler_state is not None:
            scheduler.load_state_dict(self.scheduler_state)

        train_data = self.corpus.train

        # if training also uses dev data, include in training set
        if train_with_dev:
            train_data.extend(self.corpus.dev)

        dev_score_history = []
        dev_loss_history = []
        train_loss_history = []

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            previous_learning_rate = learning_rate

            for epoch in range(0 + self.epoch, max_epochs + self.epoch):
                log_line(log)

                try:
                    bad_epochs = scheduler.num_bad_epochs
                except:
                    bad_epochs = 0
                for group in optimizer.param_groups:
                    learning_rate = group['lr']

                # reload last best model if annealing with restarts is enabled
                if learning_rate != previous_learning_rate and anneal_with_restarts and \
                        (base_path / 'best-model.pt').exists():
                    log.info('resetting to best model')
                    self.model.load_from_file(base_path / 'best-model.pt')

                previous_learning_rate = learning_rate

                # stop training if learning rate becomes too small
                if learning_rate < 0.0001:
                    log_line(log)
                    log.info('learning rate too small - quitting training!')
                    log_line(log)
                    break

                if not test_mode:
                    random.shuffle(train_data)

                batches = [
                    train_data[x:x + mini_batch_size]
                    for x in range(0, len(train_data), mini_batch_size)
                ]

                self.model.train()

                train_loss: float = 0
                seen_sentences = 0
                modulo = max(1, int(len(batches) / 10))

                for batch_no, batch in enumerate(batches):
                    loss = self.model.forward_loss(batch)

                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   5.0)
                    optimizer.step()

                    seen_sentences += len(batch)
                    train_loss += loss.item()

                    clear_embeddings(
                        batch,
                        also_clear_word_embeddings=not embeddings_in_memory)

                    if batch_no % modulo == 0:
                        log.info(
                            f'epoch {epoch + 1} - iter {batch_no}/{len(batches)} - loss '
                            f'{train_loss / seen_sentences:.8f}')
                        iteration = epoch * len(batches) + batch_no
                        if not param_selection_mode:
                            weight_extractor.extract_weights(
                                self.model.state_dict(), iteration)

                train_loss /= len(train_data)

                self.model.eval()

                log_line(log)
                log.info(
                    f'EPOCH {epoch + 1} done: loss {train_loss:.4f} - lr {learning_rate:.4f} - bad epochs {bad_epochs}'
                )

                dev_metric = None
                dev_loss = '_'

                train_metric = None
                if monitor_train:
                    train_metric, train_loss = self._calculate_evaluation_results_for(
                        'TRAIN', self.corpus.train, evaluation_metric,
                        embeddings_in_memory, eval_mini_batch_size)

                if not train_with_dev:
                    dev_metric, dev_loss = self._calculate_evaluation_results_for(
                        'DEV', self.corpus.dev, evaluation_metric,
                        embeddings_in_memory, eval_mini_batch_size)

                if not param_selection_mode:
                    test_metric, test_loss = self._calculate_evaluation_results_for(
                        'TEST', self.corpus.test, evaluation_metric,
                        embeddings_in_memory, eval_mini_batch_size,
                        base_path / 'test.tsv')

                if not param_selection_mode:
                    with open(loss_txt, 'a') as f:
                        train_metric_str = train_metric.to_tsv(
                        ) if train_metric is not None else Metric.to_empty_tsv(
                        )
                        dev_metric_str = dev_metric.to_tsv(
                        ) if dev_metric is not None else Metric.to_empty_tsv()
                        test_metric_str = test_metric.to_tsv(
                        ) if test_metric is not None else Metric.to_empty_tsv(
                        )
                        f.write(
                            f'{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t'
                            f'{train_loss}\t{train_metric_str}\t{dev_loss}\t{dev_metric_str}\t_\t{test_metric_str}\n'
                        )

                # calculate scores using dev data if available
                dev_score = 0.
                if not train_with_dev:
                    if evaluation_metric == EvaluationMetric.MACRO_ACCURACY:
                        dev_score = dev_metric.macro_avg_accuracy()
                    elif evaluation_metric == EvaluationMetric.MICRO_ACCURACY:
                        dev_score = dev_metric.micro_avg_accuracy()
                    elif evaluation_metric == EvaluationMetric.MACRO_F1_SCORE:
                        dev_score = dev_metric.macro_avg_f_score()
                    else:
                        dev_score = dev_metric.micro_avg_f_score()

                    # append dev score to score history
                    dev_score_history.append(dev_score)
                    dev_loss_history.append(dev_loss.item())

                # anneal against train loss if training with dev, otherwise anneal against dev score
                current_score = train_loss if anneal_against_train_loss else dev_score

                scheduler.step(current_score)

                train_loss_history.append(train_loss)

                # if checkpoint is enable, save model at each epoch
                if checkpoint and not param_selection_mode:
                    self.model.save_checkpoint(base_path / 'checkpoint.pt',
                                               optimizer.state_dict(),
                                               scheduler.state_dict(),
                                               epoch + 1, train_loss)

                # if we use dev data, remember best model based on dev evaluation score
                if not train_with_dev and not param_selection_mode and current_score == scheduler.best:
                    self.model.save(base_path / 'best-model.pt')

            # if we do not use dev data for model selection, save final model
            if save_final_model and not param_selection_mode:
                self.model.save(base_path / 'final-model.pt')

        except KeyboardInterrupt:
            log_line(log)
            log.info('Exiting from training early.')
            if not param_selection_mode:
                log.info('Saving model ...')
                self.model.save(base_path / 'final-model.pt')
                log.info('Done.')

        # test best model on test data
        final_score = self.final_test(base_path, embeddings_in_memory,
                                      evaluation_metric, eval_mini_batch_size)

        return {
            'test_score': final_score,
            'dev_score_history': dev_score_history,
            'train_loss_history': train_loss_history,
            'dev_loss_history': dev_loss_history
        }