def final_test(self,
                   base_path: Path,
                   eval_mini_batch_size: int,
                   num_workers: int = 8):

        log_line(log)
        log.info("Testing using best model ...")

        final_score = 0.0
        for id, self.model in self.models.items():
            self.model.eval()

            if (base_path / f"{id}-best-model.pt").exists():
                self.model = self.model.load(base_path / f"{id}-best-model.pt")

            test_results, test_loss = self.model.evaluate(
                DataLoader(
                    self.corpus.get_test(id),
                    batch_size=eval_mini_batch_size,
                    num_workers=num_workers,
                ),
                out_path=base_path / f"{id}-test.tsv",
                embedding_storage_mode="none",
            )

            test_results: Result = test_results
            log.info(test_results.log_line)
            log.info(test_results.detailed_results)
            log_line(log)

            # get and return the final test score of best model
            final_score += test_results.main_score

        return final_score / len(self.models)
Ejemplo n.º 2
0
    def final_test(self, base_path: Path, embeddings_in_memory: bool,
                   evaluation_metric: EvaluationMetric,
                   eval_mini_batch_size: int):

        log_line(log)
        log.info('Testing using best model ...')

        self.model.eval()

        if (base_path / 'best-model.pt').exists():
            self.model = TextRegressor.load_from_file(base_path /
                                                      'best-model.pt')

        test_metric, test_loss = self._evaluate_text_regressor(
            self.model,
            self.corpus.test,
            eval_mini_batch_size=eval_mini_batch_size,
            embeddings_in_memory=embeddings_in_memory)

        log.info(f'AVG: mse: {test_metric.mean_squared_error():.4f} - '
                 f'mae: {test_metric.mean_absolute_error():.4f} - '
                 f'pearson: {test_metric.pearsonr():.4f} - '
                 f'spearman: {test_metric.spearmanr():.4f}')

        log_line(log)

        return test_metric.mean_squared_error()
Ejemplo n.º 3
0
    def train(self, max_epochs: int = 100, **kwargs):
        log.info(f"Prepare data loader")
        for trainer, base_path, lr in zip(self.trainer_list,
                                          self.base_path_list,
                                          self.learning_rate_list):
            trainer.max_epochs = max_epochs
            trainer.prepare_data(base_path, lr, **kwargs)

        if self.pretrain_list is not None:
            for i, epochs_pretrain in enumerate(self.pretrain_list):
                log_line(log)
                log.info(f"Pretrain Trainer {i} for {epochs_pretrain} epochs")

                for epoch in range(0, epochs_pretrain):
                    trainer = self.trainer_list[i]
                    trainer.cur_epoch = epoch
                    trainer.prepare_epoch()

                    for batch_no, batch in enumerate(trainer.batch_loader):
                        trainer.train_batch(batch_no, batch)
                    trainer.eval_after_epoch()

        log_line(log)
        log.info(f"Start multi-task training")
        for epoch in range(0, max_epochs):
            quit = False
            for trainer in self.trainer_list:
                trainer.cur_epoch = epoch
                trainer.prepare_epoch()

                if trainer.learning_rate < trainer.min_learning_rate:
                    quit = True
            if quit:
                log.info('Quitting')
                break

            num_batches = min([len(t.batch_loader) for t in self.trainer_list])
            iters = [iter(t.batch_loader) for t in self.trainer_list]
            batch_no = 0

            while batch_no < num_batches:
                for i, trainer in enumerate(self.trainer_list):
                    batch = next(iters[i])
                    trainer.train_batch(batch_no, batch)
                batch_no += 1

            for trainer in self.trainer_list:
                trainer.eval_after_epoch()

        for trainer in self.trainer_list:
            trainer.make_final_predictions()
Ejemplo n.º 4
0
    def optimize(self, space: SearchSpace, max_evals=100):
        search_space = space.search_space
        best = fmin(
            self._objective, search_space, algo=tpe.suggest, max_evals=max_evals
        )

        log_line(log)
        log.info("Optimizing parameter configuration done.")
        log.info("Best parameter configuration found:")
        for k, v in best.items():
            log.info(f"\t{k}: {v}")
        log_line(log)

        with open(self.param_selection_file, "a") as f:
            f.write("best parameter combination\n")
            for k, v in best.items():
                if isinstance(v, Tuple):
                    v = ",".join([str(x) for x in v])
                f.write(f"\t{k}: {str(v)}\n")
Ejemplo n.º 5
0
    def train(self, max_epochs: int = 100, **kwargs):
        self.model.set_output(self.model_mode)
        self.max_epochs = max_epochs
        self.prepare_data(**kwargs)
        for epoch in range(0 + self.epoch, max_epochs + self.epoch):
            self.cur_epoch = epoch
            self.prepare_epoch()

            if self.learning_rate < self.min_learning_rate:
                log_line(log)
                log.info("Quitting Training as one Model finished")
                log_line(log)
                break

            for batch_no, batch in enumerate(self.batch_loader):
                self.train_batch(batch_no, batch)

            self.eval_after_epoch()
        self.make_final_predictions()
Ejemplo n.º 6
0
 def optimize(self, space: SearchSpace, max_evals=100):
     search_space = space.search_space
     best = fmin(self._objective,
                 search_space,
                 algo=tpe.suggest,
                 max_evals=max_evals)
     log_line(log)
     log.info('Optimizing parameter configuration done.')
     log.info('Best parameter configuration found:')
     for (k, v) in best.items():
         log.info(''.join(['\t', '{}'.format(k), ': ', '{}'.format(v)]))
     log_line(log)
     with open(self.param_selection_file, 'a') as f:
         f.write('best parameter combination\n')
         for (k, v) in best.items():
             if isinstance(v, Tuple):
                 v = ','.join([str(x) for x in v])
             f.write(''.join(
                 ['\t', '{}'.format(k), ': ', '{}'.format(str(v)), '\n']))
Ejemplo n.º 7
0
 def final_test(self,
                base_path: Path,
                eval_mini_batch_size: int,
                num_workers: int = 8):
     log_line(log)
     log.info('Testing using best model ...')
     self.model.eval()
     if (base_path / 'best-model.pt').exists():
         self.model = self.model.load((base_path / 'best-model.pt'))
     (test_results, test_loss) = self.model.evaluate(
         DataLoader(self.corpus.test,
                    batch_size=eval_mini_batch_size,
                    num_workers=num_workers),
         out_path=(base_path / 'test.tsv'),
         embeddings_storage_mode='none')
     test_results = test_results
     log.info(test_results.log_line)
     log.info(test_results.detailed_results)
     log_line(log)
     if (type(self.corpus) is MultiCorpus):
         for subcorpus in self.corpus.corpora:
             log_line(log)
             self.model.evaluate(
                 DataLoader(subcorpus.test,
                            batch_size=eval_mini_batch_size,
                            num_workers=num_workers),
                 out_path=(
                     base_path /
                     ''.join(['{}'.format(subcorpus.name), '-test.tsv'])),
                 embeddings_storage_mode='none')
     final_score = test_results.main_score
     return final_score
Ejemplo n.º 8
0
    def predict(
        self,
        sentences: Union[List[Sentence], Sentence],
        mini_batch_size: int = 32,
        num_workers: int = 8,
        print_tree: bool = False,
        embedding_storage_mode="none",
    ) -> None:
        """
        Predict arcs and tags for Dependency Parser task
        :param sentences: a Sentence or a List of Sentence
        :param mini_batch_size: mini batch size to use
        :param print_tree: set to True to print dependency parser of sentence as tree shape
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
        you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        'gpu' to store embeddings in GPU memory.
        """
        if not isinstance(sentences, list):
            sentences = [sentences]
        sentence_dataset = FlairDatapointDataset(sentences)
        data_loader = DataLoader(sentence_dataset, batch_size=mini_batch_size, num_workers=num_workers)

        for batch in data_loader:
            with torch.no_grad():
                score_arc, score_rel = self.forward(batch)
                arc_prediction, relation_prediction = self._obtain_labels_(score_arc, score_rel)

            for sentnce_index, (sentence, sent_tags, sent_arcs) in enumerate(
                zip(batch, relation_prediction, arc_prediction)
            ):

                for token_index, (token, tag, head_id) in enumerate(zip(sentence.tokens, sent_tags, sent_arcs)):
                    token.add_tag(self.tag_type, tag, score_rel[sentnce_index][token_index])

                    token.head_id = int(head_id)

                if print_tree:
                    tree_printer(sentence, self.tag_type)
                    log_line(log)
            store_embeddings(batch, storage_mode=embedding_storage_mode)
Ejemplo n.º 9
0
    def final_test(self, base_path: Path, embeddings_in_memory: bool,
                   evaluation_metric: EvaluationMetric,
                   eval_mini_batch_size: int):

        log_line(log)
        log.info('Testing using best model ...')

        self.model.eval()

        if (base_path / 'best-model.pt').exists():
            if isinstance(self.model, TextClassifier):
                self.model = TextClassifier.load_from_file(base_path /
                                                           'best-model.pt')
            if isinstance(self.model, SequenceTagger):
                self.model = SequenceTagger.load_from_file(base_path /
                                                           'best-model.pt')

        test_metric, test_loss = self.evaluate(
            self.model,
            self.corpus.test,
            eval_mini_batch_size=eval_mini_batch_size,
            embeddings_in_memory=embeddings_in_memory)

        log.info(
            f'MICRO_AVG: acc {test_metric.micro_avg_accuracy()} - f1-score {test_metric.micro_avg_f_score()}'
        )
        log.info(
            f'MACRO_AVG: acc {test_metric.macro_avg_accuracy()} - f1-score {test_metric.macro_avg_f_score()}'
        )
        for class_name in test_metric.get_classes():
            log.info(
                f'{class_name:<10} tp: {test_metric.get_tp(class_name)} - fp: {test_metric.get_fp(class_name)} - '
                f'fn: {test_metric.get_fn(class_name)} - tn: {test_metric.get_tn(class_name)} - precision: '
                f'{test_metric.precision(class_name):.4f} - recall: {test_metric.recall(class_name):.4f} - '
                f'accuracy: {test_metric.accuracy(class_name):.4f} - f1-score: '
                f'{test_metric.f_score(class_name):.4f}')
        log_line(log)

        # if we are training over multiple datasets, do evaluation for each
        if type(self.corpus) is MultiCorpus:
            for subcorpus in self.corpus.corpora:
                log_line(log)
                self._calculate_evaluation_results_for(subcorpus.name,
                                                       subcorpus.test,
                                                       evaluation_metric,
                                                       embeddings_in_memory,
                                                       eval_mini_batch_size,
                                                       base_path / 'test.tsv')

        # get and return the final test score of best model
        if evaluation_metric == EvaluationMetric.MACRO_ACCURACY:
            final_score = test_metric.macro_avg_accuracy()
        elif evaluation_metric == EvaluationMetric.MICRO_ACCURACY:
            final_score = test_metric.micro_avg_accuracy()
        elif evaluation_metric == EvaluationMetric.MACRO_F1_SCORE:
            final_score = test_metric.macro_avg_f_score()
        else:
            final_score = test_metric.micro_avg_f_score()

        return final_score
Ejemplo n.º 10
0
    def final_test(
        self,
        base_path: Path,
        embeddings_in_memory: bool,
        evaluation_metric: EvaluationMetric,
        eval_mini_batch_size: int,
        num_workers: int = 8,
        eval_mode: EvalMode = EvalMode.Standard,
        misspell_mode: MisspellingMode = MisspellingMode.Random,
        misspelling_rate: float = 0.0,
        char_vocab: set = {},
        cmx=None,
        lut={},
    ):

        log_line(log)
        log.info("Testing using best model ...")

        self.model.eval()

        if (base_path / "best-model.pt").exists():
            self.model = self.model.load(base_path / "best-model.pt")

        test_results, test_loss = self.model.evaluate(
            self.corpus.test,
            eval_mini_batch_size=eval_mini_batch_size,
            embeddings_in_memory=embeddings_in_memory,
            out_path=base_path / "test.tsv",
            num_workers=num_workers,
            eval_mode=eval_mode,
            misspell_mode=misspell_mode,
            misspelling_rate=misspelling_rate,
            char_vocab=char_vocab,
            cmx=cmx,
            lut=lut,
        )

        test_results: Result = test_results
        log.info(test_results.log_line)
        log.info(test_results.detailed_results)
        log_line(log)

        # if we are training over multiple datasets, do evaluation for each
        if type(self.corpus) is MultiCorpus:
            for subcorpus in self.corpus.corpora:
                log_line(log)
                self.model.evaluate(
                    subcorpus.test,
                    eval_mini_batch_size,
                    embeddings_in_memory,
                    base_path / f"{subcorpus.name}-test.tsv",
                    eval_mode=eval_mode,
                    misspelling_rate=misspelling_rate,
                    char_vocab=char_vocab,
                )

        # get and return the final test score of best model
        final_score = test_results.main_score

        return final_score
Ejemplo n.º 11
0
    def final_test(
            self,
            base_path: Union[Path, str],
            eval_mini_batch_size: int,
            main_evaluation_metric: Tuple[str, str],
            num_workers: int = 8,
            gold_label_dictionary_for_eval: Optional[Dictionary] = None
    ):
        if type(base_path) is str:
            base_path = Path(base_path)
        base_path.mkdir(exist_ok=True, parents=True)

        log_line(log)

        self.model.eval()

        if (base_path / "best-model.pt").exists():
            self.model.load_state_dict(self.model.load(base_path / "best-model.pt").state_dict())
        else:
            log.info("Testing using last state of model ...")

        test_results = self.model.evaluate(
            self.corpus.test,
            gold_label_type=self.model.label_type,
            mini_batch_size=eval_mini_batch_size,
            num_workers=num_workers,
            out_path=base_path / "test.tsv",
            embedding_storage_mode="none",
            main_evaluation_metric=main_evaluation_metric,
            gold_label_dictionary=gold_label_dictionary_for_eval,
        )

        test_results: Result = test_results
        log.info(test_results.log_line)
        log.info(test_results.detailed_results)
        log_line(log)

        # if we are training over multiple datasets, do evaluation for each
        if type(self.corpus) is MultiCorpus:
            for subcorpus in self.corpus.corpora:
                log_line(log)
                if subcorpus.test:
                    subcorpus_results = self.model.evaluate(
                        subcorpus.test,
                        gold_label_type=self.model.label_type,
                        mini_batch_size=eval_mini_batch_size,
                        num_workers=num_workers,
                        out_path=base_path / f"{subcorpus.name}-test.tsv",
                        embedding_storage_mode="none",
                        main_evaluation_metric=main_evaluation_metric
                    )
                    log.info(subcorpus.name)
                    log.info(subcorpus_results.log_line)

        # get and return the final test score of best model
        final_score = test_results.main_score

        return final_score
Ejemplo n.º 12
0
    def final_test(
            self,
            base_path: Union[Path, str],
            eval_mini_batch_size: int,
            num_workers: int = 8,
            main_score_type: str = None,
    ):
        if type(base_path) is str:
            base_path = Path(base_path)

        log_line(log)

        self.model.eval()

        if (os.path.exists(self.get_best_model_path(base_path, check_model_existance=True))):
            log.info("Testing using best model ...")
            self.model = self.model.load(self.get_best_model_path(base_path, check_model_existance=True))
        else:
            log.info("Testing using last state of model ...")

        test_results, test_loss = self.model.evaluate(
            self.corpus.test,
            mini_batch_size=eval_mini_batch_size,
            num_workers=num_workers,
            out_path=base_path / "test.tsv",
            embedding_storage_mode="none",
            main_score_type=main_score_type
        )

        test_results: Result = test_results
        log.info(test_results.log_line)
        log.info(test_results.detailed_results)
        log_line(log)

        # if we are training over multiple datasets, do evaluation for each
        if type(self.corpus) is MultiCorpus:
            for subcorpus in self.corpus.corpora:
                log_line(log)
                if subcorpus.test:
                    subcorpus_results, subcorpus_loss = self.model.evaluate(
                        subcorpus.test,
                        mini_batch_size=eval_mini_batch_size,
                        num_workers=num_workers,
                        out_path=base_path / f"{subcorpus.name}-test.tsv",
                        embedding_storage_mode="none",
                        main_score_type=main_score_type
                    )
                    log.info(subcorpus.name)
                    log.info(subcorpus_results.log_line)

        # get and return the final test score of best model
        final_score = test_results.main_score

        return final_score
Ejemplo n.º 13
0
    def final_test(self,
                   base_path: Path,
                   eval_mini_batch_size: int,
                   num_workers: int = 8):

        log_line(log)
        log.info("Testing using best model ...")

        if (base_path / "best-model.pt").exists():
            cls = type(self.model)
            del self.model
            self.model = cls.load(base_path / "best-model.pt")

        self.model.eval()

        test_results, test_loss = self.model.evaluate(
            DataLoader(
                self.corpus.test,
                batch_size=eval_mini_batch_size,
                num_workers=num_workers,
            ),
            out_path=base_path / "test.tsv",
            embedding_storage_mode="none",
        )

        test_results: Result = test_results
        log.info(test_results.log_line)
        log.info(test_results.detailed_results)
        log_line(log)

        # if we are training over multiple datasets, do evaluation for each
        if type(self.corpus) is MultiCorpus:
            for subcorpus in self.corpus.corpora:
                log_line(log)
                self.model.evaluate(
                    DataLoader(
                        subcorpus.test,
                        batch_size=eval_mini_batch_size,
                        num_workers=num_workers,
                    ),
                    out_path=base_path / f"{subcorpus.name}-test.tsv",
                    embedding_storage_mode="none",
                )

        # get and return the final test score of best model
        final_score = test_results.main_score

        return final_score
Ejemplo n.º 14
0
    def final_test(self,
                   base_path: Union[Path, str],
                   eval_mini_batch_size: int,
                   num_workers: int = 8):
        if type(base_path) is str:
            base_path = Path(base_path)

        log_line(log)
        log.info("Testing using best model ...")

        self.model.eval()

        torch.jit.script(self.model)
        #if (base_path / "best-model.pt").exists():
        #    self.model = self.model.load(base_path / "best-model.pt")

        test_results, test_loss = self.model.evaluate(
            self.corpus.test,
            mini_batch_size=eval_mini_batch_size,
            num_workers=num_workers,
            out_path=base_path / "test.tsv",
            embedding_storage_mode="none",
        )

        test_results: Result = test_results
        log.info(test_results.log_line)
        log.info(test_results.detailed_results)
        log_line(log)

        # if we are training over multiple datasets, do evaluation for each
        if type(self.corpus) is MultiCorpus:
            for subcorpus in self.corpus.corpora:
                log_line(log)
                self.model.evaluate(
                    subcorpus.test,
                    mini_batch_size=eval_mini_batch_size,
                    num_workers=num_workers,
                    out_path=base_path / f"{subcorpus.name}-test.tsv",
                    embedding_storage_mode="none",
                )

        # get and return the final test score of best model
        final_score = test_results.main_score

        print("FINAL RESULT", test_loss, final_score)
        return final_score
Ejemplo n.º 15
0
    def final_test(
        self,
        base_path: Path,
        embeddings_in_memory: bool,
        evaluation_metric: EvaluationMetric,
        eval_mini_batch_size: int,
        num_workers: int = 8,
    ):

        log_line(log)
        log.info("Testing using best model ...")

        self.model.eval()

        if (base_path / "best-model.pt").exists():
            self.model = self.model.load(base_path / "best-model.pt")

        test_results, test_loss = self.model.evaluate(
            self.corpus.test,
            eval_mini_batch_size=eval_mini_batch_size,
            embeddings_in_memory=embeddings_in_memory,
            out_path=base_path / "test.tsv",
            num_workers=num_workers,
        )

        test_results: Result = test_results
        log.info(test_results.log_line)
        log.info(test_results.detailed_results)
        log_line(log)

        # if we are training over multiple datasets, do evaluation for each
        if type(self.corpus) is MultiCorpus:
            for subcorpus in self.corpus.corpora:
                log_line(log)
                self.model.evaluate(
                    subcorpus.test,
                    eval_mini_batch_size,
                    embeddings_in_memory,
                    base_path / f"{subcorpus.name}-test.tsv",
                )

        # get and return the final test score of best model
        final_score = test_results.main_score

        return final_score
Ejemplo n.º 16
0
    def _setup(self, config):
        args = config["args"]
        config.pop("args", None)
        self.params = config
        self.corpus = args["corpus"]

        torch.manual_seed(args["seed"])
        if args.cuda:
            torch.cuda.manual_seed(args["seed"])

        model = args["_set_up_model"](self.params)

        log_line(log)
        log.info(f'Model: "{self.model}"')
        log_line(log)
        log.info(f'Corpus: "{self.corpus}"')
        log_line(log)
        log.info("Parameters:")
        log.info(f' - learning_rate: "{config["learning_rate"]}"')
        # log.info(f' - mini_batch_size: "{config["mini_batch_size"]}"')
        # log.info(f' - patience: "{config["patience"]}"')
        # log.info(f' - anneal_factor: "{config["anneal_factor"]}"')

        # Check this once more
        weight_extractor = None  # WeightExtractor()

        training_params = {
            key: config[key]
            for key in config if key in TRAINING_PARAMETERS
        }

        model_trainer_parameters = {
            key: config[key]
            for key in config if key in MODEL_TRAINER_PARAMETERS
        }

        # This should be enough for initializing all the parameters for the trainer
        self.trainer = ModelTrainer(model, self.corpus,
                                    model_trainer_parameters)
Ejemplo n.º 17
0
    def prepare_epoch(self):
        log_line(log)

        for group in self.optimizer.param_groups:
            self.learning_rate = group["lr"]

        # reload last best model if annealing with restarts is enabled
        if (self.learning_rate != self.previous_learning_rate
                and self.anneal_with_restarts
                and (base_path / "best-model.pt").exists()):
            log.info("resetting to best model")
            self.model.load(base_path / "best-model.pt")

        self.previous_learning_rate = self.learning_rate

        # stop training if learning rate becomes too small
        if self.learning_rate < self.min_learning_rate:
            log_line(log)
            log.info("learning rate too small - quitting training!")
            log_line(log)
            return

        self.batch_loader = DataLoader(
            self.train_data,
            batch_size=self.mini_batch_size,
            shuffle=self.shuffle,
            num_workers=self.num_workers,
        )

        self.model.train()
        self.model.set_output(self.model_mode)

        self.train_loss: float = 0
        self.seen_batches = 0
        self.total_number_of_batches = len(self.batch_loader)

        self.modulo = max(1, int(self.total_number_of_batches / 10))
        self.batch_time = 0
Ejemplo n.º 18
0
from __future__ import absolute_import
Ejemplo n.º 19
0
    def train(
        self,
        base_path: Union[Path, str],
        learning_rate: float = 0.1,
        mini_batch_size: int = 32,
        eval_mini_batch_size: int = None,
        max_epochs: int = 100,
        anneal_factor: float = 0.5,
        patience: int = 3,
        min_learning_rate: float = 0.0001,
        train_with_dev: bool = False,
        monitor_train: bool = False,
        monitor_test: bool = False,
        embeddings_storage_mode: str = "cpu",
        checkpoint: bool = False,
        save_final_model: bool = True,
        anneal_with_restarts: bool = False,
        shuffle: bool = True,
        param_selection_mode: bool = False,
        num_workers: int = 6,
        sampler=None,
        **kwargs,
    ) -> dict:
        """
        Trains any class that implements the flair.nn.Model interface.
        :param base_path: Main path to which all output during training is logged and models are saved
        :param learning_rate: Initial learning rate
        :param mini_batch_size: Size of mini-batches during training
        :param eval_mini_batch_size: Size of mini-batches during evaluation
        :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed.
        :param anneal_factor: The factor by which the learning rate is annealed
        :param patience: Patience is the number of epochs with no improvement the Trainer waits
         until annealing the learning rate
        :param min_learning_rate: If the learning rate falls below this threshold, training terminates
        :param train_with_dev: If True, training is performed using both train+dev data
        :param monitor_train: If True, training data is evaluated at end of each epoch
        :param monitor_test: If True, test data is evaluated at end of each epoch
        :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),
        'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU)
        :param checkpoint: If True, a full checkpoint is saved at end of each epoch
        :param save_final_model: If True, final model is saved
        :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate
        :param shuffle: If True, data is shuffled during training
        :param param_selection_mode: If True, testing is performed against dev data. Use this mode when doing
        parameter selection.
        :param num_workers: Number of workers in your data loader.
        :param sampler: You can pass a data sampler here for special sampling of data.
        :param kwargs: Other arguments for the Optimizer
        :return:
        """

        if self.use_tensorboard:
            try:
                from torch.utils.tensorboard import SummaryWriter

                writer = SummaryWriter()
            except:
                log_line(log)
                log.warning(
                    "ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!"
                )
                log_line(log)
                self.use_tensorboard = False
                pass

        if eval_mini_batch_size is None:
            eval_mini_batch_size = mini_batch_size

        # cast string to Path
        if type(base_path) is str:
            base_path = Path(base_path)

        log_handler = add_file_handler(log, base_path / "training.log")

        log_line(log)
        log.info(f'Model: "{self.model}"')
        log_line(log)
        log.info(f'Corpus: "{self.corpus}"')
        log_line(log)
        log.info("Parameters:")
        log.info(f' - learning_rate: "{learning_rate}"')
        log.info(f' - mini_batch_size: "{mini_batch_size}"')
        log.info(f' - patience: "{patience}"')
        log.info(f' - anneal_factor: "{anneal_factor}"')
        log.info(f' - max_epochs: "{max_epochs}"')
        log.info(f' - shuffle: "{shuffle}"')
        log.info(f' - train_with_dev: "{train_with_dev}"')
        log_line(log)
        log.info(f'Model training base path: "{base_path}"')
        log_line(log)
        log.info(f"Device: {flair.device}")
        log_line(log)
        log.info(f"Embeddings storage mode: {embeddings_storage_mode}")

        # determine what splits (train, dev, test) to evaluate and log
        log_train = True if monitor_train else False
        log_test = (True if (not param_selection_mode and self.corpus.test
                             and monitor_test) else False)
        log_dev = True if not train_with_dev else False

        # prepare loss logging file and set up header
        loss_txt = init_output_file(base_path, "loss.tsv")

        weight_extractor = WeightExtractor(base_path)

        optimizer: torch.optim.Optimizer = self.optimizer(
            self.model.parameters(), lr=learning_rate, **kwargs)
        if self.optimizer_state is not None:
            optimizer.load_state_dict(self.optimizer_state)

        # minimize training loss if training with dev data, else maximize dev score
        anneal_mode = "min" if train_with_dev else "max"

        scheduler: ReduceLROnPlateau = ReduceLROnPlateau(
            optimizer,
            factor=anneal_factor,
            patience=patience,
            mode=anneal_mode,
            verbose=True,
        )

        if self.scheduler_state is not None:
            scheduler.load_state_dict(self.scheduler_state)

        train_data = self.corpus.train

        # if training also uses dev data, include in training set
        if train_with_dev:
            train_data = ConcatDataset([self.corpus.train, self.corpus.dev])

        if sampler is not None:
            sampler = sampler(train_data)
            shuffle = False

        dev_score_history = []
        dev_loss_history = []
        train_loss_history = []

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            previous_learning_rate = learning_rate

            for epoch in range(0 + self.epoch, max_epochs + self.epoch):
                log_line(log)

                # get new learning rate
                for group in optimizer.param_groups:
                    learning_rate = group["lr"]

                # reload last best model if annealing with restarts is enabled
                if (learning_rate != previous_learning_rate
                        and anneal_with_restarts
                        and (base_path / "best-model.pt").exists()):
                    log.info("resetting to best model")
                    self.model.load(base_path / "best-model.pt")

                previous_learning_rate = learning_rate

                # stop training if learning rate becomes too small
                if learning_rate < min_learning_rate:
                    log_line(log)
                    log.info("learning rate too small - quitting training!")
                    log_line(log)
                    break

                batch_loader = DataLoader(
                    train_data,
                    batch_size=mini_batch_size,
                    shuffle=shuffle,
                    num_workers=num_workers,
                    sampler=sampler,
                )

                self.model.train()

                train_loss: float = 0

                seen_batches = 0
                total_number_of_batches = len(batch_loader)

                modulo = max(1, int(total_number_of_batches / 10))

                # process mini-batches
                for batch_no, batch in enumerate(batch_loader):

                    loss = self.model.forward_loss(batch)

                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   5.0)
                    optimizer.step()

                    seen_batches += 1
                    train_loss += loss.item()

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(batch, embeddings_storage_mode)

                    if batch_no % modulo == 0:
                        log.info(
                            f"epoch {epoch + 1} - iter {batch_no}/{total_number_of_batches} - loss "
                            f"{train_loss / seen_batches:.8f}")
                        iteration = epoch * total_number_of_batches + batch_no
                        if not param_selection_mode:
                            weight_extractor.extract_weights(
                                self.model.state_dict(), iteration)

                train_loss /= seen_batches

                self.model.eval()

                log_line(log)
                log.info(
                    f"EPOCH {epoch + 1} done: loss {train_loss:.4f} - lr {learning_rate:.4f}"
                )

                if self.use_tensorboard:
                    writer.add_scalar("train_loss", train_loss, epoch + 1)

                # anneal against train loss if training with dev, otherwise anneal against dev score
                current_score = train_loss

                # evaluate on train / dev / test split depending on training settings
                result_line: str = ""

                if log_train:
                    train_eval_result, train_loss = self.model.evaluate(
                        DataLoader(
                            self.corpus.train,
                            batch_size=eval_mini_batch_size,
                            num_workers=num_workers,
                        ),
                        embeddings_storage_mode=embeddings_storage_mode,
                    )
                    result_line += f"\t{train_eval_result.log_line}"

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(self.corpus.train,
                                     embeddings_storage_mode)

                if log_dev:
                    dev_eval_result, dev_loss = self.model.evaluate(
                        DataLoader(
                            self.corpus.dev,
                            batch_size=eval_mini_batch_size,
                            num_workers=num_workers,
                        ),
                        embeddings_storage_mode=embeddings_storage_mode,
                    )
                    result_line += f"\t{dev_loss}\t{dev_eval_result.log_line}"
                    log.info(
                        f"DEV : loss {dev_loss} - score {dev_eval_result.main_score}"
                    )
                    # calculate scores using dev data if available
                    # append dev score to score history
                    dev_score_history.append(dev_eval_result.main_score)
                    dev_loss_history.append(dev_loss)

                    current_score = dev_eval_result.main_score

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(self.corpus.dev, embeddings_storage_mode)

                    if self.use_tensorboard:
                        writer.add_scalar("dev_loss", dev_loss, epoch + 1)
                        writer.add_scalar("dev_score",
                                          dev_eval_result.main_score,
                                          epoch + 1)

                if log_test:
                    test_eval_result, test_loss = self.model.evaluate(
                        DataLoader(
                            self.corpus.test,
                            batch_size=eval_mini_batch_size,
                            num_workers=num_workers,
                        ),
                        base_path / "test.tsv",
                        embeddings_storage_mode=embeddings_storage_mode,
                    )
                    result_line += f"\t{test_loss}\t{test_eval_result.log_line}"
                    log.info(
                        f"TEST : loss {test_loss} - score {test_eval_result.main_score}"
                    )

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(self.corpus.test, embeddings_storage_mode)

                    if self.use_tensorboard:
                        writer.add_scalar("test_loss", test_loss, epoch + 1)
                        writer.add_scalar("test_score",
                                          test_eval_result.main_score,
                                          epoch + 1)

                # determine learning rate annealing through scheduler
                scheduler.step(current_score)

                train_loss_history.append(train_loss)

                # determine bad epoch number
                try:
                    bad_epochs = scheduler.num_bad_epochs
                except:
                    bad_epochs = 0
                for group in optimizer.param_groups:
                    new_learning_rate = group["lr"]
                if new_learning_rate != previous_learning_rate:
                    bad_epochs = patience + 1

                # log bad epochs
                log.info(f"BAD EPOCHS (no improvement): {bad_epochs}")

                # output log file
                with open(loss_txt, "a") as f:

                    # make headers on first epoch
                    if epoch == 0:
                        f.write(
                            f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS"
                        )

                        if log_train:
                            f.write("\tTRAIN_" + "\tTRAIN_".join(
                                train_eval_result.log_header.split("\t")))
                        if log_dev:
                            f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join(
                                dev_eval_result.log_header.split("\t")))
                        if log_test:
                            f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join(
                                test_eval_result.log_header.split("\t")))

                    f.write(
                        f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}"
                    )
                    f.write(result_line)

                # if checkpoint is enable, save model at each epoch
                if checkpoint and not param_selection_mode:
                    self.model.save_checkpoint(
                        base_path / "checkpoint.pt",
                        optimizer.state_dict(),
                        scheduler.state_dict(),
                        epoch + 1,
                        train_loss,
                    )

                # if we use dev data, remember best model based on dev evaluation score
                if (not train_with_dev and not param_selection_mode
                        and current_score == scheduler.best):
                    self.model.save(base_path / "best-model.pt")

            # if we do not use dev data for model selection, save final model
            if save_final_model and not param_selection_mode:
                self.model.save(base_path / "final-model.pt")

        except KeyboardInterrupt:
            log_line(log)
            log.info("Exiting from training early.")

            if self.use_tensorboard:
                writer.close()

            if not param_selection_mode:
                log.info("Saving model ...")
                self.model.save(base_path / "final-model.pt")
                log.info("Done.")

        # test best model if test data is present
        if self.corpus.test:
            final_score = self.final_test(base_path, eval_mini_batch_size,
                                          num_workers)
        else:
            final_score = 0
            log.info("Test data not provided setting final score to 0")

        log.removeHandler(log_handler)

        if self.use_tensorboard:
            writer.close()

        return {
            "test_score": final_score,
            "dev_score_history": dev_score_history,
            "train_loss_history": train_loss_history,
            "dev_loss_history": dev_loss_history,
        }
Ejemplo n.º 20
0
    def find_learning_rate(self,
                           base_path: Union[Path, str],
                           file_name: str = 'learning_rate.tsv',
                           start_learning_rate: float = 1e-7,
                           end_learning_rate: float = 10,
                           iterations: int = 100,
                           mini_batch_size: int = 32,
                           stop_early: bool = True,
                           smoothing_factor: float = 0.98,
                           **kwargs) -> Path:
        best_loss = None
        moving_avg_loss = 0

        # cast string to Path
        if type(base_path) is str:
            base_path = Path(base_path)
        learning_rate_tsv = init_output_file(base_path, file_name)

        with open(learning_rate_tsv, 'a') as f:
            f.write('ITERATION\tTIMESTAMP\tLEARNING_RATE\tTRAIN_LOSS\n')

        optimizer = self.optimizer(self.model.parameters(),
                                   lr=start_learning_rate,
                                   **kwargs)

        train_data = self.corpus.train
        random.shuffle(train_data)
        batches = [
            train_data[x:x + mini_batch_size]
            for x in range(0, len(train_data), mini_batch_size)
        ][:iterations]

        scheduler = ExpAnnealLR(optimizer, end_learning_rate, iterations)

        model_state = self.model.state_dict()
        model_device = next(self.model.parameters()).device
        self.model.train()

        for itr, batch in enumerate(batches):
            loss = self.model.forward_loss(batch)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
            optimizer.step()
            scheduler.step()
            learning_rate = scheduler.get_lr()[0]

            loss_item = loss.item()
            if itr == 0:
                best_loss = loss_item
            else:
                if smoothing_factor > 0:
                    moving_avg_loss = smoothing_factor * moving_avg_loss + (
                        1 - smoothing_factor) * loss_item
                    loss_item = moving_avg_loss / (1 -
                                                   smoothing_factor**(itr + 1))
                if loss_item < best_loss:
                    best_loss = loss

            if stop_early and (loss_item > 4 * best_loss or torch.isnan(loss)):
                log_line(log)
                log.info('loss diverged - stopping early!')
                break

            with open(learning_rate_tsv, 'a') as f:
                f.write(
                    f'{itr}\t{datetime.datetime.now():%H:%M:%S}\t{learning_rate}\t{loss_item}\n'
                )

        self.model.load_state_dict(model_state)
        self.model.to(model_device)

        log_line(log)
        log.info(f'learning rate finder finished - plot {learning_rate_tsv}')
        log_line(log)

        return Path(learning_rate_tsv)
Ejemplo n.º 21
0
    def train(
        self,
        base_path: Union[Path, str],
        evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE,
        learning_rate: float = 0.1,
        mini_batch_size: int = 32,
        eval_mini_batch_size: int = None,
        max_epochs: int = 100,
        anneal_factor: float = 0.5,
        patience: int = 3,
        train_with_dev: bool = False,
        monitor_train: bool = False,
        embeddings_in_memory: bool = True,
        checkpoint: bool = False,
        save_final_model: bool = True,
        anneal_with_restarts: bool = False,
        shuffle: bool = True,
        param_selection_mode: bool = False,
        num_workers: int = 8,
        **kwargs,
    ) -> dict:

        if eval_mini_batch_size is None:
            eval_mini_batch_size = mini_batch_size

        log.info(f'Model training base path: "{base_path}"')

        # cast string to Path
        if type(base_path) is str:
            base_path = Path(base_path)

        add_file_handler(log, base_path / "training.log")

        log_line(log)
        log.info(f"Evaluation method: {evaluation_metric.name}")

        # determine what splits (train, dev, test) to evaluate and log
        log_train = True if monitor_train else False
        log_test = True if (not param_selection_mode
                            and self.corpus.test) else False
        log_dev = True if not train_with_dev else False

        loss_txt = init_output_file(base_path, "loss.tsv")
        with open(loss_txt, "a") as f:
            f.write(f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS")

            dummy_result, _ = self.model.evaluate(
                [Sentence("d", labels=["0.1"])],
                eval_mini_batch_size,
                embeddings_in_memory,
            )
            if log_train:
                f.write("\tTRAIN_" +
                        "\tTRAIN_".join(dummy_result.log_header.split("\t")))
            if log_dev:
                f.write("\tDEV_LOSS\tDEV_" +
                        "\tDEV_".join(dummy_result.log_header.split("\t")))
            if log_test:
                f.write("\tTEST_LOSS\tTEST_" +
                        "\tTEST_".join(dummy_result.log_header.split("\t")))

            weight_extractor = WeightExtractor(base_path)

        optimizer = self.optimizer(self.model.parameters(),
                                   lr=learning_rate,
                                   **kwargs)
        if self.optimizer_state is not None:
            optimizer.load_state_dict(self.optimizer_state)

        # minimize training loss if training with dev data, else maximize dev score
        anneal_mode = "min" if train_with_dev else "max"

        if isinstance(optimizer, (AdamW, SGDW)):
            scheduler = ReduceLRWDOnPlateau(
                optimizer,
                factor=anneal_factor,
                patience=patience,
                mode=anneal_mode,
                verbose=True,
            )
        else:
            scheduler = ReduceLROnPlateau(
                optimizer,
                factor=anneal_factor,
                patience=patience,
                mode=anneal_mode,
                verbose=True,
            )
        if self.scheduler_state is not None:
            scheduler.load_state_dict(self.scheduler_state)

        train_data = self.corpus.train

        # if training also uses dev data, include in training set
        if train_with_dev:
            train_data = ConcatDataset([self.corpus.train, self.corpus.dev])

        dev_score_history = []
        dev_loss_history = []
        train_loss_history = []

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            previous_learning_rate = learning_rate

            for epoch in range(0 + self.epoch, max_epochs + self.epoch):
                log_line(log)
                try:
                    bad_epochs = scheduler.num_bad_epochs
                except:
                    bad_epochs = 0
                for group in optimizer.param_groups:
                    learning_rate = group["lr"]

                # reload last best model if annealing with restarts is enabled
                if (learning_rate != previous_learning_rate
                        and anneal_with_restarts
                        and (base_path / "best-model.pt").exists()):
                    log.info("resetting to best model")
                    self.model.load(base_path / "best-model.pt")

                previous_learning_rate = learning_rate

                # stop training if learning rate becomes too small
                if learning_rate < 0.0001:
                    log_line(log)
                    log.info("learning rate too small - quitting training!")
                    log_line(log)
                    break

                batch_loader = DataLoader(
                    train_data,
                    batch_size=mini_batch_size,
                    shuffle=shuffle,
                    num_workers=num_workers,
                )

                self.model.train()

                train_loss: float = 0
                seen_batches = 0
                total_number_of_batches = len(batch_loader)

                modulo = max(1, int(total_number_of_batches / 10))

                for batch_no, batch in enumerate(batch_loader):

                    loss = self.model.forward_loss(batch)

                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   5.0)
                    optimizer.step()

                    seen_batches += 1
                    train_loss += loss.item()

                    clear_embeddings(
                        batch,
                        also_clear_word_embeddings=not embeddings_in_memory)

                    if batch_no % modulo == 0:
                        log.info(
                            f"epoch {epoch + 1} - iter {batch_no}/{total_number_of_batches} - loss "
                            f"{train_loss / seen_batches:.8f}")
                        iteration = epoch * total_number_of_batches + batch_no
                        if not param_selection_mode:
                            weight_extractor.extract_weights(
                                self.model.state_dict(), iteration)

                train_loss /= seen_batches

                self.model.eval()

                log_line(log)
                log.info(
                    f"EPOCH {epoch + 1} done: loss {train_loss:.4f} - lr {learning_rate:.4f} - bad epochs {bad_epochs}"
                )

                # anneal against train loss if training with dev, otherwise anneal against dev score
                current_score = train_loss

                with open(loss_txt, "a") as f:

                    f.write(
                        f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}"
                    )

                    if log_train:
                        train_eval_result, train_loss = self.model.evaluate(
                            self.corpus.train,
                            eval_mini_batch_size,
                            embeddings_in_memory,
                            num_workers=num_workers,
                        )
                        f.write(f"\t{train_eval_result.log_line}")

                    if log_dev:
                        dev_eval_result, dev_loss = self.model.evaluate(
                            self.corpus.dev,
                            eval_mini_batch_size,
                            embeddings_in_memory,
                            num_workers=num_workers,
                        )
                        f.write(f"\t{dev_loss}\t{dev_eval_result.log_line}")
                        log.info(
                            f"DEV : loss {dev_loss} - score {dev_eval_result.main_score}"
                        )
                        # calculate scores using dev data if available
                        # append dev score to score history
                        dev_score_history.append(dev_eval_result.main_score)
                        dev_loss_history.append(dev_loss)

                        current_score = dev_eval_result.main_score

                    if log_test:
                        test_eval_result, test_loss = self.model.evaluate(
                            self.corpus.test,
                            eval_mini_batch_size,
                            embeddings_in_memory,
                            base_path / "test.tsv",
                            num_workers=num_workers,
                        )
                        f.write(f"\t{test_loss}\t{test_eval_result.log_line}")
                        log.info(
                            f"TEST : loss {test_loss} - score {test_eval_result.main_score}"
                        )

                scheduler.step(current_score)

                train_loss_history.append(train_loss)

                # if checkpoint is enable, save model at each epoch
                if checkpoint and not param_selection_mode:
                    self.model.save_checkpoint(
                        base_path / "checkpoint.pt",
                        optimizer.state_dict(),
                        scheduler.state_dict(),
                        epoch + 1,
                        train_loss,
                    )

                # if we use dev data, remember best model based on dev evaluation score
                if (not train_with_dev and not param_selection_mode
                        and current_score == scheduler.best):
                    self.model.save(base_path / "best-model.pt")

            # if we do not use dev data for model selection, save final model
            if save_final_model and not param_selection_mode:
                self.model.save(base_path / "final-model.pt")

        except KeyboardInterrupt:
            log_line(log)
            log.info("Exiting from training early.")
            if not param_selection_mode:
                log.info("Saving model ...")
                self.model.save(base_path / "final-model.pt")
                log.info("Done.")

        # test best model if test data is present
        if self.corpus.test:
            final_score = self.final_test(
                base_path,
                embeddings_in_memory,
                evaluation_metric,
                eval_mini_batch_size,
                num_workers,
            )
        else:
            final_score = 0
            log.info("Test data not provided setting final score to 0")

        return {
            "test_score": final_score,
            "dev_score_history": dev_score_history,
            "train_loss_history": train_loss_history,
            "dev_loss_history": dev_loss_history,
        }
Ejemplo n.º 22
0
    def train(self,
              base_path: Union[Path, str],
              evaluation_metric: EvaluationMetric = EvaluationMetric.
              MICRO_F1_SCORE,
              learning_rate: float = 0.1,
              mini_batch_size: int = 32,
              eval_mini_batch_size: int = None,
              max_epochs: int = 100,
              anneal_factor: float = 0.5,
              patience: int = 3,
              anneal_against_train_loss: bool = True,
              train_with_dev: bool = False,
              monitor_train: bool = False,
              embeddings_in_memory: bool = True,
              checkpoint: bool = False,
              save_final_model: bool = True,
              anneal_with_restarts: bool = False,
              test_mode: bool = False,
              param_selection_mode: bool = False,
              **kwargs) -> dict:

        if eval_mini_batch_size is None:
            eval_mini_batch_size = mini_batch_size

        # cast string to Path
        if type(base_path) is str:
            base_path = Path(base_path)

        add_file_handler(log, base_path / 'training.log')

        log_line(log)
        log.info(f'Evaluation method: {evaluation_metric.name}')

        if not param_selection_mode:
            loss_txt = init_output_file(base_path, 'loss.tsv')
            with open(loss_txt, 'a') as f:
                f.write(
                    f'EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS\t{Metric.tsv_header("TRAIN")}\tDEV_LOSS\t{Metric.tsv_header("DEV")}'
                    f'\tTEST_LOSS\t{Metric.tsv_header("TEST")}\n')

            weight_extractor = WeightExtractor(base_path)

        optimizer = self.optimizer(self.model.parameters(),
                                   lr=learning_rate,
                                   **kwargs)
        if self.optimizer_state is not None:
            optimizer.load_state_dict(self.optimizer_state)

        # annealing scheduler
        anneal_mode = 'min' if anneal_against_train_loss else 'max'
        if isinstance(optimizer, (AdamW, SGDW)):
            scheduler = ReduceLRWDOnPlateau(optimizer,
                                            factor=anneal_factor,
                                            patience=patience,
                                            mode=anneal_mode,
                                            verbose=True)
        else:
            scheduler = ReduceLROnPlateau(optimizer,
                                          factor=anneal_factor,
                                          patience=patience,
                                          mode=anneal_mode,
                                          verbose=True)
        if self.scheduler_state is not None:
            scheduler.load_state_dict(self.scheduler_state)

        train_data = self.corpus.train

        # if training also uses dev data, include in training set
        if train_with_dev:
            train_data.extend(self.corpus.dev)

        dev_score_history = []
        dev_loss_history = []
        train_loss_history = []

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            previous_learning_rate = learning_rate

            for epoch in range(0 + self.epoch, max_epochs + self.epoch):
                log_line(log)

                try:
                    bad_epochs = scheduler.num_bad_epochs
                except:
                    bad_epochs = 0
                for group in optimizer.param_groups:
                    learning_rate = group['lr']

                # reload last best model if annealing with restarts is enabled
                if learning_rate != previous_learning_rate and anneal_with_restarts and \
                        (base_path / 'best-model.pt').exists():
                    log.info('resetting to best model')
                    self.model.load_from_file(base_path / 'best-model.pt')

                previous_learning_rate = learning_rate

                # stop training if learning rate becomes too small
                if learning_rate < 0.0001:
                    log_line(log)
                    log.info('learning rate too small - quitting training!')
                    log_line(log)
                    break

                if not test_mode:
                    random.shuffle(train_data)

                batches = [
                    train_data[x:x + mini_batch_size]
                    for x in range(0, len(train_data), mini_batch_size)
                ]

                self.model.train()

                train_loss: float = 0
                seen_sentences = 0
                modulo = max(1, int(len(batches) / 10))

                for batch_no, batch in enumerate(batches):
                    loss = self.model.forward_loss(batch)

                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   5.0)
                    optimizer.step()

                    seen_sentences += len(batch)
                    train_loss += loss.item()

                    clear_embeddings(
                        batch,
                        also_clear_word_embeddings=not embeddings_in_memory)

                    if batch_no % modulo == 0:
                        log.info(
                            f'epoch {epoch + 1} - iter {batch_no}/{len(batches)} - loss '
                            f'{train_loss / seen_sentences:.8f}')
                        iteration = epoch * len(batches) + batch_no
                        if not param_selection_mode:
                            weight_extractor.extract_weights(
                                self.model.state_dict(), iteration)

                train_loss /= len(train_data)

                self.model.eval()

                log_line(log)
                log.info(
                    f'EPOCH {epoch + 1} done: loss {train_loss:.4f} - lr {learning_rate:.4f} - bad epochs {bad_epochs}'
                )

                dev_metric = None
                dev_loss = '_'

                train_metric = None
                if monitor_train:
                    train_metric, train_loss = self._calculate_evaluation_results_for(
                        'TRAIN', self.corpus.train, evaluation_metric,
                        embeddings_in_memory, eval_mini_batch_size)

                if not train_with_dev:
                    dev_metric, dev_loss = self._calculate_evaluation_results_for(
                        'DEV', self.corpus.dev, evaluation_metric,
                        embeddings_in_memory, eval_mini_batch_size)

                if not param_selection_mode:
                    test_metric, test_loss = self._calculate_evaluation_results_for(
                        'TEST', self.corpus.test, evaluation_metric,
                        embeddings_in_memory, eval_mini_batch_size,
                        base_path / 'test.tsv')

                if not param_selection_mode:
                    with open(loss_txt, 'a') as f:
                        train_metric_str = train_metric.to_tsv(
                        ) if train_metric is not None else Metric.to_empty_tsv(
                        )
                        dev_metric_str = dev_metric.to_tsv(
                        ) if dev_metric is not None else Metric.to_empty_tsv()
                        test_metric_str = test_metric.to_tsv(
                        ) if test_metric is not None else Metric.to_empty_tsv(
                        )
                        f.write(
                            f'{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t'
                            f'{train_loss}\t{train_metric_str}\t{dev_loss}\t{dev_metric_str}\t_\t{test_metric_str}\n'
                        )

                # calculate scores using dev data if available
                dev_score = 0.
                if not train_with_dev:
                    if evaluation_metric == EvaluationMetric.MACRO_ACCURACY:
                        dev_score = dev_metric.macro_avg_accuracy()
                    elif evaluation_metric == EvaluationMetric.MICRO_ACCURACY:
                        dev_score = dev_metric.micro_avg_accuracy()
                    elif evaluation_metric == EvaluationMetric.MACRO_F1_SCORE:
                        dev_score = dev_metric.macro_avg_f_score()
                    else:
                        dev_score = dev_metric.micro_avg_f_score()

                    # append dev score to score history
                    dev_score_history.append(dev_score)
                    dev_loss_history.append(dev_loss.item())

                # anneal against train loss if training with dev, otherwise anneal against dev score
                current_score = train_loss if anneal_against_train_loss else dev_score

                scheduler.step(current_score)

                train_loss_history.append(train_loss)

                # if checkpoint is enable, save model at each epoch
                if checkpoint and not param_selection_mode:
                    self.model.save_checkpoint(base_path / 'checkpoint.pt',
                                               optimizer.state_dict(),
                                               scheduler.state_dict(),
                                               epoch + 1, train_loss)

                # if we use dev data, remember best model based on dev evaluation score
                if not train_with_dev and not param_selection_mode and current_score == scheduler.best:
                    self.model.save(base_path / 'best-model.pt')

            # if we do not use dev data for model selection, save final model
            if save_final_model and not param_selection_mode:
                self.model.save(base_path / 'final-model.pt')

        except KeyboardInterrupt:
            log_line(log)
            log.info('Exiting from training early.')
            if not param_selection_mode:
                log.info('Saving model ...')
                self.model.save(base_path / 'final-model.pt')
                log.info('Done.')

        # test best model on test data
        final_score = self.final_test(base_path, embeddings_in_memory,
                                      evaluation_metric, eval_mini_batch_size)

        return {
            'test_score': final_score,
            'dev_score_history': dev_score_history,
            'train_loss_history': train_loss_history,
            'dev_loss_history': dev_loss_history
        }
Ejemplo n.º 23
0
 def find_learning_rate(self,
                        base_path: Union[(Path, str)],
                        file_name: str = 'learning_rate.tsv',
                        start_learning_rate: float = 1e-07,
                        end_learning_rate: float = 10,
                        iterations: int = 100,
                        mini_batch_size: int = 32,
                        stop_early: bool = True,
                        smoothing_factor: float = 0.98,
                        **kwargs) -> Path:
     best_loss = None
     moving_avg_loss = 0
     if (type(base_path) is str):
         base_path = Path(base_path)
     learning_rate_tsv = init_output_file(base_path, file_name)
     with open(learning_rate_tsv, 'a') as f:
         f.write('ITERATION\tTIMESTAMP\tLEARNING_RATE\tTRAIN_LOSS\n')
     optimizer = self.optimizer(self.model.parameters(),
                                lr=start_learning_rate,
                                **kwargs)
     train_data = self.corpus.train
     batch_loader = DataLoader(train_data,
                               batch_size=mini_batch_size,
                               shuffle=True)
     scheduler = ExpAnnealLR(optimizer, end_learning_rate, iterations)
     model_state = self.model.state_dict()
     model_device = next(self.model.parameters()).device
     self.model.train()
     for (itr, batch) in enumerate(batch_loader):
         loss = self.model.forward_loss(batch)
         optimizer.zero_grad()
         loss.backward()
         torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
         optimizer.step()
         scheduler.step(1)
         learning_rate = scheduler.get_lr()[0]
         loss_item = loss.item()
         if (itr == 0):
             best_loss = loss_item
         else:
             if (smoothing_factor > 0):
                 moving_avg_loss = ((smoothing_factor * moving_avg_loss) +
                                    ((1 - smoothing_factor) * loss_item))
                 loss_item = (moving_avg_loss /
                              (1 - (smoothing_factor**(itr + 1))))
             if (loss_item < best_loss):
                 best_loss = loss
         if (stop_early and ((loss_item >
                              (4 * best_loss)) or torch.isnan(loss))):
             log_line(log)
             log.info('loss diverged - stopping early!')
             break
         if (itr > iterations):
             break
         with open(str(learning_rate_tsv), 'a') as f:
             f.write(''.join([
                 '{}'.format(itr), '\t',
                 '{:%H:%M:%S}'.format(datetime.datetime.now()), '\t',
                 '{}'.format(learning_rate), '\t', '{}'.format(loss_item),
                 '\n'
             ]))
     self.model.load_state_dict(model_state)
     self.model.to(model_device)
     log_line(log)
     log.info(''.join([
         'learning rate finder finished - plot ',
         '{}'.format(learning_rate_tsv)
     ]))
     log_line(log)
     return Path(learning_rate_tsv)
Ejemplo n.º 24
0
    def find_learning_rate(
            self,
            base_path: Union[Path, str],
            optimizer,
            mini_batch_size: int = 32,
            start_learning_rate: float = 1e-7,
            end_learning_rate: float = 10,
            iterations: int = 1000,
            stop_early: bool = True,
            file_name: str = "learning_rate.tsv",
            **kwargs,
    ) -> Path:
        best_loss = None

        # cast string to Path
        if type(base_path) is str:
            base_path = Path(base_path)
        base_path.mkdir(exist_ok=True, parents=True)
        learning_rate_tsv = init_output_file(base_path, file_name)

        with open(learning_rate_tsv, "a") as f:
            f.write("ITERATION\tTIMESTAMP\tLEARNING_RATE\tTRAIN_LOSS\n")

        optimizer = optimizer(self.model.parameters(), lr=start_learning_rate, **kwargs)

        train_data = self.corpus.train

        scheduler = ExpAnnealLR(optimizer, end_learning_rate, iterations)

        model_state = self.model.state_dict()
        self.model.train()

        step = 0

        loss_list = []
        average_loss_list = []

        while step < iterations:

            batch_loader = DataLoader(train_data, batch_size=mini_batch_size, shuffle=True)

            for batch in batch_loader:
                step += 1

                # forward pass
                loss = self.model.forward_loss(batch)
                if isinstance(loss, Tuple):
                    loss = loss[0]

                # update optimizer and scheduler
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
                optimizer.step()
                scheduler.step()

                learning_rate = scheduler.get_lr()[0]

                # append current loss to list of losses for all iterations
                loss_list.append(loss.item())

                # compute averaged loss
                import statistics
                moving_avg_loss = statistics.mean(loss_list)
                average_loss_list.append(moving_avg_loss)

                if len(average_loss_list) > 10:
                    drop = average_loss_list[-10] - moving_avg_loss
                else:
                    drop = 0.

                if not best_loss or moving_avg_loss < best_loss:
                    best_loss = moving_avg_loss

                if step > iterations:
                    break

                if stop_early and (moving_avg_loss > 4 * best_loss or torch.isnan(loss)):
                    log_line(log)
                    log.info("loss diverged - stopping early!")
                    step = iterations
                    break

                with open(str(learning_rate_tsv), "a") as f:
                    f.write(f"{step}\t{learning_rate}\t{loss.item()}\t{moving_avg_loss}\t{drop}\n")

            self.model.load_state_dict(model_state)
            self.model.to(flair.device)

        log_line(log)
        log.info(f"learning rate finder finished - plot {learning_rate_tsv}")
        log_line(log)

        return Path(learning_rate_tsv)
Ejemplo n.º 25
0
    def train(
        self,
        base_path: Union[Path, str],
        learning_rate: float = 0.1,
        mini_batch_size: int = 32,
        mini_batch_chunk_size: int = None,
        max_epochs: int = 100,
        scheduler=AnnealOnPlateau,
        cycle_momentum: bool = False,
        anneal_factor: float = 0.5,
        patience: int = 3,
        initial_extra_patience=0,
        min_learning_rate: float = 0.0001,
        train_with_dev: bool = False,
        train_with_test: bool = False,
        monitor_train: bool = False,
        monitor_test: bool = False,
        embeddings_storage_mode: str = "cpu",
        checkpoint: bool = False,
        save_final_model: bool = True,
        anneal_with_restarts: bool = False,
        anneal_with_prestarts: bool = False,
        batch_growth_annealing: bool = False,
        shuffle: bool = True,
        param_selection_mode: bool = False,
        write_weights: bool = False,
        num_workers: int = 6,
        sampler=None,
        use_amp: bool = False,
        amp_opt_level: str = "O1",
        eval_on_train_fraction=0.0,
        eval_on_train_shuffle=False,
        save_model_at_each_epoch=False,
        **kwargs,
    ) -> dict:
        """
        Trains any class that implements the flair.nn.Model interface.
        :param base_path: Main path to which all output during training is logged and models are saved
        :param learning_rate: Initial learning rate (or max, if scheduler is OneCycleLR)
        :param mini_batch_size: Size of mini-batches during training
        :param mini_batch_chunk_size: If mini-batches are larger than this number, they get broken down into chunks of this size for processing purposes
        :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed.
        :param scheduler: The learning rate scheduler to use
        :param cycle_momentum: If scheduler is OneCycleLR, whether the scheduler should cycle also the momentum
        :param anneal_factor: The factor by which the learning rate is annealed
        :param patience: Patience is the number of epochs with no improvement the Trainer waits
         until annealing the learning rate
        :param min_learning_rate: If the learning rate falls below this threshold, training terminates
        :param train_with_dev: If True, training is performed using both train+dev data
        :param monitor_train: If True, training data is evaluated at end of each epoch
        :param monitor_test: If True, test data is evaluated at end of each epoch
        :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),
        'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU)
        :param checkpoint: If True, a full checkpoint is saved at end of each epoch
        :param save_final_model: If True, final model is saved
        :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate
        :param shuffle: If True, data is shuffled during training
        :param param_selection_mode: If True, testing is performed against dev data. Use this mode when doing
        parameter selection.
        :param num_workers: Number of workers in your data loader.
        :param sampler: You can pass a data sampler here for special sampling of data.
        :param eval_on_train_fraction: the fraction of train data to do the evaluation on,
        if 0. the evaluation is not performed on fraction of training data,
        if 'dev' the size is determined from dev set size
        :param eval_on_train_shuffle: if True the train data fraction is determined on the start of training
        and kept fixed during training, otherwise it's sampled at beginning of each epoch
        :param save_model_at_each_epoch: If True, at each epoch the thus far trained model will be saved
        :param kwargs: Other arguments for the Optimizer
        :return:
        """

        if self.use_tensorboard:
            try:
                from torch.utils.tensorboard import SummaryWriter

                writer = SummaryWriter()
            except:
                log_line(log)
                log.warning(
                    "ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!"
                )
                log_line(log)
                self.use_tensorboard = False
                pass

        if use_amp:
            if sys.version_info < (3, 0):
                raise RuntimeError(
                    "Apex currently only supports Python 3. Aborting.")
            if amp is None:
                raise RuntimeError(
                    "Failed to import apex. Please install apex from https://www.github.com/nvidia/apex "
                    "to enable mixed-precision training.")

        if mini_batch_chunk_size is None:
            mini_batch_chunk_size = mini_batch_size
        if learning_rate < min_learning_rate:
            min_learning_rate = learning_rate / 10

        initial_learning_rate = learning_rate

        # cast string to Path
        if type(base_path) is str:
            base_path = Path(base_path)

        log_handler = add_file_handler(log, base_path / "training.log")

        log_line(log)
        log.info(f'Model: "{self.model}"')
        log_line(log)
        log.info(f'Corpus: "{self.corpus}"')
        log_line(log)
        log.info("Parameters:")
        log.info(f' - learning_rate: "{learning_rate}"')
        log.info(f' - mini_batch_size: "{mini_batch_size}"')
        log.info(f' - patience: "{patience}"')
        log.info(f' - anneal_factor: "{anneal_factor}"')
        log.info(f' - max_epochs: "{max_epochs}"')
        log.info(f' - shuffle: "{shuffle}"')
        log.info(f' - train_with_dev: "{train_with_dev}"')
        log.info(f' - batch_growth_annealing: "{batch_growth_annealing}"')
        log_line(log)
        log.info(f'Model training base path: "{base_path}"')
        log_line(log)
        log.info(f"Device: {flair.device}")
        log_line(log)
        log.info(f"Embeddings storage mode: {embeddings_storage_mode}")
        if isinstance(self.model, SequenceTagger
                      ) and self.model.weight_dict and self.model.use_crf:
            log_line(log)
            log.warning(
                f'WARNING: Specified class weights will not take effect when using CRF'
            )

        # determine what splits (train, dev, test) to evaluate and log
        log_train = True if monitor_train else False
        log_test = (True if (not param_selection_mode and self.corpus.test
                             and monitor_test) else False)
        log_dev = False if train_with_dev or not self.corpus.dev else True
        log_train_part = (True if (eval_on_train_fraction == "dev"
                                   or eval_on_train_fraction > 0.0) else False)

        if log_train_part:
            train_part_size = (len(
                self.corpus.dev) if eval_on_train_fraction == "dev" else int(
                    len(self.corpus.train) * eval_on_train_fraction))
            assert train_part_size > 0
            if not eval_on_train_shuffle:
                train_part_indices = list(range(train_part_size))
                train_part = torch.utils.data.dataset.Subset(
                    self.corpus.train, train_part_indices)

        # prepare loss logging file and set up header
        loss_txt = init_output_file(base_path, "loss.tsv")

        weight_extractor = WeightExtractor(base_path)

        optimizer: torch.optim.Optimizer = self.optimizer(
            self.model.parameters(), lr=learning_rate, **kwargs)

        if use_amp:
            self.model, optimizer = amp.initialize(self.model,
                                                   optimizer,
                                                   opt_level=amp_opt_level)

        # minimize training loss if training with dev data, else maximize dev score
        anneal_mode = "min" if train_with_dev else "max"

        if scheduler == OneCycleLR:
            dataset_size = len(self.corpus.train)
            if train_with_dev:
                dataset_size += len(self.corpus.dev)
            lr_scheduler = OneCycleLR(
                optimizer,
                max_lr=learning_rate,
                steps_per_epoch=dataset_size // mini_batch_size + 1,
                epochs=max_epochs - self.
                epoch,  # if we load a checkpoint, we have already trained for self.epoch
                pct_start=0.0,
                cycle_momentum=cycle_momentum)
        else:
            lr_scheduler = scheduler(
                optimizer,
                factor=anneal_factor,
                patience=patience,
                initial_extra_patience=initial_extra_patience,
                mode=anneal_mode,
                verbose=True,
            )

        if (isinstance(lr_scheduler, OneCycleLR) and batch_growth_annealing):
            raise ValueError(
                "Batch growth with OneCycle policy is not implemented.")

        train_data = self.corpus.train

        # if training also uses dev/train data, include in training set
        if train_with_dev or train_with_test:

            parts = [self.corpus.train]
            if train_with_dev: parts.append(self.corpus.dev)
            if train_with_test: parts.append(self.corpus.test)

            train_data = ConcatDataset(parts)

        # initialize sampler if provided
        if sampler is not None:
            # init with default values if only class is provided
            if inspect.isclass(sampler):
                sampler = sampler()
            # set dataset to sample from
            sampler.set_dataset(train_data)
            shuffle = False

        dev_score_history = []
        dev_loss_history = []
        train_loss_history = []

        micro_batch_size = mini_batch_chunk_size

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            previous_learning_rate = learning_rate
            momentum = 0
            for group in optimizer.param_groups:
                if "momentum" in group:
                    momentum = group["momentum"]

            for self.epoch in range(self.epoch + 1, max_epochs + 1):
                log_line(log)

                if anneal_with_prestarts:
                    last_epoch_model_state_dict = copy.deepcopy(
                        self.model.state_dict())

                if eval_on_train_shuffle:
                    train_part_indices = list(range(self.corpus.train))
                    random.shuffle(train_part_indices)
                    train_part_indices = train_part_indices[:train_part_size]
                    train_part = torch.utils.data.dataset.Subset(
                        self.corpus.train, train_part_indices)

                # get new learning rate
                for group in optimizer.param_groups:
                    learning_rate = group["lr"]

                if learning_rate != previous_learning_rate and batch_growth_annealing:
                    mini_batch_size *= 2

                # reload last best model if annealing with restarts is enabled
                if ((anneal_with_restarts or anneal_with_prestarts)
                        and learning_rate != previous_learning_rate
                        and (base_path / "best-model.pt").exists()):
                    if anneal_with_restarts:
                        log.info("resetting to best model")
                        self.model.load_state_dict(
                            self.model.load(base_path /
                                            "best-model.pt").state_dict())
                    if anneal_with_prestarts:
                        log.info("resetting to pre-best model")
                        self.model.load_state_dict(
                            self.model.load(base_path /
                                            "pre-best-model.pt").state_dict())

                previous_learning_rate = learning_rate

                # stop training if learning rate becomes too small
                if (not isinstance(lr_scheduler, OneCycleLR)
                    ) and learning_rate < min_learning_rate:
                    log_line(log)
                    log.info("learning rate too small - quitting training!")
                    log_line(log)
                    break

                batch_loader = DataLoader(
                    train_data,
                    batch_size=mini_batch_size,
                    shuffle=shuffle,
                    num_workers=num_workers,
                    sampler=sampler,
                )

                self.model.train()

                train_loss: float = 0

                seen_batches = 0
                total_number_of_batches = len(batch_loader)

                modulo = max(1, int(total_number_of_batches / 10))

                # process mini-batches
                batch_time = 0
                for batch_no, batch in enumerate(batch_loader):
                    start_time = time.time()

                    # zero the gradients on the model and optimizer
                    self.model.zero_grad()
                    optimizer.zero_grad()

                    # if necessary, make batch_steps
                    batch_steps = [batch]
                    if len(batch) > micro_batch_size:
                        batch_steps = [
                            batch[x:x + micro_batch_size]
                            for x in range(0, len(batch), micro_batch_size)
                        ]

                    # forward and backward for batch
                    for batch_step in batch_steps:

                        # forward pass
                        loss = self.model.forward_loss(batch_step)

                        # Backward
                        if use_amp:
                            with amp.scale_loss(loss,
                                                optimizer) as scaled_loss:
                                scaled_loss.backward()
                        else:
                            loss.backward()

                    # do the optimizer step
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   5.0)
                    optimizer.step()

                    # do the scheduler step if one-cycle
                    if isinstance(lr_scheduler, OneCycleLR):
                        lr_scheduler.step()
                        # get new learning rate
                        for group in optimizer.param_groups:
                            learning_rate = group["lr"]
                            if "momentum" in group:
                                momentum = group["momentum"]

                    seen_batches += 1
                    train_loss += loss.item()

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(batch, embeddings_storage_mode)

                    batch_time += time.time() - start_time
                    if seen_batches % modulo == 0:
                        momentum_info = f' - momentum: {momentum:.4f}' if cycle_momentum else ''
                        log.info(
                            f"epoch {self.epoch} - iter {seen_batches}/{total_number_of_batches} - loss "
                            f"{train_loss / seen_batches:.8f} - samples/sec: {mini_batch_size * modulo / batch_time:.2f}"
                            f" - lr: {learning_rate:.6f}{momentum_info}")
                        batch_time = 0
                        iteration = self.epoch * total_number_of_batches + batch_no
                        if not param_selection_mode and write_weights:
                            weight_extractor.extract_weights(
                                self.model.state_dict(), iteration)

                train_loss /= seen_batches

                self.model.eval()

                log_line(log)
                log.info(
                    f"EPOCH {self.epoch} done: loss {train_loss:.4f} - lr {learning_rate:.7f}"
                )

                if self.use_tensorboard:
                    writer.add_scalar("train_loss", train_loss, self.epoch)

                # anneal against train loss if training with dev, otherwise anneal against dev score
                current_score = train_loss

                # evaluate on train / dev / test split depending on training settings
                result_line: str = ""

                if log_train:
                    train_eval_result, train_loss = self.model.evaluate(
                        self.corpus.train,
                        mini_batch_size=mini_batch_chunk_size,
                        num_workers=num_workers,
                        embedding_storage_mode=embeddings_storage_mode,
                    )
                    result_line += f"\t{train_eval_result.log_line}"

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(self.corpus.train,
                                     embeddings_storage_mode)

                if log_train_part:
                    train_part_eval_result, train_part_loss = self.model.evaluate(
                        train_part,
                        mini_batch_size=mini_batch_chunk_size,
                        num_workers=num_workers,
                        embedding_storage_mode=embeddings_storage_mode,
                    )
                    result_line += (
                        f"\t{train_part_loss}\t{train_part_eval_result.log_line}"
                    )
                    log.info(
                        f"TRAIN_SPLIT : loss {train_part_loss} - score {round(train_part_eval_result.main_score, 4)}"
                    )

                if log_dev:
                    dev_eval_result, dev_loss = self.model.evaluate(
                        self.corpus.dev,
                        mini_batch_size=mini_batch_chunk_size,
                        num_workers=num_workers,
                        out_path=base_path / "dev.tsv",
                        embedding_storage_mode=embeddings_storage_mode,
                    )
                    result_line += f"\t{dev_loss}\t{dev_eval_result.log_line}"
                    log.info(
                        f"DEV : loss {dev_loss} - score {round(dev_eval_result.main_score, 4)}"
                    )
                    # calculate scores using dev data if available
                    # append dev score to score history
                    dev_score_history.append(dev_eval_result.main_score)
                    dev_loss_history.append(dev_loss.item())

                    current_score = dev_eval_result.main_score

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(self.corpus.dev, embeddings_storage_mode)

                    if self.use_tensorboard:
                        writer.add_scalar("dev_loss", dev_loss, self.epoch)
                        writer.add_scalar("dev_score",
                                          dev_eval_result.main_score,
                                          self.epoch)

                if log_test:
                    test_eval_result, test_loss = self.model.evaluate(
                        self.corpus.test,
                        mini_batch_size=mini_batch_chunk_size,
                        num_workers=num_workers,
                        out_path=base_path / "test.tsv",
                        embedding_storage_mode=embeddings_storage_mode,
                    )
                    result_line += f"\t{test_loss}\t{test_eval_result.log_line}"
                    log.info(
                        f"TEST : loss {test_loss} - score {round(test_eval_result.main_score, 4)}"
                    )

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(self.corpus.test, embeddings_storage_mode)

                    if self.use_tensorboard:
                        writer.add_scalar("test_loss", test_loss, self.epoch)
                        writer.add_scalar("test_score",
                                          test_eval_result.main_score,
                                          self.epoch)

                # determine learning rate annealing through scheduler. Use auxiliary metric for AnnealOnPlateau
                if log_dev and isinstance(lr_scheduler, AnnealOnPlateau):
                    lr_scheduler.step(current_score, dev_loss)
                elif not isinstance(lr_scheduler, OneCycleLR):
                    lr_scheduler.step(current_score)

                train_loss_history.append(train_loss)

                # determine bad epoch number
                try:
                    bad_epochs = lr_scheduler.num_bad_epochs
                except:
                    bad_epochs = 0
                for group in optimizer.param_groups:
                    new_learning_rate = group["lr"]
                if new_learning_rate != previous_learning_rate:
                    bad_epochs = patience + 1
                    if previous_learning_rate == initial_learning_rate:
                        bad_epochs += initial_extra_patience

                # log bad epochs
                log.info(f"BAD EPOCHS (no improvement): {bad_epochs}")

                # output log file
                with open(loss_txt, "a") as f:

                    # make headers on first epoch
                    if self.epoch == 1:
                        f.write(
                            f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS"
                        )

                        if log_train:
                            f.write("\tTRAIN_" + "\tTRAIN_".join(
                                train_eval_result.log_header.split("\t")))
                        if log_train_part:
                            f.write("\tTRAIN_PART_LOSS\tTRAIN_PART_" +
                                    "\tTRAIN_PART_".join(
                                        train_part_eval_result.log_header.
                                        split("\t")))
                        if log_dev:
                            f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join(
                                dev_eval_result.log_header.split("\t")))
                        if log_test:
                            f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join(
                                test_eval_result.log_header.split("\t")))

                    f.write(
                        f"\n{self.epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}"
                    )
                    f.write(result_line)

                # if checkpoint is enabled, save model at each epoch
                if checkpoint and not param_selection_mode:
                    self.save_checkpoint(base_path / "checkpoint.pt")

                # if we use dev data, remember best model based on dev evaluation score
                if ((not train_with_dev or anneal_with_restarts
                     or anneal_with_prestarts) and not param_selection_mode
                        and not isinstance(lr_scheduler, OneCycleLR)
                        and current_score == lr_scheduler.best
                        and bad_epochs == 0):
                    print("saving best model")
                    self.model.save(base_path / "best-model.pt")

                    if anneal_with_prestarts:
                        current_state_dict = self.model.state_dict()
                        self.model.load_state_dict(last_epoch_model_state_dict)
                        self.model.save(base_path / "pre-best-model.pt")
                        self.model.load_state_dict(current_state_dict)

                if save_model_at_each_epoch:
                    print("saving model of current epoch")
                    model_name = "model_epoch_" + str(self.epoch) + ".pt"
                    self.model.save(base_path / model_name)

            # if we do not use dev data for model selection, save final model
            if save_final_model and not param_selection_mode:
                self.model.save(base_path / "final-model.pt")

        except KeyboardInterrupt:
            log_line(log)
            log.info("Exiting from training early.")

            if self.use_tensorboard:
                writer.close()

            if not param_selection_mode:
                log.info("Saving model ...")
                self.model.save(base_path / "final-model.pt")
                log.info("Done.")

        # test best model if test data is present
        if self.corpus.test and not train_with_test:
            final_score = self.final_test(base_path, mini_batch_chunk_size,
                                          num_workers)
        else:
            final_score = 0
            log.info("Test data not provided setting final score to 0")

        log.removeHandler(log_handler)

        if self.use_tensorboard:
            writer.close()

        return {
            "test_score": final_score,
            "dev_score_history": dev_score_history,
            "train_loss_history": train_loss_history,
            "dev_loss_history": dev_loss_history,
        }
Ejemplo n.º 26
0
    def train(
            self,
            base_path: Union[Path, str],
            learning_rate: float = 0.1,
            mini_batch_size: int = 32,
            mini_batch_chunk_size: Optional[int] = None,
            max_epochs: int = 100,
            train_with_dev: bool = False,
            train_with_test: bool = False,
            monitor_train: bool = False,
            monitor_test: bool = False,
            main_evaluation_metric: Tuple[str, str] = ("micro avg", 'f1-score'),
            scheduler=AnnealOnPlateau,
            anneal_factor: float = 0.5,
            patience: int = 3,
            min_learning_rate: float = 0.0001,
            initial_extra_patience: int = 0,
            optimizer: torch.optim.Optimizer = SGD,
            cycle_momentum: bool = False,
            warmup_fraction: float = 0.1,
            embeddings_storage_mode: str = "cpu",
            checkpoint: bool = False,
            save_final_model: bool = True,
            anneal_with_restarts: bool = False,
            anneal_with_prestarts: bool = False,
            anneal_against_dev_loss: bool = False,
            batch_growth_annealing: bool = False,
            shuffle: bool = True,
            param_selection_mode: bool = False,
            write_weights: bool = False,
            num_workers: int = 6,
            sampler=None,
            use_amp: bool = False,
            amp_opt_level: str = "O1",
            eval_on_train_fraction: float = 0.0,
            eval_on_train_shuffle: bool = False,
            save_model_each_k_epochs: int = 0,
            tensorboard_comment: str = '',
            use_swa: bool = False,
            use_final_model_for_eval: bool = False,
            gold_label_dictionary_for_eval: Optional[Dictionary] = None,
            create_file_logs: bool = True,
            create_loss_file: bool = True,
            epoch: int = 0,
            use_tensorboard: bool = False,
            tensorboard_log_dir=None,
            metrics_for_tensorboard=[],
            optimizer_state_dict: Optional = None,
            scheduler_state_dict: Optional = None,
            save_optimizer_state: bool = False,
            **kwargs,
    ) -> dict:
        """
        Trains any class that implements the flair.nn.Model interface.
        :param base_path: Main path to which all output during training is logged and models are saved
        :param learning_rate: Initial learning rate (or max, if scheduler is OneCycleLR)
        :param mini_batch_size: Size of mini-batches during training
        :param mini_batch_chunk_size: If mini-batches are larger than this number, they get broken down into chunks of this size for processing purposes
        :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed.
        :param scheduler: The learning rate scheduler to use
        :param checkpoint: If True, a full checkpoint is saved at end of each epoch
        :param cycle_momentum: If scheduler is OneCycleLR, whether the scheduler should cycle also the momentum
        :param anneal_factor: The factor by which the learning rate is annealed
        :param patience: Patience is the number of epochs with no improvement the Trainer waits
         until annealing the learning rate
        :param min_learning_rate: If the learning rate falls below this threshold, training terminates
        :param warmup_fraction: Fraction of warmup steps if the scheduler is LinearSchedulerWithWarmup
        :param train_with_dev:  If True, the data from dev split is added to the training data
        :param train_with_test: If True, the data from test split is added to the training data
        :param monitor_train: If True, training data is evaluated at end of each epoch
        :param monitor_test: If True, test data is evaluated at end of each epoch
        :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),
        'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU)
        :param save_final_model: If True, final model is saved
        :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate
        :param shuffle: If True, data is shuffled during training
        :param param_selection_mode: If True, testing is performed against dev data. Use this mode when doing
        parameter selection.
        :param num_workers: Number of workers in your data loader.
        :param sampler: You can pass a data sampler here for special sampling of data.
        :param eval_on_train_fraction: the fraction of train data to do the evaluation on,
        if 0. the evaluation is not performed on fraction of training data,
        if 'dev' the size is determined from dev set size
        :param eval_on_train_shuffle: if True the train data fraction is determined on the start of training
        and kept fixed during training, otherwise it's sampled at beginning of each epoch
        :param save_model_each_k_epochs: Each k epochs, a model state will be written out. If set to '5', a model will
        be saved each 5 epochs. Default is 0 which means no model saving.
        :param main_evaluation_metric: Type of metric to use for best model tracking and learning rate scheduling (if dev data is available, otherwise loss will be used), currently only applicable for text_classification_model
        :param tensorboard_comment: Comment to use for tensorboard logging
        :param create_file_logs: If True, the logs will also be stored in a file 'training.log' in the model folder
        :param create_loss_file: If True, the loss will be writen to a file 'loss.tsv' in the model folder
        :param optimizer: The optimizer to use (typically SGD or Adam)
        :param epoch: The starting epoch (normally 0 but could be higher if you continue training model)
        :param use_tensorboard: If True, writes out tensorboard information
        :param tensorboard_log_dir: Directory into which tensorboard log files will be written
        :param metrics_for_tensorboard: List of tuples that specify which metrics (in addition to the main_score) shall be plotted in tensorboard, could be [("macro avg", 'f1-score'), ("macro avg", 'precision')] for example
        :param kwargs: Other arguments for the Optimizer
        :return:
        """

        # create a model card for this model with Flair and PyTorch version
        model_card = {'flair_version': flair.__version__, 'pytorch_version': torch.__version__}

        # also record Transformers version if library is loaded
        try:
            import transformers
            model_card['transformers_version'] = transformers.__version__
        except:
            pass

        # remember all parameters used in train() call
        local_variables = locals()
        training_parameters = {}
        for parameter in signature(self.train).parameters:
            training_parameters[parameter] = local_variables[parameter]
        model_card['training_parameters'] = training_parameters

        # add model card to model
        self.model.model_card = model_card

        if use_tensorboard:
            try:
                from torch.utils.tensorboard import SummaryWriter

                if tensorboard_log_dir is not None and not os.path.exists(tensorboard_log_dir):
                    os.mkdir(tensorboard_log_dir)
                writer = SummaryWriter(log_dir=tensorboard_log_dir, comment=tensorboard_comment)
                log.info(f"tensorboard logging path is {tensorboard_log_dir}")

            except:
                log_line(log)
                log.warning("ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!")
                log_line(log)
                use_tensorboard = False
                pass

        if use_amp:
            if sys.version_info < (3, 0):
                raise RuntimeError("Apex currently only supports Python 3. Aborting.")
            if amp is None:
                raise RuntimeError(
                    "Failed to import apex. Please install apex from https://www.github.com/nvidia/apex "
                    "to enable mixed-precision training."
                )

        if mini_batch_chunk_size is None:
            mini_batch_chunk_size = mini_batch_size
        if learning_rate < min_learning_rate:
            min_learning_rate = learning_rate / 10

        initial_learning_rate = learning_rate

        # cast string to Path
        if type(base_path) is str:
            base_path = Path(base_path)
        base_path.mkdir(exist_ok=True, parents=True)

        if create_file_logs:
            log_handler = add_file_handler(log, base_path / "training.log")
        else:
            log_handler = None

        log_line(log)
        log.info(f'Model: "{self.model}"')
        log_line(log)
        log.info(f'Corpus: "{self.corpus}"')
        log_line(log)
        log.info("Parameters:")
        log.info(f' - learning_rate: "{learning_rate}"')
        log.info(f' - mini_batch_size: "{mini_batch_size}"')
        log.info(f' - patience: "{patience}"')
        log.info(f' - anneal_factor: "{anneal_factor}"')
        log.info(f' - max_epochs: "{max_epochs}"')
        log.info(f' - shuffle: "{shuffle}"')
        log.info(f' - train_with_dev: "{train_with_dev}"')
        log.info(f' - batch_growth_annealing: "{batch_growth_annealing}"')
        log_line(log)
        log.info(f'Model training base path: "{base_path}"')
        log_line(log)
        log.info(f"Device: {flair.device}")
        log_line(log)
        log.info(f"Embeddings storage mode: {embeddings_storage_mode}")
        if isinstance(self.model, SequenceTagger) and self.model.weight_dict and self.model.use_crf:
            log_line(log)
            log.warning(f'WARNING: Specified class weights will not take effect when using CRF')

        # check for previously saved best models in the current training folder and delete them
        self.check_for_and_delete_previous_best_models(base_path)

        # determine what splits (train, dev, test) to evaluate and log
        log_train = True if monitor_train else False
        log_test = True if (not param_selection_mode and self.corpus.test and monitor_test) else False
        log_dev = False if train_with_dev or not self.corpus.dev else True
        log_train_part = True if (eval_on_train_fraction == "dev" or eval_on_train_fraction > 0.0) else False

        if log_train_part:
            train_part_size = len(self.corpus.dev) if eval_on_train_fraction == "dev" \
                else int(len(self.corpus.train) * eval_on_train_fraction)

            assert train_part_size > 0
            if not eval_on_train_shuffle:
                train_part_indices = list(range(train_part_size))
                train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices)

        # prepare loss logging file and set up header
        loss_txt = init_output_file(base_path, "loss.tsv") if create_loss_file else None

        weight_extractor = WeightExtractor(base_path)

        # if optimizer class is passed, instantiate:
        if inspect.isclass(optimizer):
            optimizer: torch.optim.Optimizer = optimizer(self.model.parameters(), lr=learning_rate, **kwargs)

        if use_swa:
            import torchcontrib
            optimizer = torchcontrib.optim.SWA(optimizer, swa_start=10, swa_freq=5, swa_lr=learning_rate)

        if use_amp:
            self.model, optimizer = amp.initialize(
                self.model, optimizer, opt_level=amp_opt_level
            )

        # load existing optimizer state dictionary if it exists
        if optimizer_state_dict:
            optimizer.load_state_dict(optimizer_state_dict)

        # minimize training loss if training with dev data, else maximize dev score
        anneal_mode = "min" if train_with_dev or anneal_against_dev_loss else "max"
        best_validation_score = 100000000000 if train_with_dev or anneal_against_dev_loss else 0.

        dataset_size = len(self.corpus.train)
        if train_with_dev:
            dataset_size += len(self.corpus.dev)

        # if scheduler is passed as a class, instantiate
        if inspect.isclass(scheduler):
            if scheduler == OneCycleLR:
                scheduler = OneCycleLR(optimizer,
                                       max_lr=learning_rate,
                                       steps_per_epoch=dataset_size // mini_batch_size + 1,
                                       epochs=max_epochs - epoch,
                                       # if we load a checkpoint, we have already trained for epoch
                                       pct_start=0.0,
                                       cycle_momentum=cycle_momentum)
            elif scheduler == LinearSchedulerWithWarmup:
                steps_per_epoch = (dataset_size + mini_batch_size - 1) / mini_batch_size
                num_train_steps = int(steps_per_epoch * max_epochs)
                num_warmup_steps = int(num_train_steps * warmup_fraction)

                scheduler = LinearSchedulerWithWarmup(optimizer,
                                                      num_train_steps=num_train_steps,
                                                      num_warmup_steps=num_warmup_steps)
            else:
                scheduler = scheduler(
                    optimizer,
                    factor=anneal_factor,
                    patience=patience,
                    initial_extra_patience=initial_extra_patience,
                    mode=anneal_mode,
                    verbose=True,
                )

        # load existing scheduler state dictionary if it exists
        if scheduler_state_dict:
            scheduler.load_state_dict(scheduler_state_dict)

        # update optimizer and scheduler in model card
        model_card['training_parameters']['optimizer'] = optimizer
        model_card['training_parameters']['scheduler'] = scheduler

        if isinstance(scheduler, OneCycleLR) and batch_growth_annealing:
            raise ValueError("Batch growth with OneCycle policy is not implemented.")

        train_data = self.corpus.train

        # if training also uses dev/train data, include in training set
        if train_with_dev or train_with_test:

            parts = [self.corpus.train]
            if train_with_dev: parts.append(self.corpus.dev)
            if train_with_test: parts.append(self.corpus.test)

            train_data = ConcatDataset(parts)

        # initialize sampler if provided
        if sampler is not None:
            # init with default values if only class is provided
            if inspect.isclass(sampler):
                sampler = sampler()
            # set dataset to sample from
            sampler.set_dataset(train_data)
            shuffle = False

        dev_score_history = []
        dev_loss_history = []
        train_loss_history = []

        micro_batch_size = mini_batch_chunk_size

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            previous_learning_rate = learning_rate
            momentum = 0
            for group in optimizer.param_groups:
                if "momentum" in group:
                    momentum = group["momentum"]

            for epoch in range(epoch + 1, max_epochs + 1):
                log_line(log)

                # update epoch in model card
                self.model.model_card['training_parameters']['epoch'] = epoch

                if anneal_with_prestarts:
                    last_epoch_model_state_dict = copy.deepcopy(self.model.state_dict())

                if eval_on_train_shuffle:
                    train_part_indices = list(range(self.corpus.train))
                    random.shuffle(train_part_indices)
                    train_part_indices = train_part_indices[:train_part_size]
                    train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices)

                # get new learning rate
                for group in optimizer.param_groups:
                    learning_rate = group["lr"]

                if learning_rate != previous_learning_rate and batch_growth_annealing:
                    mini_batch_size *= 2

                # reload last best model if annealing with restarts is enabled
                if (
                        (anneal_with_restarts or anneal_with_prestarts)
                        and learning_rate != previous_learning_rate
                        and os.path.exists(base_path / "best-model.pt")
                ):
                    if anneal_with_restarts:
                        log.info("resetting to best model")
                        self.model.load_state_dict(
                            self.model.load(base_path / "best-model.pt").state_dict()
                        )
                    if anneal_with_prestarts:
                        log.info("resetting to pre-best model")
                        self.model.load_state_dict(
                            self.model.load(base_path / "pre-best-model.pt").state_dict()
                        )

                previous_learning_rate = learning_rate
                if use_tensorboard:
                    writer.add_scalar("learning_rate", learning_rate, epoch)

                # stop training if learning rate becomes too small
                if ((not isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)) and
                     learning_rate < min_learning_rate)):
                    log_line(log)
                    log.info("learning rate too small - quitting training!")
                    log_line(log)
                    break

                batch_loader = DataLoader(
                    train_data,
                    batch_size=mini_batch_size,
                    shuffle=shuffle if epoch > 1 else False,  # never shuffle the first epoch
                    num_workers=num_workers,
                    sampler=sampler,
                )

                self.model.train()

                train_loss: float = 0

                seen_batches = 0
                total_number_of_batches = len(batch_loader)

                modulo = max(1, int(total_number_of_batches / 10))

                # process mini-batches
                batch_time = 0
                average_over = 0
                for batch_no, batch in enumerate(batch_loader):

                    start_time = time.time()

                    # zero the gradients on the model and optimizer
                    self.model.zero_grad()
                    optimizer.zero_grad()

                    # if necessary, make batch_steps
                    batch_steps = [batch]
                    if len(batch) > micro_batch_size:
                        batch_steps = [batch[x: x + micro_batch_size] for x in range(0, len(batch), micro_batch_size)]

                    # forward and backward for batch
                    for batch_step in batch_steps:

                        # forward pass
                        loss = self.model.forward_loss(batch_step)

                        if isinstance(loss, Tuple):
                            average_over += loss[1]
                            loss = loss[0]

                        # Backward
                        if use_amp:
                            with amp.scale_loss(loss, optimizer) as scaled_loss:
                                scaled_loss.backward()
                        else:
                            loss.backward()
                        train_loss += loss.item()

                    # do the optimizer step
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
                    optimizer.step()

                    # do the scheduler step if one-cycle or linear decay
                    if isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)):
                        scheduler.step()
                        # get new learning rate
                        for group in optimizer.param_groups:
                            learning_rate = group["lr"]
                            if "momentum" in group:
                                momentum = group["momentum"]
                            if "betas" in group:
                                momentum, _ = group["betas"]

                    seen_batches += 1

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(batch, embeddings_storage_mode)

                    batch_time += time.time() - start_time
                    if seen_batches % modulo == 0:
                        momentum_info = f' - momentum: {momentum:.4f}' if cycle_momentum else ''
                        intermittent_loss = train_loss / average_over if average_over > 0 else train_loss / seen_batches
                        log.info(
                            f"epoch {epoch} - iter {seen_batches}/{total_number_of_batches} - loss "
                            f"{intermittent_loss:.8f} - samples/sec: {mini_batch_size * modulo / batch_time:.2f}"
                            f" - lr: {learning_rate:.6f}{momentum_info}"
                        )
                        batch_time = 0
                        iteration = epoch * total_number_of_batches + batch_no
                        if not param_selection_mode and write_weights:
                            weight_extractor.extract_weights(self.model.state_dict(), iteration)

                if average_over != 0:
                    train_loss /= average_over

                self.model.eval()

                log_line(log)
                log.info(f"EPOCH {epoch} done: loss {train_loss:.4f} - lr {learning_rate:.7f}")

                if use_tensorboard:
                    writer.add_scalar("train_loss", train_loss, epoch)

                # evaluate on train / dev / test split depending on training settings
                result_line: str = ""

                if log_train:
                    train_eval_result = self.model.evaluate(
                        self.corpus.train,
                        gold_label_type=self.model.label_type,
                        mini_batch_size=mini_batch_chunk_size,
                        num_workers=num_workers,
                        embedding_storage_mode=embeddings_storage_mode,
                        main_evaluation_metric=main_evaluation_metric,
                        gold_label_dictionary=gold_label_dictionary_for_eval,
                    )
                    result_line += f"\t{train_eval_result.log_line}"

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(self.corpus.train, embeddings_storage_mode)

                if log_train_part:
                    train_part_eval_result = self.model.evaluate(
                        train_part,
                        gold_label_type=self.model.label_type,
                        mini_batch_size=mini_batch_chunk_size,
                        num_workers=num_workers,
                        embedding_storage_mode=embeddings_storage_mode,
                        main_evaluation_metric=main_evaluation_metric,
                        gold_label_dictionary=gold_label_dictionary_for_eval,
                    )
                    result_line += f"\t{train_part_eval_result.loss}\t{train_part_eval_result.log_line}"

                    log.info(
                        f"TRAIN_SPLIT : loss {train_part_eval_result.loss} - {main_evaluation_metric[1]} ({main_evaluation_metric[0]}) {round(train_part_eval_result.main_score, 4)}"
                    )
                if use_tensorboard:
                    for (metric_class_avg_type, metric_type) in metrics_for_tensorboard:
                        writer.add_scalar(
                            f"train_{metric_class_avg_type}_{metric_type}",
                            train_part_eval_result.classification_report[metric_class_avg_type][metric_type], epoch
                        )

                if log_dev:
                    dev_eval_result = self.model.evaluate(
                        self.corpus.dev,
                        gold_label_type=self.model.label_type,
                        mini_batch_size=mini_batch_chunk_size,
                        num_workers=num_workers,
                        out_path=base_path / "dev.tsv",
                        embedding_storage_mode=embeddings_storage_mode,
                        main_evaluation_metric=main_evaluation_metric,
                        gold_label_dictionary=gold_label_dictionary_for_eval,
                    )
                    result_line += f"\t{dev_eval_result.loss}\t{dev_eval_result.log_line}"
                    log.info(
                        f"DEV : loss {dev_eval_result.loss} - {main_evaluation_metric[1]} ({main_evaluation_metric[0]})  {round(dev_eval_result.main_score, 4)}"
                    )
                    # calculate scores using dev data if available
                    # append dev score to score history
                    dev_score_history.append(dev_eval_result.main_score)
                    dev_loss_history.append(dev_eval_result.loss)

                    dev_score = dev_eval_result.main_score

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(self.corpus.dev, embeddings_storage_mode)

                    if use_tensorboard:
                        writer.add_scalar("dev_loss", dev_eval_result.loss, epoch)
                        writer.add_scalar("dev_score", dev_eval_result.main_score, epoch)
                        for (metric_class_avg_type, metric_type) in metrics_for_tensorboard:
                            writer.add_scalar(
                                f"dev_{metric_class_avg_type}_{metric_type}",
                                dev_eval_result.classification_report[metric_class_avg_type][metric_type], epoch
                            )

                if log_test:
                    test_eval_result = self.model.evaluate(
                        self.corpus.test,
                        gold_label_type=self.model.label_type,
                        mini_batch_size=mini_batch_chunk_size,
                        num_workers=num_workers,
                        out_path=base_path / "test.tsv",
                        embedding_storage_mode=embeddings_storage_mode,
                        main_evaluation_metric=main_evaluation_metric,
                        gold_label_dictionary=gold_label_dictionary_for_eval,
                    )
                    result_line += f"\t{test_eval_result.loss}\t{test_eval_result.log_line}"
                    log.info(
                        f"TEST : loss {test_eval_result.loss} - {main_evaluation_metric[1]} ({main_evaluation_metric[0]})  {round(test_eval_result.main_score, 4)}"
                    )

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(self.corpus.test, embeddings_storage_mode)

                    if use_tensorboard:
                        writer.add_scalar("test_loss", test_eval_result.loss, epoch)
                        writer.add_scalar("test_score", test_eval_result.main_score, epoch)
                        for (metric_class_avg_type, metric_type) in metrics_for_tensorboard:
                            writer.add_scalar(
                                f"test_{metric_class_avg_type}_{metric_type}",
                                test_eval_result.classification_report[metric_class_avg_type][metric_type], epoch
                            )

                # determine if this is the best model or if we need to anneal
                current_epoch_has_best_model_so_far = False
                # default mode: anneal against dev score
                if not train_with_dev and not anneal_against_dev_loss:
                    if dev_score > best_validation_score:
                        current_epoch_has_best_model_so_far = True
                        best_validation_score = dev_score

                    if isinstance(scheduler, AnnealOnPlateau):
                        scheduler.step(dev_score, dev_eval_result.loss)

                # alternative: anneal against dev loss
                if not train_with_dev and anneal_against_dev_loss:
                    if dev_eval_result.loss < best_validation_score:
                        current_epoch_has_best_model_so_far = True
                        best_validation_score = dev_eval_result.loss

                    if isinstance(scheduler, AnnealOnPlateau):
                        scheduler.step(dev_eval_result.loss)

                # alternative: anneal against train loss
                if train_with_dev:
                    if train_loss < best_validation_score:
                        current_epoch_has_best_model_so_far = True
                        best_validation_score = train_loss

                    if isinstance(scheduler, AnnealOnPlateau):
                        scheduler.step(train_loss)

                train_loss_history.append(train_loss)

                # determine bad epoch number
                try:
                    bad_epochs = scheduler.num_bad_epochs
                except:
                    bad_epochs = 0
                for group in optimizer.param_groups:
                    new_learning_rate = group["lr"]
                if new_learning_rate != previous_learning_rate:
                    bad_epochs = patience + 1
                    if previous_learning_rate == initial_learning_rate: bad_epochs += initial_extra_patience

                # log bad epochs
                log.info(f"BAD EPOCHS (no improvement): {bad_epochs}")

                if create_loss_file:
                    # output log file
                    with open(loss_txt, "a") as f:

                        # make headers on first epoch
                        if epoch == 1:
                            f.write(f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS")

                            if log_train:
                                f.write("\tTRAIN_" + "\tTRAIN_".join(train_eval_result.log_header.split("\t")))

                            if log_train_part:
                                f.write("\tTRAIN_PART_LOSS\tTRAIN_PART_" + "\tTRAIN_PART_".join(
                                    train_part_eval_result.log_header.split("\t")))

                            if log_dev:
                                f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join(dev_eval_result.log_header.split("\t")))

                            if log_test:
                                f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join(test_eval_result.log_header.split("\t")))

                        f.write(
                            f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}"
                        )
                        f.write(result_line)

                # if checkpoint is enabled, save model at each epoch
                if checkpoint and not param_selection_mode:
                    self.model.save(base_path / "checkpoint.pt", checkpoint=True)

                # Check whether to save best model
                if (
                        (not train_with_dev or anneal_with_restarts or anneal_with_prestarts)
                        and not param_selection_mode
                        and current_epoch_has_best_model_so_far
                        and not use_final_model_for_eval
                ):
                    log.info("saving best model")
                    self.model.save(base_path / "best-model.pt", checkpoint=save_optimizer_state)

                    if anneal_with_prestarts:
                        current_state_dict = self.model.state_dict()
                        self.model.load_state_dict(last_epoch_model_state_dict)
                        self.model.save(base_path / "pre-best-model.pt")
                        self.model.load_state_dict(current_state_dict)

                if save_model_each_k_epochs > 0 and not epoch % save_model_each_k_epochs:
                    print("saving model of current epoch")
                    model_name = "model_epoch_" + str(epoch) + ".pt"
                    self.model.save(base_path / model_name, checkpoint=save_optimizer_state)

            if use_swa:
                optimizer.swap_swa_sgd()

            # if we do not use dev data for model selection, save final model
            if save_final_model and not param_selection_mode:
                self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state)

        except KeyboardInterrupt:
            log_line(log)
            log.info("Exiting from training early.")

            if use_tensorboard:
                writer.close()

            if not param_selection_mode:
                log.info("Saving model ...")
                self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state)
                log.info("Done.")

        # test best model if test data is present
        if self.corpus.test and not train_with_test:
            final_score = self.final_test(
                base_path=base_path,
                eval_mini_batch_size=mini_batch_chunk_size,
                num_workers=num_workers,
                main_evaluation_metric=main_evaluation_metric,
                gold_label_dictionary_for_eval=gold_label_dictionary_for_eval,
            )
        else:
            final_score = 0
            log.info("Test data not provided setting final score to 0")

        if create_file_logs:
            log_handler.close()
            log.removeHandler(log_handler)

        if use_tensorboard:
            writer.close()

        return {
            "test_score": final_score,
            "dev_score_history": dev_score_history,
            "train_loss_history": train_loss_history,
            "dev_loss_history": dev_loss_history,
        }
Ejemplo n.º 27
0
 def train(self,
           base_path: Union[(Path, str)],
           learning_rate: float = 0.1,
           mini_batch_size: int = 32,
           eval_mini_batch_size: int = None,
           max_epochs: int = 100,
           anneal_factor: float = 0.5,
           patience: int = 3,
           min_learning_rate: float = 0.0001,
           train_with_dev: bool = False,
           monitor_train: bool = False,
           monitor_test: bool = False,
           embeddings_storage_mode: str = 'cpu',
           checkpoint: bool = False,
           save_final_model: bool = True,
           anneal_with_restarts: bool = False,
           shuffle: bool = True,
           param_selection_mode: bool = False,
           num_workers: int = 6,
           sampler=None,
           use_amp: bool = False,
           amp_opt_level: str = 'O1',
           **kwargs) -> dict:
     "\n        Trains any class that implements the flair.nn.Model interface.\n        :param base_path: Main path to which all output during training is logged and models are saved\n        :param learning_rate: Initial learning rate\n        :param mini_batch_size: Size of mini-batches during training\n        :param eval_mini_batch_size: Size of mini-batches during evaluation\n        :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed.\n        :param anneal_factor: The factor by which the learning rate is annealed\n        :param patience: Patience is the number of epochs with no improvement the Trainer waits\n         until annealing the learning rate\n        :param min_learning_rate: If the learning rate falls below this threshold, training terminates\n        :param train_with_dev: If True, training is performed using both train+dev data\n        :param monitor_train: If True, training data is evaluated at end of each epoch\n        :param monitor_test: If True, test data is evaluated at end of each epoch\n        :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),\n        'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU)\n        :param checkpoint: If True, a full checkpoint is saved at end of each epoch\n        :param save_final_model: If True, final model is saved\n        :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate\n        :param shuffle: If True, data is shuffled during training\n        :param param_selection_mode: If True, testing is performed against dev data. Use this mode when doing\n        parameter selection.\n        :param num_workers: Number of workers in your data loader.\n        :param sampler: You can pass a data sampler here for special sampling of data.\n        :param kwargs: Other arguments for the Optimizer\n        :return:\n        "
     if self.use_tensorboard:
         try:
             from torch.utils.tensorboard import SummaryWriter
             writer = SummaryWriter()
         except:
             log_line(log)
             log.warning(
                 'ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!'
             )
             log_line(log)
             self.use_tensorboard = False
             pass
     if use_amp:
         if (sys.version_info < (3, 0)):
             raise RuntimeError(
                 'Apex currently only supports Python 3. Aborting.')
         if (amp is None):
             raise RuntimeError(
                 'Failed to import apex. Please install apex from https://www.github.com/nvidia/apex to enable mixed-precision training.'
             )
     if (eval_mini_batch_size is None):
         eval_mini_batch_size = mini_batch_size
     if (type(base_path) is str):
         base_path = Path(base_path)
     log_handler = add_file_handler(log, (base_path / 'training.log'))
     log_line(log)
     log.info(''.join(['Model: "', '{}'.format(self.model), '"']))
     log_line(log)
     log.info(''.join(['Corpus: "', '{}'.format(self.corpus), '"']))
     log_line(log)
     log.info('Parameters:')
     log.info(''.join(
         [' - learning_rate: "', '{}'.format(learning_rate), '"']))
     log.info(''.join(
         [' - mini_batch_size: "', '{}'.format(mini_batch_size), '"']))
     log.info(''.join([' - patience: "', '{}'.format(patience), '"']))
     log.info(''.join(
         [' - anneal_factor: "', '{}'.format(anneal_factor), '"']))
     log.info(''.join([' - max_epochs: "', '{}'.format(max_epochs), '"']))
     log.info(''.join([' - shuffle: "', '{}'.format(shuffle), '"']))
     log.info(''.join(
         [' - train_with_dev: "', '{}'.format(train_with_dev), '"']))
     log_line(log)
     log.info(''.join(
         ['Model training base path: "', '{}'.format(base_path), '"']))
     log_line(log)
     log.info(''.join(['Device: ', '{}'.format(flair.device)]))
     log_line(log)
     log.info(''.join([
         'Embeddings storage mode: ', '{}'.format(embeddings_storage_mode)
     ]))
     log_train = (True if monitor_train else False)
     log_test = (True if ((not param_selection_mode) and self.corpus.test
                          and monitor_test) else False)
     log_dev = (True if (not train_with_dev) else False)
     loss_txt = init_output_file(base_path, 'loss.tsv')
     weight_extractor = WeightExtractor(base_path)
     optimizer = self.optimizer(self.model.parameters(),
                                lr=learning_rate,
                                **kwargs)
     if (self.optimizer_state is not None):
         optimizer.load_state_dict(self.optimizer_state)
     if use_amp:
         (self.model, optimizer) = amp.initialize(self.model,
                                                  optimizer,
                                                  opt_level=amp_opt_level)
     anneal_mode = ('min' if train_with_dev else 'max')
     scheduler = ReduceLROnPlateau(optimizer,
                                   factor=anneal_factor,
                                   patience=patience,
                                   mode=anneal_mode,
                                   verbose=True)
     if (self.scheduler_state is not None):
         scheduler.load_state_dict(self.scheduler_state)
     train_data = self.corpus.train
     if train_with_dev:
         train_data = ConcatDataset([self.corpus.train, self.corpus.dev])
     if (sampler is not None):
         sampler = sampler(train_data)
         shuffle = False
     dev_score_history = []
     dev_loss_history = []
     train_loss_history = []
     try:
         previous_learning_rate = learning_rate
         for epoch in range((0 + self.epoch), (max_epochs + self.epoch)):
             log_line(log)
             for group in optimizer.param_groups:
                 learning_rate = group['lr']
             if ((learning_rate != previous_learning_rate)
                     and anneal_with_restarts
                     and (base_path / 'best-model.pt').exists()):
                 log.info('resetting to best model')
                 self.model.load((base_path / 'best-model.pt'))
             previous_learning_rate = learning_rate
             if (learning_rate < min_learning_rate):
                 log_line(log)
                 log.info('learning rate too small - quitting training!')
                 log_line(log)
                 break
             batch_loader = DataLoader(train_data,
                                       batch_size=mini_batch_size,
                                       shuffle=shuffle,
                                       num_workers=num_workers,
                                       sampler=sampler)
             self.model.train()
             train_loss = 0
             seen_batches = 0
             total_number_of_batches = len(batch_loader)
             modulo = max(1, int((total_number_of_batches / 10)))
             batch_time = 0
             for (batch_no, batch) in enumerate(batch_loader):
                 start_time = time.time()
                 loss = self.model.forward_loss(batch)
                 optimizer.zero_grad()
                 if use_amp:
                     with amp.scale_loss(loss, optimizer) as scaled_loss:
                         scaled_loss.backward()
                 else:
                     loss.backward()
                 torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                5.0)
                 optimizer.step()
                 seen_batches += 1
                 train_loss += loss.item()
                 store_embeddings(batch, embeddings_storage_mode)
                 batch_time += (time.time() - start_time)
                 if ((batch_no % modulo) == 0):
                     log.info(''.join([
                         'epoch ', '{}'.format((epoch + 1)), ' - iter ',
                         '{}'.format(batch_no), '/',
                         '{}'.format(total_number_of_batches), ' - loss ',
                         '{:.8f}'.format((train_loss / seen_batches)),
                         ' - samples/sec: ', '{:.2f}'.format(
                             ((mini_batch_size * modulo) / batch_time))
                     ]))
                     batch_time = 0
                     iteration = ((epoch * total_number_of_batches) +
                                  batch_no)
                     if (not param_selection_mode):
                         weight_extractor.extract_weights(
                             self.model.state_dict(), iteration)
             train_loss /= seen_batches
             self.model.eval()
             log_line(log)
             log.info(''.join([
                 'EPOCH ', '{}'.format((epoch + 1)), ' done: loss ',
                 '{:.4f}'.format(train_loss), ' - lr ',
                 '{:.4f}'.format(learning_rate)
             ]))
             if self.use_tensorboard:
                 writer.add_scalar('train_loss', train_loss, (epoch + 1))
             current_score = train_loss
             result_line = ''
             if log_train:
                 (train_eval_result, train_loss) = self.model.evaluate(
                     DataLoader(self.corpus.train,
                                batch_size=eval_mini_batch_size,
                                num_workers=num_workers),
                     embeddings_storage_mode=embeddings_storage_mode)
                 result_line += ''.join(
                     ['\t', '{}'.format(train_eval_result.log_line)])
                 store_embeddings(self.corpus.train,
                                  embeddings_storage_mode)
             if log_dev:
                 (dev_eval_result, dev_loss) = self.model.evaluate(
                     DataLoader(self.corpus.dev,
                                batch_size=eval_mini_batch_size,
                                num_workers=num_workers),
                     embeddings_storage_mode=embeddings_storage_mode)
                 result_line += ''.join([
                     '\t', '{}'.format(dev_loss), '\t',
                     '{}'.format(dev_eval_result.log_line)
                 ])
                 log.info(''.join([
                     'DEV : loss ', '{}'.format(dev_loss), ' - score ',
                     '{}'.format(dev_eval_result.main_score)
                 ]))
                 dev_score_history.append(dev_eval_result.main_score)
                 dev_loss_history.append(dev_loss)
                 current_score = dev_eval_result.main_score
                 store_embeddings(self.corpus.dev, embeddings_storage_mode)
                 if self.use_tensorboard:
                     writer.add_scalar('dev_loss', dev_loss, (epoch + 1))
                     writer.add_scalar('dev_score',
                                       dev_eval_result.main_score,
                                       (epoch + 1))
             if log_test:
                 (test_eval_result, test_loss) = self.model.evaluate(
                     DataLoader(self.corpus.test,
                                batch_size=eval_mini_batch_size,
                                num_workers=num_workers),
                     (base_path / 'test.tsv'),
                     embeddings_storage_mode=embeddings_storage_mode)
                 result_line += ''.join([
                     '\t', '{}'.format(test_loss), '\t',
                     '{}'.format(test_eval_result.log_line)
                 ])
                 log.info(''.join([
                     'TEST : loss ', '{}'.format(test_loss), ' - score ',
                     '{}'.format(test_eval_result.main_score)
                 ]))
                 store_embeddings(self.corpus.test, embeddings_storage_mode)
                 if self.use_tensorboard:
                     writer.add_scalar('test_loss', test_loss, (epoch + 1))
                     writer.add_scalar('test_score',
                                       test_eval_result.main_score,
                                       (epoch + 1))
             scheduler.step(current_score)
             train_loss_history.append(train_loss)
             try:
                 bad_epochs = scheduler.num_bad_epochs
             except:
                 bad_epochs = 0
             for group in optimizer.param_groups:
                 new_learning_rate = group['lr']
             if (new_learning_rate != previous_learning_rate):
                 bad_epochs = (patience + 1)
             log.info(''.join(
                 ['BAD EPOCHS (no improvement): ',
                  '{}'.format(bad_epochs)]))
             with open(loss_txt, 'a') as f:
                 if (epoch == 0):
                     f.write(
                         'EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS'
                     )
                     if log_train:
                         f.write(('\tTRAIN_' + '\tTRAIN_'.join(
                             train_eval_result.log_header.split('\t'))))
                     if log_dev:
                         f.write(('\tDEV_LOSS\tDEV_' + '\tDEV_'.join(
                             dev_eval_result.log_header.split('\t'))))
                     if log_test:
                         f.write(('\tTEST_LOSS\tTEST_' + '\tTEST_'.join(
                             test_eval_result.log_header.split('\t'))))
                 f.write(''.join([
                     '\n', '{}'.format(epoch), '\t',
                     '{:%H:%M:%S}'.format(datetime.datetime.now()), '\t',
                     '{}'.format(bad_epochs), '\t',
                     '{:.4f}'.format(learning_rate), '\t',
                     '{}'.format(train_loss)
                 ]))
                 f.write(result_line)
             if (checkpoint and (not param_selection_mode)):
                 self.model.save_checkpoint((base_path / 'checkpoint.pt'),
                                            optimizer.state_dict(),
                                            scheduler.state_dict(),
                                            (epoch + 1), train_loss)
             if ((not train_with_dev) and (not param_selection_mode)
                     and (current_score == scheduler.best)):
                 self.model.save((base_path / 'best-model.pt'))
         if (save_final_model and (not param_selection_mode)):
             self.model.save((base_path / 'final-model.pt'))
     except KeyboardInterrupt:
         log_line(log)
         log.info('Exiting from training early.')
         if self.use_tensorboard:
             writer.close()
         if (not param_selection_mode):
             log.info('Saving model ...')
             self.model.save((base_path / 'final-model.pt'))
             log.info('Done.')
     if self.corpus.test:
         final_score = self.final_test(base_path, eval_mini_batch_size,
                                       num_workers)
     else:
         final_score = 0
         log.info('Test data not provided setting final score to 0')
     log.removeHandler(log_handler)
     if self.use_tensorboard:
         writer.close()
     return {
         'test_score': final_score,
         'dev_score_history': dev_score_history,
         'train_loss_history': train_loss_history,
         'dev_loss_history': dev_loss_history,
     }
Ejemplo n.º 28
0
    def _objective(self, params: dict):
        log_line(log)
        log.info(f"Evaluation run: {self.run}")
        log.info(f"Evaluating parameter combination:")
        for k, v in params.items():
            if isinstance(v, Tuple):
                v = ",".join([str(x) for x in v])
            log.info(f"\t{k}: {str(v)}")
        log_line(log)

        for sent in self.corpus.get_all_sentences():
            sent.clear_embeddings()

        scores = []
        vars = []

        for i in range(0, self.training_runs):
            log_line(log)
            log.info(f"Training run: {i + 1}")

            model = self._set_up_model(params)

            training_params = {
                key: params[key] for key in params if key in TRAINING_PARAMETERS
            }
            model_trainer_parameters = {
                key: params[key] for key in params if key in MODEL_TRAINER_PARAMETERS
            }

            trainer: ModelTrainer = ModelTrainer(
                model, self.corpus, **model_trainer_parameters
            )

            result = trainer.train(
                self.base_path,
                evaluation_metric=self.evaluation_metric,
                max_epochs=self.max_epochs,
                param_selection_mode=True,
                **training_params,
            )

            # take the average over the last three scores of training
            if self.optimization_value == OptimizationValue.DEV_LOSS:
                curr_scores = result["dev_loss_history"][-3:]
            else:
                curr_scores = list(
                    map(lambda s: 1 - s, result["dev_score_history"][-3:])
                )

            score = sum(curr_scores) / float(len(curr_scores))
            var = np.var(curr_scores)
            scores.append(score)
            vars.append(var)

        # take average over the scores from the different training runs
        final_score = sum(scores) / float(len(scores))
        final_var = sum(vars) / float(len(vars))

        test_score = result["test_score"]
        log_line(log)
        log.info(f"Done evaluating parameter combination:")
        for k, v in params.items():
            if isinstance(v, Tuple):
                v = ",".join([str(x) for x in v])
            log.info(f"\t{k}: {v}")
        log.info(f"{self.optimization_value.value}: {final_score}")
        log.info(f"variance: {final_var}")
        log.info(f"test_score: {test_score}\n")
        log_line(log)

        with open(self.param_selection_file, "a") as f:
            f.write(f"evaluation run {self.run}\n")
            for k, v in params.items():
                if isinstance(v, Tuple):
                    v = ",".join([str(x) for x in v])
                f.write(f"\t{k}: {str(v)}\n")
            f.write(f"{self.optimization_value.value}: {final_score}\n")
            f.write(f"variance: {final_var}\n")
            f.write(f"test_score: {test_score}\n")
            f.write("-" * 100 + "\n")

        self.run += 1

        return {"status": "ok", "loss": final_score, "loss_variance": final_var}
Ejemplo n.º 29
0
    def find_learning_rate(
        self,
        base_path: Union[Path, str],
        file_name: str = "learning_rate.tsv",
        start_learning_rate: float = 1e-7,
        end_learning_rate: float = 10,
        iterations: int = 100,
        mini_batch_size: int = 32,
        stop_early: bool = True,
        smoothing_factor: float = 0.98,
        **kwargs,
    ) -> Path:
        best_loss = None
        moving_avg_loss = 0

        # cast string to Path
        if type(base_path) is str:
            base_path = Path(base_path)
        learning_rate_tsv = init_output_file(base_path, file_name)

        with open(learning_rate_tsv, "a") as f:
            f.write("ITERATION\tTIMESTAMP\tLEARNING_RATE\tTRAIN_LOSS\n")

        optimizer = self.optimizer(self.model.parameters(),
                                   lr=start_learning_rate,
                                   **kwargs)

        train_data = self.corpus.train

        scheduler = ExpAnnealLR(optimizer, end_learning_rate, iterations)

        model_state = self.model.state_dict()
        self.model.train()

        step = 0
        while step < iterations:
            batch_loader = DataLoader(train_data,
                                      batch_size=mini_batch_size,
                                      shuffle=True)
            for batch in batch_loader:
                step += 1

                # forward pass
                loss = self.model.forward_loss(batch)

                # update optimizer and scheduler
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
                optimizer.step()
                scheduler.step(step)

                print(scheduler.get_lr())
                learning_rate = scheduler.get_lr()[0]

                loss_item = loss.item()
                if step == 1:
                    best_loss = loss_item
                else:
                    if smoothing_factor > 0:
                        moving_avg_loss = (smoothing_factor * moving_avg_loss +
                                           (1 - smoothing_factor) * loss_item)
                        loss_item = moving_avg_loss / (1 - smoothing_factor**
                                                       (step + 1))
                    if loss_item < best_loss:
                        best_loss = loss

                if step > iterations:
                    break

                if stop_early and (loss_item > 4 * best_loss
                                   or torch.isnan(loss)):
                    log_line(log)
                    log.info("loss diverged - stopping early!")
                    step = iterations
                    break

                with open(str(learning_rate_tsv), "a") as f:
                    f.write(
                        f"{step}\t{datetime.datetime.now():%H:%M:%S}\t{learning_rate}\t{loss_item}\n"
                    )

            self.model.load_state_dict(model_state)
            self.model.to(flair.device)

        log_line(log)
        log.info(f"learning rate finder finished - plot {learning_rate_tsv}")
        log_line(log)

        return Path(learning_rate_tsv)
Ejemplo n.º 30
0
 def _objective(self, params):
     log_line(log)
     log.info(u''.join([u'Evaluation run: ', u'{}'.format(self.run)]))
     log.info(u''.join([u'Evaluating parameter combination:']))
     for (k, v) in params.items():
         if isinstance(v, Tuple):
             v = u','.join([unicode(x) for x in v])
         log.info(u''.join(
             [u'\t', u'{}'.format(k), u': ', u'{}'.format(unicode(v))]))
     log_line(log)
     for sent in self.corpus.get_all_sentences():
         sent.clear_embeddings()
     scores = []
     vars = []
     for i in range(0, self.training_runs):
         log_line(log)
         log.info(u''.join([u'Training run: ', u'{}'.format((i + 1))]))
         model = self._set_up_model(params)
         training_params = {
             key: params[key]
             for key in params if (key in TRAINING_PARAMETERS)
         }
         model_trainer_parameters = {
             key: params[key]
             for key in params if (key in MODEL_TRAINER_PARAMETERS)
         }
         trainer = ModelTrainer(model, self.corpus,
                                **model_trainer_parameters)
         result = trainer.train(self.base_path,
                                evaluation_metric=self.evaluation_metric,
                                max_epochs=self.max_epochs,
                                param_selection_mode=True,
                                **training_params)
         if (self.optimization_value == OptimizationValue.DEV_LOSS):
             curr_scores = result[u'dev_loss_history'][(-3):]
         else:
             curr_scores = list(
                 map((lambda s: (1 - s)),
                     result[u'dev_score_history'][(-3):]))
         score = (sum(curr_scores) / float(len(curr_scores)))
         var = np.var(curr_scores)
         scores.append(score)
         vars.append(var)
     final_score = (sum(scores) / float(len(scores)))
     final_var = (sum(vars) / float(len(vars)))
     test_score = result[u'test_score']
     log_line(log)
     log.info(u''.join([u'Done evaluating parameter combination:']))
     for (k, v) in params.items():
         if isinstance(v, Tuple):
             v = u','.join([unicode(x) for x in v])
         log.info(u''.join([u'\t', u'{}'.format(k), u': ',
                            u'{}'.format(v)]))
     log.info(u''.join([
         u'{}'.format(self.optimization_value.value), u': ',
         u'{}'.format(final_score)
     ]))
     log.info(u''.join([u'variance: ', u'{}'.format(final_var)]))
     log.info(u''.join([u'test_score: ', u'{}'.format(test_score), u'\n']))
     log_line(log)
     with open(self.param_selection_file, u'a') as f:
         f.write(u''.join(
             [u'evaluation run ', u'{}'.format(self.run), u'\n']))
         for (k, v) in params.items():
             if isinstance(v, Tuple):
                 v = u','.join([unicode(x) for x in v])
             f.write(u''.join([
                 u'\t', u'{}'.format(k), u': ', u'{}'.format(unicode(v)),
                 u'\n'
             ]))
         f.write(u''.join([
             u'{}'.format(self.optimization_value.value), u': ',
             u'{}'.format(final_score), u'\n'
         ]))
         f.write(u''.join([u'variance: ', u'{}'.format(final_var), u'\n']))
         f.write(u''.join(
             [u'test_score: ', u'{}'.format(test_score), u'\n']))
         f.write(((u'-' * 100) + u'\n'))
     self.run += 1
     return {
         u'status': u'ok',
         u'loss': final_score,
         u'loss_variance': final_var,
     }