예제 #1
0
    def __init__(
        self,
        pipeline: "Pipeline",
        train_dataset: Optional[Union[Dataset, InstanceDataset]] = None,
        valid_dataset: Optional[Union[Dataset, InstanceDataset]] = None,
        trainer_config: Optional[TrainerConfiguration] = None,
        vocab_config: Optional[Union[str, VocabularyConfiguration]] = "default",
        lazy: bool = False,
    ):
        self._pipeline = pipeline
        # since we will make changes to the config, better to make a copy -> asdict returns a deep copy
        self._trainer_config = (
            TrainerConfiguration(**asdict(trainer_config))
            if trainer_config is not None
            else TrainerConfiguration()
        )

        # Use GPU by default if available
        if self._trainer_config.gpus is None and torch.cuda.is_available():
            self._trainer_config.gpus = 1

        self._vocab_config: Optional[VocabularyConfiguration] = (
            VocabularyConfiguration() if vocab_config == "default" else vocab_config
        )
        self._lazy = lazy

        # create instances
        if isinstance(train_dataset, Dataset):
            self._train_instances = train_dataset.to_instances(
                self._pipeline, lazy=self._lazy, tqdm_desc="Loading training instances"
            )
        else:
            self._train_instances = train_dataset

        if isinstance(valid_dataset, Dataset):
            self._valid_instances = valid_dataset.to_instances(
                self._pipeline,
                lazy=self._lazy,
                tqdm_desc="Loading validation instances",
            )
        else:
            self._valid_instances = valid_dataset

        # Maybe we just want to call `self.test`
        if self._train_instances is not None:
            self._setup_for_training()

        self.trainer = pl.Trainer(**self._trainer_config.lightning_params)
예제 #2
0
def test_add_default_loggers(input_kwargs, expected_loggers, pipeline_dict,
                             dataset, tmp_path):
    trainer_config = TrainerConfiguration(**input_kwargs,
                                          default_root_dir=str(tmp_path))
    trainer = Trainer(
        Pipeline.from_config(pipeline_dict),
        train_dataset=dataset,
        trainer_config=trainer_config,
    )
    if input_kwargs.get("logger") is not False:
        assert isinstance(trainer.trainer.logger, LoggerCollection)
        assert len(trainer.trainer.logger.experiment) == len(expected_loggers)
    else:
        assert trainer._trainer_config.logger is False

    def loggers_include(logger_type) -> bool:
        return any([
            isinstance(logger, logger_type)
            for logger in trainer._trainer_config.logger
        ])

    for logger in expected_loggers:
        if logger == "csv":
            assert loggers_include(CSVLogger)
        if logger == "tensorboard":
            assert loggers_include(TensorBoardLogger)
        if logger == "wandb":
            assert loggers_include(WandbLogger)
            assert (tmp_path / "wandb").is_dir()
        if logger == "mlflow":
            assert loggers_include(MLFlowLogger)
예제 #3
0
def trainer_config() -> TrainerConfiguration:
    return TrainerConfiguration(batch_size=64,
                                num_epochs=5,
                                optimizer={
                                    "type": "adam",
                                    "lr": 0.01
                                })
예제 #4
0
def test_create_output_dir(pipeline_dict, dataset, tmp_path):
    config = TrainerConfiguration(logger=False,
                                  fast_dev_run=True,
                                  batch_size=1,
                                  max_epochs=1,
                                  gpus=0)
    pipeline = Pipeline.from_config(pipeline_dict)
    trainer = Trainer(pipeline, train_dataset=dataset, trainer_config=config)

    output_dir = tmp_path / "test_this_non_existing_parent_dir" / "output"
    trainer.fit(output_dir=output_dir)

    assert output_dir.is_dir()
예제 #5
0
def test_text_classification(tmp_path, pipeline_dict, train_valid_dataset):
    """Apart from a well specified training, this also tests the vocab creation!"""
    seed_everything(43)

    pl = Pipeline.from_config(pipeline_dict)
    train_ds = train_valid_dataset[0]
    valid_ds = train_valid_dataset[1]

    vocab_config = VocabularyConfiguration(max_vocab_size={"word": 50})
    trainer_config = TrainerConfiguration(
        batch_size=64,
        optimizer={
            "type": "adam",
            "lr": 0.01
        },
        max_epochs=5,
        default_root_dir=str(tmp_path),
        gpus=0,  # turn off gpus even if available
    )

    trainer = Trainer(
        pipeline=pl,
        train_dataset=train_ds,
        valid_dataset=valid_ds,
        trainer_config=trainer_config,
        vocab_config=vocab_config,
    )

    trainer.fit(tmp_path / "output")

    assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
    assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83

    assert pl.num_trainable_parameters == 22070

    evaluation = trainer.test(valid_ds, batch_size=16)

    # Reminder: the value depends on the batch_size!
    assert evaluation["test_loss"] == pytest.approx(0.7404146790504456,
                                                    abs=0.003)

    Pipeline.from_pretrained(str(tmp_path / "output" / "model.tar.gz"))

    assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
    assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
예제 #6
0
def test_deep_copy_of_trainer_config(pipeline_dict, dataset):
    pl = Pipeline.from_config(pipeline_dict)
    trainer_config = TrainerConfiguration()
    trainer = Trainer(pl, train_dataset=dataset, trainer_config=trainer_config)
    assert trainer_config is not trainer._trainer_config
예제 #7
0
    def train(
        self,
        output: str,
        training: Union[DataSource, InstancesDataset],
        trainer: Optional[TrainerConfiguration] = None,
        validation: Optional[Union[DataSource, InstancesDataset]] = None,
        test: Optional[Union[DataSource, InstancesDataset]] = None,
        extend_vocab: Optional[VocabularyConfiguration] = None,
        loggers: List[BaseTrainLogger] = None,
        restore: bool = False,
        quiet: bool = False,
    ) -> TrainingResults:
        """Launches a training run with the specified configurations and data sources

        Parameters
        ----------
        output:
            The experiment output path
        training:
            The training DataSource
        trainer:
            The trainer file path
        validation:
            The validation DataSource (optional)
        test:
            The test DataSource (optional)
        extend_vocab:
            Extends the vocabulary tokens with the provided VocabularyConfiguration
        loggers:
            A list of loggers that execute a callback before the training, after each epoch,
            and at the end of the training (see `biome.text.logger.MlflowLogger`, for example)
        restore:
            If enabled, tries to read previous training status from the `output` folder and
            continues the training process
        quiet:
            If enabled, disables most logging messages keeping only warning and error messages.
            In any case, all logging info will be stored into a file at ${output}/train.log

        Returns
        -------
        training_results
            Training results including the generated model path and the related metrics
        """
        if extend_vocab is not None and isinstance(self, _BlankPipeline):
            raise ActionNotSupportedError(
                "If you want to customize pipeline vocab, please use the `create_vocabulary()` method instead"
            )

        trainer = trainer or TrainerConfiguration()
        try:
            if not restore and os.path.isdir(output):
                shutil.rmtree(output)

            self.__configure_training_logging(output, quiet)

            # The original pipeline keeps unchanged
            train_pipeline = self._make_copy()
            vocab = None

            if restore:
                vocab = vocabulary.load_vocabulary(
                    os.path.join(output, "vocabulary"))
            if extend_vocab is not None and not vocab:
                vocab = train_pipeline._extend_vocabulary(
                    train_pipeline.backbone.vocab, vocab_config=extend_vocab)
            if vocab:
                train_pipeline._set_vocab(vocab)

            if vocabulary.is_empty(train_pipeline.backbone.vocab,
                                   self.config.features.keys):
                raise EmptyVocabError(
                    "Found an empty vocabulary. "
                    "You probably forgot to create a vocabulary with '.create_vocabulary()'."
                )

            from ._helpers import PipelineTrainer

            datasets = {
                "training": training,
                "validation": validation,
                "test": test
            }
            for name, dataset in datasets.items():
                if isinstance(dataset, DataSource):
                    datasets[name] = train_pipeline.create_dataset(dataset)

            loggers = loggers or []
            add_default_wandb_logger_if_needed(loggers)

            pipeline_trainer = PipelineTrainer(
                train_pipeline,
                trainer_config=trainer,
                output_dir=output,
                epoch_callbacks=loggers,
                **datasets,
            )

            for logger in loggers:
                try:
                    logger.init_train(
                        pipeline=train_pipeline,
                        trainer_configuration=trainer,
                        **datasets,
                    )
                except Exception as e:
                    self.__LOGGER.warning("Logger %s failed on init_train: %s",
                                          logger, e)

            model_path, metrics = pipeline_trainer.train()
            train_results = TrainingResults(model_path, metrics)

            for logger in loggers:
                try:
                    logger.end_train(train_results)
                except Exception as e:
                    self.__LOGGER.warning("Logger %s failed on end_traing: %s",
                                          logger, e)

            return train_results

        finally:
            self.__restore_training_logging()
예제 #8
0
    def train(
        self,
        output: str,
        training: Union[Dataset, InstancesDataset],
        trainer: Optional[TrainerConfiguration] = None,
        validation: Optional[Union[Dataset, InstancesDataset]] = None,
        test: Optional[Union[Dataset, InstancesDataset]] = None,
        vocab_config: Optional[Union[VocabularyConfiguration,
                                     str]] = "default",
        loggers: List[BaseTrainLogger] = None,
        lazy: bool = False,
        restore: bool = False,
        quiet: bool = False,
    ) -> TrainingResults:
        """Launches a training run with the specified configurations and data sources

        Parameters
        ----------
        output
            The experiment output path
        training
            The training Dataset
        trainer
            The trainer file path
        validation
            The validation Dataset (optional)
        test
            The test Dataset (optional)
        vocab_config
            A `VocabularyConfiguration` to create/extend the pipeline's vocabulary if necessary.
            If 'default' (str), we will use the default configuration `VocabularyConfiguration(datasets=[training])`.
            If None, we will leave the pipeline's vocabulary untouched.
        loggers
            A list of loggers that execute a callback before the training, after each epoch,
            and at the end of the training (see `biome.text.logger.MlflowLogger`, for example)
        lazy
            If true, dataset instances are lazily loaded from disk, otherwise they are loaded and kept in memory.
        restore
            If enabled, tries to read previous training status from the `output` folder and
            continues the training process
        quiet
            If enabled, disables most logging messages keeping only warning and error messages.
            In any case, all logging info will be stored into a file at ${output}/train.log

        Returns
        -------
        training_results
            Training results including the generated model path and the related metrics
        """
        trainer = trainer or TrainerConfiguration()
        try:
            if not restore and os.path.isdir(output):
                shutil.rmtree(output)

            self.__configure_training_logging(output, quiet)

            self._prepare_vocab(
                vocabulary_folder=os.path.join(output, "vocabulary")
                if restore else None,
                vocab_config=vocab_config,
                training_data=training,
                lazy=lazy,
            )

            from ._helpers import PipelineTrainer

            datasets = {
                "training": training,
                "validation": validation,
                "test": test
            }
            for name, dataset in datasets.items():
                if isinstance(dataset, Dataset):
                    datasets[name] = dataset.to_instances(pipeline=self,
                                                          lazy=lazy)

            loggers = loggers or []
            loggers = add_default_wandb_logger_if_needed(loggers)

            pipeline_trainer = PipelineTrainer(
                self,
                trainer_config=trainer,
                output_dir=output,
                epoch_callbacks=loggers,
                **datasets,
            )

            for logger in loggers:
                try:
                    logger.init_train(
                        pipeline=self,
                        trainer_configuration=trainer,
                        **datasets,
                    )
                except Exception as e:
                    self.__LOGGER.warning("Logger %s failed on init_train: %s",
                                          logger, e)

            self._model.file_path, metrics = pipeline_trainer.train()
            train_results = TrainingResults(self.model_path, metrics)

            for logger in loggers:
                try:
                    logger.end_train(train_results)
                except Exception as e:
                    self.__LOGGER.warning("Logger %s failed on end_traing: %s",
                                          logger, e)

            return train_results

        finally:
            self.__restore_training_logging()