def __init__( self, pipeline: "Pipeline", train_dataset: Optional[Union[Dataset, InstanceDataset]] = None, valid_dataset: Optional[Union[Dataset, InstanceDataset]] = None, trainer_config: Optional[TrainerConfiguration] = None, vocab_config: Optional[Union[str, VocabularyConfiguration]] = "default", lazy: bool = False, ): self._pipeline = pipeline # since we will make changes to the config, better to make a copy -> asdict returns a deep copy self._trainer_config = ( TrainerConfiguration(**asdict(trainer_config)) if trainer_config is not None else TrainerConfiguration() ) # Use GPU by default if available if self._trainer_config.gpus is None and torch.cuda.is_available(): self._trainer_config.gpus = 1 self._vocab_config: Optional[VocabularyConfiguration] = ( VocabularyConfiguration() if vocab_config == "default" else vocab_config ) self._lazy = lazy # create instances if isinstance(train_dataset, Dataset): self._train_instances = train_dataset.to_instances( self._pipeline, lazy=self._lazy, tqdm_desc="Loading training instances" ) else: self._train_instances = train_dataset if isinstance(valid_dataset, Dataset): self._valid_instances = valid_dataset.to_instances( self._pipeline, lazy=self._lazy, tqdm_desc="Loading validation instances", ) else: self._valid_instances = valid_dataset # Maybe we just want to call `self.test` if self._train_instances is not None: self._setup_for_training() self.trainer = pl.Trainer(**self._trainer_config.lightning_params)
def test_add_default_loggers(input_kwargs, expected_loggers, pipeline_dict, dataset, tmp_path): trainer_config = TrainerConfiguration(**input_kwargs, default_root_dir=str(tmp_path)) trainer = Trainer( Pipeline.from_config(pipeline_dict), train_dataset=dataset, trainer_config=trainer_config, ) if input_kwargs.get("logger") is not False: assert isinstance(trainer.trainer.logger, LoggerCollection) assert len(trainer.trainer.logger.experiment) == len(expected_loggers) else: assert trainer._trainer_config.logger is False def loggers_include(logger_type) -> bool: return any([ isinstance(logger, logger_type) for logger in trainer._trainer_config.logger ]) for logger in expected_loggers: if logger == "csv": assert loggers_include(CSVLogger) if logger == "tensorboard": assert loggers_include(TensorBoardLogger) if logger == "wandb": assert loggers_include(WandbLogger) assert (tmp_path / "wandb").is_dir() if logger == "mlflow": assert loggers_include(MLFlowLogger)
def trainer_config() -> TrainerConfiguration: return TrainerConfiguration(batch_size=64, num_epochs=5, optimizer={ "type": "adam", "lr": 0.01 })
def test_create_output_dir(pipeline_dict, dataset, tmp_path): config = TrainerConfiguration(logger=False, fast_dev_run=True, batch_size=1, max_epochs=1, gpus=0) pipeline = Pipeline.from_config(pipeline_dict) trainer = Trainer(pipeline, train_dataset=dataset, trainer_config=config) output_dir = tmp_path / "test_this_non_existing_parent_dir" / "output" trainer.fit(output_dir=output_dir) assert output_dir.is_dir()
def test_text_classification(tmp_path, pipeline_dict, train_valid_dataset): """Apart from a well specified training, this also tests the vocab creation!""" seed_everything(43) pl = Pipeline.from_config(pipeline_dict) train_ds = train_valid_dataset[0] valid_ds = train_valid_dataset[1] vocab_config = VocabularyConfiguration(max_vocab_size={"word": 50}) trainer_config = TrainerConfiguration( batch_size=64, optimizer={ "type": "adam", "lr": 0.01 }, max_epochs=5, default_root_dir=str(tmp_path), gpus=0, # turn off gpus even if available ) trainer = Trainer( pipeline=pl, train_dataset=train_ds, valid_dataset=valid_ds, trainer_config=trainer_config, vocab_config=vocab_config, ) trainer.fit(tmp_path / "output") assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83 assert pl.num_trainable_parameters == 22070 evaluation = trainer.test(valid_ds, batch_size=16) # Reminder: the value depends on the batch_size! assert evaluation["test_loss"] == pytest.approx(0.7404146790504456, abs=0.003) Pipeline.from_pretrained(str(tmp_path / "output" / "model.tar.gz")) assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
def test_deep_copy_of_trainer_config(pipeline_dict, dataset): pl = Pipeline.from_config(pipeline_dict) trainer_config = TrainerConfiguration() trainer = Trainer(pl, train_dataset=dataset, trainer_config=trainer_config) assert trainer_config is not trainer._trainer_config
def train( self, output: str, training: Union[DataSource, InstancesDataset], trainer: Optional[TrainerConfiguration] = None, validation: Optional[Union[DataSource, InstancesDataset]] = None, test: Optional[Union[DataSource, InstancesDataset]] = None, extend_vocab: Optional[VocabularyConfiguration] = None, loggers: List[BaseTrainLogger] = None, restore: bool = False, quiet: bool = False, ) -> TrainingResults: """Launches a training run with the specified configurations and data sources Parameters ---------- output: The experiment output path training: The training DataSource trainer: The trainer file path validation: The validation DataSource (optional) test: The test DataSource (optional) extend_vocab: Extends the vocabulary tokens with the provided VocabularyConfiguration loggers: A list of loggers that execute a callback before the training, after each epoch, and at the end of the training (see `biome.text.logger.MlflowLogger`, for example) restore: If enabled, tries to read previous training status from the `output` folder and continues the training process quiet: If enabled, disables most logging messages keeping only warning and error messages. In any case, all logging info will be stored into a file at ${output}/train.log Returns ------- training_results Training results including the generated model path and the related metrics """ if extend_vocab is not None and isinstance(self, _BlankPipeline): raise ActionNotSupportedError( "If you want to customize pipeline vocab, please use the `create_vocabulary()` method instead" ) trainer = trainer or TrainerConfiguration() try: if not restore and os.path.isdir(output): shutil.rmtree(output) self.__configure_training_logging(output, quiet) # The original pipeline keeps unchanged train_pipeline = self._make_copy() vocab = None if restore: vocab = vocabulary.load_vocabulary( os.path.join(output, "vocabulary")) if extend_vocab is not None and not vocab: vocab = train_pipeline._extend_vocabulary( train_pipeline.backbone.vocab, vocab_config=extend_vocab) if vocab: train_pipeline._set_vocab(vocab) if vocabulary.is_empty(train_pipeline.backbone.vocab, self.config.features.keys): raise EmptyVocabError( "Found an empty vocabulary. " "You probably forgot to create a vocabulary with '.create_vocabulary()'." ) from ._helpers import PipelineTrainer datasets = { "training": training, "validation": validation, "test": test } for name, dataset in datasets.items(): if isinstance(dataset, DataSource): datasets[name] = train_pipeline.create_dataset(dataset) loggers = loggers or [] add_default_wandb_logger_if_needed(loggers) pipeline_trainer = PipelineTrainer( train_pipeline, trainer_config=trainer, output_dir=output, epoch_callbacks=loggers, **datasets, ) for logger in loggers: try: logger.init_train( pipeline=train_pipeline, trainer_configuration=trainer, **datasets, ) except Exception as e: self.__LOGGER.warning("Logger %s failed on init_train: %s", logger, e) model_path, metrics = pipeline_trainer.train() train_results = TrainingResults(model_path, metrics) for logger in loggers: try: logger.end_train(train_results) except Exception as e: self.__LOGGER.warning("Logger %s failed on end_traing: %s", logger, e) return train_results finally: self.__restore_training_logging()
def train( self, output: str, training: Union[Dataset, InstancesDataset], trainer: Optional[TrainerConfiguration] = None, validation: Optional[Union[Dataset, InstancesDataset]] = None, test: Optional[Union[Dataset, InstancesDataset]] = None, vocab_config: Optional[Union[VocabularyConfiguration, str]] = "default", loggers: List[BaseTrainLogger] = None, lazy: bool = False, restore: bool = False, quiet: bool = False, ) -> TrainingResults: """Launches a training run with the specified configurations and data sources Parameters ---------- output The experiment output path training The training Dataset trainer The trainer file path validation The validation Dataset (optional) test The test Dataset (optional) vocab_config A `VocabularyConfiguration` to create/extend the pipeline's vocabulary if necessary. If 'default' (str), we will use the default configuration `VocabularyConfiguration(datasets=[training])`. If None, we will leave the pipeline's vocabulary untouched. loggers A list of loggers that execute a callback before the training, after each epoch, and at the end of the training (see `biome.text.logger.MlflowLogger`, for example) lazy If true, dataset instances are lazily loaded from disk, otherwise they are loaded and kept in memory. restore If enabled, tries to read previous training status from the `output` folder and continues the training process quiet If enabled, disables most logging messages keeping only warning and error messages. In any case, all logging info will be stored into a file at ${output}/train.log Returns ------- training_results Training results including the generated model path and the related metrics """ trainer = trainer or TrainerConfiguration() try: if not restore and os.path.isdir(output): shutil.rmtree(output) self.__configure_training_logging(output, quiet) self._prepare_vocab( vocabulary_folder=os.path.join(output, "vocabulary") if restore else None, vocab_config=vocab_config, training_data=training, lazy=lazy, ) from ._helpers import PipelineTrainer datasets = { "training": training, "validation": validation, "test": test } for name, dataset in datasets.items(): if isinstance(dataset, Dataset): datasets[name] = dataset.to_instances(pipeline=self, lazy=lazy) loggers = loggers or [] loggers = add_default_wandb_logger_if_needed(loggers) pipeline_trainer = PipelineTrainer( self, trainer_config=trainer, output_dir=output, epoch_callbacks=loggers, **datasets, ) for logger in loggers: try: logger.init_train( pipeline=self, trainer_configuration=trainer, **datasets, ) except Exception as e: self.__LOGGER.warning("Logger %s failed on init_train: %s", logger, e) self._model.file_path, metrics = pipeline_trainer.train() train_results = TrainingResults(self.model_path, metrics) for logger in loggers: try: logger.end_train(train_results) except Exception as e: self.__LOGGER.warning("Logger %s failed on end_traing: %s", logger, e) return train_results finally: self.__restore_training_logging()