Exemple #1
0
def create_trainer_for_finding_lr(
    pipeline: Pipeline,
    trainer_config: TrainerConfiguration,
    training_data: InstancesDataset,
) -> GradientDescentTrainer:
    """Returns an AllenNLP Trainer used for the learning rate scan.

    Parameters
    ----------
    pipeline
        The pipeline with the model
    trainer_config
        A trainer configuration
    training_data
        The training data
    """
    prepare_environment(Params({}))

    if hasattr(training_data, "index_with"):
        training_data.index_with(pipeline.backbone.vocab)

    trainer_params = Params(
        helpers.sanitize_for_params(trainer_config.to_allennlp_trainer()))

    training_data_loader = create_dataloader(training_data,
                                             trainer_config.batch_size,
                                             trainer_config.data_bucketing)

    return Trainer.from_params(
        model=pipeline._model,
        data_loader=training_data_loader,
        params=trainer_params,
        serialization_dir=None,
    )
Exemple #2
0
def create_trainer_for_finding_lr(
    model: PipelineModel,
    trainer_config: TrainerConfiguration,
    training_data: InstancesDataset,
) -> GradientDescentTrainer:
    """Returns an AllenNLP Trainer used for the learning rate scan.

    Parameters
    ----------
    model
        The underlying model
    trainer_config
        A trainer configuration
    training_data
        The training data
    """
    prepare_environment(Params({}))

    trainer_params = Params(
        helpers.sanitize_for_params(trainer_config.to_allennlp_trainer()))

    training_data_loader = create_dataloader(training_data,
                                             trainer_config.batch_size,
                                             trainer_config.data_bucketing)

    return cast(
        "GradientDescentTrainer",
        Trainer.from_params(
            model=model,
            data_loader=training_data_loader,
            params=trainer_params,
            serialization_dir=None,
        ),
    )
Exemple #3
0
    def _setup(self):
        """Setup the trainer components and local resources"""
        prepare_environment(
            Params({} if self._trainer_config.random_seed is None else {
                "random_seed": self._trainer_config.random_seed,
                "numpy_seed": self._trainer_config.random_seed,
                "pytorch_seed": self._trainer_config.random_seed,
            }))
        os.makedirs(self._output_dir, exist_ok=True)

        # We don't need to load pretrained weights from saved models
        if self._pipeline.config.features.word:
            self._pipeline.config.features.word.weights_file = None

        serialization_params = sanitize(self._allennlp_configuration())
        with open(os.path.join(self._output_dir, CONFIG_NAME),
                  "w") as param_file:
            json.dump(serialization_params, param_file, indent=4)

        self._pipeline.save_vocabulary(
            os.path.join(self._output_dir, "vocabulary"))

        for dataset in [self._training, self._validation, self._test]:
            if dataset and hasattr(dataset, "index_with"):
                dataset.index_with(self._pipeline.backbone.vocab)

        trainer_params = Params(
            helpers.sanitize_for_params(
                self._trainer_config.to_allennlp_trainer()))

        pipeline_model = self._pipeline._model

        training_data_loader = create_dataloader(
            self._training,
            self._trainer_config.batch_size,
            self._trainer_config.data_bucketing,
            self._trainer_config.batches_per_epoch,
        )

        validation_data_loader = (create_dataloader(
            self._validation,
            self._trainer_config.batch_size,
            self._trainer_config.data_bucketing,
        ) if self._validation else None)

        self._trainer = Trainer.from_params(
            model=pipeline_model,
            serialization_dir=self._output_dir,
            data_loader=training_data_loader,
            validation_data_loader=validation_data_loader,
            params=trainer_params,
            epoch_callbacks=self._epoch_callbacks,
        )
Exemple #4
0
    def from_dict(cls, config_dict: dict) -> "PipelineConfiguration":
        """Creates a pipeline configuration from a config dictionary

        Parameters
        ----------
        config_dict
            A configuration dictionary

        Returns
        -------
        pipeline_configuration
        """
        config_dict = sanitize_for_params(copy.deepcopy(config_dict))
        return PipelineConfiguration.from_params(Params(config_dict))
Exemple #5
0
    def as_dict(self) -> Dict[str, Any]:
        """Returns the configuration as dictionary

        Returns
        -------
        config
        """
        config = {
            "name": self.name,
            "tokenizer": vars(self.tokenizer_config),
            "features": vars(self.features),
            "head": self.head.config,
        }

        if self.encoder:
            config["encoder"] = self.encoder.config

        return sanitize_for_params(config)
Exemple #6
0
    def _default_trainable(config, reporter):
        """A default trainable function used by `tune.run`

        It performs the most straight forward training loop with the provided `config`:
        - Create the pipeline (optionally with a provided vocab)
        - Set up a MLFlow and WandB logger
        - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch
        - Create the vocab if necessary
        - Execute the training
        """
        pipeline = Pipeline.from_config(config["pipeline_config"],
                                        vocab_path=config["vocab_path"])

        trainer_config = TrainerConfiguration(
            **helpers.sanitize_for_params(config["trainer_config"]))

        mlflow_tracking_uri = config["mlflow_tracking_uri"]
        mlflow.set_tracking_uri(mlflow_tracking_uri)

        train_ds = Dataset.load_from_disk(config["train_dataset_path"])
        valid_ds = Dataset.load_from_disk(config["valid_dataset_path"])

        train_loggers = [
            MlflowLogger(
                experiment_name=config["name"],
                run_name=reporter.trial_name,
                ray_trial_id=reporter.trial_id,
                ray_logdir=reporter.logdir,
            ),
            TuneMetricsLogger(),
        ]
        if is_wandb_installed_and_logged_in():
            train_loggers = [WandBLogger(project_name=config["name"])
                             ] + train_loggers

        pipeline.train(
            output="training",
            training=train_ds,
            validation=valid_ds,
            trainer=trainer_config,
            loggers=train_loggers,
            vocab_config=None if config["vocab_path"] else "default",
        )