Beispiel #1
0
    def _save_dataset_to_disk(self, dataset: Dataset) -> str:
        """Saves the dataset to disk if not saved already

        Parameters
        ----------
        dataset
            Dataset to save to disk

        Returns
        -------
        dataset_path
            Path to the saved dataset, that is a directory
        """
        try:
            filename = Path(dataset.dataset.cache_files[0]["filename"])
        except (IndexError, KeyError):
            filename = Path()

        if filename.name != "dataset.arrow":
            tmp_dir = tempfile.TemporaryDirectory()
            self._created_tmp_dirs.append(tmp_dir)
            dataset_path = tmp_dir.name
            dataset.save_to_disk(dataset_path)
        else:
            dataset_path = str(filename.absolute())

        # Make sure that we can load the dataset successfully
        try:
            Dataset.load_from_disk(dataset_path)
        except Exception as exception:
            raise ValidationError(
                f"Could not load dataset saved in '{dataset_path}'"
            ) from exception

        return dataset_path
Beispiel #2
0
def test_tune_exp_save_dataset_and_vocab(
    dataset, pipeline_config, trainer_config, monkeypatch
):
    pl = Pipeline.from_config(pipeline_config)

    my_exp = TuneExperiment(
        pipeline_config=pipeline_config,
        trainer_config=trainer_config,
        train_dataset=dataset,
        valid_dataset=dataset,
    )

    config = my_exp.config

    assert dataset[:] == Dataset.load_from_disk(config["train_dataset_path"])[:]
    assert dataset[:] == Dataset.load_from_disk(config["valid_dataset_path"])[:]
Beispiel #3
0
    def _default_trainable(config, checkpoint_dir=None):
        """A default trainable function used by `tune.run`

        It performs the most straight forward training loop with the provided `config`:
        - Create the pipeline (optionally with a provided vocab)
        - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch
        - Execute the training
        """
        if config["silence"]:
            logging.getLogger("biome.text").setLevel(logging.ERROR)

        pipeline = Pipeline.from_config(config["pipeline_config"])

        trainer_config = TrainerConfiguration(**config["trainer_config"])

        vocab_config = config["vocab_config"]
        if vocab_config:
            vocab_config = VocabularyConfiguration(**vocab_config)

        callbacks = trainer_config.callbacks
        if not isinstance(callbacks, list):
            callbacks = [callbacks]
        if not any(
            [isinstance(callback, TuneReportCallback) for callback in callbacks]
        ):
            tune_callback = TuneReportCallback(metrics=config["metrics"])
            if trainer_config.callbacks is None:
                trainer_config.callbacks = tune_callback
            else:
                trainer_config.callbacks = callbacks + [tune_callback]

        train_ds = Dataset.load_from_disk(config["train_dataset_path"])
        valid_ds = Dataset.load_from_disk(config["valid_dataset_path"])
        train_instances = train_ds.to_instances(pipeline=pipeline, disable_tqdm=True)
        valid_instances = valid_ds.to_instances(pipeline=pipeline, disable_tqdm=True)

        trainer = Trainer(
            pipeline=pipeline,
            train_dataset=train_instances,
            valid_dataset=valid_instances,
            trainer_config=trainer_config,
            vocab_config=vocab_config,
        )
        trainer.fit()
Beispiel #4
0
    def _default_trainable(config, reporter):
        """A default trainable function used by `tune.run`

        It performs the most straight forward training loop with the provided `config`:
        - Create the pipeline (optionally with a provided vocab)
        - Set up a MLFlow and WandB logger
        - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch
        - Create the vocab if necessary
        - Execute the training
        """
        pipeline = Pipeline.from_config(config["pipeline_config"],
                                        vocab_path=config["vocab_path"])

        trainer_config = TrainerConfiguration(
            **helpers.sanitize_for_params(config["trainer_config"]))

        mlflow_tracking_uri = config["mlflow_tracking_uri"]
        mlflow.set_tracking_uri(mlflow_tracking_uri)

        train_ds = Dataset.load_from_disk(config["train_dataset_path"])
        valid_ds = Dataset.load_from_disk(config["valid_dataset_path"])

        train_loggers = [
            MlflowLogger(
                experiment_name=config["name"],
                run_name=reporter.trial_name,
                ray_trial_id=reporter.trial_id,
                ray_logdir=reporter.logdir,
            ),
            TuneMetricsLogger(),
        ]
        if is_wandb_installed_and_logged_in():
            train_loggers = [WandBLogger(project_name=config["name"])
                             ] + train_loggers

        pipeline.train(
            output="training",
            training=train_ds,
            validation=valid_ds,
            trainer=trainer_config,
            loggers=train_loggers,
            vocab_config=None if config["vocab_path"] else "default",
        )
Beispiel #5
0
def test_tune_exp_save_dataset_and_vocab(
    dataset, pipeline_config, trainer_config, monkeypatch
):
    pl = Pipeline.from_config(pipeline_config)
    vocab = VocabularyConfiguration(datasets=[dataset]).build_vocab(pipeline=pl)

    my_exp = TuneExperiment(
        pipeline_config=pipeline_config,
        trainer_config=trainer_config,
        train_dataset=dataset,
        valid_dataset=dataset,
        vocab=vocab,
    )

    config = my_exp.config
    pl2 = Pipeline.from_config(config["pipeline_config"], config["vocab_path"])

    pl._model.extend_vocabulary(vocab)
    assert pl.backbone.vocab._index_to_token == pl2.backbone.vocab._index_to_token
    assert pl.backbone.vocab._token_to_index == pl2.backbone.vocab._token_to_index

    assert dataset[:] == Dataset.load_from_disk(config["train_dataset_path"])[:]
    assert dataset[:] == Dataset.load_from_disk(config["valid_dataset_path"])[:]