Python Dataset Examples, biome.text.dataset.Dataset Python Examples

Example #1

0

Show file

    def _save_dataset_to_disk(self, dataset: Dataset) -> str:
        """Saves the dataset to disk if not saved already

        Parameters
        ----------
        dataset
            Dataset to save to disk

        Returns
        -------
        dataset_path
            Path to the saved dataset, that is a directory
        """
        try:
            filename = Path(dataset.dataset.cache_files[0]["filename"])
        except (IndexError, KeyError):
            filename = Path()

        if filename.name != "dataset.arrow":
            tmp_dir = tempfile.TemporaryDirectory()
            self._created_tmp_dirs.append(tmp_dir)
            dataset_path = tmp_dir.name
            dataset.save_to_disk(dataset_path)
        else:
            dataset_path = str(filename.absolute())

        # Make sure that we can load the dataset successfully
        try:
            Dataset.load_from_disk(dataset_path)
        except Exception as exception:
            raise ValidationError(
                f"Could not load dataset saved in '{dataset_path}'"
            ) from exception

        return dataset_path

Example #2

0

Show file

File: test_hpo.py Project: javispp/biome-text

def test_tune_exp_save_dataset_and_vocab(
    dataset, pipeline_config, trainer_config, monkeypatch
):
    pl = Pipeline.from_config(pipeline_config)

    my_exp = TuneExperiment(
        pipeline_config=pipeline_config,
        trainer_config=trainer_config,
        train_dataset=dataset,
        valid_dataset=dataset,
    )

    config = my_exp.config

    assert dataset[:] == Dataset.load_from_disk(config["train_dataset_path"])[:]
    assert dataset[:] == Dataset.load_from_disk(config["valid_dataset_path"])[:]

Example #3

0

Show file

    def _default_trainable(config, checkpoint_dir=None):
        """A default trainable function used by `tune.run`

        It performs the most straight forward training loop with the provided `config`:
        - Create the pipeline (optionally with a provided vocab)
        - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch
        - Execute the training
        """
        if config["silence"]:
            logging.getLogger("biome.text").setLevel(logging.ERROR)

        pipeline = Pipeline.from_config(config["pipeline_config"])

        trainer_config = TrainerConfiguration(**config["trainer_config"])

        vocab_config = config["vocab_config"]
        if vocab_config:
            vocab_config = VocabularyConfiguration(**vocab_config)

        callbacks = trainer_config.callbacks
        if not isinstance(callbacks, list):
            callbacks = [callbacks]
        if not any(
            [isinstance(callback, TuneReportCallback) for callback in callbacks]
        ):
            tune_callback = TuneReportCallback(metrics=config["metrics"])
            if trainer_config.callbacks is None:
                trainer_config.callbacks = tune_callback
            else:
                trainer_config.callbacks = callbacks + [tune_callback]

        train_ds = Dataset.load_from_disk(config["train_dataset_path"])
        valid_ds = Dataset.load_from_disk(config["valid_dataset_path"])
        train_instances = train_ds.to_instances(pipeline=pipeline, disable_tqdm=True)
        valid_instances = valid_ds.to_instances(pipeline=pipeline, disable_tqdm=True)

        trainer = Trainer(
            pipeline=pipeline,
            train_dataset=train_instances,
            valid_dataset=valid_instances,
            trainer_config=trainer_config,
            vocab_config=vocab_config,
        )
        trainer.fit()

Example #4

0

Show file

    def _default_trainable(config, reporter):
        """A default trainable function used by `tune.run`

        It performs the most straight forward training loop with the provided `config`:
        - Create the pipeline (optionally with a provided vocab)
        - Set up a MLFlow and WandB logger
        - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch
        - Create the vocab if necessary
        - Execute the training
        """
        pipeline = Pipeline.from_config(config["pipeline_config"],
                                        vocab_path=config["vocab_path"])

        trainer_config = TrainerConfiguration(
            **helpers.sanitize_for_params(config["trainer_config"]))

        mlflow_tracking_uri = config["mlflow_tracking_uri"]
        mlflow.set_tracking_uri(mlflow_tracking_uri)

        train_ds = Dataset.load_from_disk(config["train_dataset_path"])
        valid_ds = Dataset.load_from_disk(config["valid_dataset_path"])

        train_loggers = [
            MlflowLogger(
                experiment_name=config["name"],
                run_name=reporter.trial_name,
                ray_trial_id=reporter.trial_id,
                ray_logdir=reporter.logdir,
            ),
            TuneMetricsLogger(),
        ]
        if is_wandb_installed_and_logged_in():
            train_loggers = [WandBLogger(project_name=config["name"])
                             ] + train_loggers

        pipeline.train(
            output="training",
            training=train_ds,
            validation=valid_ds,
            trainer=trainer_config,
            loggers=train_loggers,
            vocab_config=None if config["vocab_path"] else "default",
        )

Example #5

0

Show file

def test_tune_exp_save_dataset_and_vocab(
    dataset, pipeline_config, trainer_config, monkeypatch
):
    pl = Pipeline.from_config(pipeline_config)
    vocab = VocabularyConfiguration(datasets=[dataset]).build_vocab(pipeline=pl)

    my_exp = TuneExperiment(
        pipeline_config=pipeline_config,
        trainer_config=trainer_config,
        train_dataset=dataset,
        valid_dataset=dataset,
        vocab=vocab,
    )

    config = my_exp.config
    pl2 = Pipeline.from_config(config["pipeline_config"], config["vocab_path"])

    pl._model.extend_vocabulary(vocab)
    assert pl.backbone.vocab._index_to_token == pl2.backbone.vocab._index_to_token
    assert pl.backbone.vocab._token_to_index == pl2.backbone.vocab._token_to_index

    assert dataset[:] == Dataset.load_from_disk(config["train_dataset_path"])[:]
    assert dataset[:] == Dataset.load_from_disk(config["valid_dataset_path"])[:]

Example #6

0

Show file

def dataset():
    return Dataset.from_dict({"text": ["a", "b"], "label": ["a", "b"]})

Example #7

0

Show file

    def evaluate(
        self,
        dataset: Dataset,
        batch_size: int = 16,
        lazy: bool = False,
        cuda_device: int = None,
        predictions_output_file: Optional[str] = None,
        metrics_output_file: Optional[str] = None,
    ) -> Dict[str, Any]:
        """Evaluates the pipeline on a given dataset

        Parameters
        ----------
        dataset
            The dataset to use for the evaluation
        batch_size
            Batch size used during the evaluation
        lazy
            If true, instances from the dataset are lazily loaded from disk, otherwise they are loaded into memory.
        cuda_device
            If you want to use a specific CUDA device for the evaluation, specify it here. Pass on -1 for the CPU.
            By default we will use a CUDA device if one is available.
        predictions_output_file
            Optional path to write the predictions to.
        metrics_output_file
            Optional path to write the final metrics to.

        Returns
        -------
        metrics
            Metrics defined in the TaskHead
        """
        from biome.text._helpers import create_dataloader

        # move model to cuda device
        if cuda_device is None:
            from torch import cuda

            if cuda.device_count() > 0:
                cuda_device = 0
            else:
                cuda_device = -1
        prior_device = next(self._model.parameters()).get_device()
        self._model.to(cuda_device if cuda_device >= 0 else "cpu")

        if not any(label_column in dataset.column_names
                   for label_column in self.output):
            raise ValueError(
                f"Your dataset needs one of the label columns for an evaluation: {self.output}"
            )

        instances = dataset.to_instances(self, lazy=lazy)
        instances.index_with(self.backbone.vocab)
        data_loader = create_dataloader(instances, batch_size=batch_size)

        try:
            return evaluate(
                self._model,
                data_loader,
                cuda_device=cuda_device,
                predictions_output_file=predictions_output_file,
                output_file=metrics_output_file,
            )
        finally:
            self._model.to(prior_device if prior_device >= 0 else "cpu")