def _save_dataset_to_disk(self, dataset: Dataset) -> str: """Saves the dataset to disk if not saved already Parameters ---------- dataset Dataset to save to disk Returns ------- dataset_path Path to the saved dataset, that is a directory """ try: filename = Path(dataset.dataset.cache_files[0]["filename"]) except (IndexError, KeyError): filename = Path() if filename.name != "dataset.arrow": tmp_dir = tempfile.TemporaryDirectory() self._created_tmp_dirs.append(tmp_dir) dataset_path = tmp_dir.name dataset.save_to_disk(dataset_path) else: dataset_path = str(filename.absolute()) # Make sure that we can load the dataset successfully try: Dataset.load_from_disk(dataset_path) except Exception as exception: raise ValidationError( f"Could not load dataset saved in '{dataset_path}'" ) from exception return dataset_path
def test_tune_exp_save_dataset_and_vocab( dataset, pipeline_config, trainer_config, monkeypatch ): pl = Pipeline.from_config(pipeline_config) my_exp = TuneExperiment( pipeline_config=pipeline_config, trainer_config=trainer_config, train_dataset=dataset, valid_dataset=dataset, ) config = my_exp.config assert dataset[:] == Dataset.load_from_disk(config["train_dataset_path"])[:] assert dataset[:] == Dataset.load_from_disk(config["valid_dataset_path"])[:]
def _default_trainable(config, checkpoint_dir=None): """A default trainable function used by `tune.run` It performs the most straight forward training loop with the provided `config`: - Create the pipeline (optionally with a provided vocab) - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch - Execute the training """ if config["silence"]: logging.getLogger("biome.text").setLevel(logging.ERROR) pipeline = Pipeline.from_config(config["pipeline_config"]) trainer_config = TrainerConfiguration(**config["trainer_config"]) vocab_config = config["vocab_config"] if vocab_config: vocab_config = VocabularyConfiguration(**vocab_config) callbacks = trainer_config.callbacks if not isinstance(callbacks, list): callbacks = [callbacks] if not any( [isinstance(callback, TuneReportCallback) for callback in callbacks] ): tune_callback = TuneReportCallback(metrics=config["metrics"]) if trainer_config.callbacks is None: trainer_config.callbacks = tune_callback else: trainer_config.callbacks = callbacks + [tune_callback] train_ds = Dataset.load_from_disk(config["train_dataset_path"]) valid_ds = Dataset.load_from_disk(config["valid_dataset_path"]) train_instances = train_ds.to_instances(pipeline=pipeline, disable_tqdm=True) valid_instances = valid_ds.to_instances(pipeline=pipeline, disable_tqdm=True) trainer = Trainer( pipeline=pipeline, train_dataset=train_instances, valid_dataset=valid_instances, trainer_config=trainer_config, vocab_config=vocab_config, ) trainer.fit()
def _default_trainable(config, reporter): """A default trainable function used by `tune.run` It performs the most straight forward training loop with the provided `config`: - Create the pipeline (optionally with a provided vocab) - Set up a MLFlow and WandB logger - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch - Create the vocab if necessary - Execute the training """ pipeline = Pipeline.from_config(config["pipeline_config"], vocab_path=config["vocab_path"]) trainer_config = TrainerConfiguration( **helpers.sanitize_for_params(config["trainer_config"])) mlflow_tracking_uri = config["mlflow_tracking_uri"] mlflow.set_tracking_uri(mlflow_tracking_uri) train_ds = Dataset.load_from_disk(config["train_dataset_path"]) valid_ds = Dataset.load_from_disk(config["valid_dataset_path"]) train_loggers = [ MlflowLogger( experiment_name=config["name"], run_name=reporter.trial_name, ray_trial_id=reporter.trial_id, ray_logdir=reporter.logdir, ), TuneMetricsLogger(), ] if is_wandb_installed_and_logged_in(): train_loggers = [WandBLogger(project_name=config["name"]) ] + train_loggers pipeline.train( output="training", training=train_ds, validation=valid_ds, trainer=trainer_config, loggers=train_loggers, vocab_config=None if config["vocab_path"] else "default", )
def test_tune_exp_save_dataset_and_vocab( dataset, pipeline_config, trainer_config, monkeypatch ): pl = Pipeline.from_config(pipeline_config) vocab = VocabularyConfiguration(datasets=[dataset]).build_vocab(pipeline=pl) my_exp = TuneExperiment( pipeline_config=pipeline_config, trainer_config=trainer_config, train_dataset=dataset, valid_dataset=dataset, vocab=vocab, ) config = my_exp.config pl2 = Pipeline.from_config(config["pipeline_config"], config["vocab_path"]) pl._model.extend_vocabulary(vocab) assert pl.backbone.vocab._index_to_token == pl2.backbone.vocab._index_to_token assert pl.backbone.vocab._token_to_index == pl2.backbone.vocab._token_to_index assert dataset[:] == Dataset.load_from_disk(config["train_dataset_path"])[:] assert dataset[:] == Dataset.load_from_disk(config["valid_dataset_path"])[:]
def dataset(): return Dataset.from_dict({"text": ["a", "b"], "label": ["a", "b"]})
def evaluate( self, dataset: Dataset, batch_size: int = 16, lazy: bool = False, cuda_device: int = None, predictions_output_file: Optional[str] = None, metrics_output_file: Optional[str] = None, ) -> Dict[str, Any]: """Evaluates the pipeline on a given dataset Parameters ---------- dataset The dataset to use for the evaluation batch_size Batch size used during the evaluation lazy If true, instances from the dataset are lazily loaded from disk, otherwise they are loaded into memory. cuda_device If you want to use a specific CUDA device for the evaluation, specify it here. Pass on -1 for the CPU. By default we will use a CUDA device if one is available. predictions_output_file Optional path to write the predictions to. metrics_output_file Optional path to write the final metrics to. Returns ------- metrics Metrics defined in the TaskHead """ from biome.text._helpers import create_dataloader # move model to cuda device if cuda_device is None: from torch import cuda if cuda.device_count() > 0: cuda_device = 0 else: cuda_device = -1 prior_device = next(self._model.parameters()).get_device() self._model.to(cuda_device if cuda_device >= 0 else "cpu") if not any(label_column in dataset.column_names for label_column in self.output): raise ValueError( f"Your dataset needs one of the label columns for an evaluation: {self.output}" ) instances = dataset.to_instances(self, lazy=lazy) instances.index_with(self.backbone.vocab) data_loader = create_dataloader(instances, batch_size=batch_size) try: return evaluate( self._model, data_loader, cuda_device=cuda_device, predictions_output_file=predictions_output_file, output_file=metrics_output_file, ) finally: self._model.to(prior_device if prior_device >= 0 else "cpu")