def test_training_from_pretrained_with_head_replace( pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str ): training = pipeline_test.create_dataset(datasource_test) pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[training])) configuration = TrainerConfiguration( data_bucketing=True, batch_size=2, num_epochs=5 ) output_dir = os.path.join(tmp_path, "output") results = pipeline_test.train( output=output_dir, trainer=configuration, training=training, quiet=True ) trained = Pipeline.from_pretrained(results.model_path) trained.set_head(TestHead) trained.config.tokenizer.max_nr_of_sentences = 3 copied = trained._make_copy() assert isinstance(copied.head, TestHead) assert copied.num_parameters == trained.num_parameters assert copied.num_trainable_parameters == trained.num_trainable_parameters copied_model_state = copied._model.state_dict() original_model_state = trained._model.state_dict() for key, value in copied_model_state.items(): if "backbone" in key: assert torch.all(torch.eq(value, original_model_state[key])) assert copied.backbone.featurizer.tokenizer.max_nr_of_sentences == 3
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(text="Test this NER machine") pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "ner_experiment"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, )
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(text="my name is juan") pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "lm"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, )
def test_specific_vocab_config(pipeline, train_dataset, valid_dataset): vocab_config = VocabularyConfiguration(include_valid_data=True) Trainer( pipeline, train_dataset=train_dataset, valid_dataset=valid_dataset, vocab_config=vocab_config, ) assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 16 assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 19 assert pipeline.vocab.get_vocab_size( TransformersFeatures.namespace) == 28996
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(record1={"first_name": "Hans"}, record2={"first_name": "Hansel"}) pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "record_bimpm_experiment"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, )
def __init__( self, pipeline_config: dict, trainer_config: TrainerConfiguration, train_dataset: Dataset, valid_dataset: Dataset, vocab_config: Optional[Union[str, VocabularyConfiguration]] = "default", metrics: Union[None, str, List[str], Dict[str, str]] = None, name: Optional[str] = None, trainable: Optional[Callable] = None, silence: bool = False, **kwargs, ): if ( "name" in kwargs.keys() or "run" in kwargs.keys() or "config" in kwargs.keys() ): raise ValueError( f"Your `kwargs` must not contain the 'name', 'run' or 'config' key." f"These are provided automatically by `TuneExperiment`." ) # save created tmp dirs in this list to clean them up when object gets destroyed self._created_tmp_dirs: List[tempfile.TemporaryDirectory] = [] self._train_dataset_path = self._save_dataset_to_disk(train_dataset) self._valid_dataset_path = self._save_dataset_to_disk(valid_dataset) self._pipeline_config = pipeline_config self._trainer_config = asdict(trainer_config) vocab_config: Optional[VocabularyConfiguration] = ( VocabularyConfiguration() if vocab_config == "default" else vocab_config ) self._vocab_config: Optional[Dict] = ( asdict(vocab_config) if vocab_config else vocab_config ) self.trainable = trainable or self._default_trainable self._silence = silence self._name = name or f"HPO on {datetime.now().strftime('%Y-%m-%d (%I-%M)')}" if not os.environ.get("WANDB_PROJECT"): os.environ["WANDB_PROJECT"] = self._name self._metrics = metrics super().__init__( name=self._name, run=self.trainable, config=self.config, **kwargs )
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict( text="The most common audits were about waste and recycling", entities=[ { "start": 34, "end": 39, "label": "OBJECT", "text": "waste" }, { "start": 16, "end": 22, "label": "SUBJECT", "text": "audits" }, ], ) pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "relation_classifier"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, ) pl_trained = Pipeline.from_pretrained(str(tmp_path / "relation_classifier")) pl_trained.predict( text="The most common audits were about waste and recycling", entities=[ { "start": 34, "end": 39, "label": "OBJECT", "text": "waste" }, { "start": 16, "end": 22, "label": "SUBJECT", "text": "audits" }, ], )
def test_text_classification(tmp_path, pipeline_dict, train_valid_dataset): """Apart from a well specified training, this also tests the vocab creation!""" seed_everything(43) pl = Pipeline.from_config(pipeline_dict) train_ds = train_valid_dataset[0] valid_ds = train_valid_dataset[1] vocab_config = VocabularyConfiguration(max_vocab_size={"word": 50}) trainer_config = TrainerConfiguration( batch_size=64, optimizer={ "type": "adam", "lr": 0.01 }, max_epochs=5, default_root_dir=str(tmp_path), gpus=0, # turn off gpus even if available ) trainer = Trainer( pipeline=pl, train_dataset=train_ds, valid_dataset=valid_ds, trainer_config=trainer_config, vocab_config=vocab_config, ) trainer.fit(tmp_path / "output") assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83 assert pl.num_trainable_parameters == 22070 evaluation = trainer.test(valid_ds, batch_size=16) # Reminder: the value depends on the batch_size! assert evaluation["test_loss"] == pytest.approx(0.7404146790504456, abs=0.003) Pipeline.from_pretrained(str(tmp_path / "output" / "model.tar.gz")) assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
def test_text_classification( tmp_path, pipeline_dict, trainer_dict, train_valid_dataset ): """Apart from a well specified training, this also tests the vocab creation!""" random.seed(42) np.random.seed(422) torch.manual_seed(4222) if torch.cuda.is_available(): torch.cuda.manual_seed_all(4222) pl = Pipeline.from_config(pipeline_dict) train_ds = train_valid_dataset[0] valid_ds = train_valid_dataset[1] trainer = TrainerConfiguration(**trainer_dict) vocab_config = VocabularyConfiguration( datasets=[train_ds], max_vocab_size={"word": 50} ) output = tmp_path / "output" pl.train( output=str(output), trainer=trainer, training=train_ds, validation=valid_ds, vocab_config=vocab_config, ) assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83 assert pl.num_trainable_parameters == 22070 with (output / "metrics.json").open() as file: metrics = json.load(file) # It may fail in some systems assert metrics["training_loss"] == pytest.approx(0.684, abs=0.003) # Test vocab from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
def _default_trainable(config, checkpoint_dir=None): """A default trainable function used by `tune.run` It performs the most straight forward training loop with the provided `config`: - Create the pipeline (optionally with a provided vocab) - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch - Execute the training """ if config["silence"]: logging.getLogger("biome.text").setLevel(logging.ERROR) pipeline = Pipeline.from_config(config["pipeline_config"]) trainer_config = TrainerConfiguration(**config["trainer_config"]) vocab_config = config["vocab_config"] if vocab_config: vocab_config = VocabularyConfiguration(**vocab_config) callbacks = trainer_config.callbacks if not isinstance(callbacks, list): callbacks = [callbacks] if not any( [isinstance(callback, TuneReportCallback) for callback in callbacks] ): tune_callback = TuneReportCallback(metrics=config["metrics"]) if trainer_config.callbacks is None: trainer_config.callbacks = tune_callback else: trainer_config.callbacks = callbacks + [tune_callback] train_ds = Dataset.load_from_disk(config["train_dataset_path"]) valid_ds = Dataset.load_from_disk(config["valid_dataset_path"]) train_instances = train_ds.to_instances(pipeline=pipeline, disable_tqdm=True) valid_instances = valid_ds.to_instances(pipeline=pipeline, disable_tqdm=True) trainer = Trainer( pipeline=pipeline, train_dataset=train_instances, valid_dataset=valid_instances, trainer_config=trainer_config, vocab_config=vocab_config, ) trainer.fit()
def test_vocab_config(tmp_path, pipeline_config, trainer_config, dataset): vocab_config = VocabularyConfiguration(max_vocab_size=1) my_exp = TuneExperiment( pipeline_config=pipeline_config, trainer_config=trainer_config, train_dataset=dataset, valid_dataset=dataset, vocab_config=vocab_config, name="test_vocab_config", local_dir=str(tmp_path), ) analysis = tune.run(my_exp) pl = Pipeline.from_pretrained( Path(analysis.get_best_logdir("validation_loss", "min")) / "output" / "model.tar.gz") assert pl.vocab.get_vocab_size("word") == 3
def test_training_with_logging( pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str ): training = pipeline_test.create_dataset(datasource_test) pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[training])) configuration = TrainerConfiguration( data_bucketing=True, batch_size=2, num_epochs=5 ) output_dir = os.path.join(tmp_path, "output") pipeline_test.train( output=output_dir, trainer=configuration, training=training, quiet=True ) assert os.path.exists(os.path.join(output_dir, "train.log")) with open(os.path.join(output_dir, "train.log")) as train_log: for line in train_log.readlines(): assert "allennlp" in line assert logging.getLogger("allennlp").level == logging.ERROR assert logging.getLogger("biome").level == logging.INFO
def train( pipeline_path: str, output: str, trainer: str, training: str, validation: Optional[str] = None, test: Optional[str] = None, ) -> None: """Train a pipeline. PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file), or the path to a pipeline configuration (YAML file). """ _, extension = os.path.splitext(pipeline_path) extension = extension[1:].lower() pipeline = ( Pipeline.from_yaml(pipeline_path) if extension in ["yaml", "yml"] else Pipeline.from_pretrained(pipeline_path) ) datasets = { "train": dataset_from_path(training), "validation": dataset_from_path(validation) if validation else None, "test": dataset_from_path(test) if test else None, } pipeline.create_vocabulary( VocabularyConfiguration( sources=[dataset for dataset in datasets.values() if dataset] ), ) pipeline.train( output=output, trainer=TrainerConfiguration(**yaml_to_dict(trainer)), training=datasets["training"], validation=datasets["validation"], test=datasets["test"], )
def test_train(tmp_path, pipeline_dict, trainer_dict, train_data_source): pl = Pipeline.from_config(pipeline_dict) trainer = TrainerConfiguration(**trainer_dict) vocab = VocabularyConfiguration(sources=[train_data_source]) pl.create_vocabulary(vocab) assert pl.backbone.vocab.get_vocab_size("transformers") == 50265 pl.predict(text="test") output = tmp_path / "output" training_results = pl.train( output=str(output), trainer=trainer, training=train_data_source, ) # test vocab from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) assert pl.backbone.vocab.get_vocab_size("transformers") == 50265
def test_tune_exp_save_dataset_and_vocab( dataset, pipeline_config, trainer_config, monkeypatch ): pl = Pipeline.from_config(pipeline_config) vocab = VocabularyConfiguration(datasets=[dataset]).build_vocab(pipeline=pl) my_exp = TuneExperiment( pipeline_config=pipeline_config, trainer_config=trainer_config, train_dataset=dataset, valid_dataset=dataset, vocab=vocab, ) config = my_exp.config pl2 = Pipeline.from_config(config["pipeline_config"], config["vocab_path"]) pl._model.extend_vocabulary(vocab) assert pl.backbone.vocab._index_to_token == pl2.backbone.vocab._index_to_token assert pl.backbone.vocab._token_to_index == pl2.backbone.vocab._token_to_index assert dataset[:] == Dataset.load_from_disk(config["train_dataset_path"])[:] assert dataset[:] == Dataset.load_from_disk(config["valid_dataset_path"])[:]
def test_training_with_data_bucketing( pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str ): lazy_ds = pipeline_test.create_dataset(datasource_test, lazy=True) non_lazy_ds = pipeline_test.create_dataset(datasource_test) pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[lazy_ds])) configuration = TrainerConfiguration( data_bucketing=True, batch_size=2, num_epochs=5 ) pipeline_test.train( output=os.path.join(tmp_path, "output"), trainer=configuration, training=lazy_ds, validation=non_lazy_ds, ) pipeline_test.train( output=os.path.join(tmp_path, "output"), trainer=configuration, training=non_lazy_ds, validation=lazy_ds, )