def test_mlflow_logger(): logger = MlflowLogger(experiment_name="test-experiment", run_name="test_run", tag1="my-tag") pipeline = Pipeline.from_config( PipelineConfiguration( name="test-pipeline", head=TaskHeadConfiguration(type=TextClassification, labels=["A", "B"]), )) trainer = TrainerConfiguration() logger.init_train(pipeline, trainer, training=None) for epoch in range(0, 10): logger.log_epoch_metrics(epoch, metrics={"key": 10 * epoch}) model_path = mkdtemp() metrics = {"metric": 200} logger.end_train(TrainingResults(model_path, metrics)) run = mlflow.get_run(logger._run_id) assert run # Tags assert "test_run" == run.data.tags[mlflow_tags.MLFLOW_RUN_NAME] assert "my-tag" == run.data.tags["tag1"] # Parameters expected_parmams = { "pipeline.features.word.trainable": "True", "pipeline.num_parameters": "202", "pipeline.num_trainable_parameters": "202", "pipeline.features.word.embedding_dim": "50", "pipeline.head.type": "biome.text.modules.heads.classification.text_classification.TextClassification", "pipeline.head.labels": "['A', 'B']", "pipeline.name": "test-pipeline", "pipeline.tokenizer.lang": "en", "trainer.batch_size": "16", "trainer.validation_metric": "-loss", "trainer.optimizer.type": "adam", "trainer.patience": "2", "trainer.num_epochs": "20", "pipeline.tokenizer.remove_space_tokens": "True", } assert expected_parmams == run.data.params # Artifacts assert os.path.basename(model_path) in os.listdir( urlparse(run.info.artifact_uri).path) # Metrics for metric in metrics: assert (metric in run.data.metrics and run.data.metrics[metric] == metrics[metric])
def test_save(pipeline, tmp_path): pipeline.save(tmp_path) assert (tmp_path / "model.tar.gz").is_file() expected_prediction = pipeline.predict("test") prediction = Pipeline.from_pretrained(tmp_path / "model.tar.gz").predict("test") assert prediction["labels"] == expected_prediction["labels"] assert_allclose(prediction["probabilities"], expected_prediction["probabilities"])
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(text="my name is juan") pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "lm"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, )
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(text="Test this NER machine") pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "ner_experiment"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, )
def test_pure_transformers(tmp_path, pipeline_dict, trainer_config, train_dataset): """Testing a Transformer training process and a model load""" pl = Pipeline.from_config(pipeline_dict) # Check a fixed vocabulary size for the model assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 pl.predict(text="test") output = tmp_path / "output" trainer = Trainer( pipeline=pl, train_dataset=train_dataset, trainer_config=trainer_config ) trainer.fit(output_dir=output) # Test vocabulary from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) # Check a fixed vocabulary size for the model after loading assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
def test_train(pipeline_dict, training_dataset, trainer_config, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) assert pipeline.output == ["entities", "tags"] assert pipeline.head.span_labels == ["NER"] assert pipeline.head.labels == ["B-NER", "I-NER", "U-NER", "L-NER", "O"] trainer = Trainer( pipeline=pipeline, train_dataset=training_dataset, trainer_config=trainer_config ) trainer.fit(tmp_path / "ner_experiment")
def test_create_pipeline_with_weights_file(pipeline_config, dataset, tmp_path): pipeline = Pipeline.from_config(pipeline_config) output = tmp_path / "pretrained_word_vector_output" pipeline.train( output=str(output), training=dataset, trainer=TrainerConfiguration(num_epochs=1, cuda_device=-1), ) instance = pipeline.head.featurize("test") instance.index_fields(pipeline.vocab) assert_allclose( pipeline.backbone.embedder(instance.as_tensor_dict()["text"], 0), torch.tensor([[0.66, 0.33]]), ) # Loading a pretrained model without the weights file should work Path(pipeline_config["features"]["word"]["weights_file"]).unlink() assert isinstance(Pipeline.from_pretrained(str(output / "model.tar.gz")), Pipeline)
def test_training_with_data_bucketing(pipeline: Pipeline, dataset: Dataset, tmp_path: str): configuration = TrainerConfiguration(data_bucketing=True, batch_size=2, num_epochs=5) pipeline.copy().train( output=os.path.join(tmp_path, "output"), trainer=configuration, training=dataset, validation=dataset, lazy=False, ) pipeline.copy().train( output=os.path.join(tmp_path, "output"), trainer=configuration, training=dataset, validation=dataset, lazy=True, )
def test_dataset_creation_with_partial_mapping( datasource_with_partial_mapping: DataSource, pipeline_test: Pipeline ): df = datasource_with_partial_mapping.to_mapped_dataframe() dataset = pipeline_test.create_dataset(datasource_with_partial_mapping) assert isinstance(dataset, AllennlpDataset) assert len(dataset) == len(df.text) for instance in dataset: assert isinstance(instance, Instance) assert "text" in instance.fields assert "label" in instance.fields
def test_training_with_logging( pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str ): training = pipeline_test.create_dataset(datasource_test) pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[training])) configuration = TrainerConfiguration( data_bucketing=True, batch_size=2, num_epochs=5 ) output_dir = os.path.join(tmp_path, "output") pipeline_test.train( output=output_dir, trainer=configuration, training=training, quiet=True ) assert os.path.exists(os.path.join(output_dir, "train.log")) with open(os.path.join(output_dir, "train.log")) as train_log: for line in train_log.readlines(): assert "allennlp" in line assert logging.getLogger("allennlp").level == logging.ERROR assert logging.getLogger("biome").level == logging.INFO
def train( pipeline_path: str, output: str, trainer: str, training: str, validation: Optional[str] = None, test: Optional[str] = None, ) -> None: """Train a pipeline. PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file), or the path to a pipeline configuration (YAML file). """ _, extension = os.path.splitext(pipeline_path) extension = extension[1:].lower() pipeline = ( Pipeline.from_yaml(pipeline_path) if extension in ["yaml", "yml"] else Pipeline.from_pretrained(pipeline_path) ) datasets = { "train": dataset_from_path(training), "validation": dataset_from_path(validation) if validation else None, "test": dataset_from_path(test) if test else None, } pipeline.create_vocabulary( VocabularyConfiguration( sources=[dataset for dataset in datasets.values() if dataset] ), ) pipeline.train( output=output, trainer=TrainerConfiguration(**yaml_to_dict(trainer)), training=datasets["training"], validation=datasets["validation"], test=datasets["test"], )
def test_train_from_pretrained(pipeline, dataset, tmp_path): output_path = tmp_path / "test_train_from_pretrained_output" trainer_config = TrainerConfiguration(max_epochs=1, batch_size=2, gpus=0) trainer = Trainer( pipeline=pipeline, train_dataset=dataset, trainer_config=trainer_config ) trainer.fit(output_path) prediction = pipeline.predict("a test") pipeline_loaded = Pipeline.from_pretrained(output_path / "model.tar.gz") prediction_loaded = pipeline_loaded.predict("a test") assert_allclose(prediction["probabilities"], prediction_loaded["probabilities"])
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(record1={"first_name": "Hans"}, record2={"first_name": "Hansel"}) pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "record_bimpm_experiment"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, )
def test_train(pipeline_dict, training_dataset, trainer_config, tmp_path): """Testing the correct working of prediction, vocab creating and training""" pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(record1={"first_name": "Hans"}, record2={"first_name": "Hansel"}) trainer = Trainer( pipeline=pipeline, train_dataset=training_dataset, valid_dataset=training_dataset, trainer_config=trainer_config, ) trainer.fit(tmp_path / "record_bimpm_experiment")
def pipeline() -> Pipeline: labels = ["a", "b", "c", "d", "e", "f"] return Pipeline.from_config({ "name": "test_document_classification", "tokenizer": { "segment_sentences": False }, "head": { "type": "DocumentClassification", "labels": labels, "dropout": 0.1, }, })
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) assert pipeline.output == ["entities", "tags"] assert pipeline.head.span_labels == ["NER"] assert pipeline.head.labels == ["B-NER", "I-NER", "U-NER", "L-NER", "O"] pipeline.train( output=str(tmp_path / "ner_experiment"), trainer=TrainerConfiguration(**trainer_dict), training=training_dataset, )
def test_pretokenized_decode(self, pipeline_dict): pipeline = Pipeline.from_config(pipeline_dict) output = self._input_top_k2(pipeline) assert output.keys() == dict(entities=None, tags=None, scores=None).keys() assert output["entities"] == [[ [dict(start_token=3, end_token=4, label="NER")], [dict(start_token=1, end_token=4, label="NER")], ]] assert output["tags"] == [[["O", "O", "O", "U-NER"], ["O", "B-NER", "I-NER", "L-NER"]]] assert output["scores"] == [[2, 1]]
def test_untokenized_input(self, pipeline_dict): pipeline = Pipeline.from_config(pipeline_dict) output = self._input_top_k2(pipeline, pretokenized=False) expected_output = TokenClassificationPrediction( tags=[["O", "O", "O", "U-NER"], ["O", "B-NER", "I-NER", "L-NER"]], entities=[ [Entity(start_token=3, end_token=4, label="NER", start=10, end=14)], [Entity(start_token=1, end_token=4, label="NER", start=5, end=14)], ], scores=[2, 1], ) assert output == expected_output
def test_explain_without_steps(): pipeline_config = PipelineConfiguration( name="test-classifier", head=TaskHeadConfiguration(type=TestHeadWithRaise), features=FeaturesConfiguration(), ) pipeline = Pipeline.from_config(pipeline_config) with pytest.raises(NotImplementedError): pipeline.explain("This is a simple test with only tokens in explain") prediction = pipeline.explain( "This is a simple test with only tokens in explain", n_steps=0) assert "explain" in prediction
def test_create_output_dir(pipeline_dict, dataset, tmp_path): config = TrainerConfiguration(logger=False, fast_dev_run=True, batch_size=1, max_epochs=1, gpus=0) pipeline = Pipeline.from_config(pipeline_dict) trainer = Trainer(pipeline, train_dataset=dataset, trainer_config=config) output_dir = tmp_path / "test_this_non_existing_parent_dir" / "output" trainer.fit(output_dir=output_dir) assert output_dir.is_dir()
def test_pipeline_default_tokenizer(pipeline_dict): pipeline_dict["features"].update({"word": {"embedding_dim": 2}}) pl = Pipeline.from_config(pipeline_dict) assert pl.config.tokenizer_config == TokenizerConfiguration() assert pl.config.features.transformers.mismatched is True assert ( type(pl.backbone.featurizer.indexer["transformers"]) is PretrainedTransformerMismatchedIndexer ) assert type(pl.backbone.tokenizer) is Tokenizer prediction = pl.predict("Test this!")
def test_text_classification( tmp_path, pipeline_dict, trainer_dict, train_valid_data_source ): """Apart from a well specified training, this also tests the vocab creation!""" random.seed(42) np.random.seed(422) torch.manual_seed(4222) if torch.cuda.is_available(): torch.cuda.manual_seed_all(4222) pl = Pipeline.from_config(pipeline_dict) train_ds = pl.create_dataset(train_valid_data_source[0]) valid_ds = pl.create_dataset(train_valid_data_source[1]) trainer = TrainerConfiguration(**trainer_dict) vocab = VocabularyConfiguration(sources=[train_ds], max_vocab_size={"word": 50}) pl.create_vocabulary(vocab) assert pl._model.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl._model.vocab.get_vocab_size(CharFeatures.namespace) == 83 output = tmp_path / "output" pl.train( output=str(output), trainer=trainer, training=train_ds, validation=valid_ds ) assert pl.num_trainable_parameters == 22070 with (output / "metrics.json").open() as file: metrics = json.load(file) assert metrics["training_loss"] == pytest.approx(0.670, abs=0.003) # test vocab from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) assert pl._model.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl._model.vocab.get_vocab_size(CharFeatures.namespace) == 83
def test_train(pipeline_dict, training_dataset, trainer_config, tmp_path): """Testing the correct working of prediction, vocab creating and training""" pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(text="my name is juan") trainer = Trainer( pipeline=pipeline, train_dataset=training_dataset, valid_dataset=training_dataset, trainer_config=trainer_config, ) trainer.fit(tmp_path / "lm")
def test_train(tmp_path, pipeline_dict, trainer_dict, train_data_source): pl = Pipeline.from_config(pipeline_dict) trainer = TrainerConfiguration(**trainer_dict) vocab = VocabularyConfiguration(sources=[train_data_source]) pl.create_vocabulary(vocab) assert pl.backbone.vocab.get_vocab_size("transformers") == 50265 pl.predict(text="test") output = tmp_path / "output" training_results = pl.train( output=str(output), trainer=trainer, training=train_data_source, ) # test vocab from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) assert pl.backbone.vocab.get_vocab_size("transformers") == 50265
def test_pipeline_transformers_tokenizer(pipeline_dict): pl = Pipeline.from_config(pipeline_dict) assert pl.config.tokenizer_config.transformers_kwargs == { "model_name": "sshleifer/tiny-distilroberta-base" } assert pl.config.features.transformers.mismatched is False assert ( type(pl.backbone.featurizer.indexer["transformers"]) is PretrainedTransformerIndexer ) assert type(pl.backbone.tokenizer) is TransformersTokenizer prediction = pl.predict("Test this!")
def test_tune_exp_save_dataset_and_vocab( dataset, pipeline_config, trainer_config, monkeypatch ): pl = Pipeline.from_config(pipeline_config) vocab = VocabularyConfiguration(datasets=[dataset]).build_vocab(pipeline=pl) my_exp = TuneExperiment( pipeline_config=pipeline_config, trainer_config=trainer_config, train_dataset=dataset, valid_dataset=dataset, vocab=vocab, ) config = my_exp.config pl2 = Pipeline.from_config(config["pipeline_config"], config["vocab_path"]) pl._model.extend_vocabulary(vocab) assert pl.backbone.vocab._index_to_token == pl2.backbone.vocab._index_to_token assert pl.backbone.vocab._token_to_index == pl2.backbone.vocab._token_to_index assert dataset[:] == Dataset.load_from_disk(config["train_dataset_path"])[:] assert dataset[:] == Dataset.load_from_disk(config["valid_dataset_path"])[:]
def test_pipeline_test(pipeline_dict, dataset, tmp_path): import json pl = Pipeline.from_config(pipeline_dict) trainer = Trainer(pl) first_metrics = trainer.test(dataset, output_dir=tmp_path, batch_size=16) assert "test_loss" in first_metrics assert (tmp_path / "metrics.json").is_file() with (tmp_path / "metrics.json").open() as file: assert "test_loss" in json.load(file) assert pl.evaluate(dataset)["test_loss"] == pytest.approx( first_metrics["test_loss"])
def test_find_lr(train_data_source, pipeline_dict, trainer_config, find_lr_config): pl = Pipeline.from_config(pipeline_dict) vocab_config = VocabularyConfiguration(sources=[train_data_source]) pl.create_vocabulary(vocab_config) learning_rates, losses = pl.find_lr( trainer_config=trainer_config, find_lr_config=find_lr_config, training_data=train_data_source, ) assert len(learning_rates) == len(losses) == 12
def test_map_args_kwargs_to_input(): class MockPipeline: def __init__(self, inputs): self._inputs = inputs @property def inputs(self): return self._inputs assert Pipeline._map_args_kwargs_to_input(MockPipeline(["text"]), "test") == { "text": "test" } assert Pipeline._map_args_kwargs_to_input(MockPipeline(["text"]), text="test") == { "text": "test" } assert Pipeline._map_args_kwargs_to_input(MockPipeline(["text", "text2"]), "test", text2="test2") == { "text": "test", "text2": "test2" }
def test_train_from_pretrained(pipeline, dataset, tmp_path): output_path = tmp_path / "test_train_from_pretrained_output" trainer_config = TrainerConfiguration(num_epochs=1, batch_size=2, cuda_device=-1) pipeline.train(output=str(output_path), training=dataset, trainer=trainer_config) prediction = pipeline.predict("a test") pipeline_loaded = Pipeline.from_pretrained(str(output_path)) prediction_loaded = pipeline_loaded.predict("a test") assert_allclose(prediction["probabilities"], prediction_loaded["probabilities"])