def test_not_implemented_transformers_with_tokenclassification( transformers_pipeline_config, ): transformers_pipeline_config["tokenizer"] = {"use_transformers": True} transformers_pipeline_config["head"] = { "type": "TokenClassification", "labels": ["NER"], } with pytest.raises(NotImplementedError): Pipeline.from_config(transformers_pipeline_config)
def test_invalid_tokenizer_features_combination(transformers_pipeline_config): transformers_pipeline_config["features"].update( {"word": { "embedding_dim": 2 }}) transformers_pipeline_config["tokenizer"] = {"use_transformers": True} with pytest.raises(ConfigurationError): Pipeline.from_config(transformers_pipeline_config)
def test_invalid_transformers_tokenizer_indexer_embedder_combination( transformers_pipeline_config, ): transformers_pipeline_config["tokenizer"] = { "transformers_kwargs": { "model_name": "distilroberta-base" } } with pytest.raises(ConfigurationError): Pipeline.from_config(transformers_pipeline_config)
def test_max_length_not_affecting_shorter_sequences(pipeline_dict): """Max length change should not affect at all previous shorter-length models""" pl = Pipeline.from_config(pipeline_dict) state_dict = pl._model.state_dict() # dict with the whole state of the module probs = pl.predict("Test this")["probabilities"] # probabilities of the test input pipeline_dict["features"]["transformers"]["max_length"] = 100 # changing max length pl = Pipeline.from_config(pipeline_dict) pl._model.load_state_dict(state_dict) # loading previous state from dict probs_max_length = pl.predict("Test this")["probabilities"] assert_allclose(probs, probs_max_length)
def test_load_pipeline_with_custom_head(training_dataset): """Testing a model training inserting a class as custom heard""" # Pipeline configuration dict with custom head config = PipelineConfiguration( "test-pipeline", head=TaskHeadConfiguration( type=MyCustomHead, labels=[ "blue-collar", "technician", "management", "services", "retired", "admin.", ], ), features=FeaturesConfiguration(), ) # Asserting that pipeline.head is an instance of MyCustomHead pipeline = Pipeline.from_config(config) assert isinstance(pipeline.head, MyCustomHead) # Training the model and saving it to output output = mkdtemp() pipeline.train(output=output, training=training_dataset) # Loading model from output trained_pl = Pipeline.from_pretrained(os.path.join(output, "model.tar.gz")) trained_pl.predict("Oh yeah") # Asserting that the pipeline head is recognized as `MyCustomHead` instance after loading from a model.tar.gz assert isinstance(trained_pl.head, MyCustomHead)
def test_train(pipeline_dict, training_dataset, trainer_config, tmp_path): """Testing a classifier made from scratch""" pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict( text="The most common audits were about waste and recycling", entities=[ { "start": 34, "end": 39, "label": "OBJECT", "text": "waste" }, { "start": 16, "end": 22, "label": "SUBJECT", "text": "audits" }, ], ) trainer = Trainer( pipeline=pipeline, train_dataset=training_dataset, valid_dataset=training_dataset, trainer_config=trainer_config, ) trainer.fit(tmp_path / "relation_classifier") # test loading Pipeline.from_pretrained(tmp_path / "relation_classifier" / "model.tar.gz")
def pipeline(): return Pipeline.from_config( { "name": "test_pipeline_copy", "head": {"type": "TextClassification", "labels": ["a", "b"]}, } )
def pipeline(): config = { "name": "vocab_test", "features": { "transformers": { "model_name": "sshleifer/tiny-distilbert-base-cased" }, "word": { "embedding_dim": 2 }, "char": { "embedding_dim": 2, "dropout": 0.1, "encoder": { "type": "gru", "hidden_size": 2, "num_layers": 1, "bidirectional": False, }, }, }, "head": { "type": "TextClassification", "labels": ["good", "bad"], }, } return Pipeline.from_config(config)
def test_pipeline_without_word_features(): tokenizer_config = TokenizerConfiguration() char_features = CharFeatures( embedding_dim=2, encoder={ "type": "gru", "hidden_size": 2, "num_layers": 1, "bidirectional": True, }, dropout=0.1, ) features_config = FeaturesConfiguration(char=char_features) encoder_spec = Seq2SeqEncoderConfiguration(type="gru", hidden_size=2, num_layers=1, bidirectional=True) head_spec = TaskHeadConfiguration( type="TextClassification", labels=["duplicate", "not_duplicate"], pooler={"type": "boe"}, ) pipeline_config = PipelineConfiguration( name="no_word_features", head=head_spec, features=features_config, tokenizer=tokenizer_config, encoder=encoder_spec, ) pl = Pipeline.from_config(pipeline_config) assert "word" not in pl.backbone.featurizer.indexer assert "char" in pl.backbone.featurizer.indexer
def test_transformers_and_word(tmp_path, pipeline_dict, trainer_config, train_dataset): """Testing Transformer pipeline with an added word feature layer""" # Changing the pipeline to delete the BERT pooler and add a word feature del pipeline_dict["head"]["pooler"] pipeline_dict["features"].update( {"word": {"embedding_dim": 16, "lowercase_tokens": True}} ) pl = Pipeline.from_config(pipeline_dict) pl.predict(text="test") output = tmp_path / "output" trainer = Trainer( pipeline=pl, train_dataset=train_dataset, trainer_config=trainer_config ) trainer.fit(output_dir=output) # Check a fixed vocabulary size for the transformer and the word feature assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 assert pl.backbone.vocab.get_vocab_size("word") == 273 # Test vocab from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) # Check a fixed vocabulary size for the transformer and the word feature after loading assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 assert pl.backbone.vocab.get_vocab_size("word") == 273
def test_add_default_loggers(input_kwargs, expected_loggers, pipeline_dict, dataset, tmp_path): trainer_config = TrainerConfiguration(**input_kwargs, default_root_dir=str(tmp_path)) trainer = Trainer( Pipeline.from_config(pipeline_dict), train_dataset=dataset, trainer_config=trainer_config, ) if input_kwargs.get("logger") is not False: assert isinstance(trainer.trainer.logger, LoggerCollection) assert len(trainer.trainer.logger.experiment) == len(expected_loggers) else: assert trainer._trainer_config.logger is False def loggers_include(logger_type) -> bool: return any([ isinstance(logger, logger_type) for logger in trainer._trainer_config.logger ]) for logger in expected_loggers: if logger == "csv": assert loggers_include(CSVLogger) if logger == "tensorboard": assert loggers_include(TensorBoardLogger) if logger == "wandb": assert loggers_include(WandbLogger) assert (tmp_path / "wandb").is_dir() if logger == "mlflow": assert loggers_include(MLFlowLogger)
def test_extending_vocab_with_weights_file(pipeline_config, dataset, dataset2, deactivate_pipeline_trainer, caplog): pipeline = Pipeline.from_config(pipeline_config) # create vocab pipeline.train( output="dummy", training=dataset, ) # extending the vocab with the weights file available should apply the pretrained weights pipeline.train( output="dummy", training=dataset2, ) instance = pipeline.head.featurize("this") instance.index_fields(pipeline.vocab) assert_allclose( pipeline.backbone.embedder(instance.as_tensor_dict()["text"]), torch.tensor([[0.25, 0.75]]), ) # extending the vocab with the weights file deleted should trigger a warning logging.captureWarnings(True) Path(pipeline_config["features"]["word"]["weights_file"]).unlink() pipeline.train( output="dummy", training=Dataset.from_dict({ "text": ["that"], "label": ["good"] }), ) assert caplog.records[0].module == "embedding" assert "cannot locate the pretrained_file" in caplog.records[0].message
def test_extending_vocab_with_weights_file(pipeline_config, dataset, dataset2, capsys, caplog): pipeline = Pipeline.from_config(pipeline_config) # create vocab pipeline.create_vocab([dataset.to_instances(pipeline)]) # extending the vocab with the weights file available should apply the pretrained weights pipeline.create_vocab([dataset2.to_instances(pipeline)]) instance = pipeline.head.featurize("this") instance.index_fields(pipeline.vocab) assert_allclose( pipeline.backbone.embedder(instance.as_tensor_dict()["text"]), torch.tensor([[0.25, 0.75]]), ) # extending the vocab with the weights file deleted should trigger a warning Path(pipeline_config["features"]["word"]["weights_file"]).unlink() ds = Dataset.from_dict({"text": ["that"], "label": ["good"]}) pipeline.create_vocab([ds.to_instances(pipeline)]) assert caplog.record_tuples[-1][ 0] == "allennlp.modules.token_embedders.embedding" assert caplog.record_tuples[-1][1] == 30 assert ( "Embedding at model_path, " "_head.backbone.embedder.token_embedder_word cannot locate the pretrained_file." in caplog.record_tuples[-1][2])
def test_load_pipeline_with_custom_head(): config = PipelineConfiguration( "test-pipeline", head=TaskHeadConfiguration( type=MyCustomHead, labels=[ "blue-collar", "technician", "management", "services", "retired", "admin.", ], ), features=FeaturesConfiguration(), ) pipeline = Pipeline.from_config(config) assert isinstance(pipeline.head, MyCustomHead) train = DataSource( source=os.path.join(TEST_RESOURCES, "resources/data/dataset_source.csv"), mapping={ "label": "job", "text": ["education", "marital"] }, ) output = mkdtemp() pipeline.create_vocabulary(VocabularyConfiguration(sources=[train])) pipeline.train(output=output, training=train) trained_pl = Pipeline.from_pretrained(os.path.join(output, "model.tar.gz")) trained_pl.predict("Oh yeah") assert isinstance(trained_pl.head, MyCustomHead)
def test_attributions(pipeline_dict, training_dataset): pipeline = Pipeline.from_config(pipeline_dict) instance = pipeline.head.featurize(training_dataset["record1"][0], training_dataset["record2"][0]) pipeline.model.eval() forward_output = pipeline.model.forward_on_instances([instance]) attributions = pipeline.head._compute_attributions(forward_output[0], instance) assert all( [isinstance(attribution, Attribution) for attribution in attributions]) assert len(attributions) == 4 assert all([isinstance(attr.attribution, float) for attr in attributions]) assert all([attributions[i].field == "record1" for i in [0, 1]]) assert all([attributions[i].field == "record2" for i in [2, 3]]) assert attributions[1].start == 0 and attributions[1].end == 16 assert attributions[0].text == "@first_name Hans" assert attributions[3].text == "@last_name Petre" # Raise error when records with different number of record fields instance = pipeline.head.featurize( record1={ "first_name": "Hans", "last_name": "Zimmermann" }, record2={"first_name": "Hansel"}, ) forward_output = pipeline._model.forward_on_instances([instance]) with pytest.raises(RuntimeError): pipeline.head._compute_attributions(forward_output[0], instance)
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path): """Testing a classifier made from scratch""" pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict( text="The most common audits were about waste and recycling", entities=[ { "start": 34, "end": 39, "label": "OBJECT", "text": "waste" }, { "start": 16, "end": 22, "label": "SUBJECT", "text": "audits" }, ], ) pipeline.train( output=str(tmp_path / "relation_classifier"), trainer=TrainerConfiguration(**trainer_dict), training=training_dataset, validation=training_dataset, ) # test loading Pipeline.from_pretrained(str(tmp_path / "relation_classifier"))
def test_metrics(pipeline_dict): pipeline = Pipeline.from_config(pipeline_dict) instance = pipeline.head.featurize(text="test this".split(), tags=["U-NER", "O"]) batch = Batch([instance]) batch.index_instances(pipeline.vocab) pipeline.head.forward(**batch.as_tensor_dict()) # validation metric should have never been called assert pipeline.head._metrics.get_dict()["accuracy"].total_count == 2 assert pipeline.head._metrics.get_dict(is_train=False)["accuracy"].total_count == 0 train_metrics = pipeline.head.get_metrics(reset=True) expected_metric_names = ["accuracy"] + [ f"{metric}-{label}" for metric in ["precision", "recall", "f1-measure"] for label in ["NER", "overall"] ] print(train_metrics) assert all(name in train_metrics for name in expected_metric_names) pipeline.head.training = False pipeline.head.forward(**batch.as_tensor_dict()) # training metric should have never been called after its reset assert pipeline.head._metrics.get_dict()["accuracy"].total_count == 0 assert pipeline.head._metrics.get_dict(is_train=False)["accuracy"].total_count == 2 valid_metrics = pipeline.head.get_metrics() assert all(name in valid_metrics for name in expected_metric_names)
def pipeline() -> Pipeline: return Pipeline.from_config({ "name": "test_predict", "head": { "type": "TextClassification", "labels": ["a"] }, })
def test_pipeline_config(pipeline_yaml): tokenizer_config = TokenizerConfiguration( text_cleaning={"rules": ["strip_spaces"]}, use_spacy_tokens=True) word_features = WordFeatures(embedding_dim=2, lowercase_tokens=True) char_features = CharFeatures( embedding_dim=2, encoder={ "type": "gru", "hidden_size": 2, "num_layers": 1, "bidirectional": True, }, dropout=0.1, ) features_config = FeaturesConfiguration(word=word_features, char=char_features) encoder_spec = Seq2SeqEncoderConfiguration(type="gru", hidden_size=2, num_layers=1, bidirectional=True) head_spec = TaskHeadConfiguration( type=TextClassification, labels=["duplicate", "not_duplicate"], pooler={"type": "boe"}, ) pipeline_config = PipelineConfiguration( name="test_pipeline_config", head=head_spec, features=features_config, tokenizer=tokenizer_config, encoder=encoder_spec, ) pl = Pipeline.from_config(pipeline_config) pl_yaml = Pipeline.from_yaml(pipeline_yaml) assert pl.named_trainable_parameters == pl_yaml.named_trainable_parameters assert pl.num_trainable_parameters == pl_yaml.num_trainable_parameters assert pl.num_parameters == pl_yaml.num_parameters sample_text = "My simple text" for instance in [ pl.backbone.featurizer(sample_text), pl_yaml.backbone.featurizer(sample_text), ]: for key, value in instance.items(): assert key == "record" assert isinstance(value, ListField) assert len(value) == 1 for text in value: assert isinstance(text, TextField) assert all(map(lambda t: isinstance(t, Token), text.tokens)) assert sample_text == " ".join([t.text for t in text.tokens])
def pipeline(dataset): labels = dataset.unique("label") return Pipeline.from_config({ "name": "test_pipeline_evaluate", "head": { "type": "TextClassification", "labels": labels, }, })
def model() -> PipelineModel: pipeline = Pipeline.from_config({ "name": "test_predict", "head": { "type": "TextClassification", "labels": ["a"] }, }) return pipeline._model
def pipeline() -> Pipeline: labels = ["a", "b", "c", "d", "e", "f"] return Pipeline.from_config({ "name": "test_text_classification", "head": { "type": "TextClassification", "labels": labels, "dropout": 0.1 }, })
def test_raise_filenotfound_error(pipeline_config, deactivate_pipeline_trainer): Path(pipeline_config["features"]["word"]["weights_file"]).unlink() pipeline = Pipeline.from_config(pipeline_config) with pytest.raises(FileNotFoundError): pipeline.train( output="dummy", training=cast(Dataset, None), )
def uses_cached_instances(self, pipeline_config) -> bool: """Checks if the `to_instances` method of the provided pipeline_config uses the cached instances""" cache_path = Path(self.dataset.dataset.cache_files[0]["filename"]).parent number_of_files_before = len(list(cache_path.iterdir())) pipeline = Pipeline.from_config(pipeline_config) self.dataset.to_instances(pipeline) number_of_files_after = len(list(cache_path.iterdir())) return number_of_files_before == number_of_files_after
def test_serve(): """Needs to be automatized this test!""" pipeline = Pipeline.from_config( { "name": "serve_test", "head": {"type": "TextClassification", "labels": ["a", "b"]}, } ) _serve(pipeline)
def test_pipeline_default_tokenizer(pipeline_dict): pipeline_dict["features"].update({"word": {"embedding_dim": 2}}) pl = Pipeline.from_config(pipeline_dict) assert pl.config.tokenizer_config == TokenizerConfiguration() assert pl.config.features.transformers.mismatched is True assert (type(pl.backbone.featurizer.indexer["transformers"]) is PretrainedTransformerMismatchedIndexer) assert type(pl.backbone.tokenizer) is Tokenizer prediction = pl.predict("Test this!")
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path): """Testing the correct working of prediction, vocab creating and training""" pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(record1={"first_name": "Hans"}, record2={"first_name": "Hansel"}) pipeline.train( output=str(tmp_path / "record_bimpm_experiment"), trainer=TrainerConfiguration(**trainer_dict), training=training_dataset, validation=training_dataset, )
def test_mlflow_logger(): logger = MlflowLogger(experiment_name="test-experiment", run_name="test_run", tag1="my-tag") pipeline = Pipeline.from_config( PipelineConfiguration( name="test-pipeline", head=TaskHeadConfiguration(type=TextClassification, labels=["A", "B"]), )) trainer = TrainerConfiguration() logger.init_train(pipeline, trainer, training=None) for epoch in range(0, 10): logger.log_epoch_metrics(epoch, metrics={"key": 10 * epoch}) model_path = mkdtemp() metrics = {"metric": 200} logger.end_train(TrainingResults(model_path, metrics)) run = mlflow.get_run(logger._run_id) assert run # Tags assert "test_run" == run.data.tags[mlflow_tags.MLFLOW_RUN_NAME] assert "my-tag" == run.data.tags["tag1"] # Parameters expected_parmams = { "pipeline.features.word.trainable": "True", "pipeline.num_parameters": "202", "pipeline.num_trainable_parameters": "202", "pipeline.features.word.embedding_dim": "50", "pipeline.head.type": "biome.text.modules.heads.classification.text_classification.TextClassification", "pipeline.head.labels": "['A', 'B']", "pipeline.name": "test-pipeline", "pipeline.tokenizer.lang": "en", "trainer.batch_size": "16", "trainer.validation_metric": "-loss", "trainer.optimizer.type": "adam", "trainer.patience": "2", "trainer.num_epochs": "20", "trainer.num_serialized_models_to_keep": "1", "pipeline.tokenizer.remove_space_tokens": "True", } assert expected_parmams == run.data.params # Artifacts assert os.path.basename(model_path) in os.listdir( urlparse(run.info.artifact_uri).path) # Metrics for metric in metrics: assert (metric in run.data.metrics and run.data.metrics[metric] == metrics[metric])
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path): """Testing the correct working of prediction, vocab creating and training""" pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(text="my name is juan") pipeline.train( output=str(tmp_path / "lm"), trainer=TrainerConfiguration(**trainer_dict), training=training_dataset, validation=training_dataset, )
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict(text="Test this NER machine") pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "ner_experiment"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, )