def test_load_pipeline_with_custom_head(training_dataset): """Testing a model training inserting a class as custom heard""" # Pipeline configuration dict with custom head config = PipelineConfiguration( "test-pipeline", head=TaskHeadConfiguration( type=MyCustomHead, labels=[ "blue-collar", "technician", "management", "services", "retired", "admin.", ], ), features=FeaturesConfiguration(), ) # Asserting that pipeline.head is an instance of MyCustomHead pipeline = Pipeline.from_config(config) assert isinstance(pipeline.head, MyCustomHead) # Training the model and saving it to output output = mkdtemp() pipeline.train(output=output, training=training_dataset) # Loading model from output trained_pl = Pipeline.from_pretrained(os.path.join(output, "model.tar.gz")) trained_pl.predict("Oh yeah") # Asserting that the pipeline head is recognized as `MyCustomHead` instance after loading from a model.tar.gz assert isinstance(trained_pl.head, MyCustomHead)
def test_load_pipeline_with_custom_head(): config = PipelineConfiguration( "test-pipeline", head=TaskHeadConfiguration( type=MyCustomHead, labels=[ "blue-collar", "technician", "management", "services", "retired", "admin.", ], ), features=FeaturesConfiguration(), ) pipeline = Pipeline.from_config(config) assert isinstance(pipeline.head, MyCustomHead) train = DataSource( source=os.path.join(TEST_RESOURCES, "resources/data/dataset_source.csv"), mapping={ "label": "job", "text": ["education", "marital"] }, ) output = mkdtemp() pipeline.create_vocabulary(VocabularyConfiguration(sources=[train])) pipeline.train(output=output, training=train) trained_pl = Pipeline.from_pretrained(os.path.join(output, "model.tar.gz")) trained_pl.predict("Oh yeah") assert isinstance(trained_pl.head, MyCustomHead)
def test_mlflow_logger(): logger = MlflowLogger(experiment_name="test-experiment", run_name="test_run", tag1="my-tag") pipeline = Pipeline.from_config( PipelineConfiguration( name="test-pipeline", head=TaskHeadConfiguration(type=TextClassification, labels=["A", "B"]), )) trainer = TrainerConfiguration() logger.init_train(pipeline, trainer, training=None) for epoch in range(0, 10): logger.log_epoch_metrics(epoch, metrics={"key": 10 * epoch}) model_path = mkdtemp() metrics = {"metric": 200} logger.end_train(TrainingResults(model_path, metrics)) run = mlflow.get_run(logger._run_id) assert run # Tags assert "test_run" == run.data.tags[mlflow_tags.MLFLOW_RUN_NAME] assert "my-tag" == run.data.tags["tag1"] # Parameters expected_parmams = { "pipeline.features.word.trainable": "True", "pipeline.num_parameters": "202", "pipeline.num_trainable_parameters": "202", "pipeline.features.word.embedding_dim": "50", "pipeline.head.type": "biome.text.modules.heads.classification.text_classification.TextClassification", "pipeline.head.labels": "['A', 'B']", "pipeline.name": "test-pipeline", "pipeline.tokenizer.lang": "en", "trainer.batch_size": "16", "trainer.validation_metric": "-loss", "trainer.optimizer.type": "adam", "trainer.patience": "2", "trainer.num_epochs": "20", "trainer.num_serialized_models_to_keep": "1", "pipeline.tokenizer.remove_space_tokens": "True", } assert expected_parmams == run.data.params # Artifacts assert os.path.basename(model_path) in os.listdir( urlparse(run.info.artifact_uri).path) # Metrics for metric in metrics: assert (metric in run.data.metrics and run.data.metrics[metric] == metrics[metric])
def test_explain_without_steps(): pipeline_config = PipelineConfiguration( name="test-classifier", head=TaskHeadConfiguration(type=TestHeadWithRaise), features=FeaturesConfiguration(), ) pipeline = Pipeline.from_config(pipeline_config) with pytest.raises(NotImplementedError): pipeline.explain("This is a simple test with only tokens in explain") prediction = pipeline.explain( "This is a simple test with only tokens in explain", n_steps=0) assert "explain" in prediction
def test_predict_batch(): pipeline_config = PipelineConfiguration( name="test-classifier", head=TaskHeadConfiguration(type=TestHead), features=FeaturesConfiguration(), ) pipeline = Pipeline.from_config(pipeline_config) predictions = pipeline.predict_batch([{ "text": "test1" }, { "text": "test2" }]) assert len(predictions) == 2 assert all([isinstance(prediction, dict) for prediction in predictions])
def test_explain_tokenized_as_default(): pipeline_config = PipelineConfiguration( name="test-classifier", head=TaskHeadConfiguration(type=TestHead), features=FeaturesConfiguration(), ) pipeline = Pipeline.from_config(pipeline_config) prediction = pipeline.explain( "This is a simple test with only tokens in explain") explain = prediction["explain"] assert explain assert explain.get("text") for token_info in explain["text"]: assert isinstance(token_info.get("token"), str) assert token_info.get("attribution") == 0.0
def test_check_pipeline_inputs_and_output(): config = PipelineConfiguration( "test-pipeline", head=TaskHeadConfiguration( type=MyCustomHead, labels=[ "blue-collar", "technician", "management", "services", "retired", "admin.", ], ), features=FeaturesConfiguration(), ) pipeline = Pipeline.from_config(config) assert pipeline.inputs == ["text", "second_text"] assert pipeline.output == "label"
def test_explain_batch(): pipeline_config = PipelineConfiguration( name="test-classifier", head=TaskHeadConfiguration(type=TestHead), features=FeaturesConfiguration(), ) pipeline = Pipeline.from_config(pipeline_config) predictions = pipeline.explain_batch([{ "text": "test1" }, { "text": "test2" }]) assert len(predictions) == 2 for prediction in predictions: explain: Dict[str, Any] = prediction["explain"] assert explain assert explain.get("text") for token_info in explain["text"]: assert isinstance(token_info.get("token"), str) assert token_info.get("attribution") == 0.0
def pipeline() -> Pipeline: config = PipelineConfiguration( name="test-classifier", head=TextClassificationConfiguration(labels=["one", "zero"]), ) return Pipeline.from_config(config)