Example #1
0
def test_not_implemented_transformers_with_tokenclassification(
    transformers_pipeline_config, ):
    transformers_pipeline_config["tokenizer"] = {"use_transformers": True}
    transformers_pipeline_config["head"] = {
        "type": "TokenClassification",
        "labels": ["NER"],
    }
    with pytest.raises(NotImplementedError):
        Pipeline.from_config(transformers_pipeline_config)
Example #2
0
def test_invalid_tokenizer_features_combination(transformers_pipeline_config):
    transformers_pipeline_config["features"].update(
        {"word": {
            "embedding_dim": 2
        }})
    transformers_pipeline_config["tokenizer"] = {"use_transformers": True}

    with pytest.raises(ConfigurationError):
        Pipeline.from_config(transformers_pipeline_config)
Example #3
0
def test_invalid_transformers_tokenizer_indexer_embedder_combination(
    transformers_pipeline_config, ):
    transformers_pipeline_config["tokenizer"] = {
        "transformers_kwargs": {
            "model_name": "distilroberta-base"
        }
    }

    with pytest.raises(ConfigurationError):
        Pipeline.from_config(transformers_pipeline_config)
Example #4
0
def test_max_length_not_affecting_shorter_sequences(pipeline_dict):
    """Max length change should not affect at all previous shorter-length models"""

    pl = Pipeline.from_config(pipeline_dict)
    state_dict = pl._model.state_dict()  # dict with the whole state of the module
    probs = pl.predict("Test this")["probabilities"]  # probabilities of the test input

    pipeline_dict["features"]["transformers"]["max_length"] = 100  # changing max length
    pl = Pipeline.from_config(pipeline_dict)
    pl._model.load_state_dict(state_dict)  # loading previous state from dict
    probs_max_length = pl.predict("Test this")["probabilities"]

    assert_allclose(probs, probs_max_length)
Example #5
0
def test_load_pipeline_with_custom_head(training_dataset):
    """Testing a model training inserting a class as custom heard"""

    # Pipeline configuration dict with custom head
    config = PipelineConfiguration(
        "test-pipeline",
        head=TaskHeadConfiguration(
            type=MyCustomHead,
            labels=[
                "blue-collar",
                "technician",
                "management",
                "services",
                "retired",
                "admin.",
            ],
        ),
        features=FeaturesConfiguration(),
    )

    # Asserting that pipeline.head is an instance of MyCustomHead
    pipeline = Pipeline.from_config(config)
    assert isinstance(pipeline.head, MyCustomHead)

    # Training the model and saving it to output
    output = mkdtemp()
    pipeline.train(output=output, training=training_dataset)

    # Loading model from output
    trained_pl = Pipeline.from_pretrained(os.path.join(output, "model.tar.gz"))
    trained_pl.predict("Oh yeah")

    # Asserting that the pipeline head is recognized as `MyCustomHead` instance after loading from a model.tar.gz
    assert isinstance(trained_pl.head, MyCustomHead)
def test_train(pipeline_dict, training_dataset, trainer_config, tmp_path):
    """Testing a classifier made from scratch"""

    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(
        text="The most common audits were about waste and recycling",
        entities=[
            {
                "start": 34,
                "end": 39,
                "label": "OBJECT",
                "text": "waste"
            },
            {
                "start": 16,
                "end": 22,
                "label": "SUBJECT",
                "text": "audits"
            },
        ],
    )

    trainer = Trainer(
        pipeline=pipeline,
        train_dataset=training_dataset,
        valid_dataset=training_dataset,
        trainer_config=trainer_config,
    )
    trainer.fit(tmp_path / "relation_classifier")

    # test loading
    Pipeline.from_pretrained(tmp_path / "relation_classifier" / "model.tar.gz")
def pipeline():
    return Pipeline.from_config(
        {
            "name": "test_pipeline_copy",
            "head": {"type": "TextClassification", "labels": ["a", "b"]},
        }
    )
Example #8
0
def pipeline():
    config = {
        "name": "vocab_test",
        "features": {
            "transformers": {
                "model_name": "sshleifer/tiny-distilbert-base-cased"
            },
            "word": {
                "embedding_dim": 2
            },
            "char": {
                "embedding_dim": 2,
                "dropout": 0.1,
                "encoder": {
                    "type": "gru",
                    "hidden_size": 2,
                    "num_layers": 1,
                    "bidirectional": False,
                },
            },
        },
        "head": {
            "type": "TextClassification",
            "labels": ["good", "bad"],
        },
    }

    return Pipeline.from_config(config)
Example #9
0
def test_pipeline_without_word_features():
    tokenizer_config = TokenizerConfiguration()
    char_features = CharFeatures(
        embedding_dim=2,
        encoder={
            "type": "gru",
            "hidden_size": 2,
            "num_layers": 1,
            "bidirectional": True,
        },
        dropout=0.1,
    )
    features_config = FeaturesConfiguration(char=char_features)
    encoder_spec = Seq2SeqEncoderConfiguration(type="gru",
                                               hidden_size=2,
                                               num_layers=1,
                                               bidirectional=True)

    head_spec = TaskHeadConfiguration(
        type="TextClassification",
        labels=["duplicate", "not_duplicate"],
        pooler={"type": "boe"},
    )

    pipeline_config = PipelineConfiguration(
        name="no_word_features",
        head=head_spec,
        features=features_config,
        tokenizer=tokenizer_config,
        encoder=encoder_spec,
    )

    pl = Pipeline.from_config(pipeline_config)
    assert "word" not in pl.backbone.featurizer.indexer
    assert "char" in pl.backbone.featurizer.indexer
Example #10
0
def test_transformers_and_word(tmp_path, pipeline_dict, trainer_config, train_dataset):
    """Testing Transformer pipeline with an added word feature layer"""
    # Changing the pipeline to delete the BERT pooler and add a word feature
    del pipeline_dict["head"]["pooler"]
    pipeline_dict["features"].update(
        {"word": {"embedding_dim": 16, "lowercase_tokens": True}}
    )

    pl = Pipeline.from_config(pipeline_dict)
    pl.predict(text="test")

    output = tmp_path / "output"
    trainer = Trainer(
        pipeline=pl, train_dataset=train_dataset, trainer_config=trainer_config
    )
    trainer.fit(output_dir=output)

    # Check a fixed vocabulary size for the transformer and the word feature
    assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
    assert pl.backbone.vocab.get_vocab_size("word") == 273

    # Test vocab from a pretrained file
    pl = Pipeline.from_pretrained(str(output / "model.tar.gz"))

    # Check a fixed vocabulary size for the transformer and the word feature after loading
    assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
    assert pl.backbone.vocab.get_vocab_size("word") == 273
Example #11
0
def test_add_default_loggers(input_kwargs, expected_loggers, pipeline_dict,
                             dataset, tmp_path):
    trainer_config = TrainerConfiguration(**input_kwargs,
                                          default_root_dir=str(tmp_path))
    trainer = Trainer(
        Pipeline.from_config(pipeline_dict),
        train_dataset=dataset,
        trainer_config=trainer_config,
    )
    if input_kwargs.get("logger") is not False:
        assert isinstance(trainer.trainer.logger, LoggerCollection)
        assert len(trainer.trainer.logger.experiment) == len(expected_loggers)
    else:
        assert trainer._trainer_config.logger is False

    def loggers_include(logger_type) -> bool:
        return any([
            isinstance(logger, logger_type)
            for logger in trainer._trainer_config.logger
        ])

    for logger in expected_loggers:
        if logger == "csv":
            assert loggers_include(CSVLogger)
        if logger == "tensorboard":
            assert loggers_include(TensorBoardLogger)
        if logger == "wandb":
            assert loggers_include(WandbLogger)
            assert (tmp_path / "wandb").is_dir()
        if logger == "mlflow":
            assert loggers_include(MLFlowLogger)
Example #12
0
def test_extending_vocab_with_weights_file(pipeline_config, dataset, dataset2,
                                           deactivate_pipeline_trainer,
                                           caplog):
    pipeline = Pipeline.from_config(pipeline_config)
    # create vocab
    pipeline.train(
        output="dummy",
        training=dataset,
    )

    # extending the vocab with the weights file available should apply the pretrained weights
    pipeline.train(
        output="dummy",
        training=dataset2,
    )
    instance = pipeline.head.featurize("this")
    instance.index_fields(pipeline.vocab)

    assert_allclose(
        pipeline.backbone.embedder(instance.as_tensor_dict()["text"]),
        torch.tensor([[0.25, 0.75]]),
    )

    # extending the vocab with the weights file deleted should trigger a warning
    logging.captureWarnings(True)
    Path(pipeline_config["features"]["word"]["weights_file"]).unlink()
    pipeline.train(
        output="dummy",
        training=Dataset.from_dict({
            "text": ["that"],
            "label": ["good"]
        }),
    )
    assert caplog.records[0].module == "embedding"
    assert "cannot locate the pretrained_file" in caplog.records[0].message
def test_extending_vocab_with_weights_file(pipeline_config, dataset, dataset2,
                                           capsys, caplog):
    pipeline = Pipeline.from_config(pipeline_config)
    # create vocab
    pipeline.create_vocab([dataset.to_instances(pipeline)])

    # extending the vocab with the weights file available should apply the pretrained weights
    pipeline.create_vocab([dataset2.to_instances(pipeline)])

    instance = pipeline.head.featurize("this")
    instance.index_fields(pipeline.vocab)

    assert_allclose(
        pipeline.backbone.embedder(instance.as_tensor_dict()["text"]),
        torch.tensor([[0.25, 0.75]]),
    )

    # extending the vocab with the weights file deleted should trigger a warning
    Path(pipeline_config["features"]["word"]["weights_file"]).unlink()
    ds = Dataset.from_dict({"text": ["that"], "label": ["good"]})
    pipeline.create_vocab([ds.to_instances(pipeline)])

    assert caplog.record_tuples[-1][
        0] == "allennlp.modules.token_embedders.embedding"
    assert caplog.record_tuples[-1][1] == 30
    assert (
        "Embedding at model_path, "
        "_head.backbone.embedder.token_embedder_word cannot locate the pretrained_file."
        in caplog.record_tuples[-1][2])
Example #14
0
def test_load_pipeline_with_custom_head():
    config = PipelineConfiguration(
        "test-pipeline",
        head=TaskHeadConfiguration(
            type=MyCustomHead,
            labels=[
                "blue-collar",
                "technician",
                "management",
                "services",
                "retired",
                "admin.",
            ],
        ),
        features=FeaturesConfiguration(),
    )

    pipeline = Pipeline.from_config(config)
    assert isinstance(pipeline.head, MyCustomHead)

    train = DataSource(
        source=os.path.join(TEST_RESOURCES,
                            "resources/data/dataset_source.csv"),
        mapping={
            "label": "job",
            "text": ["education", "marital"]
        },
    )
    output = mkdtemp()
    pipeline.create_vocabulary(VocabularyConfiguration(sources=[train]))
    pipeline.train(output=output, training=train)

    trained_pl = Pipeline.from_pretrained(os.path.join(output, "model.tar.gz"))
    trained_pl.predict("Oh yeah")
    assert isinstance(trained_pl.head, MyCustomHead)
def test_attributions(pipeline_dict, training_dataset):
    pipeline = Pipeline.from_config(pipeline_dict)
    instance = pipeline.head.featurize(training_dataset["record1"][0],
                                       training_dataset["record2"][0])
    pipeline.model.eval()
    forward_output = pipeline.model.forward_on_instances([instance])

    attributions = pipeline.head._compute_attributions(forward_output[0],
                                                       instance)

    assert all(
        [isinstance(attribution, Attribution) for attribution in attributions])
    assert len(attributions) == 4
    assert all([isinstance(attr.attribution, float) for attr in attributions])
    assert all([attributions[i].field == "record1" for i in [0, 1]])
    assert all([attributions[i].field == "record2" for i in [2, 3]])
    assert attributions[1].start == 0 and attributions[1].end == 16

    assert attributions[0].text == "@first_name Hans"
    assert attributions[3].text == "@last_name Petre"

    # Raise error when records with different number of record fields
    instance = pipeline.head.featurize(
        record1={
            "first_name": "Hans",
            "last_name": "Zimmermann"
        },
        record2={"first_name": "Hansel"},
    )
    forward_output = pipeline._model.forward_on_instances([instance])

    with pytest.raises(RuntimeError):
        pipeline.head._compute_attributions(forward_output[0], instance)
Example #16
0
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path):
    """Testing a classifier made from scratch"""

    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(
        text="The most common audits were about waste and recycling",
        entities=[
            {
                "start": 34,
                "end": 39,
                "label": "OBJECT",
                "text": "waste"
            },
            {
                "start": 16,
                "end": 22,
                "label": "SUBJECT",
                "text": "audits"
            },
        ],
    )

    pipeline.train(
        output=str(tmp_path / "relation_classifier"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_dataset,
        validation=training_dataset,
    )

    # test loading
    Pipeline.from_pretrained(str(tmp_path / "relation_classifier"))
Example #17
0
def test_metrics(pipeline_dict):
    pipeline = Pipeline.from_config(pipeline_dict)
    instance = pipeline.head.featurize(text="test this".split(), tags=["U-NER", "O"])
    batch = Batch([instance])
    batch.index_instances(pipeline.vocab)

    pipeline.head.forward(**batch.as_tensor_dict())
    # validation metric should have never been called
    assert pipeline.head._metrics.get_dict()["accuracy"].total_count == 2
    assert pipeline.head._metrics.get_dict(is_train=False)["accuracy"].total_count == 0

    train_metrics = pipeline.head.get_metrics(reset=True)
    expected_metric_names = ["accuracy"] + [
        f"{metric}-{label}"
        for metric in ["precision", "recall", "f1-measure"]
        for label in ["NER", "overall"]
    ]
    print(train_metrics)
    assert all(name in train_metrics for name in expected_metric_names)

    pipeline.head.training = False
    pipeline.head.forward(**batch.as_tensor_dict())
    # training metric should have never been called after its reset
    assert pipeline.head._metrics.get_dict()["accuracy"].total_count == 0
    assert pipeline.head._metrics.get_dict(is_train=False)["accuracy"].total_count == 2

    valid_metrics = pipeline.head.get_metrics()
    assert all(name in valid_metrics for name in expected_metric_names)
Example #18
0
def pipeline() -> Pipeline:
    return Pipeline.from_config({
        "name": "test_predict",
        "head": {
            "type": "TextClassification",
            "labels": ["a"]
        },
    })
Example #19
0
def test_pipeline_config(pipeline_yaml):
    tokenizer_config = TokenizerConfiguration(
        text_cleaning={"rules": ["strip_spaces"]}, use_spacy_tokens=True)

    word_features = WordFeatures(embedding_dim=2, lowercase_tokens=True)
    char_features = CharFeatures(
        embedding_dim=2,
        encoder={
            "type": "gru",
            "hidden_size": 2,
            "num_layers": 1,
            "bidirectional": True,
        },
        dropout=0.1,
    )
    features_config = FeaturesConfiguration(word=word_features,
                                            char=char_features)

    encoder_spec = Seq2SeqEncoderConfiguration(type="gru",
                                               hidden_size=2,
                                               num_layers=1,
                                               bidirectional=True)

    head_spec = TaskHeadConfiguration(
        type=TextClassification,
        labels=["duplicate", "not_duplicate"],
        pooler={"type": "boe"},
    )

    pipeline_config = PipelineConfiguration(
        name="test_pipeline_config",
        head=head_spec,
        features=features_config,
        tokenizer=tokenizer_config,
        encoder=encoder_spec,
    )

    pl = Pipeline.from_config(pipeline_config)

    pl_yaml = Pipeline.from_yaml(pipeline_yaml)

    assert pl.named_trainable_parameters == pl_yaml.named_trainable_parameters
    assert pl.num_trainable_parameters == pl_yaml.num_trainable_parameters
    assert pl.num_parameters == pl_yaml.num_parameters

    sample_text = "My simple text"
    for instance in [
            pl.backbone.featurizer(sample_text),
            pl_yaml.backbone.featurizer(sample_text),
    ]:
        for key, value in instance.items():
            assert key == "record"
            assert isinstance(value, ListField)
            assert len(value) == 1
            for text in value:
                assert isinstance(text, TextField)
                assert all(map(lambda t: isinstance(t, Token), text.tokens))
                assert sample_text == " ".join([t.text for t in text.tokens])
Example #20
0
def pipeline(dataset):
    labels = dataset.unique("label")
    return Pipeline.from_config({
        "name": "test_pipeline_evaluate",
        "head": {
            "type": "TextClassification",
            "labels": labels,
        },
    })
Example #21
0
def model() -> PipelineModel:
    pipeline = Pipeline.from_config({
        "name": "test_predict",
        "head": {
            "type": "TextClassification",
            "labels": ["a"]
        },
    })
    return pipeline._model
Example #22
0
def pipeline() -> Pipeline:
    labels = ["a", "b", "c", "d", "e", "f"]
    return Pipeline.from_config({
        "name": "test_text_classification",
        "head": {
            "type": "TextClassification",
            "labels": labels,
            "dropout": 0.1
        },
    })
Example #23
0
def test_raise_filenotfound_error(pipeline_config,
                                  deactivate_pipeline_trainer):
    Path(pipeline_config["features"]["word"]["weights_file"]).unlink()
    pipeline = Pipeline.from_config(pipeline_config)

    with pytest.raises(FileNotFoundError):
        pipeline.train(
            output="dummy",
            training=cast(Dataset, None),
        )
Example #24
0
    def uses_cached_instances(self, pipeline_config) -> bool:
        """Checks if the `to_instances` method of the provided pipeline_config uses the cached instances"""
        cache_path = Path(self.dataset.dataset.cache_files[0]["filename"]).parent

        number_of_files_before = len(list(cache_path.iterdir()))
        pipeline = Pipeline.from_config(pipeline_config)
        self.dataset.to_instances(pipeline)
        number_of_files_after = len(list(cache_path.iterdir()))

        return number_of_files_before == number_of_files_after
Example #25
0
def test_serve():
    """Needs to be automatized this test!"""
    pipeline = Pipeline.from_config(
        {
            "name": "serve_test",
            "head": {"type": "TextClassification", "labels": ["a", "b"]},
        }
    )

    _serve(pipeline)
Example #26
0
def test_pipeline_default_tokenizer(pipeline_dict):
    pipeline_dict["features"].update({"word": {"embedding_dim": 2}})
    pl = Pipeline.from_config(pipeline_dict)

    assert pl.config.tokenizer_config == TokenizerConfiguration()
    assert pl.config.features.transformers.mismatched is True
    assert (type(pl.backbone.featurizer.indexer["transformers"]) is
            PretrainedTransformerMismatchedIndexer)
    assert type(pl.backbone.tokenizer) is Tokenizer

    prediction = pl.predict("Test this!")
Example #27
0
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path):
    """Testing the correct working of prediction, vocab creating and training"""
    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(record1={"first_name": "Hans"}, record2={"first_name": "Hansel"})

    pipeline.train(
        output=str(tmp_path / "record_bimpm_experiment"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_dataset,
        validation=training_dataset,
    )
Example #28
0
def test_mlflow_logger():

    logger = MlflowLogger(experiment_name="test-experiment",
                          run_name="test_run",
                          tag1="my-tag")

    pipeline = Pipeline.from_config(
        PipelineConfiguration(
            name="test-pipeline",
            head=TaskHeadConfiguration(type=TextClassification,
                                       labels=["A", "B"]),
        ))
    trainer = TrainerConfiguration()

    logger.init_train(pipeline, trainer, training=None)
    for epoch in range(0, 10):
        logger.log_epoch_metrics(epoch, metrics={"key": 10 * epoch})

    model_path = mkdtemp()
    metrics = {"metric": 200}
    logger.end_train(TrainingResults(model_path, metrics))

    run = mlflow.get_run(logger._run_id)
    assert run
    # Tags
    assert "test_run" == run.data.tags[mlflow_tags.MLFLOW_RUN_NAME]
    assert "my-tag" == run.data.tags["tag1"]
    # Parameters
    expected_parmams = {
        "pipeline.features.word.trainable": "True",
        "pipeline.num_parameters": "202",
        "pipeline.num_trainable_parameters": "202",
        "pipeline.features.word.embedding_dim": "50",
        "pipeline.head.type":
        "biome.text.modules.heads.classification.text_classification.TextClassification",
        "pipeline.head.labels": "['A', 'B']",
        "pipeline.name": "test-pipeline",
        "pipeline.tokenizer.lang": "en",
        "trainer.batch_size": "16",
        "trainer.validation_metric": "-loss",
        "trainer.optimizer.type": "adam",
        "trainer.patience": "2",
        "trainer.num_epochs": "20",
        "trainer.num_serialized_models_to_keep": "1",
        "pipeline.tokenizer.remove_space_tokens": "True",
    }
    assert expected_parmams == run.data.params
    # Artifacts
    assert os.path.basename(model_path) in os.listdir(
        urlparse(run.info.artifact_uri).path)
    # Metrics
    for metric in metrics:
        assert (metric in run.data.metrics
                and run.data.metrics[metric] == metrics[metric])
Example #29
0
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path):
    """Testing the correct working of prediction, vocab creating and training"""

    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(text="my name is juan")

    pipeline.train(
        output=str(tmp_path / "lm"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_dataset,
        validation=training_dataset,
    )
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path):
    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(text="Test this NER machine")
    pipeline.create_vocabulary(
        VocabularyConfiguration(sources=[training_data_source]))

    pipeline.train(
        output=str(tmp_path / "ner_experiment"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_data_source,
        validation=training_data_source,
    )