Exemple #1
0
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path):
    """Testing a classifier made from scratch"""

    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(
        text="The most common audits were about waste and recycling",
        entities=[
            {
                "start": 34,
                "end": 39,
                "label": "OBJECT",
                "text": "waste"
            },
            {
                "start": 16,
                "end": 22,
                "label": "SUBJECT",
                "text": "audits"
            },
        ],
    )

    pipeline.train(
        output=str(tmp_path / "relation_classifier"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_dataset,
        validation=training_dataset,
    )

    # test loading
    Pipeline.from_pretrained(str(tmp_path / "relation_classifier"))
def test_train(pipeline_dict, training_dataset, trainer_config, tmp_path):
    """Testing a classifier made from scratch"""

    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(
        text="The most common audits were about waste and recycling",
        entities=[
            {
                "start": 34,
                "end": 39,
                "label": "OBJECT",
                "text": "waste"
            },
            {
                "start": 16,
                "end": 22,
                "label": "SUBJECT",
                "text": "audits"
            },
        ],
    )

    trainer = Trainer(
        pipeline=pipeline,
        train_dataset=training_dataset,
        valid_dataset=training_dataset,
        trainer_config=trainer_config,
    )
    trainer.fit(tmp_path / "relation_classifier")

    # test loading
    Pipeline.from_pretrained(tmp_path / "relation_classifier" / "model.tar.gz")
Exemple #3
0
def test_load_pipeline_with_custom_head():
    config = PipelineConfiguration(
        "test-pipeline",
        head=TaskHeadConfiguration(
            type=MyCustomHead,
            labels=[
                "blue-collar",
                "technician",
                "management",
                "services",
                "retired",
                "admin.",
            ],
        ),
        features=FeaturesConfiguration(),
    )

    pipeline = Pipeline.from_config(config)
    assert isinstance(pipeline.head, MyCustomHead)

    train = DataSource(
        source=os.path.join(TEST_RESOURCES,
                            "resources/data/dataset_source.csv"),
        mapping={
            "label": "job",
            "text": ["education", "marital"]
        },
    )
    output = mkdtemp()
    pipeline.create_vocabulary(VocabularyConfiguration(sources=[train]))
    pipeline.train(output=output, training=train)

    trained_pl = Pipeline.from_pretrained(os.path.join(output, "model.tar.gz"))
    trained_pl.predict("Oh yeah")
    assert isinstance(trained_pl.head, MyCustomHead)
Exemple #4
0
def train(
    pipeline_path: str,
    output: str,
    trainer_config: str,
    train_data: str,
    valid_data: Optional[str] = None,
) -> None:
    """Train a pipeline.

    PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file),
    or the path to a pipeline configuration (YAML file).
    """
    _, extension = os.path.splitext(pipeline_path)
    extension = extension[1:].lower()
    pipeline = (
        Pipeline.from_yaml(pipeline_path)
        if extension in ["yaml", "yml"]
        else Pipeline.from_pretrained(pipeline_path)
    )

    datasets = {
        "train": dataset_from_path(train_data),
        "validation": dataset_from_path(valid_data) if valid_data else None,
    }

    trainer = Trainer(
        pipeline=pipeline,
        train_dataset=datasets["train"],
        valid_dataset=datasets["validation"],
        trainer_config=TrainerConfiguration(**yaml_to_dict(trainer_config)),
    )
    trainer.fit(output_dir=output)
Exemple #5
0
def serve(pipeline_path: str, port: int, predictions_dir: str) -> None:
    pipeline = Pipeline.from_pretrained(pipeline_path)

    if predictions_dir:
        pipeline.init_prediction_logger(predictions_dir)

    pipeline.serve(port)
Exemple #6
0
def test_load_pipeline_with_custom_head(training_dataset):
    """Testing a model training inserting a class as custom heard"""

    # Pipeline configuration dict with custom head
    config = PipelineConfiguration(
        "test-pipeline",
        head=TaskHeadConfiguration(
            type=MyCustomHead,
            labels=[
                "blue-collar",
                "technician",
                "management",
                "services",
                "retired",
                "admin.",
            ],
        ),
        features=FeaturesConfiguration(),
    )

    # Asserting that pipeline.head is an instance of MyCustomHead
    pipeline = Pipeline.from_config(config)
    assert isinstance(pipeline.head, MyCustomHead)

    # Training the model and saving it to output
    output = mkdtemp()
    pipeline.train(output=output, training=training_dataset)

    # Loading model from output
    trained_pl = Pipeline.from_pretrained(os.path.join(output, "model.tar.gz"))
    trained_pl.predict("Oh yeah")

    # Asserting that the pipeline head is recognized as `MyCustomHead` instance after loading from a model.tar.gz
    assert isinstance(trained_pl.head, MyCustomHead)
Exemple #7
0
def test_training_from_pretrained_with_head_replace(
    pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str
):
    training = pipeline_test.create_dataset(datasource_test)
    pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[training]))
    configuration = TrainerConfiguration(
        data_bucketing=True, batch_size=2, num_epochs=5
    )
    output_dir = os.path.join(tmp_path, "output")
    results = pipeline_test.train(
        output=output_dir, trainer=configuration, training=training, quiet=True
    )

    trained = Pipeline.from_pretrained(results.model_path)
    trained.set_head(TestHead)
    trained.config.tokenizer.max_nr_of_sentences = 3
    copied = trained._make_copy()
    assert isinstance(copied.head, TestHead)
    assert copied.num_parameters == trained.num_parameters
    assert copied.num_trainable_parameters == trained.num_trainable_parameters
    copied_model_state = copied._model.state_dict()
    original_model_state = trained._model.state_dict()
    for key, value in copied_model_state.items():
        if "backbone" in key:
            assert torch.all(torch.eq(value, original_model_state[key]))
    assert copied.backbone.featurizer.tokenizer.max_nr_of_sentences == 3
Exemple #8
0
def test_transformers_and_word(tmp_path, pipeline_dict, trainer_config, train_dataset):
    """Testing Transformer pipeline with an added word feature layer"""
    # Changing the pipeline to delete the BERT pooler and add a word feature
    del pipeline_dict["head"]["pooler"]
    pipeline_dict["features"].update(
        {"word": {"embedding_dim": 16, "lowercase_tokens": True}}
    )

    pl = Pipeline.from_config(pipeline_dict)
    pl.predict(text="test")

    output = tmp_path / "output"
    trainer = Trainer(
        pipeline=pl, train_dataset=train_dataset, trainer_config=trainer_config
    )
    trainer.fit(output_dir=output)

    # Check a fixed vocabulary size for the transformer and the word feature
    assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
    assert pl.backbone.vocab.get_vocab_size("word") == 273

    # Test vocab from a pretrained file
    pl = Pipeline.from_pretrained(str(output / "model.tar.gz"))

    # Check a fixed vocabulary size for the transformer and the word feature after loading
    assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
    assert pl.backbone.vocab.get_vocab_size("word") == 273
Exemple #9
0
def test_text_classification(tmp_path, pipeline_dict, train_valid_dataset):
    """Apart from a well specified training, this also tests the vocab creation!"""
    seed_everything(43)

    pl = Pipeline.from_config(pipeline_dict)
    train_ds = train_valid_dataset[0]
    valid_ds = train_valid_dataset[1]

    vocab_config = VocabularyConfiguration(max_vocab_size={"word": 50})
    trainer_config = TrainerConfiguration(
        batch_size=64,
        optimizer={
            "type": "adam",
            "lr": 0.01
        },
        max_epochs=5,
        default_root_dir=str(tmp_path),
        gpus=0,  # turn off gpus even if available
    )

    trainer = Trainer(
        pipeline=pl,
        train_dataset=train_ds,
        valid_dataset=valid_ds,
        trainer_config=trainer_config,
        vocab_config=vocab_config,
    )

    trainer.fit(tmp_path / "output")

    assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
    assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83

    assert pl.num_trainable_parameters == 22070

    evaluation = trainer.test(valid_ds, batch_size=16)

    # Reminder: the value depends on the batch_size!
    assert evaluation["test_loss"] == pytest.approx(0.7404146790504456,
                                                    abs=0.003)

    Pipeline.from_pretrained(str(tmp_path / "output" / "model.tar.gz"))

    assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
    assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
Exemple #10
0
def serve(pipeline_path: str, port: int, predictions_dir: str) -> None:
    """Serves the pipeline predictions as a REST API

    PIPELINE_PATH is the path to a pretrained pipeline (model.tar.gz file).
    """
    pipeline = Pipeline.from_pretrained(pipeline_path)

    if predictions_dir:
        pipeline.init_prediction_logger(predictions_dir)

    pipeline.serve(port)
def test_save(pipeline, tmp_path):
    pipeline.save(tmp_path)

    assert (tmp_path / "model.tar.gz").is_file()

    expected_prediction = pipeline.predict("test")
    prediction = Pipeline.from_pretrained(tmp_path /
                                          "model.tar.gz").predict("test")

    assert prediction["labels"] == expected_prediction["labels"]
    assert_allclose(prediction["probabilities"],
                    expected_prediction["probabilities"])
def test_train_from_pretrained(pipeline, dataset, tmp_path):
    output_path = tmp_path / "test_train_from_pretrained_output"
    trainer_config = TrainerConfiguration(max_epochs=1, batch_size=2, gpus=0)
    trainer = Trainer(
        pipeline=pipeline, train_dataset=dataset, trainer_config=trainer_config
    )
    trainer.fit(output_path)

    prediction = pipeline.predict("a test")
    pipeline_loaded = Pipeline.from_pretrained(output_path / "model.tar.gz")
    prediction_loaded = pipeline_loaded.predict("a test")

    assert_allclose(prediction["probabilities"], prediction_loaded["probabilities"])
Exemple #13
0
def test_train_from_pretrained(pipeline, dataset, tmp_path):
    output_path = tmp_path / "test_train_from_pretrained_output"
    trainer_config = TrainerConfiguration(num_epochs=1,
                                          batch_size=2,
                                          cuda_device=-1)
    pipeline.train(output=str(output_path),
                   training=dataset,
                   trainer=trainer_config)

    prediction = pipeline.predict("a test")
    pipeline_loaded = Pipeline.from_pretrained(str(output_path))
    prediction_loaded = pipeline_loaded.predict("a test")

    assert_allclose(prediction["probabilities"],
                    prediction_loaded["probabilities"])
Exemple #14
0
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path):
    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(
        text="The most common audits were about waste and recycling",
        entities=[
            {
                "start": 34,
                "end": 39,
                "label": "OBJECT",
                "text": "waste"
            },
            {
                "start": 16,
                "end": 22,
                "label": "SUBJECT",
                "text": "audits"
            },
        ],
    )
    pipeline.create_vocabulary(
        VocabularyConfiguration(sources=[training_data_source]))

    pipeline.train(
        output=str(tmp_path / "relation_classifier"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_data_source,
        validation=training_data_source,
    )

    pl_trained = Pipeline.from_pretrained(str(tmp_path /
                                              "relation_classifier"))
    pl_trained.predict(
        text="The most common audits were about waste and recycling",
        entities=[
            {
                "start": 34,
                "end": 39,
                "label": "OBJECT",
                "text": "waste"
            },
            {
                "start": 16,
                "end": 22,
                "label": "SUBJECT",
                "text": "audits"
            },
        ],
    )
Exemple #15
0
def test_text_classification(
    tmp_path, pipeline_dict, trainer_dict, train_valid_dataset
):
    """Apart from a well specified training, this also tests the vocab creation!"""

    random.seed(42)
    np.random.seed(422)
    torch.manual_seed(4222)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(4222)

    pl = Pipeline.from_config(pipeline_dict)
    train_ds = train_valid_dataset[0]
    valid_ds = train_valid_dataset[1]
    trainer = TrainerConfiguration(**trainer_dict)
    vocab_config = VocabularyConfiguration(
        datasets=[train_ds], max_vocab_size={"word": 50}
    )

    output = tmp_path / "output"

    pl.train(
        output=str(output),
        trainer=trainer,
        training=train_ds,
        validation=valid_ds,
        vocab_config=vocab_config,
    )
    assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
    assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83

    assert pl.num_trainable_parameters == 22070

    with (output / "metrics.json").open() as file:
        metrics = json.load(file)

    # It may fail in some systems
    assert metrics["training_loss"] == pytest.approx(0.684, abs=0.003)

    # Test vocab from a pretrained file
    pl = Pipeline.from_pretrained(str(output / "model.tar.gz"))

    assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
    assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
Exemple #16
0
def test_vocab_config(tmp_path, pipeline_config, trainer_config, dataset):
    vocab_config = VocabularyConfiguration(max_vocab_size=1)

    my_exp = TuneExperiment(
        pipeline_config=pipeline_config,
        trainer_config=trainer_config,
        train_dataset=dataset,
        valid_dataset=dataset,
        vocab_config=vocab_config,
        name="test_vocab_config",
        local_dir=str(tmp_path),
    )

    analysis = tune.run(my_exp)
    pl = Pipeline.from_pretrained(
        Path(analysis.get_best_logdir("validation_loss", "min")) / "output" /
        "model.tar.gz")

    assert pl.vocab.get_vocab_size("word") == 3
Exemple #17
0
def test_pure_transformers(tmp_path, pipeline_dict, trainer_dict, train_dataset):
    """Testing a Transformer training process and a model load"""

    pl = Pipeline.from_config(pipeline_dict)
    trainer = TrainerConfiguration(**trainer_dict)

    # Check a fixed vocabulary size for the model
    assert pl.backbone.vocab.get_vocab_size("transformers") == 28996

    pl.predict(text="test")

    output = tmp_path / "output"
    pl.train(output=str(output), trainer=trainer, training=train_dataset)

    # Test vocabulary from a pretrained file
    pl = Pipeline.from_pretrained(str(output / "model.tar.gz"))

    # Check a fixed vocabulary size for the model after loading
    assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
Exemple #18
0
def test_create_pipeline_with_weights_file(pipeline_config, dataset, tmp_path):
    pipeline = Pipeline.from_config(pipeline_config)

    output = tmp_path / "pretrained_word_vector_output"
    pipeline.train(
        output=str(output),
        training=dataset,
        trainer=TrainerConfiguration(num_epochs=1, cuda_device=-1),
    )
    instance = pipeline.head.featurize("test")
    instance.index_fields(pipeline.vocab)

    assert_allclose(
        pipeline.backbone.embedder(instance.as_tensor_dict()["text"], 0),
        torch.tensor([[0.66, 0.33]]),
    )

    # Loading a pretrained model without the weights file should work
    Path(pipeline_config["features"]["word"]["weights_file"]).unlink()
    assert isinstance(Pipeline.from_pretrained(str(output / "model.tar.gz")),
                      Pipeline)
Exemple #19
0
def train(
    pipeline_path: str,
    output: str,
    trainer: str,
    training: str,
    validation: Optional[str] = None,
    test: Optional[str] = None,
) -> None:
    """Train a pipeline.

    PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file),
    or the path to a pipeline configuration (YAML file).
    """
    _, extension = os.path.splitext(pipeline_path)
    extension = extension[1:].lower()
    pipeline = (
        Pipeline.from_yaml(pipeline_path)
        if extension in ["yaml", "yml"]
        else Pipeline.from_pretrained(pipeline_path)
    )

    datasets = {
        "train": dataset_from_path(training),
        "validation": dataset_from_path(validation) if validation else None,
        "test": dataset_from_path(test) if test else None,
    }

    pipeline.create_vocabulary(
        VocabularyConfiguration(
            sources=[dataset for dataset in datasets.values() if dataset]
        ),
    )

    pipeline.train(
        output=output,
        trainer=TrainerConfiguration(**yaml_to_dict(trainer)),
        training=datasets["training"],
        validation=datasets["validation"],
        test=datasets["test"],
    )
Exemple #20
0
def evaluate(
    pipeline_path: str,
    output: str,
    dataset: str,
    batch_size: int = 16,
    lazy: bool = False,
    prediction_output: Optional[str] = None,
) -> None:
    """Evaluate a pipeline on a given dataset.

    PIPELINE_PATH is the path to a pretrained pipeline (model.tar.gz file).
    """
    pipeline = Pipeline.from_pretrained(pipeline_path)
    dataset = dataset_from_path(dataset)

    pipeline.evaluate(
        dataset,
        batch_size=batch_size,
        lazy=lazy,
        predictions_output_file=prediction_output,
        metrics_output_file=output,
    )
def test_train(tmp_path, pipeline_dict, trainer_dict, train_data_source):
    pl = Pipeline.from_config(pipeline_dict)
    trainer = TrainerConfiguration(**trainer_dict)
    vocab = VocabularyConfiguration(sources=[train_data_source])
    pl.create_vocabulary(vocab)

    assert pl.backbone.vocab.get_vocab_size("transformers") == 50265

    pl.predict(text="test")

    output = tmp_path / "output"

    training_results = pl.train(
        output=str(output),
        trainer=trainer,
        training=train_data_source,
    )

    # test vocab from a pretrained file
    pl = Pipeline.from_pretrained(str(output / "model.tar.gz"))

    assert pl.backbone.vocab.get_vocab_size("transformers") == 50265
Exemple #22
0
def explore(pipeline_path: str, data_source: str, explain: bool,
            es_host: str) -> None:
    Pipeline.from_pretrained(pipeline_path).explore(
        data_source=DataSource.from_yaml(data_source),
        es_host=es_host,
        explain=explain)