Example #1
0
def test_transformers_and_word(tmp_path, pipeline_dict, trainer_dict, train_dataset):
    """Testing Transformer pipeline with an added word feature layer"""
    # Changing the pipeline to delete the BERT pooler and add a word feature
    del pipeline_dict["head"]["pooler"]
    pipeline_dict["features"].update(
        {"word": {"embedding_dim": 16, "lowercase_tokens": True}}
    )

    pl = Pipeline.from_config(pipeline_dict)
    pl.predict(text="test")

    output = tmp_path / "output"
    trainer = TrainerConfiguration(**trainer_dict)
    pl.train(output=str(output), trainer=trainer, training=train_dataset)

    # Check a fixed vocabulary size for the transformer and the word feature
    assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
    assert pl.backbone.vocab.get_vocab_size("word") == 273

    # Test vocab from a pretrained file
    pl = Pipeline.from_pretrained(str(output / "model.tar.gz"))

    # Check a fixed vocabulary size for the transformer and the word feature after loading
    assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
    assert pl.backbone.vocab.get_vocab_size("word") == 273
Example #2
0
def create_trainer_for_finding_lr(
    pipeline: Pipeline,
    trainer_config: TrainerConfiguration,
    training_data: InstancesDataset,
) -> GradientDescentTrainer:
    """Returns an AllenNLP Trainer used for the learning rate scan.

    Parameters
    ----------
    pipeline
        The pipeline with the model
    trainer_config
        A trainer configuration
    training_data
        The training data
    """
    prepare_environment(Params({}))

    if hasattr(training_data, "index_with"):
        training_data.index_with(pipeline.backbone.vocab)

    trainer_params = Params(
        helpers.sanitize_for_params(trainer_config.to_allennlp_trainer()))

    training_data_loader = create_dataloader(training_data,
                                             trainer_config.batch_size,
                                             trainer_config.data_bucketing)

    return Trainer.from_params(
        model=pipeline._model,
        data_loader=training_data_loader,
        params=trainer_params,
        serialization_dir=None,
    )
Example #3
0
def create_trainer_for_finding_lr(
    model: PipelineModel,
    trainer_config: TrainerConfiguration,
    training_data: InstancesDataset,
) -> GradientDescentTrainer:
    """Returns an AllenNLP Trainer used for the learning rate scan.

    Parameters
    ----------
    model
        The underlying model
    trainer_config
        A trainer configuration
    training_data
        The training data
    """
    prepare_environment(Params({}))

    trainer_params = Params(
        helpers.sanitize_for_params(trainer_config.to_allennlp_trainer()))

    training_data_loader = create_dataloader(training_data,
                                             trainer_config.batch_size,
                                             trainer_config.data_bucketing)

    return cast(
        "GradientDescentTrainer",
        Trainer.from_params(
            model=model,
            data_loader=training_data_loader,
            params=trainer_params,
            serialization_dir=None,
        ),
    )
Example #4
0
def train(
    pipeline_path: str,
    output: str,
    trainer_config: str,
    train_data: str,
    valid_data: Optional[str] = None,
) -> None:
    """Train a pipeline.

    PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file),
    or the path to a pipeline configuration (YAML file).
    """
    _, extension = os.path.splitext(pipeline_path)
    extension = extension[1:].lower()
    pipeline = (
        Pipeline.from_yaml(pipeline_path)
        if extension in ["yaml", "yml"]
        else Pipeline.from_pretrained(pipeline_path)
    )

    datasets = {
        "train": dataset_from_path(train_data),
        "validation": dataset_from_path(valid_data) if valid_data else None,
    }

    trainer = Trainer(
        pipeline=pipeline,
        train_dataset=datasets["train"],
        valid_dataset=datasets["validation"],
        trainer_config=TrainerConfiguration(**yaml_to_dict(trainer_config)),
    )
    trainer.fit(output_dir=output)
Example #5
0
def test_training_from_pretrained_with_head_replace(pipeline: Pipeline,
                                                    dataset: Dataset,
                                                    tmp_path: str):
    configuration = TrainerConfiguration(
        data_bucketing=True,
        batch_size=2,
        num_epochs=5,
        cuda_device=-1,
    )
    output_dir = os.path.join(tmp_path, "output")
    pipeline.train(output=output_dir,
                   trainer=configuration,
                   training=dataset,
                   quiet=True)

    pipeline.set_head(TestHead)
    pipeline.config.tokenizer_config.max_nr_of_sentences = 3
    copied = pipeline.copy()
    assert isinstance(copied.head, TestHead)
    assert copied.num_parameters == pipeline.num_parameters
    assert copied.num_trainable_parameters == pipeline.num_trainable_parameters
    copied_model_state = copied._model.state_dict()
    original_model_state = pipeline._model.state_dict()
    for key, value in copied_model_state.items():
        if "backbone" in key:
            assert torch.all(torch.eq(value, original_model_state[key]))
    assert copied.backbone.featurizer.tokenizer.config.max_nr_of_sentences == 3
Example #6
0
def test_training_from_pretrained_with_head_replace(pipeline, dataset,
                                                    tmp_path):
    trainer_config = TrainerConfiguration(
        batch_size=2,
        max_epochs=5,
        gpus=0,
    )

    trainer = Trainer(pipeline,
                      train_dataset=dataset,
                      trainer_config=trainer_config)
    trainer.fit(tmp_path / "output")

    pipeline.set_head(TestHead)
    pipeline.config.tokenizer_config.max_nr_of_sentences = 3
    copied = pipeline.copy()
    assert isinstance(copied.head, TestHead)
    assert copied.num_parameters == pipeline.num_parameters
    assert copied.num_trainable_parameters == pipeline.num_trainable_parameters
    copied_model_state = copied._model.state_dict()
    original_model_state = pipeline._model.state_dict()
    for key, value in copied_model_state.items():
        if "backbone" in key:
            assert torch.all(torch.eq(value, original_model_state[key]))
    assert copied.backbone.featurizer.tokenizer.config.max_nr_of_sentences == 3
Example #7
0
def test_training_from_pretrained_with_head_replace(
    pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str
):
    training = pipeline_test.create_dataset(datasource_test)
    pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[training]))
    configuration = TrainerConfiguration(
        data_bucketing=True, batch_size=2, num_epochs=5
    )
    output_dir = os.path.join(tmp_path, "output")
    results = pipeline_test.train(
        output=output_dir, trainer=configuration, training=training, quiet=True
    )

    trained = Pipeline.from_pretrained(results.model_path)
    trained.set_head(TestHead)
    trained.config.tokenizer.max_nr_of_sentences = 3
    copied = trained._make_copy()
    assert isinstance(copied.head, TestHead)
    assert copied.num_parameters == trained.num_parameters
    assert copied.num_trainable_parameters == trained.num_trainable_parameters
    copied_model_state = copied._model.state_dict()
    original_model_state = trained._model.state_dict()
    for key, value in copied_model_state.items():
        if "backbone" in key:
            assert torch.all(torch.eq(value, original_model_state[key]))
    assert copied.backbone.featurizer.tokenizer.max_nr_of_sentences == 3
Example #8
0
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path):
    """Testing a classifier made from scratch"""

    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(
        text="The most common audits were about waste and recycling",
        entities=[
            {
                "start": 34,
                "end": 39,
                "label": "OBJECT",
                "text": "waste"
            },
            {
                "start": 16,
                "end": 22,
                "label": "SUBJECT",
                "text": "audits"
            },
        ],
    )

    pipeline.train(
        output=str(tmp_path / "relation_classifier"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_dataset,
        validation=training_dataset,
    )

    # test loading
    Pipeline.from_pretrained(str(tmp_path / "relation_classifier"))
def trainer_config() -> TrainerConfiguration:
    return TrainerConfiguration(
        max_epochs=1,
        optimizer={
            "type": "adamw",
            "lr": 0.002
        },
        gpus=0,
    )
Example #10
0
    def _default_trainable(config, checkpoint_dir=None):
        """A default trainable function used by `tune.run`

        It performs the most straight forward training loop with the provided `config`:
        - Create the pipeline (optionally with a provided vocab)
        - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch
        - Execute the training
        """
        if config["silence"]:
            logging.getLogger("biome.text").setLevel(logging.ERROR)

        pipeline = Pipeline.from_config(config["pipeline_config"])

        trainer_config = TrainerConfiguration(**config["trainer_config"])

        vocab_config = config["vocab_config"]
        if vocab_config:
            vocab_config = VocabularyConfiguration(**vocab_config)

        callbacks = trainer_config.callbacks
        if not isinstance(callbacks, list):
            callbacks = [callbacks]
        if not any(
            [isinstance(callback, TuneReportCallback) for callback in callbacks]
        ):
            tune_callback = TuneReportCallback(metrics=config["metrics"])
            if trainer_config.callbacks is None:
                trainer_config.callbacks = tune_callback
            else:
                trainer_config.callbacks = callbacks + [tune_callback]

        train_ds = Dataset.load_from_disk(config["train_dataset_path"])
        valid_ds = Dataset.load_from_disk(config["valid_dataset_path"])
        train_instances = train_ds.to_instances(pipeline=pipeline, disable_tqdm=True)
        valid_instances = valid_ds.to_instances(pipeline=pipeline, disable_tqdm=True)

        trainer = Trainer(
            pipeline=pipeline,
            train_dataset=train_instances,
            valid_dataset=valid_instances,
            trainer_config=trainer_config,
            vocab_config=vocab_config,
        )
        trainer.fit()
Example #11
0
def trainer_config() -> TrainerConfiguration:
    return TrainerConfiguration(
        max_epochs=2,
        optimizer={
            "type": "adam",
            "amsgrad": True,
            "lr": 0.002
        },
        gpus=0,
    )
Example #12
0
def trainer_config(tmp_path) -> TrainerConfiguration:
    return TrainerConfiguration(
        batch_size=16,
        max_epochs=1,
        optimizer={
            "type": "adam",
            "lr": 0.0001,
        },
        gpus=0,
        default_root_dir=str(tmp_path),
    )
Example #13
0
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path):
    """Testing the correct working of prediction, vocab creating and training"""
    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(record1={"first_name": "Hans"}, record2={"first_name": "Hansel"})

    pipeline.train(
        output=str(tmp_path / "record_bimpm_experiment"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_dataset,
        validation=training_dataset,
    )
Example #14
0
def test_mlflow_logger():

    logger = MlflowLogger(experiment_name="test-experiment",
                          run_name="test_run",
                          tag1="my-tag")

    pipeline = Pipeline.from_config(
        PipelineConfiguration(
            name="test-pipeline",
            head=TaskHeadConfiguration(type=TextClassification,
                                       labels=["A", "B"]),
        ))
    trainer = TrainerConfiguration()

    logger.init_train(pipeline, trainer, training=None)
    for epoch in range(0, 10):
        logger.log_epoch_metrics(epoch, metrics={"key": 10 * epoch})

    model_path = mkdtemp()
    metrics = {"metric": 200}
    logger.end_train(TrainingResults(model_path, metrics))

    run = mlflow.get_run(logger._run_id)
    assert run
    # Tags
    assert "test_run" == run.data.tags[mlflow_tags.MLFLOW_RUN_NAME]
    assert "my-tag" == run.data.tags["tag1"]
    # Parameters
    expected_parmams = {
        "pipeline.features.word.trainable": "True",
        "pipeline.num_parameters": "202",
        "pipeline.num_trainable_parameters": "202",
        "pipeline.features.word.embedding_dim": "50",
        "pipeline.head.type":
        "biome.text.modules.heads.classification.text_classification.TextClassification",
        "pipeline.head.labels": "['A', 'B']",
        "pipeline.name": "test-pipeline",
        "pipeline.tokenizer.lang": "en",
        "trainer.batch_size": "16",
        "trainer.validation_metric": "-loss",
        "trainer.optimizer.type": "adam",
        "trainer.patience": "2",
        "trainer.num_epochs": "20",
        "trainer.num_serialized_models_to_keep": "1",
        "pipeline.tokenizer.remove_space_tokens": "True",
    }
    assert expected_parmams == run.data.params
    # Artifacts
    assert os.path.basename(model_path) in os.listdir(
        urlparse(run.info.artifact_uri).path)
    # Metrics
    for metric in metrics:
        assert (metric in run.data.metrics
                and run.data.metrics[metric] == metrics[metric])
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path):
    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(text="my name is juan")
    pipeline.create_vocabulary(
        VocabularyConfiguration(sources=[training_data_source]))

    pipeline.train(
        output=str(tmp_path / "lm"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_data_source,
        validation=training_data_source,
    )
Example #16
0
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path):
    """Testing the correct working of prediction, vocab creating and training"""

    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(text="my name is juan")

    pipeline.train(
        output=str(tmp_path / "lm"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_dataset,
        validation=training_dataset,
    )
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path):
    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(text="Test this NER machine")
    pipeline.create_vocabulary(
        VocabularyConfiguration(sources=[training_data_source]))

    pipeline.train(
        output=str(tmp_path / "ner_experiment"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_data_source,
        validation=training_data_source,
    )
Example #18
0
def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path):
    pipeline = Pipeline.from_config(pipeline_dict)

    assert pipeline.output == ["entities", "tags"]

    assert pipeline.head.span_labels == ["NER"]
    assert pipeline.head.labels == ["B-NER", "I-NER", "U-NER", "L-NER", "O"]

    pipeline.train(
        output=str(tmp_path / "ner_experiment"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_dataset,
    )
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path):
    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(record1={"first_name": "Hans"},
                     record2={"first_name": "Hansel"})
    pipeline.create_vocabulary(
        VocabularyConfiguration(sources=[training_data_source]))

    pipeline.train(
        output=str(tmp_path / "record_bimpm_experiment"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_data_source,
        validation=training_data_source,
    )
Example #20
0
def test_train_from_pretrained(pipeline, dataset, tmp_path):
    output_path = tmp_path / "test_train_from_pretrained_output"
    trainer_config = TrainerConfiguration(max_epochs=1, batch_size=2, gpus=0)
    trainer = Trainer(
        pipeline=pipeline, train_dataset=dataset, trainer_config=trainer_config
    )
    trainer.fit(output_path)

    prediction = pipeline.predict("a test")
    pipeline_loaded = Pipeline.from_pretrained(output_path / "model.tar.gz")
    prediction_loaded = pipeline_loaded.predict("a test")

    assert_allclose(prediction["probabilities"], prediction_loaded["probabilities"])
Example #21
0
def test_use_amp(dataset, pipeline, tmp_path, capsys):
    trainer_config = TrainerConfiguration(
        num_epochs=1,
        batch_size=2,
        use_amp=True,
    )

    pipeline.train(
        output=str(tmp_path / "test_use_amp_output"),
        training=dataset,
        trainer=trainer_config,
    )

    captured = capsys.readouterr()
    assert "use_amp = True" in captured.err
Example #22
0
def test_train_from_pretrained(pipeline, dataset, tmp_path):
    output_path = tmp_path / "test_train_from_pretrained_output"
    trainer_config = TrainerConfiguration(num_epochs=1,
                                          batch_size=2,
                                          cuda_device=-1)
    pipeline.train(output=str(output_path),
                   training=dataset,
                   trainer=trainer_config)

    prediction = pipeline.predict("a test")
    pipeline_loaded = Pipeline.from_pretrained(str(output_path))
    prediction_loaded = pipeline_loaded.predict("a test")

    assert_allclose(prediction["probabilities"],
                    prediction_loaded["probabilities"])
Example #23
0
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path):
    pipeline = Pipeline.from_config(pipeline_dict)
    pipeline.predict(
        text="The most common audits were about waste and recycling",
        entities=[
            {
                "start": 34,
                "end": 39,
                "label": "OBJECT",
                "text": "waste"
            },
            {
                "start": 16,
                "end": 22,
                "label": "SUBJECT",
                "text": "audits"
            },
        ],
    )
    pipeline.create_vocabulary(
        VocabularyConfiguration(sources=[training_data_source]))

    pipeline.train(
        output=str(tmp_path / "relation_classifier"),
        trainer=TrainerConfiguration(**trainer_dict),
        training=training_data_source,
        validation=training_data_source,
    )

    pl_trained = Pipeline.from_pretrained(str(tmp_path /
                                              "relation_classifier"))
    pl_trained.predict(
        text="The most common audits were about waste and recycling",
        entities=[
            {
                "start": 34,
                "end": 39,
                "label": "OBJECT",
                "text": "waste"
            },
            {
                "start": 16,
                "end": 22,
                "label": "SUBJECT",
                "text": "audits"
            },
        ],
    )
Example #24
0
def test_training_with_logging(pipeline: Pipeline, dataset: Dataset,
                               tmp_path: str):
    configuration = TrainerConfiguration(data_bucketing=True,
                                         batch_size=2,
                                         num_epochs=5)
    output_dir = os.path.join(tmp_path, "output")
    pipeline.train(output=output_dir,
                   trainer=configuration,
                   training=dataset,
                   quiet=True)

    assert os.path.exists(os.path.join(output_dir, "train.log"))
    with open(os.path.join(output_dir, "train.log")) as train_log:
        for line in train_log.readlines()[3:]:
            assert "allennlp" in line

    assert logging.getLogger("allennlp").level == logging.ERROR
    assert logging.getLogger("biome").level == logging.INFO
Example #25
0
def test_text_classification(
    tmp_path, pipeline_dict, trainer_dict, train_valid_dataset
):
    """Apart from a well specified training, this also tests the vocab creation!"""

    random.seed(42)
    np.random.seed(422)
    torch.manual_seed(4222)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(4222)

    pl = Pipeline.from_config(pipeline_dict)
    train_ds = train_valid_dataset[0]
    valid_ds = train_valid_dataset[1]
    trainer = TrainerConfiguration(**trainer_dict)
    vocab_config = VocabularyConfiguration(
        datasets=[train_ds], max_vocab_size={"word": 50}
    )

    output = tmp_path / "output"

    pl.train(
        output=str(output),
        trainer=trainer,
        training=train_ds,
        validation=valid_ds,
        vocab_config=vocab_config,
    )
    assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
    assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83

    assert pl.num_trainable_parameters == 22070

    with (output / "metrics.json").open() as file:
        metrics = json.load(file)

    # It may fail in some systems
    assert metrics["training_loss"] == pytest.approx(0.684, abs=0.003)

    # Test vocab from a pretrained file
    pl = Pipeline.from_pretrained(str(output / "model.tar.gz"))

    assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
    assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
Example #26
0
    def _default_trainable(config, reporter):
        """A default trainable function used by `tune.run`

        It performs the most straight forward training loop with the provided `config`:
        - Create the pipeline (optionally with a provided vocab)
        - Set up a MLFlow and WandB logger
        - Set up a TuneMetrics logger that reports all metrics back to ray tune after each epoch
        - Create the vocab if necessary
        - Execute the training
        """
        pipeline = Pipeline.from_config(config["pipeline_config"],
                                        vocab_path=config["vocab_path"])

        trainer_config = TrainerConfiguration(
            **helpers.sanitize_for_params(config["trainer_config"]))

        mlflow_tracking_uri = config["mlflow_tracking_uri"]
        mlflow.set_tracking_uri(mlflow_tracking_uri)

        train_ds = Dataset.load_from_disk(config["train_dataset_path"])
        valid_ds = Dataset.load_from_disk(config["valid_dataset_path"])

        train_loggers = [
            MlflowLogger(
                experiment_name=config["name"],
                run_name=reporter.trial_name,
                ray_trial_id=reporter.trial_id,
                ray_logdir=reporter.logdir,
            ),
            TuneMetricsLogger(),
        ]
        if is_wandb_installed_and_logged_in():
            train_loggers = [WandBLogger(project_name=config["name"])
                             ] + train_loggers

        pipeline.train(
            output="training",
            training=train_ds,
            validation=valid_ds,
            trainer=trainer_config,
            loggers=train_loggers,
            vocab_config=None if config["vocab_path"] else "default",
        )
Example #27
0
def test_pure_transformers(tmp_path, pipeline_dict, trainer_dict, train_dataset):
    """Testing a Transformer training process and a model load"""

    pl = Pipeline.from_config(pipeline_dict)
    trainer = TrainerConfiguration(**trainer_dict)

    # Check a fixed vocabulary size for the model
    assert pl.backbone.vocab.get_vocab_size("transformers") == 28996

    pl.predict(text="test")

    output = tmp_path / "output"
    pl.train(output=str(output), trainer=trainer, training=train_dataset)

    # Test vocabulary from a pretrained file
    pl = Pipeline.from_pretrained(str(output / "model.tar.gz"))

    # Check a fixed vocabulary size for the model after loading
    assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
Example #28
0
def test_create_pipeline_with_weights_file(pipeline_config, dataset, tmp_path):
    pipeline = Pipeline.from_config(pipeline_config)

    output = tmp_path / "pretrained_word_vector_output"
    pipeline.train(
        output=str(output),
        training=dataset,
        trainer=TrainerConfiguration(num_epochs=1, cuda_device=-1),
    )
    instance = pipeline.head.featurize("test")
    instance.index_fields(pipeline.vocab)

    assert_allclose(
        pipeline.backbone.embedder(instance.as_tensor_dict()["text"], 0),
        torch.tensor([[0.66, 0.33]]),
    )

    # Loading a pretrained model without the weights file should work
    Path(pipeline_config["features"]["word"]["weights_file"]).unlink()
    assert isinstance(Pipeline.from_pretrained(str(output / "model.tar.gz")),
                      Pipeline)
Example #29
0
def test_training_with_logging(
    pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str
):
    training = pipeline_test.create_dataset(datasource_test)
    pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[training]))

    configuration = TrainerConfiguration(
        data_bucketing=True, batch_size=2, num_epochs=5
    )
    output_dir = os.path.join(tmp_path, "output")
    pipeline_test.train(
        output=output_dir, trainer=configuration, training=training, quiet=True
    )

    assert os.path.exists(os.path.join(output_dir, "train.log"))
    with open(os.path.join(output_dir, "train.log")) as train_log:
        for line in train_log.readlines():
            assert "allennlp" in line

    assert logging.getLogger("allennlp").level == logging.ERROR
    assert logging.getLogger("biome").level == logging.INFO
Example #30
0
def test_training_with_data_bucketing(pipeline: Pipeline, dataset: Dataset,
                                      tmp_path: str):
    configuration = TrainerConfiguration(data_bucketing=True,
                                         batch_size=2,
                                         num_epochs=5)

    pipeline.copy().train(
        output=os.path.join(tmp_path, "output"),
        trainer=configuration,
        training=dataset,
        validation=dataset,
        lazy=False,
    )

    pipeline.copy().train(
        output=os.path.join(tmp_path, "output"),
        trainer=configuration,
        training=dataset,
        validation=dataset,
        lazy=True,
    )