Ejemplo n.º 1
0
def train(
    pipeline_path: str,
    output: str,
    trainer_config: str,
    train_data: str,
    valid_data: Optional[str] = None,
) -> None:
    """Train a pipeline.

    PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file),
    or the path to a pipeline configuration (YAML file).
    """
    _, extension = os.path.splitext(pipeline_path)
    extension = extension[1:].lower()
    pipeline = (
        Pipeline.from_yaml(pipeline_path)
        if extension in ["yaml", "yml"]
        else Pipeline.from_pretrained(pipeline_path)
    )

    datasets = {
        "train": dataset_from_path(train_data),
        "validation": dataset_from_path(valid_data) if valid_data else None,
    }

    trainer = Trainer(
        pipeline=pipeline,
        train_dataset=datasets["train"],
        valid_dataset=datasets["validation"],
        trainer_config=TrainerConfiguration(**yaml_to_dict(trainer_config)),
    )
    trainer.fit(output_dir=output)
Ejemplo n.º 2
0
def test_pipeline_config(pipeline_yaml):
    tokenizer_config = TokenizerConfiguration(
        text_cleaning={"rules": ["strip_spaces"]}, use_spacy_tokens=True)

    word_features = WordFeatures(embedding_dim=2, lowercase_tokens=True)
    char_features = CharFeatures(
        embedding_dim=2,
        encoder={
            "type": "gru",
            "hidden_size": 2,
            "num_layers": 1,
            "bidirectional": True,
        },
        dropout=0.1,
    )
    features_config = FeaturesConfiguration(word=word_features,
                                            char=char_features)

    encoder_spec = Seq2SeqEncoderConfiguration(type="gru",
                                               hidden_size=2,
                                               num_layers=1,
                                               bidirectional=True)

    head_spec = TaskHeadConfiguration(
        type=TextClassification,
        labels=["duplicate", "not_duplicate"],
        pooler={"type": "boe"},
    )

    pipeline_config = PipelineConfiguration(
        name="test_pipeline_config",
        head=head_spec,
        features=features_config,
        tokenizer=tokenizer_config,
        encoder=encoder_spec,
    )

    pl = Pipeline.from_config(pipeline_config)

    pl_yaml = Pipeline.from_yaml(pipeline_yaml)

    assert pl.named_trainable_parameters == pl_yaml.named_trainable_parameters
    assert pl.num_trainable_parameters == pl_yaml.num_trainable_parameters
    assert pl.num_parameters == pl_yaml.num_parameters

    sample_text = "My simple text"
    for instance in [
            pl.backbone.featurizer(sample_text),
            pl_yaml.backbone.featurizer(sample_text),
    ]:
        for key, value in instance.items():
            assert key == "record"
            assert isinstance(value, ListField)
            assert len(value) == 1
            for text in value:
                assert isinstance(text, TextField)
                assert all(map(lambda t: isinstance(t, Token), text.tokens))
                assert sample_text == " ".join([t.text for t in text.tokens])
Ejemplo n.º 3
0
def train(
    pipeline_path: str,
    output: str,
    trainer: str,
    training: str,
    validation: Optional[str] = None,
    test: Optional[str] = None,
) -> None:
    """Train a pipeline.

    PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file),
    or the path to a pipeline configuration (YAML file).
    """
    _, extension = os.path.splitext(pipeline_path)
    extension = extension[1:].lower()
    pipeline = (
        Pipeline.from_yaml(pipeline_path)
        if extension in ["yaml", "yml"]
        else Pipeline.from_pretrained(pipeline_path)
    )

    datasets = {
        "train": dataset_from_path(training),
        "validation": dataset_from_path(validation) if validation else None,
        "test": dataset_from_path(test) if test else None,
    }

    pipeline.create_vocabulary(
        VocabularyConfiguration(
            sources=[dataset for dataset in datasets.values() if dataset]
        ),
    )

    pipeline.train(
        output=output,
        trainer=TrainerConfiguration(**yaml_to_dict(trainer)),
        training=datasets["training"],
        validation=datasets["validation"],
        test=datasets["test"],
    )