def train( pipeline_path: str, output: str, trainer_config: str, train_data: str, valid_data: Optional[str] = None, ) -> None: """Train a pipeline. PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file), or the path to a pipeline configuration (YAML file). """ _, extension = os.path.splitext(pipeline_path) extension = extension[1:].lower() pipeline = ( Pipeline.from_yaml(pipeline_path) if extension in ["yaml", "yml"] else Pipeline.from_pretrained(pipeline_path) ) datasets = { "train": dataset_from_path(train_data), "validation": dataset_from_path(valid_data) if valid_data else None, } trainer = Trainer( pipeline=pipeline, train_dataset=datasets["train"], valid_dataset=datasets["validation"], trainer_config=TrainerConfiguration(**yaml_to_dict(trainer_config)), ) trainer.fit(output_dir=output)
def test_pipeline_config(pipeline_yaml): tokenizer_config = TokenizerConfiguration( text_cleaning={"rules": ["strip_spaces"]}, use_spacy_tokens=True) word_features = WordFeatures(embedding_dim=2, lowercase_tokens=True) char_features = CharFeatures( embedding_dim=2, encoder={ "type": "gru", "hidden_size": 2, "num_layers": 1, "bidirectional": True, }, dropout=0.1, ) features_config = FeaturesConfiguration(word=word_features, char=char_features) encoder_spec = Seq2SeqEncoderConfiguration(type="gru", hidden_size=2, num_layers=1, bidirectional=True) head_spec = TaskHeadConfiguration( type=TextClassification, labels=["duplicate", "not_duplicate"], pooler={"type": "boe"}, ) pipeline_config = PipelineConfiguration( name="test_pipeline_config", head=head_spec, features=features_config, tokenizer=tokenizer_config, encoder=encoder_spec, ) pl = Pipeline.from_config(pipeline_config) pl_yaml = Pipeline.from_yaml(pipeline_yaml) assert pl.named_trainable_parameters == pl_yaml.named_trainable_parameters assert pl.num_trainable_parameters == pl_yaml.num_trainable_parameters assert pl.num_parameters == pl_yaml.num_parameters sample_text = "My simple text" for instance in [ pl.backbone.featurizer(sample_text), pl_yaml.backbone.featurizer(sample_text), ]: for key, value in instance.items(): assert key == "record" assert isinstance(value, ListField) assert len(value) == 1 for text in value: assert isinstance(text, TextField) assert all(map(lambda t: isinstance(t, Token), text.tokens)) assert sample_text == " ".join([t.text for t in text.tokens])
def train( pipeline_path: str, output: str, trainer: str, training: str, validation: Optional[str] = None, test: Optional[str] = None, ) -> None: """Train a pipeline. PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file), or the path to a pipeline configuration (YAML file). """ _, extension = os.path.splitext(pipeline_path) extension = extension[1:].lower() pipeline = ( Pipeline.from_yaml(pipeline_path) if extension in ["yaml", "yml"] else Pipeline.from_pretrained(pipeline_path) ) datasets = { "train": dataset_from_path(training), "validation": dataset_from_path(validation) if validation else None, "test": dataset_from_path(test) if test else None, } pipeline.create_vocabulary( VocabularyConfiguration( sources=[dataset for dataset in datasets.values() if dataset] ), ) pipeline.train( output=output, trainer=TrainerConfiguration(**yaml_to_dict(trainer)), training=datasets["training"], validation=datasets["validation"], test=datasets["test"], )