def test_train(pipeline_dict, training_dataset, trainer_dict, tmp_path): """Testing a classifier made from scratch""" pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict( text="The most common audits were about waste and recycling", entities=[ { "start": 34, "end": 39, "label": "OBJECT", "text": "waste" }, { "start": 16, "end": 22, "label": "SUBJECT", "text": "audits" }, ], ) pipeline.train( output=str(tmp_path / "relation_classifier"), trainer=TrainerConfiguration(**trainer_dict), training=training_dataset, validation=training_dataset, ) # test loading Pipeline.from_pretrained(str(tmp_path / "relation_classifier"))
def test_train(pipeline_dict, training_dataset, trainer_config, tmp_path): """Testing a classifier made from scratch""" pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict( text="The most common audits were about waste and recycling", entities=[ { "start": 34, "end": 39, "label": "OBJECT", "text": "waste" }, { "start": 16, "end": 22, "label": "SUBJECT", "text": "audits" }, ], ) trainer = Trainer( pipeline=pipeline, train_dataset=training_dataset, valid_dataset=training_dataset, trainer_config=trainer_config, ) trainer.fit(tmp_path / "relation_classifier") # test loading Pipeline.from_pretrained(tmp_path / "relation_classifier" / "model.tar.gz")
def test_load_pipeline_with_custom_head(): config = PipelineConfiguration( "test-pipeline", head=TaskHeadConfiguration( type=MyCustomHead, labels=[ "blue-collar", "technician", "management", "services", "retired", "admin.", ], ), features=FeaturesConfiguration(), ) pipeline = Pipeline.from_config(config) assert isinstance(pipeline.head, MyCustomHead) train = DataSource( source=os.path.join(TEST_RESOURCES, "resources/data/dataset_source.csv"), mapping={ "label": "job", "text": ["education", "marital"] }, ) output = mkdtemp() pipeline.create_vocabulary(VocabularyConfiguration(sources=[train])) pipeline.train(output=output, training=train) trained_pl = Pipeline.from_pretrained(os.path.join(output, "model.tar.gz")) trained_pl.predict("Oh yeah") assert isinstance(trained_pl.head, MyCustomHead)
def train( pipeline_path: str, output: str, trainer_config: str, train_data: str, valid_data: Optional[str] = None, ) -> None: """Train a pipeline. PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file), or the path to a pipeline configuration (YAML file). """ _, extension = os.path.splitext(pipeline_path) extension = extension[1:].lower() pipeline = ( Pipeline.from_yaml(pipeline_path) if extension in ["yaml", "yml"] else Pipeline.from_pretrained(pipeline_path) ) datasets = { "train": dataset_from_path(train_data), "validation": dataset_from_path(valid_data) if valid_data else None, } trainer = Trainer( pipeline=pipeline, train_dataset=datasets["train"], valid_dataset=datasets["validation"], trainer_config=TrainerConfiguration(**yaml_to_dict(trainer_config)), ) trainer.fit(output_dir=output)
def serve(pipeline_path: str, port: int, predictions_dir: str) -> None: pipeline = Pipeline.from_pretrained(pipeline_path) if predictions_dir: pipeline.init_prediction_logger(predictions_dir) pipeline.serve(port)
def test_load_pipeline_with_custom_head(training_dataset): """Testing a model training inserting a class as custom heard""" # Pipeline configuration dict with custom head config = PipelineConfiguration( "test-pipeline", head=TaskHeadConfiguration( type=MyCustomHead, labels=[ "blue-collar", "technician", "management", "services", "retired", "admin.", ], ), features=FeaturesConfiguration(), ) # Asserting that pipeline.head is an instance of MyCustomHead pipeline = Pipeline.from_config(config) assert isinstance(pipeline.head, MyCustomHead) # Training the model and saving it to output output = mkdtemp() pipeline.train(output=output, training=training_dataset) # Loading model from output trained_pl = Pipeline.from_pretrained(os.path.join(output, "model.tar.gz")) trained_pl.predict("Oh yeah") # Asserting that the pipeline head is recognized as `MyCustomHead` instance after loading from a model.tar.gz assert isinstance(trained_pl.head, MyCustomHead)
def test_training_from_pretrained_with_head_replace( pipeline_test: Pipeline, datasource_test: DataSource, tmp_path: str ): training = pipeline_test.create_dataset(datasource_test) pipeline_test.create_vocabulary(VocabularyConfiguration(sources=[training])) configuration = TrainerConfiguration( data_bucketing=True, batch_size=2, num_epochs=5 ) output_dir = os.path.join(tmp_path, "output") results = pipeline_test.train( output=output_dir, trainer=configuration, training=training, quiet=True ) trained = Pipeline.from_pretrained(results.model_path) trained.set_head(TestHead) trained.config.tokenizer.max_nr_of_sentences = 3 copied = trained._make_copy() assert isinstance(copied.head, TestHead) assert copied.num_parameters == trained.num_parameters assert copied.num_trainable_parameters == trained.num_trainable_parameters copied_model_state = copied._model.state_dict() original_model_state = trained._model.state_dict() for key, value in copied_model_state.items(): if "backbone" in key: assert torch.all(torch.eq(value, original_model_state[key])) assert copied.backbone.featurizer.tokenizer.max_nr_of_sentences == 3
def test_transformers_and_word(tmp_path, pipeline_dict, trainer_config, train_dataset): """Testing Transformer pipeline with an added word feature layer""" # Changing the pipeline to delete the BERT pooler and add a word feature del pipeline_dict["head"]["pooler"] pipeline_dict["features"].update( {"word": {"embedding_dim": 16, "lowercase_tokens": True}} ) pl = Pipeline.from_config(pipeline_dict) pl.predict(text="test") output = tmp_path / "output" trainer = Trainer( pipeline=pl, train_dataset=train_dataset, trainer_config=trainer_config ) trainer.fit(output_dir=output) # Check a fixed vocabulary size for the transformer and the word feature assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 assert pl.backbone.vocab.get_vocab_size("word") == 273 # Test vocab from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) # Check a fixed vocabulary size for the transformer and the word feature after loading assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 assert pl.backbone.vocab.get_vocab_size("word") == 273
def test_text_classification(tmp_path, pipeline_dict, train_valid_dataset): """Apart from a well specified training, this also tests the vocab creation!""" seed_everything(43) pl = Pipeline.from_config(pipeline_dict) train_ds = train_valid_dataset[0] valid_ds = train_valid_dataset[1] vocab_config = VocabularyConfiguration(max_vocab_size={"word": 50}) trainer_config = TrainerConfiguration( batch_size=64, optimizer={ "type": "adam", "lr": 0.01 }, max_epochs=5, default_root_dir=str(tmp_path), gpus=0, # turn off gpus even if available ) trainer = Trainer( pipeline=pl, train_dataset=train_ds, valid_dataset=valid_ds, trainer_config=trainer_config, vocab_config=vocab_config, ) trainer.fit(tmp_path / "output") assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83 assert pl.num_trainable_parameters == 22070 evaluation = trainer.test(valid_ds, batch_size=16) # Reminder: the value depends on the batch_size! assert evaluation["test_loss"] == pytest.approx(0.7404146790504456, abs=0.003) Pipeline.from_pretrained(str(tmp_path / "output" / "model.tar.gz")) assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
def serve(pipeline_path: str, port: int, predictions_dir: str) -> None: """Serves the pipeline predictions as a REST API PIPELINE_PATH is the path to a pretrained pipeline (model.tar.gz file). """ pipeline = Pipeline.from_pretrained(pipeline_path) if predictions_dir: pipeline.init_prediction_logger(predictions_dir) pipeline.serve(port)
def test_save(pipeline, tmp_path): pipeline.save(tmp_path) assert (tmp_path / "model.tar.gz").is_file() expected_prediction = pipeline.predict("test") prediction = Pipeline.from_pretrained(tmp_path / "model.tar.gz").predict("test") assert prediction["labels"] == expected_prediction["labels"] assert_allclose(prediction["probabilities"], expected_prediction["probabilities"])
def test_train_from_pretrained(pipeline, dataset, tmp_path): output_path = tmp_path / "test_train_from_pretrained_output" trainer_config = TrainerConfiguration(max_epochs=1, batch_size=2, gpus=0) trainer = Trainer( pipeline=pipeline, train_dataset=dataset, trainer_config=trainer_config ) trainer.fit(output_path) prediction = pipeline.predict("a test") pipeline_loaded = Pipeline.from_pretrained(output_path / "model.tar.gz") prediction_loaded = pipeline_loaded.predict("a test") assert_allclose(prediction["probabilities"], prediction_loaded["probabilities"])
def test_train_from_pretrained(pipeline, dataset, tmp_path): output_path = tmp_path / "test_train_from_pretrained_output" trainer_config = TrainerConfiguration(num_epochs=1, batch_size=2, cuda_device=-1) pipeline.train(output=str(output_path), training=dataset, trainer=trainer_config) prediction = pipeline.predict("a test") pipeline_loaded = Pipeline.from_pretrained(str(output_path)) prediction_loaded = pipeline_loaded.predict("a test") assert_allclose(prediction["probabilities"], prediction_loaded["probabilities"])
def test_train(pipeline_dict, training_data_source, trainer_dict, tmp_path): pipeline = Pipeline.from_config(pipeline_dict) pipeline.predict( text="The most common audits were about waste and recycling", entities=[ { "start": 34, "end": 39, "label": "OBJECT", "text": "waste" }, { "start": 16, "end": 22, "label": "SUBJECT", "text": "audits" }, ], ) pipeline.create_vocabulary( VocabularyConfiguration(sources=[training_data_source])) pipeline.train( output=str(tmp_path / "relation_classifier"), trainer=TrainerConfiguration(**trainer_dict), training=training_data_source, validation=training_data_source, ) pl_trained = Pipeline.from_pretrained(str(tmp_path / "relation_classifier")) pl_trained.predict( text="The most common audits were about waste and recycling", entities=[ { "start": 34, "end": 39, "label": "OBJECT", "text": "waste" }, { "start": 16, "end": 22, "label": "SUBJECT", "text": "audits" }, ], )
def test_text_classification( tmp_path, pipeline_dict, trainer_dict, train_valid_dataset ): """Apart from a well specified training, this also tests the vocab creation!""" random.seed(42) np.random.seed(422) torch.manual_seed(4222) if torch.cuda.is_available(): torch.cuda.manual_seed_all(4222) pl = Pipeline.from_config(pipeline_dict) train_ds = train_valid_dataset[0] valid_ds = train_valid_dataset[1] trainer = TrainerConfiguration(**trainer_dict) vocab_config = VocabularyConfiguration( datasets=[train_ds], max_vocab_size={"word": 50} ) output = tmp_path / "output" pl.train( output=str(output), trainer=trainer, training=train_ds, validation=valid_ds, vocab_config=vocab_config, ) assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83 assert pl.num_trainable_parameters == 22070 with (output / "metrics.json").open() as file: metrics = json.load(file) # It may fail in some systems assert metrics["training_loss"] == pytest.approx(0.684, abs=0.003) # Test vocab from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
def test_vocab_config(tmp_path, pipeline_config, trainer_config, dataset): vocab_config = VocabularyConfiguration(max_vocab_size=1) my_exp = TuneExperiment( pipeline_config=pipeline_config, trainer_config=trainer_config, train_dataset=dataset, valid_dataset=dataset, vocab_config=vocab_config, name="test_vocab_config", local_dir=str(tmp_path), ) analysis = tune.run(my_exp) pl = Pipeline.from_pretrained( Path(analysis.get_best_logdir("validation_loss", "min")) / "output" / "model.tar.gz") assert pl.vocab.get_vocab_size("word") == 3
def test_pure_transformers(tmp_path, pipeline_dict, trainer_dict, train_dataset): """Testing a Transformer training process and a model load""" pl = Pipeline.from_config(pipeline_dict) trainer = TrainerConfiguration(**trainer_dict) # Check a fixed vocabulary size for the model assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 pl.predict(text="test") output = tmp_path / "output" pl.train(output=str(output), trainer=trainer, training=train_dataset) # Test vocabulary from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) # Check a fixed vocabulary size for the model after loading assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
def test_create_pipeline_with_weights_file(pipeline_config, dataset, tmp_path): pipeline = Pipeline.from_config(pipeline_config) output = tmp_path / "pretrained_word_vector_output" pipeline.train( output=str(output), training=dataset, trainer=TrainerConfiguration(num_epochs=1, cuda_device=-1), ) instance = pipeline.head.featurize("test") instance.index_fields(pipeline.vocab) assert_allclose( pipeline.backbone.embedder(instance.as_tensor_dict()["text"], 0), torch.tensor([[0.66, 0.33]]), ) # Loading a pretrained model without the weights file should work Path(pipeline_config["features"]["word"]["weights_file"]).unlink() assert isinstance(Pipeline.from_pretrained(str(output / "model.tar.gz")), Pipeline)
def train( pipeline_path: str, output: str, trainer: str, training: str, validation: Optional[str] = None, test: Optional[str] = None, ) -> None: """Train a pipeline. PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file), or the path to a pipeline configuration (YAML file). """ _, extension = os.path.splitext(pipeline_path) extension = extension[1:].lower() pipeline = ( Pipeline.from_yaml(pipeline_path) if extension in ["yaml", "yml"] else Pipeline.from_pretrained(pipeline_path) ) datasets = { "train": dataset_from_path(training), "validation": dataset_from_path(validation) if validation else None, "test": dataset_from_path(test) if test else None, } pipeline.create_vocabulary( VocabularyConfiguration( sources=[dataset for dataset in datasets.values() if dataset] ), ) pipeline.train( output=output, trainer=TrainerConfiguration(**yaml_to_dict(trainer)), training=datasets["training"], validation=datasets["validation"], test=datasets["test"], )
def evaluate( pipeline_path: str, output: str, dataset: str, batch_size: int = 16, lazy: bool = False, prediction_output: Optional[str] = None, ) -> None: """Evaluate a pipeline on a given dataset. PIPELINE_PATH is the path to a pretrained pipeline (model.tar.gz file). """ pipeline = Pipeline.from_pretrained(pipeline_path) dataset = dataset_from_path(dataset) pipeline.evaluate( dataset, batch_size=batch_size, lazy=lazy, predictions_output_file=prediction_output, metrics_output_file=output, )
def test_train(tmp_path, pipeline_dict, trainer_dict, train_data_source): pl = Pipeline.from_config(pipeline_dict) trainer = TrainerConfiguration(**trainer_dict) vocab = VocabularyConfiguration(sources=[train_data_source]) pl.create_vocabulary(vocab) assert pl.backbone.vocab.get_vocab_size("transformers") == 50265 pl.predict(text="test") output = tmp_path / "output" training_results = pl.train( output=str(output), trainer=trainer, training=train_data_source, ) # test vocab from a pretrained file pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) assert pl.backbone.vocab.get_vocab_size("transformers") == 50265
def explore(pipeline_path: str, data_source: str, explain: bool, es_host: str) -> None: Pipeline.from_pretrained(pipeline_path).explore( data_source=DataSource.from_yaml(data_source), es_host=es_host, explain=explain)