def test_pipeline_runner_scale_datasets(): runner = PipelineRunner() x_tr = pd.DataFrame(np.random.random((20, 20))) x_val = pd.DataFrame(np.random.random((10, 20))) x_tr_scaled, x_val_scaled = runner.scale_datasets(x_tr, x_val) assert x_tr_scaled.any() assert x_val_scaled.any()
def test_save_results(): runner = PipelineRunner() path = Path("test.pkl") results = TrainingResults(model_path=path) runner.save_results(results) assert path.exists() os.remove(path)
def test_get_model_path(): runner = PipelineRunner() model = BenchmarkModel() dataset = BenchmarkDataset() path = runner.get_model_path(model=model, dataset=dataset) assert path assert model.version in path.stem assert dataset.version in path.stem
def test_pipeline_runner_build_datasets(sample_ids, tiny_files_structure): runner = PipelineRunner(file_structure=tiny_files_structure, ) X_train, y_train, X_val, y_val = runner.build_datasets( BenchmarkDataset(), sample_ids[:-2], sample_ids[-2:]) assert X_train.any() assert X_val.any() assert y_train.any() assert y_val.any()
def test_add_info_to_results(sample_ids): runner = PipelineRunner() ds, model = BenchmarkDataset(), BenchmarkModel() results = TrainingResults() runner.add_information_to_results(results, ds, model, sample_ids, sample_ids) assert results.model_path assert results.train_ids.any() assert results.val_ids.any() assert results.dataset_version
def test_evaluate_validation_set(): runner = PipelineRunner() x_val = np.random.random((10, 20)) y_val = np.random.random((10, 5)) results = TrainingResults(model=BenchmarkModel()) results.model.predict = lambda *args: np.random.random((10, 5)) runner.evaluate_validation_set(results, x_val, y_val) assert results.validation_weighted_mae assert results.validation_mae
def test_multiple_model_runner_run_pipelines(sample_ids, tiny_files_structure): training_list = [(BenchmarkDataset(), BenchmarkModel())] * 2 runner = PipelineRunner(tiny_files_structure) runner.splitter.split = lambda *args: (sample_ids, sample_ids) multi_runner = MultipleModelRunner(training_list) results = multi_runner.run_multiple_pipelines(sample_ids, runner, 0.5) assert results
def test_pipeline_runner_run_calls(tiny_files_structure, sample_ids): runner = PipelineRunner(file_structure=tiny_files_structure, ) runner.ds_builder = Mock(spec=DatasetBuilder) runner.model_trainer = Mock(spec=ModelTrainer) runner.get_model_path = Mock(spec=runner.get_model_path) runner.build_datasets = Mock(spec=runner.build_datasets, return_value=(0, 0, 0, 0)) runner.splitter.split = Mock(spec=runner.splitter.split, return_value=(0, 0)) runner.evaluate_validation_set = Mock(spec=runner.evaluate_validation_set) runner.run_pipeline( sample_ids, dataset=BenchmarkDataset(), model=BenchmarkModel(), ) runner.splitter.split.assert_called_once_with(sample_ids, 0.2) runner.build_datasets.assert_called_once() runner.get_model_path.assert_called_once() runner.model_trainer.train_model.assert_called_once() runner.evaluate_validation_set.assert_called_once()