def test_learner_datasets(self): opts = dict(random_seed=0) test_data = [ (MLPRegressor, load_boston(**opts), 0.85), # Boston housing dataset (MLPClassifier, load_titanic(**opts), 0.75), # Titanic dataset ] for learner, train_test_datasets, target_score in test_data: dataset, test_ds = train_test_datasets pipeline = Pipeline([ PipelineStep( name="preprocessor", learner=StandardPreprocessor, kwargs={ "continuous": dataset.continuous, "categorical": dataset.categorical, }, ), PipelineStep(name="estimator", learner=learner, kwargs=opts), ]) history = pipeline.train(dataset.input_fn, max_score=target_score, progress=True) test_score = pipeline.score(*test_ds[:]) self.assertGreaterEqual(max(history.scores), target_score, dataset.name) print("%s\t%.3f\t%.3f" % (dataset.name, max(history.scores), test_score))
def test_chaining_feature_drop_encoder_onehot_scaler_variance_threshold(self): n = 100 nums = list(range(10)) data_0 = [numpy.ones(n) * -1] data_1 = [numpy.random.random(n) * 10 for _ in range(3)] data_2 = [generate_array_uints(n=n, max_int=10, random_seed=0)] data_concat = data_0 + data_1 + data_2 all_steps = ( PipelineStep(learner=DummyTransformer, kwargs={}), PipelineStep(learner=FeatureDrop, kwargs={"columns": [0]}), PipelineStep(learner=MinMaxScaler, kwargs={"columns": [1, 2, 3]}), PipelineStep(learner=LabelEncoder, kwargs={"columns": {0: [], 4: nums}}), PipelineStep(learner=VarianceThreshold, kwargs={"columns": [1, 2, 3]}), ) transformers = itertools.combinations_with_replacement(all_steps, len(all_steps)) for steps in transformers: pipeline = Pipeline(steps) try: pipeline.fit(data_concat).transform(data_concat) except Exception as ex: print("Pipeline failed") for step in pipeline.steps: print(step, getattr(step[1], "classes_", None)) raise (ex)
def test_pipeline_builtin(self): steps1 = [PipelineStep(name="transformer", learner=DummyTransformer)] steps2 = [PipelineStep(name="estimator", learner=DummyRegressor)] steps3 = [ PipelineStep(name="transformer", learner=DummyTransformer), PipelineStep(name="estimator", learner=DummyRegressor), ] for steps in (steps1, steps2, steps3): self.assertTrue(test_learner(Pipeline, steps=steps))
def test_fit_pipeline(self): pipeline1 = Pipeline( [PipelineStep(name="transformer", learner=DummyTransformer)]) pipeline2 = Pipeline( [PipelineStep(name="estimator", learner=DummyRegressor)]) pipeline3 = Pipeline([ PipelineStep(name="transformer", learner=DummyTransformer), PipelineStep(name="estimator", learner=DummyRegressor), ]) data, target = numpy.random.random(10), numpy.ones(10) for pipeline in [pipeline1, pipeline2, pipeline3]: pipeline_ = pipeline.fit(data, y=target) self.assertEqual(pipeline, pipeline_)
def test_feature_drop_thrice_different(self): n = 100 data = [numpy.ones(n) * i for i in range(4)] pipeline = Pipeline( [ PipelineStep(name=1, learner=FeatureDrop, kwargs={"columns": [0]}), PipelineStep(name=2, learner=FeatureDrop, kwargs={"columns": [1]}), PipelineStep(name=3, learner=FeatureDrop, kwargs={"columns": [2]}), ] ) output = pipeline.fit(data).transform(data) self.assertEqual(1, len(output)) for i in range(1): self.assertListEqual((numpy.ones(n) * i + 3).tolist(), output[i].tolist())
def test_feature_drop_twice_same(self): n = 100 data = [numpy.ones(n) * i for i in range(4)] pipeline = Pipeline( [ PipelineStep(name=1, learner=FeatureDrop, kwargs={"columns": [0]}), PipelineStep(name=2, learner=FeatureDrop, kwargs={"columns": [0]}), ] ) # The expectation is that dropping a column that no longer exists is a no-op output = pipeline.fit(data).transform(data) self.assertEqual(3, len(output)) for i in range(3): self.assertListEqual((numpy.ones(n) * i + 1).tolist(), output[i].tolist())
def test_fit_predict_pipeline(self): pipeline = Pipeline( [PipelineStep(name="estimator", learner=DummyRegressor)]) data1, target1 = numpy.random.random(10), numpy.ones(10) target2 = pipeline.fit(data1, target1).predict(data1) self.assertTrue(numpy.array_equal(target1, target2))
def test_fit_transform_pipeline(self): pipeline = Pipeline( [PipelineStep(name="transformer", learner=DummyTransformer)]) data1 = numpy.random.random(10) data2 = pipeline.fit(data1).transform(data1) self.assertTrue(numpy.array_equal(data1, data2))
def test_feature_drop_following_encoder(self): n = 100 nums = list(range(10)) data_0 = [numpy.random.random(n) * 10 for _ in range(4)] data_1 = [generate_array_uints(n=n, max_int=10, random_seed=0)] data_concat = data_0 + data_1 pipeline = Pipeline( [ PipelineStep(name=1, learner=FeatureDrop, kwargs={"columns": [0]}), PipelineStep(name=2, learner=LabelEncoder, kwargs={"columns": {0: nums, 4: nums}}), ] ) output = pipeline.fit(data_concat).transform(data_concat) self.assertEqual(4, len(output)) self.assertGreaterEqual(output[-1].max(), 1)
def test_feature_drop_following_scaler(self): n = 100 data_0 = [numpy.ones(n) * -1] data_1 = [numpy.random.random(n) * 10 for _ in range(4)] data_concat = data_0 + data_1 pipeline = Pipeline( [ PipelineStep(name=1, learner=FeatureDrop, kwargs={"columns": [0]}), PipelineStep(name=2, learner=MinMaxScaler, kwargs={"columns": [1, 2, 3]}), ] ) output = pipeline.fit(data_concat).transform(data_concat) self.assertEqual(4, len(output)) self.assertGreaterEqual(output[-1].max(), 1) for i, col in enumerate(output[:-1]): self.assertLessEqual(col.max(), 1, "Column %d" % i)
def test_feature_drop_twice_different(self): n = 100 data = [numpy.ones(n) * i for i in range(4)] pipeline = Pipeline( [ PipelineStep(name=1, learner=FeatureDrop, kwargs={"columns": [0]}), PipelineStep(name=2, learner=FeatureDrop, kwargs={"columns": [1]}), ] ) output = pipeline.fit(data).transform(data) self.assertEqual(2, len(output)) for i in range(2): self.assertListEqual((numpy.ones(n) * i + 2).tolist(), output[i].tolist()) # Send input change event change_map = ChangeMap(len(output), idx_add=[0], idx_del=[0]) pipeline.on_input_shape_changed(change_map) output = pipeline.fit(data).transform(data)
def test_encoder_onehot_following_feature_drop(self): n = 100 nums = list(range(10)) data_0 = [generate_array_uints(n=n, max_int=10, random_seed=0)] data_1 = [numpy.random.random(n) for _ in range(4)] data_concat = data_0 + data_1 pipeline = Pipeline( [ PipelineStep(name=1, learner=FeatureDrop, kwargs={"columns": [1, 2, 3]}), PipelineStep(name=2, learner=OneHotEncoder, kwargs={"columns": {0: nums}}), ] ) output = pipeline.fit(data_concat).transform(data_concat) self.assertEqual(len(nums) + 1, len(output)) for i, col in enumerate(output[:-1]): self.assertGreaterEqual(col.max(), 1, i) self.assertLessEqual(output[-1].max(), 1)
def test_feature_drop_following_scaler_then_encoder(self): n = 100 nums = list(range(10)) data_0 = [numpy.ones(n) * -1] data_1 = [numpy.random.random(n) * 10 for _ in range(3)] data_2 = [generate_array_uints(n=n, max_int=10, random_seed=0)] data_concat = data_0 + data_1 + data_2 pipeline = Pipeline( [ PipelineStep(name=1, learner=FeatureDrop, kwargs={"columns": [0]}), PipelineStep(name=2, learner=MinMaxScaler, kwargs={"columns": [1, 2, 3]}), PipelineStep(name=3, learner=LabelEncoder, kwargs={"columns": {4: nums}}), ] ) output = pipeline.fit(data_concat).transform(data_concat) self.assertEqual(4, len(output)) self.assertGreaterEqual(output[0].max(), 1) for col in output[1:-1]: self.assertLessEqual(col.max(), 1)
def test_inverse_transform_pipeline(self): pipeline = Pipeline( [PipelineStep(name="transformer", learner=DummyTransformer)]) data1 = numpy.random.random(10) # Calling inverse transform without fitting first should fail self.assertRaises(AssertionError, lambda: pipeline.inverse_transform(data1)) data2 = pipeline.fit(data1).transform(data1) data3 = pipeline.inverse_transform(data1) self.assertTrue(numpy.array_equal(data1, data2)) self.assertTrue(numpy.array_equal(data1, data3))
def test_create_pipeline(self): pipeline = Pipeline( [PipelineStep(name="transformer", learner=DummyTransformer)]) self.assertTrue(pipeline)
def test_iter_parameters_pipeline(self): step = PipelineStep(name="1", learner=CustomEstimator, kwargs=CustomEstimator.hyperparameters(None)) params = Pipeline.hyperparameters(None, [step]) expected = [ { "steps": [ PipelineStep(name="1", learner=CustomEstimator, kwargs={ "a": 1, "b": "x" }) ] }, { "steps": [ PipelineStep(name="1", learner=CustomEstimator, kwargs={ "a": 1, "b": "y" }) ] }, { "steps": [ PipelineStep(name="1", learner=CustomEstimator, kwargs={ "a": 1, "b": "z" }) ] }, { "steps": [ PipelineStep(name="1", learner=CustomEstimator, kwargs={ "a": 2, "b": "x" }) ] }, { "steps": [ PipelineStep(name="1", learner=CustomEstimator, kwargs={ "a": 2, "b": "y" }) ] }, { "steps": [ PipelineStep(name="1", learner=CustomEstimator, kwargs={ "a": 2, "b": "z" }) ] }, { "steps": [ PipelineStep(name="1", learner=CustomEstimator, kwargs={ "a": 3, "b": "x" }) ] }, { "steps": [ PipelineStep(name="1", learner=CustomEstimator, kwargs={ "a": 3, "b": "y" }) ] }, { "steps": [ PipelineStep(name="1", learner=CustomEstimator, kwargs={ "a": 3, "b": "z" }) ] }, ] result = list(_iter_parameters(Pipeline, params)) self.assertListEqual(expected, result)