def test_max_depth(self): model_list = [(Lasso, {}), (Ridge, {}), (RandomForestRegressor, {})] max_depth = 2 addition = 0.25 epochs = 20 data = Dataset(datasets.load_boston().data, datasets.load_boston().target) context, pipeline_data = LocalExecutor(data, epochs) << ( Pipeline() >> ModelSpace(model_list) >> FormulaFeatureGenerator( ['+', '-', '*', '/'], max_depth=max_depth, addition=addition) >> Validate(test_size=0.33, metrics=mean_squared_error) >> ChooseBest(1) >> FeatureSelector(30)) depths = [feature["depth"] for feature in pipeline_data.dataset.meta] preprocessing = Preprocessing() final_data = preprocessing.reproduce( pipeline_data.dataset, Dataset(datasets.load_boston().data, datasets.load_boston().target)) self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape) self.assertTrue((final_data == pipeline_data.dataset.data).all()) self.assertLessEqual(max(depths), max_depth + addition * epochs)
def test_RFS(self): x, y = make_classification(n_samples=100, n_features=40, n_informative=2, n_redundant=10, flip_y=0.05) model_list = [(RandomForestClassifier, {}), (GradientBoostingClassifier, {}), (SVC, {}), (KNeighborsClassifier, {}), (XGBClassifier, {})] n_features_to_select = random.randint(5, 30) data = Dataset(x, y) context, pipeline_data = LocalExecutor( data, 2) << (Pipeline() >> PipelineStep( 'model space', ModelSpace(model_list), initializer=True) >> FormulaFeatureGenerator(['+', '-', '*', '/']) >> Validate(test_size=0.1, metrics=roc_auc_score) >> ChooseBest(1) >> RecursiveFeatureSelector( n_features_to_select=n_features_to_select)) preprocessing = Preprocessing() final_data = preprocessing.reproduce(pipeline_data.dataset, Dataset(x, y)) self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape) self.assertTrue((final_data == pipeline_data.dataset.data).all())
def test_correlated_feature_generator(self): model_list = [(RandomForestRegressor, {}), (GradientBoostingRegressor, {}), (SVR, {}), (XGBRegressor, {})] n_features_to_select = random.randint(5, 30) data = Dataset(datasets.load_boston().data, datasets.load_boston().target) context, pipeline_data = LocalExecutor(data, 5) << ( Pipeline() >> PipelineStep( 'model space', ModelSpace(model_list), initializer=True) >> FormulaFeatureGenerator(['+', '-', '*', '/'], limit=10) >> Validate(test_size=0.1, metrics=mean_absolute_error) >> ChooseBest( 1, by_largest_score=False) >> RecursiveFeatureSelector(n_features_to_select=n_features_to_select) >> CorrelatedFeatureSelector(max_correlation=0.9)) preprocessing = Preprocessing() final_data = preprocessing.reproduce( pipeline_data.dataset, Dataset(datasets.load_boston().data, datasets.load_boston().target)) self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape) self.assertTrue((final_data == pipeline_data.dataset.data).all())
def test_voting_feature_selector(self): x, y = make_regression( n_samples=100, n_features=40, n_informative=2, ) model_list = [(RandomForestRegressor, {}), (GradientBoostingRegressor, {}), (SVR, {}), (XGBRegressor, {})] data = Dataset(x, y) result_mult = [] result_div = [] context, pipeline_data = LocalExecutor(data, 10) << ( Pipeline() >> PipelineStep( 'model space', ModelSpace(model_list), initializer=True) >> FormulaFeatureGenerator(['+', '-', '*', '/']) >> Validate( test_size=0.1, metrics=mean_absolute_error) >> ChooseBest( 4, by_largest_score=False) >> VotingFeatureSelector( feature_to_select=10, reverse_score=True)) preprocessing = Preprocessing() final_data = preprocessing.reproduce(pipeline_data.dataset, Dataset(x, y)) self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape) self.assertTrue((final_data == pipeline_data.dataset.data).all()) print('0' * 30) for result in pipeline_data.return_val: print(result.model, result.score) print(pipeline_data.dataset.data.shape) print('0' * 30)
def test_pipeline_hyperopt(self): x, y = make_classification(n_samples=100, n_features=40, n_informative=2, n_redundant=10, flip_y=0.05) model_list = [(RandomForestClassifier, random_forest_hp_space()), (GradientBoostingClassifier, grad_boosting_hp_space()), (SVC, svc_kernel_hp_space('rbf')), (KNeighborsClassifier, knn_hp_space()), (XGBClassifier, xgboost_hp_space())] data = Dataset(x, y) context, pipeline_data = LocalExecutor(data, 2) << ( Pipeline() >> PipelineStep( 'model space', ModelSpace(model_list), initializer=True) >> FormulaFeatureGenerator(['+', '-', '*']) >> Hyperopt(Validate(test_size=0.1, metrics=roc_auc_score), max_evals=2) >> ChooseBest(1) >> FeatureSelector(10)) print('0' * 30) for result in pipeline_data.return_val: print(result.model, result.score) print(pipeline_data.dataset.data.shape) print('0' * 30)
def test_pipeline(self): df = pd.DataFrame([[1, 2], [3, 4]]) X = Dataset(df, None) poly = PolynomialGenerator(interaction_only=True, degree=4) context, pipe_output = LocalExecutor(X) << ( Pipeline() >> PipelineStep('generate_features', poly)) self.assertEqual(pipe_output.dataset.data.shape, (2, 4))
def test_recovering_dataset_FFG(self): model_list = [(Lasso, {}), (Ridge, {}), (RandomForestRegressor, {})] data = Dataset(datasets.load_boston().data, datasets.load_boston().target) context, pipeline_data = LocalExecutor( data, 10) << (Pipeline() >> ModelSpace(model_list) >> FormulaFeatureGenerator(['+', '-', '*']) >> Validate( test_size=0.33, metrics=mean_squared_error) >> ChooseBest(1) >> FeatureSelector(30)) preprocessing = Preprocessing() final_data = preprocessing.reproduce( pipeline_data.dataset, Dataset(datasets.load_boston().data, datasets.load_boston().target)) self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape) self.assertTrue((final_data == pipeline_data.dataset.data).all())
def test_generate_formula_feature(self): features = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] df = pd.DataFrame(features) X = PipelineData(Dataset(df, None)) limit = random.randint(0, 100) gen = FormulaFeatureGenerator(['+', '*', '/', '-'], limit) context = PipelineContext() result_size = gen(X, context).dataset.data.shape[1] self.assertLessEqual(result_size, np.array(features).shape[1] + limit)
def test_xgboost(self): max_evals = 2 x, y = make_classification() dataset = Dataset(x, y) result = LocalExecutor(dataset) << (Pipeline() >> ModelSpace([ (XGBClassifier, xgboost_hp_space()) ]) >> Hyperopt(CV('roc_auc'), max_evals=max_evals)) result = result[1].return_val[0] self.assertIsInstance(result, HyperparameterSearchResult) self.assertEqual(len(result.history), max_evals)
def test_hyperopt(self): max_evals = 2 x, y = make_classification() dataset = Dataset(x, y) result = LocalExecutor(dataset) << (Pipeline() >> ModelSpace([ (RandomForestClassifier, random_forest_hp_space()) ]) >> Hyperopt(CV('roc_auc'), max_evals=max_evals)) result = result[1].return_val[0] self.assertIsInstance(result.history, hyperopt.base.Trials) self.assertEqual(len(result.history), max_evals)
def test_random_choice_combinator(self): for _ in range(0, 10): data = Dataset(np.zeros((2, 2)), np.zeros((2, 2))) result = LocalExecutor() << (Pipeline() >> RandomChoice([ PipelineStep('a', lambda x, context: 'a'), PipelineStep('b', lambda x, context: 'b') ])) print(result) self.assertIn(result[1], ['a', 'b'])
def test_poly_gen(self): model_list = [ (Lasso, {}), #(Ridge, {}), (RandomForestRegressor, {}) ] X, y = datasets.make_regression(n_features=5) data = Dataset(X, y) context, pipeline_data = LocalExecutor( data, 10) << (Pipeline() >> ModelSpace(model_list) >> PolynomialFeatureGenerator(max_degree=3) >> Validate( test_size=0.33, metrics=mean_squared_error) >> ChooseBest(1) >> FeatureSelector(10)) preprocessing = Preprocessing() final_data = preprocessing.reproduce(pipeline_data.dataset, Dataset(X, y)) self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape) self.assertTrue((final_data == pipeline_data.dataset.data).all())
def test_step_space_regression_model(self): model_list = [ (Lasso, {}), (Ridge, {}), (KernelRidge, {}), ] data = Dataset(datasets.load_boston().data, datasets.load_boston().target) LocalExecutor(data) << ( Pipeline() >> ModelSpace(model_list) >> Validate( test_size=0.33, metrics=mean_absolute_error) >> ChooseBest(3))
def test_forest(self): model_list = [ #(RandomForestRegressor, random_forest_hp_space('mae')), (RandomForestClassifier, random_forest_hp_space()) ] data = Dataset(datasets.load_iris().data, datasets.load_iris().target) context, pipeline_data = LocalExecutor(data, 1) << ( Pipeline() >> PipelineStep('model space', ModelSpace(model_list)) >> PipelineStep( 'H', Hyperopt(Validate(test_size=0.33, metrics=mean_absolute_error), max_evals=2)))
def test_step_cv(self): model_list = [ (LogisticRegression, {}), (RandomForestClassifier, { 'n_estimators': 100 }), (GradientBoostingClassifier, {}), (SVC, {}), (KNeighborsClassifier, {}), ] data = Dataset(datasets.load_iris().data, datasets.load_iris().target) LocalExecutor(data) << (Pipeline() >> ModelSpace(model_list) >> CV('accuracy') >> ChooseBest(3))
def test_cv(self): dataset = PipelineData( Dataset(datasets.load_iris().data, datasets.load_iris().target)) cv = CV('accuracy', n_folds=5) self.assertAlmostEqual( cv(dataset, (RandomForestClassifier, { 'random_state': 1 })).score, cross_val_score(RandomForestClassifier(random_state=1), dataset.dataset.data, dataset.dataset.target, cv=5).mean())
def test_call_generator(self): Transformer = Mock() Transformer.fit_transform.return_value = [] df = pd.DataFrame([[1, 2], [3, 4]]) X = PipelineData(Dataset(df, None)) context = PipelineContext() transformer = lambda *args, **kwargs: Transformer gen = SklearnFeatureGenerator(transformer) gen(X, context) Transformer.fit_transform.assert_called() self.assertTrue(( Transformer.fit_transform.call_args[0][0] == df.as_matrix()).all())
def test_all_step(self): model_list = [ #(Lasso, {}), #(Ridge, {}), #(KernelRidge, {}), (RandomForestRegressor, {}), (XGBRegressor, {}) ] data = Dataset(datasets.load_boston().data, datasets.load_boston().target) context, pipeline_data = LocalExecutor(data, 10) << ( Pipeline() >> ModelSpace(model_list) >> FormulaFeatureGenerator([ '+', '-', '*' ]) >> Validate(test_size=0.33, metrics=mean_squared_error) >> ChooseBest(1, by_largest_score=False) >> FeatureSelector(20)) print('0' * 30) for result in pipeline_data.return_val: print(result.model, result.score) print(pipeline_data.dataset.data.shape) print('0' * 30)