def test_pipeline_step(self): pipeline = Pipeline() >> PipelineStep('a', lambda x, context: PipelineData(x.dataset + 1)) \ >> PipelineStep('b', lambda x, context: PipelineData(x.dataset + 2)) executor = LocalExecutor() context, data = executor.run(pipeline, 0) self.assertEqual(data.dataset, 3)
def test_max_depth(self): model_list = [(Lasso, {}), (Ridge, {}), (RandomForestRegressor, {})] max_depth = 2 addition = 0.25 epochs = 20 data = Dataset(datasets.load_boston().data, datasets.load_boston().target) context, pipeline_data = LocalExecutor(data, epochs) << ( Pipeline() >> ModelSpace(model_list) >> FormulaFeatureGenerator( ['+', '-', '*', '/'], max_depth=max_depth, addition=addition) >> Validate(test_size=0.33, metrics=mean_squared_error) >> ChooseBest(1) >> FeatureSelector(30)) depths = [feature["depth"] for feature in pipeline_data.dataset.meta] preprocessing = Preprocessing() final_data = preprocessing.reproduce( pipeline_data.dataset, Dataset(datasets.load_boston().data, datasets.load_boston().target)) self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape) self.assertTrue((final_data == pipeline_data.dataset.data).all()) self.assertLessEqual(max(depths), max_depth + addition * epochs)
def test_RFS(self): x, y = make_classification(n_samples=100, n_features=40, n_informative=2, n_redundant=10, flip_y=0.05) model_list = [(RandomForestClassifier, {}), (GradientBoostingClassifier, {}), (SVC, {}), (KNeighborsClassifier, {}), (XGBClassifier, {})] n_features_to_select = random.randint(5, 30) data = Dataset(x, y) context, pipeline_data = LocalExecutor( data, 2) << (Pipeline() >> PipelineStep( 'model space', ModelSpace(model_list), initializer=True) >> FormulaFeatureGenerator(['+', '-', '*', '/']) >> Validate(test_size=0.1, metrics=roc_auc_score) >> ChooseBest(1) >> RecursiveFeatureSelector( n_features_to_select=n_features_to_select)) preprocessing = Preprocessing() final_data = preprocessing.reproduce(pipeline_data.dataset, Dataset(x, y)) self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape) self.assertTrue((final_data == pipeline_data.dataset.data).all())
def test_pipeline_hyperopt(self): x, y = make_classification(n_samples=100, n_features=40, n_informative=2, n_redundant=10, flip_y=0.05) model_list = [(RandomForestClassifier, random_forest_hp_space()), (GradientBoostingClassifier, grad_boosting_hp_space()), (SVC, svc_kernel_hp_space('rbf')), (KNeighborsClassifier, knn_hp_space()), (XGBClassifier, xgboost_hp_space())] data = Dataset(x, y) context, pipeline_data = LocalExecutor(data, 2) << ( Pipeline() >> PipelineStep( 'model space', ModelSpace(model_list), initializer=True) >> FormulaFeatureGenerator(['+', '-', '*']) >> Hyperopt(Validate(test_size=0.1, metrics=roc_auc_score), max_evals=2) >> ChooseBest(1) >> FeatureSelector(10)) print('0' * 30) for result in pipeline_data.return_val: print(result.model, result.score) print(pipeline_data.dataset.data.shape) print('0' * 30)
def test_correlated_feature_generator(self): model_list = [(RandomForestRegressor, {}), (GradientBoostingRegressor, {}), (SVR, {}), (XGBRegressor, {})] n_features_to_select = random.randint(5, 30) data = Dataset(datasets.load_boston().data, datasets.load_boston().target) context, pipeline_data = LocalExecutor(data, 5) << ( Pipeline() >> PipelineStep( 'model space', ModelSpace(model_list), initializer=True) >> FormulaFeatureGenerator(['+', '-', '*', '/'], limit=10) >> Validate(test_size=0.1, metrics=mean_absolute_error) >> ChooseBest( 1, by_largest_score=False) >> RecursiveFeatureSelector(n_features_to_select=n_features_to_select) >> CorrelatedFeatureSelector(max_correlation=0.9)) preprocessing = Preprocessing() final_data = preprocessing.reproduce( pipeline_data.dataset, Dataset(datasets.load_boston().data, datasets.load_boston().target)) self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape) self.assertTrue((final_data == pipeline_data.dataset.data).all())
def test_voting_feature_selector(self): x, y = make_regression( n_samples=100, n_features=40, n_informative=2, ) model_list = [(RandomForestRegressor, {}), (GradientBoostingRegressor, {}), (SVR, {}), (XGBRegressor, {})] data = Dataset(x, y) result_mult = [] result_div = [] context, pipeline_data = LocalExecutor(data, 10) << ( Pipeline() >> PipelineStep( 'model space', ModelSpace(model_list), initializer=True) >> FormulaFeatureGenerator(['+', '-', '*', '/']) >> Validate( test_size=0.1, metrics=mean_absolute_error) >> ChooseBest( 4, by_largest_score=False) >> VotingFeatureSelector( feature_to_select=10, reverse_score=True)) preprocessing = Preprocessing() final_data = preprocessing.reproduce(pipeline_data.dataset, Dataset(x, y)) self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape) self.assertTrue((final_data == pipeline_data.dataset.data).all()) print('0' * 30) for result in pipeline_data.return_val: print(result.model, result.score) print(pipeline_data.dataset.data.shape) print('0' * 30)
def test_pipeline(self): df = pd.DataFrame([[1, 2], [3, 4]]) X = Dataset(df, None) poly = PolynomialGenerator(interaction_only=True, degree=4) context, pipe_output = LocalExecutor(X) << ( Pipeline() >> PipelineStep('generate_features', poly)) self.assertEqual(pipe_output.dataset.data.shape, (2, 4))
def test_random_choice_combinator(self): for _ in range(0, 10): data = Dataset(np.zeros((2, 2)), np.zeros((2, 2))) result = LocalExecutor() << (Pipeline() >> RandomChoice([ PipelineStep('a', lambda x, context: 'a'), PipelineStep('b', lambda x, context: 'b') ])) print(result) self.assertIn(result[1], ['a', 'b'])
def test_xgboost(self): max_evals = 2 x, y = make_classification() dataset = Dataset(x, y) result = LocalExecutor(dataset) << (Pipeline() >> ModelSpace([ (XGBClassifier, xgboost_hp_space()) ]) >> Hyperopt(CV('roc_auc'), max_evals=max_evals)) result = result[1].return_val[0] self.assertIsInstance(result, HyperparameterSearchResult) self.assertEqual(len(result.history), max_evals)
def test_hyperopt(self): max_evals = 2 x, y = make_classification() dataset = Dataset(x, y) result = LocalExecutor(dataset) << (Pipeline() >> ModelSpace([ (RandomForestClassifier, random_forest_hp_space()) ]) >> Hyperopt(CV('roc_auc'), max_evals=max_evals)) result = result[1].return_val[0] self.assertIsInstance(result.history, hyperopt.base.Trials) self.assertEqual(len(result.history), max_evals)
def test_step_space_regression_model(self): model_list = [ (Lasso, {}), (Ridge, {}), (KernelRidge, {}), ] data = Dataset(datasets.load_boston().data, datasets.load_boston().target) LocalExecutor(data) << ( Pipeline() >> ModelSpace(model_list) >> Validate( test_size=0.33, metrics=mean_absolute_error) >> ChooseBest(3))
def test_forest(self): model_list = [ #(RandomForestRegressor, random_forest_hp_space('mae')), (RandomForestClassifier, random_forest_hp_space()) ] data = Dataset(datasets.load_iris().data, datasets.load_iris().target) context, pipeline_data = LocalExecutor(data, 1) << ( Pipeline() >> PipelineStep('model space', ModelSpace(model_list)) >> PipelineStep( 'H', Hyperopt(Validate(test_size=0.33, metrics=mean_absolute_error), max_evals=2)))
def test_step_cv(self): model_list = [ (LogisticRegression, {}), (RandomForestClassifier, { 'n_estimators': 100 }), (GradientBoostingClassifier, {}), (SVC, {}), (KNeighborsClassifier, {}), ] data = Dataset(datasets.load_iris().data, datasets.load_iris().target) LocalExecutor(data) << (Pipeline() >> ModelSpace(model_list) >> CV('accuracy') >> ChooseBest(3))
def test_recovering_dataset_FFG(self): model_list = [(Lasso, {}), (Ridge, {}), (RandomForestRegressor, {})] data = Dataset(datasets.load_boston().data, datasets.load_boston().target) context, pipeline_data = LocalExecutor( data, 10) << (Pipeline() >> ModelSpace(model_list) >> FormulaFeatureGenerator(['+', '-', '*']) >> Validate( test_size=0.33, metrics=mean_squared_error) >> ChooseBest(1) >> FeatureSelector(30)) preprocessing = Preprocessing() final_data = preprocessing.reproduce( pipeline_data.dataset, Dataset(datasets.load_boston().data, datasets.load_boston().target)) self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape) self.assertTrue((final_data == pipeline_data.dataset.data).all())
def test_poly_gen(self): model_list = [ (Lasso, {}), #(Ridge, {}), (RandomForestRegressor, {}) ] X, y = datasets.make_regression(n_features=5) data = Dataset(X, y) context, pipeline_data = LocalExecutor( data, 10) << (Pipeline() >> ModelSpace(model_list) >> PolynomialFeatureGenerator(max_degree=3) >> Validate( test_size=0.33, metrics=mean_squared_error) >> ChooseBest(1) >> FeatureSelector(10)) preprocessing = Preprocessing() final_data = preprocessing.reproduce(pipeline_data.dataset, Dataset(X, y)) self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape) self.assertTrue((final_data == pipeline_data.dataset.data).all())
def test_all_step(self): model_list = [ #(Lasso, {}), #(Ridge, {}), #(KernelRidge, {}), (RandomForestRegressor, {}), (XGBRegressor, {}) ] data = Dataset(datasets.load_boston().data, datasets.load_boston().target) context, pipeline_data = LocalExecutor(data, 10) << ( Pipeline() >> ModelSpace(model_list) >> FormulaFeatureGenerator([ '+', '-', '*' ]) >> Validate(test_size=0.33, metrics=mean_squared_error) >> ChooseBest(1, by_largest_score=False) >> FeatureSelector(20)) print('0' * 30) for result in pipeline_data.return_val: print(result.model, result.score) print(pipeline_data.dataset.data.shape) print('0' * 30)
def test_auto_step_wrapper_error(self): with self.assertRaises(ValueError): LocalExecutor() << (Pipeline() >> "err")
def test_auto_step_wrapper(self): func = lambda x, context: 1 result = LocalExecutor() << (Pipeline() >> func) self.assertEqual(result[1], 1)
def test_initializer(self): func = lambda x, context: context.epoch result = LocalExecutor(epochs=10) << ( Pipeline() >> PipelineStep('a', func, initializer=True)) self.assertEqual(result[1], 0)