Ejemplo n.º 1
0
    def test_pipeline_step(self):
        pipeline = Pipeline() >> PipelineStep('a', lambda x, context: PipelineData(x.dataset + 1)) \
                              >> PipelineStep('b', lambda x, context: PipelineData(x.dataset + 2))

        executor = LocalExecutor()
        context, data = executor.run(pipeline, 0)
        self.assertEqual(data.dataset, 3)
Ejemplo n.º 2
0
    def test_max_depth(self):
        model_list = [(Lasso, {}), (Ridge, {}), (RandomForestRegressor, {})]

        max_depth = 2
        addition = 0.25
        epochs = 20

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        context, pipeline_data = LocalExecutor(data, epochs) << (
            Pipeline() >> ModelSpace(model_list) >> FormulaFeatureGenerator(
                ['+', '-', '*', '/'], max_depth=max_depth, addition=addition)
            >> Validate(test_size=0.33, metrics=mean_squared_error) >>
            ChooseBest(1) >> FeatureSelector(30))

        depths = [feature["depth"] for feature in pipeline_data.dataset.meta]

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(
            pipeline_data.dataset,
            Dataset(datasets.load_boston().data,
                    datasets.load_boston().target))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
        self.assertLessEqual(max(depths), max_depth + addition * epochs)
Ejemplo n.º 3
0
    def test_RFS(self):
        x, y = make_classification(n_samples=100,
                                   n_features=40,
                                   n_informative=2,
                                   n_redundant=10,
                                   flip_y=0.05)

        model_list = [(RandomForestClassifier, {}),
                      (GradientBoostingClassifier, {}), (SVC, {}),
                      (KNeighborsClassifier, {}), (XGBClassifier, {})]

        n_features_to_select = random.randint(5, 30)

        data = Dataset(x, y)
        context, pipeline_data = LocalExecutor(
            data, 2) << (Pipeline() >> PipelineStep(
                'model space', ModelSpace(model_list), initializer=True) >>
                         FormulaFeatureGenerator(['+', '-', '*', '/']) >>
                         Validate(test_size=0.1, metrics=roc_auc_score) >>
                         ChooseBest(1) >> RecursiveFeatureSelector(
                             n_features_to_select=n_features_to_select))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(pipeline_data.dataset,
                                             Dataset(x, y))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
Ejemplo n.º 4
0
    def test_pipeline_hyperopt(self):
        x, y = make_classification(n_samples=100,
                                   n_features=40,
                                   n_informative=2,
                                   n_redundant=10,
                                   flip_y=0.05)
        model_list = [(RandomForestClassifier, random_forest_hp_space()),
                      (GradientBoostingClassifier, grad_boosting_hp_space()),
                      (SVC, svc_kernel_hp_space('rbf')),
                      (KNeighborsClassifier, knn_hp_space()),
                      (XGBClassifier, xgboost_hp_space())]

        data = Dataset(x, y)
        context, pipeline_data = LocalExecutor(data, 2) << (
            Pipeline() >> PipelineStep(
                'model space', ModelSpace(model_list),
                initializer=True) >> FormulaFeatureGenerator(['+', '-', '*'])
            >> Hyperopt(Validate(test_size=0.1, metrics=roc_auc_score),
                        max_evals=2) >> ChooseBest(1) >> FeatureSelector(10))

        print('0' * 30)
        for result in pipeline_data.return_val:
            print(result.model, result.score)
        print(pipeline_data.dataset.data.shape)
        print('0' * 30)
Ejemplo n.º 5
0
    def test_correlated_feature_generator(self):

        model_list = [(RandomForestRegressor, {}),
                      (GradientBoostingRegressor, {}), (SVR, {}),
                      (XGBRegressor, {})]

        n_features_to_select = random.randint(5, 30)

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        context, pipeline_data = LocalExecutor(data, 5) << (
            Pipeline() >> PipelineStep(
                'model space', ModelSpace(model_list), initializer=True) >>
            FormulaFeatureGenerator(['+', '-', '*', '/'], limit=10) >>
            Validate(test_size=0.1, metrics=mean_absolute_error) >> ChooseBest(
                1, by_largest_score=False) >>
            RecursiveFeatureSelector(n_features_to_select=n_features_to_select)
            >> CorrelatedFeatureSelector(max_correlation=0.9))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(
            pipeline_data.dataset,
            Dataset(datasets.load_boston().data,
                    datasets.load_boston().target))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
Ejemplo n.º 6
0
    def test_voting_feature_selector(self):
        x, y = make_regression(
            n_samples=100,
            n_features=40,
            n_informative=2,
        )

        model_list = [(RandomForestRegressor, {}),
                      (GradientBoostingRegressor, {}), (SVR, {}),
                      (XGBRegressor, {})]

        data = Dataset(x, y)

        result_mult = []
        result_div = []
        context, pipeline_data = LocalExecutor(data, 10) << (
            Pipeline() >> PipelineStep(
                'model space', ModelSpace(model_list), initializer=True) >>
            FormulaFeatureGenerator(['+', '-', '*', '/']) >> Validate(
                test_size=0.1, metrics=mean_absolute_error) >> ChooseBest(
                    4, by_largest_score=False) >> VotingFeatureSelector(
                        feature_to_select=10, reverse_score=True))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(pipeline_data.dataset,
                                             Dataset(x, y))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())

        print('0' * 30)
        for result in pipeline_data.return_val:
            print(result.model, result.score)
        print(pipeline_data.dataset.data.shape)
        print('0' * 30)
Ejemplo n.º 7
0
    def test_pipeline(self):
        df = pd.DataFrame([[1, 2], [3, 4]])
        X = Dataset(df, None)
        poly = PolynomialGenerator(interaction_only=True, degree=4)
        context, pipe_output = LocalExecutor(X) << (
            Pipeline() >> PipelineStep('generate_features', poly))

        self.assertEqual(pipe_output.dataset.data.shape, (2, 4))
Ejemplo n.º 8
0
    def test_random_choice_combinator(self):
        for _ in range(0, 10):
            data = Dataset(np.zeros((2, 2)), np.zeros((2, 2)))
            result = LocalExecutor() << (Pipeline() >> RandomChoice([
                PipelineStep('a', lambda x, context: 'a'),
                PipelineStep('b', lambda x, context: 'b')
            ]))

            print(result)
            self.assertIn(result[1], ['a', 'b'])
Ejemplo n.º 9
0
 def test_xgboost(self):
     max_evals = 2
     x, y = make_classification()
     dataset = Dataset(x, y)
     result = LocalExecutor(dataset) << (Pipeline() >> ModelSpace([
         (XGBClassifier, xgboost_hp_space())
     ]) >> Hyperopt(CV('roc_auc'), max_evals=max_evals))
     result = result[1].return_val[0]
     self.assertIsInstance(result, HyperparameterSearchResult)
     self.assertEqual(len(result.history), max_evals)
Ejemplo n.º 10
0
 def test_hyperopt(self):
     max_evals = 2
     x, y = make_classification()
     dataset = Dataset(x, y)
     result = LocalExecutor(dataset) << (Pipeline() >> ModelSpace([
         (RandomForestClassifier, random_forest_hp_space())
     ]) >> Hyperopt(CV('roc_auc'), max_evals=max_evals))
     result = result[1].return_val[0]
     self.assertIsInstance(result.history, hyperopt.base.Trials)
     self.assertEqual(len(result.history), max_evals)
Ejemplo n.º 11
0
    def test_step_space_regression_model(self):
        model_list = [
            (Lasso, {}),
            (Ridge, {}),
            (KernelRidge, {}),
        ]

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        LocalExecutor(data) << (
            Pipeline() >> ModelSpace(model_list) >> Validate(
                test_size=0.33, metrics=mean_absolute_error) >> ChooseBest(3))
Ejemplo n.º 12
0
    def test_forest(self):
        model_list = [
            #(RandomForestRegressor, random_forest_hp_space('mae')),
            (RandomForestClassifier, random_forest_hp_space())
        ]

        data = Dataset(datasets.load_iris().data, datasets.load_iris().target)
        context, pipeline_data = LocalExecutor(data, 1) << (
            Pipeline() >> PipelineStep('model space', ModelSpace(model_list))
            >> PipelineStep(
                'H',
                Hyperopt(Validate(test_size=0.33, metrics=mean_absolute_error),
                         max_evals=2)))
Ejemplo n.º 13
0
    def test_step_cv(self):
        model_list = [
            (LogisticRegression, {}),
            (RandomForestClassifier, {
                'n_estimators': 100
            }),
            (GradientBoostingClassifier, {}),
            (SVC, {}),
            (KNeighborsClassifier, {}),
        ]

        data = Dataset(datasets.load_iris().data, datasets.load_iris().target)
        LocalExecutor(data) << (Pipeline() >> ModelSpace(model_list) >>
                                CV('accuracy') >> ChooseBest(3))
Ejemplo n.º 14
0
    def test_recovering_dataset_FFG(self):
        model_list = [(Lasso, {}), (Ridge, {}), (RandomForestRegressor, {})]

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        context, pipeline_data = LocalExecutor(
            data, 10) << (Pipeline() >> ModelSpace(model_list) >>
                          FormulaFeatureGenerator(['+', '-', '*']) >> Validate(
                              test_size=0.33, metrics=mean_squared_error) >>
                          ChooseBest(1) >> FeatureSelector(30))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(
            pipeline_data.dataset,
            Dataset(datasets.load_boston().data,
                    datasets.load_boston().target))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
Ejemplo n.º 15
0
    def test_poly_gen(self):
        model_list = [
            (Lasso, {}),
            #(Ridge, {}),
            (RandomForestRegressor, {})
        ]

        X, y = datasets.make_regression(n_features=5)

        data = Dataset(X, y)
        context, pipeline_data = LocalExecutor(
            data, 10) << (Pipeline() >> ModelSpace(model_list) >>
                          PolynomialFeatureGenerator(max_degree=3) >> Validate(
                              test_size=0.33, metrics=mean_squared_error) >>
                          ChooseBest(1) >> FeatureSelector(10))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(pipeline_data.dataset,
                                             Dataset(X, y))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
Ejemplo n.º 16
0
    def test_all_step(self):
        model_list = [
            #(Lasso, {}),
            #(Ridge, {}),
            #(KernelRidge, {}),
            (RandomForestRegressor, {}),
            (XGBRegressor, {})
        ]

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        context, pipeline_data = LocalExecutor(data, 10) << (
            Pipeline() >> ModelSpace(model_list) >> FormulaFeatureGenerator([
                '+', '-', '*'
            ]) >> Validate(test_size=0.33, metrics=mean_squared_error) >>
            ChooseBest(1, by_largest_score=False) >> FeatureSelector(20))

        print('0' * 30)
        for result in pipeline_data.return_val:
            print(result.model, result.score)
        print(pipeline_data.dataset.data.shape)
        print('0' * 30)
Ejemplo n.º 17
0
 def test_auto_step_wrapper_error(self):
     with self.assertRaises(ValueError):
         LocalExecutor() << (Pipeline() >> "err")
Ejemplo n.º 18
0
 def test_auto_step_wrapper(self):
     func = lambda x, context: 1
     result = LocalExecutor() << (Pipeline() >> func)
     self.assertEqual(result[1], 1)
Ejemplo n.º 19
0
 def test_initializer(self):
     func = lambda x, context: context.epoch
     result = LocalExecutor(epochs=10) << (
         Pipeline() >> PipelineStep('a', func, initializer=True))
     self.assertEqual(result[1], 0)