예제 #1
0
    def test_max_depth(self):
        model_list = [(Lasso, {}), (Ridge, {}), (RandomForestRegressor, {})]

        max_depth = 2
        addition = 0.25
        epochs = 20

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        context, pipeline_data = LocalExecutor(data, epochs) << (
            Pipeline() >> ModelSpace(model_list) >> FormulaFeatureGenerator(
                ['+', '-', '*', '/'], max_depth=max_depth, addition=addition)
            >> Validate(test_size=0.33, metrics=mean_squared_error) >>
            ChooseBest(1) >> FeatureSelector(30))

        depths = [feature["depth"] for feature in pipeline_data.dataset.meta]

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(
            pipeline_data.dataset,
            Dataset(datasets.load_boston().data,
                    datasets.load_boston().target))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
        self.assertLessEqual(max(depths), max_depth + addition * epochs)
예제 #2
0
    def test_correlated_feature_generator(self):

        model_list = [(RandomForestRegressor, {}),
                      (GradientBoostingRegressor, {}), (SVR, {}),
                      (XGBRegressor, {})]

        n_features_to_select = random.randint(5, 30)

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        context, pipeline_data = LocalExecutor(data, 5) << (
            Pipeline() >> PipelineStep(
                'model space', ModelSpace(model_list), initializer=True) >>
            FormulaFeatureGenerator(['+', '-', '*', '/'], limit=10) >>
            Validate(test_size=0.1, metrics=mean_absolute_error) >> ChooseBest(
                1, by_largest_score=False) >>
            RecursiveFeatureSelector(n_features_to_select=n_features_to_select)
            >> CorrelatedFeatureSelector(max_correlation=0.9))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(
            pipeline_data.dataset,
            Dataset(datasets.load_boston().data,
                    datasets.load_boston().target))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
예제 #3
0
    def test_pipeline_hyperopt(self):
        x, y = make_classification(n_samples=100,
                                   n_features=40,
                                   n_informative=2,
                                   n_redundant=10,
                                   flip_y=0.05)
        model_list = [(RandomForestClassifier, random_forest_hp_space()),
                      (GradientBoostingClassifier, grad_boosting_hp_space()),
                      (SVC, svc_kernel_hp_space('rbf')),
                      (KNeighborsClassifier, knn_hp_space()),
                      (XGBClassifier, xgboost_hp_space())]

        data = Dataset(x, y)
        context, pipeline_data = LocalExecutor(data, 2) << (
            Pipeline() >> PipelineStep(
                'model space', ModelSpace(model_list),
                initializer=True) >> FormulaFeatureGenerator(['+', '-', '*'])
            >> Hyperopt(Validate(test_size=0.1, metrics=roc_auc_score),
                        max_evals=2) >> ChooseBest(1) >> FeatureSelector(10))

        print('0' * 30)
        for result in pipeline_data.return_val:
            print(result.model, result.score)
        print(pipeline_data.dataset.data.shape)
        print('0' * 30)
예제 #4
0
    def test_RFS(self):
        x, y = make_classification(n_samples=100,
                                   n_features=40,
                                   n_informative=2,
                                   n_redundant=10,
                                   flip_y=0.05)

        model_list = [(RandomForestClassifier, {}),
                      (GradientBoostingClassifier, {}), (SVC, {}),
                      (KNeighborsClassifier, {}), (XGBClassifier, {})]

        n_features_to_select = random.randint(5, 30)

        data = Dataset(x, y)
        context, pipeline_data = LocalExecutor(
            data, 2) << (Pipeline() >> PipelineStep(
                'model space', ModelSpace(model_list), initializer=True) >>
                         FormulaFeatureGenerator(['+', '-', '*', '/']) >>
                         Validate(test_size=0.1, metrics=roc_auc_score) >>
                         ChooseBest(1) >> RecursiveFeatureSelector(
                             n_features_to_select=n_features_to_select))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(pipeline_data.dataset,
                                             Dataset(x, y))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
예제 #5
0
    def test_voting_feature_selector(self):
        x, y = make_regression(
            n_samples=100,
            n_features=40,
            n_informative=2,
        )

        model_list = [(RandomForestRegressor, {}),
                      (GradientBoostingRegressor, {}), (SVR, {}),
                      (XGBRegressor, {})]

        data = Dataset(x, y)

        result_mult = []
        result_div = []
        context, pipeline_data = LocalExecutor(data, 10) << (
            Pipeline() >> PipelineStep(
                'model space', ModelSpace(model_list), initializer=True) >>
            FormulaFeatureGenerator(['+', '-', '*', '/']) >> Validate(
                test_size=0.1, metrics=mean_absolute_error) >> ChooseBest(
                    4, by_largest_score=False) >> VotingFeatureSelector(
                        feature_to_select=10, reverse_score=True))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(pipeline_data.dataset,
                                             Dataset(x, y))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())

        print('0' * 30)
        for result in pipeline_data.return_val:
            print(result.model, result.score)
        print(pipeline_data.dataset.data.shape)
        print('0' * 30)
예제 #6
0
    def test_step_space_regression_model(self):
        model_list = [
            (Lasso, {}),
            (Ridge, {}),
            (KernelRidge, {}),
        ]

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        LocalExecutor(data) << (
            Pipeline() >> ModelSpace(model_list) >> Validate(
                test_size=0.33, metrics=mean_absolute_error) >> ChooseBest(3))
예제 #7
0
    def test_forest(self):
        model_list = [
            #(RandomForestRegressor, random_forest_hp_space('mae')),
            (RandomForestClassifier, random_forest_hp_space())
        ]

        data = Dataset(datasets.load_iris().data, datasets.load_iris().target)
        context, pipeline_data = LocalExecutor(data, 1) << (
            Pipeline() >> PipelineStep('model space', ModelSpace(model_list))
            >> PipelineStep(
                'H',
                Hyperopt(Validate(test_size=0.33, metrics=mean_absolute_error),
                         max_evals=2)))
예제 #8
0
    def test_step_validate(self):
        model_list = [
            (LogisticRegression, {}),
            (RandomForestClassifier, {
                'n_estimators': 100
            }),
            (GradientBoostingClassifier, {}),
            (SVC, {}),
            (KNeighborsClassifier, {}),
        ]

        data = Dataset(datasets.load_iris().data, datasets.load_iris().target)
        LocalExecutor(data) << (
            Pipeline() >> ModelSpace(model_list) >> Validate(
                test_size=0.33, metrics=accuracy_score) >> ChooseBest(3))
예제 #9
0
    def test_recovering_dataset_FFG(self):
        model_list = [(Lasso, {}), (Ridge, {}), (RandomForestRegressor, {})]

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        context, pipeline_data = LocalExecutor(
            data, 10) << (Pipeline() >> ModelSpace(model_list) >>
                          FormulaFeatureGenerator(['+', '-', '*']) >> Validate(
                              test_size=0.33, metrics=mean_squared_error) >>
                          ChooseBest(1) >> FeatureSelector(30))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(
            pipeline_data.dataset,
            Dataset(datasets.load_boston().data,
                    datasets.load_boston().target))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
예제 #10
0
    def test_poly_gen(self):
        model_list = [
            (Lasso, {}),
            #(Ridge, {}),
            (RandomForestRegressor, {})
        ]

        X, y = datasets.make_regression(n_features=5)

        data = Dataset(X, y)
        context, pipeline_data = LocalExecutor(
            data, 10) << (Pipeline() >> ModelSpace(model_list) >>
                          PolynomialFeatureGenerator(max_degree=3) >> Validate(
                              test_size=0.33, metrics=mean_squared_error) >>
                          ChooseBest(1) >> FeatureSelector(10))

        preprocessing = Preprocessing()
        final_data = preprocessing.reproduce(pipeline_data.dataset,
                                             Dataset(X, y))
        self.assertEqual(pipeline_data.dataset.data.shape, final_data.shape)
        self.assertTrue((final_data == pipeline_data.dataset.data).all())
예제 #11
0
    def test_all_step(self):
        model_list = [
            #(Lasso, {}),
            #(Ridge, {}),
            #(KernelRidge, {}),
            (RandomForestRegressor, {}),
            (XGBRegressor, {})
        ]

        data = Dataset(datasets.load_boston().data,
                       datasets.load_boston().target)
        context, pipeline_data = LocalExecutor(data, 10) << (
            Pipeline() >> ModelSpace(model_list) >> FormulaFeatureGenerator([
                '+', '-', '*'
            ]) >> Validate(test_size=0.33, metrics=mean_squared_error) >>
            ChooseBest(1, by_largest_score=False) >> FeatureSelector(20))

        print('0' * 30)
        for result in pipeline_data.return_val:
            print(result.model, result.score)
        print(pipeline_data.dataset.data.shape)
        print('0' * 30)