def test_feature_tools_transformer(self):
     df = dsutils.load_bank()
     df.drop(['id'], axis=1, inplace=True)
     y = df.pop('y')
     X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42)
     ftt = FeatureGenerationTransformer(task='classification', trans_primitives=['add_numeric', 'divide_numeric'])
     ftt.fit(X_train)
     x_t = ftt.transform(X_train)
     assert x_t is not None
 def test_pipeline(self):
     df = dsutils.load_bank()
     df.drop(['id'], axis=1, inplace=True)
     cross_cat = CrossCategorical()
     X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42)
     ftt = FeatureGenerationTransformer(task='classification', trans_primitives=[cross_cat])
     preprocessor = general_preprocessor()
     pipe = Pipeline(steps=[('feature_gen', ftt), ('processor', preprocessor)])
     X_t = pipe.fit_transform(X_train)
     assert X_t.shape == (80, 62)
 def test_in_dataframe_mapper(self):
     df = dsutils.load_bank()
     df.drop(['id'], axis=1, inplace=True)
     cross_cat = CrossCategorical()
     X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42)
     ftt = FeatureGenerationTransformer(task='classification', trans_primitives=[cross_cat])
     dfm = DataFrameMapper(features=[(X_train.columns.to_list(), ftt)],
                           input_df=True,
                           df_out=True)
     X_t = dfm.fit_transform(X_train)
     assert X_t.shape == (80, 62)
Example #4
0
    def test_model(self):
        df = dsutils.load_bank()
        df.drop(['id'], axis=1, inplace=True)
        X_train, X_test = train_test_split(df.head(1000), test_size=0.2, random_state=42)
        y_train = X_train.pop('y')
        y_test = X_test.pop('y')

        def f():
            return X_train, X_test, y_train, y_test

        self.train_bankdata(f)
 def test_feature_generation_with_selection(self):
     df = dsutils.load_bank().head(1000)
     df.drop(['id'], axis=1, inplace=True)
     y = df.pop('y')
     cross_cat = CrossCategorical()
     ftt = FeatureGenerationTransformer(task='classification',
                                        trans_primitives=['add_numeric', 'divide_numeric', cross_cat],
                                        feature_selection_args={'ratio_select_cols': 0.2})
     with pytest.raises(AssertionError) as err:
         ftt.fit(df)
         assert err.value == '`y` must be provided for feature selection.'
     ftt.fit(df, y)
     x_t = ftt.transform(df)
     assert x_t.shape[1] == 35
Example #6
0
    def test_no_categorical(self):
        df = dsutils.load_bank()

        df.drop(['id'], axis=1, inplace=True)
        df = df[['age', 'duration', 'previous', 'y']]

        X_train, X_test = train_test_split(df.head(1000), test_size=0.2, random_state=42)
        y_train = X_train.pop('y')
        y_test = X_test.pop('y')

        def f():
            return X_train, X_test, y_train, y_test

        self.train_bankdata(f)
Example #7
0
    def test_no_continuous(self):
        df = dsutils.load_bank()

        df.drop(['id'], axis=1, inplace=True)
        df = df[['job', 'education', 'loan', 'y']]

        X_train, X_test = train_test_split(df.head(1000), test_size=0.2, random_state=42)
        y_train = X_train.pop('y')
        y_test = X_test.pop('y')

        def f():
            return X_train, X_test, y_train, y_test

        self.train_bankdata(f)
    def test_feature_selection(self):
        df = dsutils.load_bank().head(1000)
        df.drop(['id'], axis=1, inplace=True)
        y = df.pop('y')
        cross_cat = CrossCategorical()
        ftt = FeatureGenerationTransformer(task='classification',
                                           trans_primitives=['add_numeric', 'divide_numeric', cross_cat])
        ftt.fit(df)
        x_t = ftt.transform(df)

        fst = FeatureSelectionTransformer('classification', ratio_select_cols=0.2, reserved_cols=ftt.original_cols)
        fst.fit(x_t, y)
        assert len(fst.scores_.items()) == 99
        assert len(fst.columns_) == 35
        x_t2 = fst.transform(x_t)
        assert x_t2.shape[1] == 35
Example #9
0
    def test_blend(self):
        df = dsutils.load_bank()
        df.drop(['id'], axis=1, inplace=True)
        X_train, X_test = train_test_split(df.head(1000), test_size=0.2, random_state=42)
        y_train = X_train.pop('y')
        y_test = X_test.pop('y')

        def f():
            return X_train, X_test, y_train, y_test

        _, hyper_model = self.train_bankdata(f)

        samples = [t.space_sample for t in hyper_model.get_top_trails(3)]
        blend_models = hyper_model.blend_models(samples, X_train, y_train)
        score = blend_models.evaluate(X_test, y_test, ['AUC'])
        assert score
Example #10
0
    def test_bankdata_catboost(self):
        space = search_space_general(lightgbm_fit_kwargs=lightgbm_fit_kwargs, )
        space.assign_by_vectors([2, 2, 1, 1, 0.031, 1, 3, 1])
        estimator = HyperGBMEstimator(
            'binary', space, cache_dir=f'{test_output_dir}/hypergbm_cache')
        df = dsutils.load_bank()
        df.drop(['id'], axis=1, inplace=True)
        X_train, X_test = train_test_split(df.head(10000),
                                           test_size=0.2,
                                           random_state=42)
        y_train = X_train.pop('y')
        y_test = X_test.pop('y')

        estimator.fit(X_train, y_train)
        scores = estimator.evaluate(X_test, y_test, metrics=['accuracy'])
        assert scores
        print(scores)
Example #11
0
    def test_save_load(self):
        df = dsutils.load_bank()
        df.drop(['id'], axis=1, inplace=True)
        X_train, X_test = train_test_split(df.head(1000), test_size=0.2, random_state=42)
        y_train = X_train.pop('y')
        y_test = X_test.pop('y')

        def f():
            return X_train, X_test, y_train, y_test

        est, hypergbm = self.train_bankdata(f)
        filepath = test_output_dir + '/hypergbm_model.pkl'
        est.save(filepath)
        assert os.path.isfile(filepath) == True
        model = hypergbm.load_estimator(filepath)
        score = model.evaluate(X_test, y_test, ['AUC'])
        assert score
Example #12
0
def main():
    rs = RandomSearcher(search_space_general, optimize_direction=OptimizeDirection.Maximize)
    hk = HyperGBM(rs, task='classification', reward_metric='auc',
                  cache_dir=f'{test_output_dir}/hypergbm_cache',
                  callbacks=[SummaryCallback(), FileLoggingCallback(rs, output_dir=f'{test_output_dir}/hyn_logs')])

    df = dsutils.load_bank()
    df.drop(['id'], axis=1, inplace=True)
    X_train, X_test = train_test_split(df, test_size=0.8, random_state=42)
    y_train = X_train.pop('y')
    y_test = X_test.pop('y')

    hk.search(X_train, y_train, X_test, y_test, max_trails=500)
    best_trial = hk.get_best_trail()
    print(f'best_train:{best_trial}')
    estimator = hk.final_train(best_trial.space_sample, X_train, y_train)
    score = estimator.predict(X_test)
    result = estimator.evaluate(X_test, y_test, metrics=['auc', 'accuracy'])
    print(f'final result:{result}')
 def test_feature_tools_categorical_cross(self):
     df = dsutils.load_bank()
     df.drop(['id'], axis=1, inplace=True)
     cross_cat = CrossCategorical()
     X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42)
     ftt = FeatureGenerationTransformer(task='classification', trans_primitives=[cross_cat])
     ftt.fit(X_train)
     x_t = ftt.transform(X_train)
     assert len(set(x_t.columns.to_list()) - set(
         ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y', 'age',
          'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'contact__marital', 'job__poutcome',
          'contact__default', 'housing__month', 'housing__marital', 'loan__y', 'housing__job', 'loan__poutcome',
          'month__poutcome', 'default__month', 'default__education', 'education__loan', 'education__housing',
          'housing__loan', 'housing__poutcome', 'contact__housing', 'contact__loan', 'marital__y', 'contact__job',
          'education__poutcome', 'default__marital', 'job__month', 'job__y', 'default__loan', 'education__marital',
          'default__poutcome', 'default__y', 'contact__month', 'education__month', 'contact__education',
          'contact__poutcome', 'job__marital', 'education__job', 'job__loan', 'contact__y', 'month__y',
          'default__housing', 'default__job', 'poutcome__y', 'loan__marital', 'education__y', 'loan__month',
          'marital__month', 'housing__y', 'marital__poutcome'])) == 0
Example #14
0
    def train_bankdata(self, data_partition):
        rs = RandomSearcher(search_space_general, optimize_direction=OptimizeDirection.Maximize)
        hk = HyperGBM(rs, task='classification', reward_metric='accuracy',
                      cache_dir=f'{test_output_dir}/hypergbm_cache',
                      callbacks=[SummaryCallback(), FileLoggingCallback(rs, output_dir=f'{test_output_dir}/hyn_logs')])

        df = dsutils.load_bank()
        df.drop(['id'], axis=1, inplace=True)

        X_train, X_test, y_train, y_test = data_partition()

        hk.search(X_train, y_train, X_test, y_test, max_trails=3)
        best_trial = hk.get_best_trail()

        estimator = hk.final_train(best_trial.space_sample, X_train, y_train)
        score = estimator.predict(X_test)
        result = estimator.evaluate(X_test, y_test)
        assert len(score) == 200
        return estimator, hk