def test_feature_tools_transformer(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) y = df.pop('y') X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42) ftt = FeatureGenerationTransformer(task='classification', trans_primitives=['add_numeric', 'divide_numeric']) ftt.fit(X_train) x_t = ftt.transform(X_train) assert x_t is not None
def test_pipeline(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) cross_cat = CrossCategorical() X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42) ftt = FeatureGenerationTransformer(task='classification', trans_primitives=[cross_cat]) preprocessor = general_preprocessor() pipe = Pipeline(steps=[('feature_gen', ftt), ('processor', preprocessor)]) X_t = pipe.fit_transform(X_train) assert X_t.shape == (80, 62)
def test_in_dataframe_mapper(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) cross_cat = CrossCategorical() X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42) ftt = FeatureGenerationTransformer(task='classification', trans_primitives=[cross_cat]) dfm = DataFrameMapper(features=[(X_train.columns.to_list(), ftt)], input_df=True, df_out=True) X_t = dfm.fit_transform(X_train) assert X_t.shape == (80, 62)
def test_model(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) X_train, X_test = train_test_split(df.head(1000), test_size=0.2, random_state=42) y_train = X_train.pop('y') y_test = X_test.pop('y') def f(): return X_train, X_test, y_train, y_test self.train_bankdata(f)
def test_feature_generation_with_selection(self): df = dsutils.load_bank().head(1000) df.drop(['id'], axis=1, inplace=True) y = df.pop('y') cross_cat = CrossCategorical() ftt = FeatureGenerationTransformer(task='classification', trans_primitives=['add_numeric', 'divide_numeric', cross_cat], feature_selection_args={'ratio_select_cols': 0.2}) with pytest.raises(AssertionError) as err: ftt.fit(df) assert err.value == '`y` must be provided for feature selection.' ftt.fit(df, y) x_t = ftt.transform(df) assert x_t.shape[1] == 35
def test_no_categorical(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) df = df[['age', 'duration', 'previous', 'y']] X_train, X_test = train_test_split(df.head(1000), test_size=0.2, random_state=42) y_train = X_train.pop('y') y_test = X_test.pop('y') def f(): return X_train, X_test, y_train, y_test self.train_bankdata(f)
def test_no_continuous(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) df = df[['job', 'education', 'loan', 'y']] X_train, X_test = train_test_split(df.head(1000), test_size=0.2, random_state=42) y_train = X_train.pop('y') y_test = X_test.pop('y') def f(): return X_train, X_test, y_train, y_test self.train_bankdata(f)
def test_feature_selection(self): df = dsutils.load_bank().head(1000) df.drop(['id'], axis=1, inplace=True) y = df.pop('y') cross_cat = CrossCategorical() ftt = FeatureGenerationTransformer(task='classification', trans_primitives=['add_numeric', 'divide_numeric', cross_cat]) ftt.fit(df) x_t = ftt.transform(df) fst = FeatureSelectionTransformer('classification', ratio_select_cols=0.2, reserved_cols=ftt.original_cols) fst.fit(x_t, y) assert len(fst.scores_.items()) == 99 assert len(fst.columns_) == 35 x_t2 = fst.transform(x_t) assert x_t2.shape[1] == 35
def test_blend(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) X_train, X_test = train_test_split(df.head(1000), test_size=0.2, random_state=42) y_train = X_train.pop('y') y_test = X_test.pop('y') def f(): return X_train, X_test, y_train, y_test _, hyper_model = self.train_bankdata(f) samples = [t.space_sample for t in hyper_model.get_top_trails(3)] blend_models = hyper_model.blend_models(samples, X_train, y_train) score = blend_models.evaluate(X_test, y_test, ['AUC']) assert score
def test_bankdata_catboost(self): space = search_space_general(lightgbm_fit_kwargs=lightgbm_fit_kwargs, ) space.assign_by_vectors([2, 2, 1, 1, 0.031, 1, 3, 1]) estimator = HyperGBMEstimator( 'binary', space, cache_dir=f'{test_output_dir}/hypergbm_cache') df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) X_train, X_test = train_test_split(df.head(10000), test_size=0.2, random_state=42) y_train = X_train.pop('y') y_test = X_test.pop('y') estimator.fit(X_train, y_train) scores = estimator.evaluate(X_test, y_test, metrics=['accuracy']) assert scores print(scores)
def test_save_load(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) X_train, X_test = train_test_split(df.head(1000), test_size=0.2, random_state=42) y_train = X_train.pop('y') y_test = X_test.pop('y') def f(): return X_train, X_test, y_train, y_test est, hypergbm = self.train_bankdata(f) filepath = test_output_dir + '/hypergbm_model.pkl' est.save(filepath) assert os.path.isfile(filepath) == True model = hypergbm.load_estimator(filepath) score = model.evaluate(X_test, y_test, ['AUC']) assert score
def main(): rs = RandomSearcher(search_space_general, optimize_direction=OptimizeDirection.Maximize) hk = HyperGBM(rs, task='classification', reward_metric='auc', cache_dir=f'{test_output_dir}/hypergbm_cache', callbacks=[SummaryCallback(), FileLoggingCallback(rs, output_dir=f'{test_output_dir}/hyn_logs')]) df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) X_train, X_test = train_test_split(df, test_size=0.8, random_state=42) y_train = X_train.pop('y') y_test = X_test.pop('y') hk.search(X_train, y_train, X_test, y_test, max_trails=500) best_trial = hk.get_best_trail() print(f'best_train:{best_trial}') estimator = hk.final_train(best_trial.space_sample, X_train, y_train) score = estimator.predict(X_test) result = estimator.evaluate(X_test, y_test, metrics=['auc', 'accuracy']) print(f'final result:{result}')
def test_feature_tools_categorical_cross(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) cross_cat = CrossCategorical() X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42) ftt = FeatureGenerationTransformer(task='classification', trans_primitives=[cross_cat]) ftt.fit(X_train) x_t = ftt.transform(X_train) assert len(set(x_t.columns.to_list()) - set( ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'contact__marital', 'job__poutcome', 'contact__default', 'housing__month', 'housing__marital', 'loan__y', 'housing__job', 'loan__poutcome', 'month__poutcome', 'default__month', 'default__education', 'education__loan', 'education__housing', 'housing__loan', 'housing__poutcome', 'contact__housing', 'contact__loan', 'marital__y', 'contact__job', 'education__poutcome', 'default__marital', 'job__month', 'job__y', 'default__loan', 'education__marital', 'default__poutcome', 'default__y', 'contact__month', 'education__month', 'contact__education', 'contact__poutcome', 'job__marital', 'education__job', 'job__loan', 'contact__y', 'month__y', 'default__housing', 'default__job', 'poutcome__y', 'loan__marital', 'education__y', 'loan__month', 'marital__month', 'housing__y', 'marital__poutcome'])) == 0
def train_bankdata(self, data_partition): rs = RandomSearcher(search_space_general, optimize_direction=OptimizeDirection.Maximize) hk = HyperGBM(rs, task='classification', reward_metric='accuracy', cache_dir=f'{test_output_dir}/hypergbm_cache', callbacks=[SummaryCallback(), FileLoggingCallback(rs, output_dir=f'{test_output_dir}/hyn_logs')]) df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) X_train, X_test, y_train, y_test = data_partition() hk.search(X_train, y_train, X_test, y_test, max_trails=3) best_trial = hk.get_best_trail() estimator = hk.final_train(best_trial.space_sample, X_train, y_train) score = estimator.predict(X_test) result = estimator.evaluate(X_test, y_test) assert len(score) == 200 return estimator, hk