def test_cross_val_stack(self): x, y = DataGenerator.get_digits_data() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) xgb_initparam = ParamsGenerator.get_xgb_init_param() rf_initparam = ParamsGenerator.get_rf_init_param() ext_initparam = ParamsGenerator.get_ext_init_param() xgb_bestparam = CrossValStack.get_best_xgbopt(x_train, y_train, xgb_initparam) rf_bestparam = CrossValStack.get_best_sklopt(x_train, y_train, rf_initparam) ext_bestparam = CrossValStack.get_best_etopt(x_train, y_train, ext_initparam) res = CrossValStack.cross_val_stack(x_train, y_train, x_test, xgb_bestparam, rf_bestparam, ext_bestparam) dfres = pd.DataFrame([res[0][:, 1], res[1][:, 1], res[2][:, 1]]).transpose() dfres.columns = ['p1', 'p2', 'p3'] y_test_xgb = CrossValStack.predict_opt_clf(XGBOpt.XGBOpt(x_train, y_train), xgb_bestparam, x_test, x_test)[0] y_test_skl = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), rf_bestparam, x_test, x_test)[0] y_test_ext = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), ext_bestparam, x_test, x_test)[0] print metrics.roc_auc_score(y_test, y_test_xgb) print metrics.roc_auc_score(y_test, y_test_skl) print metrics.roc_auc_score(y_test, y_test_ext) print metrics.roc_auc_score(y_test, dfres.p1.values) print metrics.roc_auc_score(y_test, dfres.p2.values) print metrics.roc_auc_score(y_test, dfres.p3.values) print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3) print metrics.roc_auc_score(y_test, dfres.p1.values) print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3) self.assertEqual(len(res), 5)
def test_cross_val_stack(self): x, y = DataGenerator.get_digits_data() # In order to obtain some categorical columns x['i63'] = x['i63'].map(str) x['i62'] = x['i62'].map(str) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) dic = {} x_shadow = x_train.copy() x_train.loc[:, 'source'] = 0 x_shadow.loc[:, 'source'] = 1 x_all = pd.concat([x_train, x_shadow]) shadow_selector = x_all['source'] == 0 ChaosGeneration.chaos_feature_importance(x_all, y_train, shadow_selector, feat_dic=dic, feat_iter=10, nb_features=20, chaos_gen_iter=30) sorted_x = sorted(dic.items(), key=operator.itemgetter(1)) self.assertGreater(len(dic), len(x_train.columns)) self.assertGreater(len(dic), len(x_shadow.columns))
def test_cross_val_meta_stack(self): x, y = DataGenerator.get_digits_data() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) xgb_initparam = ParamsGenerator.get_xgb_init_param() rf_initparam = ParamsGenerator.get_rf_init_param() ext_initparam = ParamsGenerator.get_ext_init_param() xgb_bestparam = CrossValStack.get_best_xgbopt(x_train, y_train, xgb_initparam) rf_bestparam = CrossValStack.get_best_sklopt(x_train, y_train, rf_initparam) ext_bestparam = CrossValStack.get_best_etopt(x_train, y_train, ext_initparam) res = CrossValStack.cross_val_meta_stack(x_train, y_train, x_test, xgb_bestparam, rf_bestparam, ext_bestparam, csvstack_cv=3) dfres = pd.DataFrame([res[0][:, 1], res[1][:, 1], res[2][:, 1]]).transpose() dfres.columns = ['p1', 'p2', 'p3'] y_test_xgb = CrossValStack.predict_opt_clf(XGBOpt.XGBOpt(x_train, y_train), xgb_bestparam, x_test, x_test)[0] y_test_skl = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), rf_bestparam, x_test, x_test)[0] y_test_ext = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), ext_bestparam, x_test, x_test)[0] print metrics.roc_auc_score(y_test, y_test_xgb) print metrics.roc_auc_score(y_test, y_test_skl) print metrics.roc_auc_score(y_test, y_test_ext) print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3) self.assertEqual(len(res), 3)
def test_kerasopt_auc(self): x, y = DataGenerator.get_digits_data() kerasopt = KerasOpt.KerasOpt(x, y) param = HyperoptParam.HyperoptParam.param_space_reg_keras_dnn param['eval_metric'] = 'auc' best = kerasopt.run_hp(param) self.assertIsNotNone(best) self.assertLess(kerasopt.score, -0.85)
def test_handle_columns_entropy_inf(self): x, y = DataGenerator.get_iris_data() x['feat_tobedef'] = x[x.columns[0]].map(lambda s: str(s)) x['feat_toother'] = x[x.columns[0]].map(lambda s: str(int(3 * s))) num_cols, todummy_cols, tootherisation_cols, tobedefined_cols = \ Numerisation.handle_columns(x, [], [], numerize_entropy_max=np.inf) self.assertEqual(len(tootherisation_cols), 2) self.assertEqual(len(tobedefined_cols), 0)
def test_xgbopt_tree_auc(self): x, y = DataGenerator.get_digits_data() xgbopt = XGBOpt.XGBOpt(x, y) param = HyperoptParam.HyperoptParam.param_space_reg_xgb_tree param['eval_metric'] = 'auc' best = xgbopt.run_hp(param) self.assertIsNotNone(best) self.assertLess(xgbopt.score, -0.99)
def test_lropt_logloss(self): x, y = DataGenerator.get_digits_data() skopt = SklearnOpt.SklearnOpt(x, y) param = HyperoptParam.HyperoptParam.param_space_clf_skl_lr param['eval_metric'] = 'logloss' param['type'] = 'logistic_regression' best = skopt.run_hp(param) self.assertIsNotNone(best) self.assertLess(skopt.score, 0.011)
def test_etopt_logloss(self): x, y = DataGenerator.get_digits_data() skopt = SklearnOpt.SklearnOpt(x, y) param = HyperoptParam.HyperoptParam.param_space_reg_skl_rf param['eval_metric'] = 'logloss' param['type'] = 'extra_trees' best = skopt.run_hp(param) self.assertIsNotNone(best) self.assertLess(skopt.score, 0.03)
def test_rfopt_auc(self): x, y = DataGenerator.get_digits_data() skopt = SklearnOpt.SklearnOpt(x, y) param = HyperoptParam.HyperoptParam.param_space_reg_skl_rf param['eval_metric'] = 'auc' param['type'] = 'random_forest' best = skopt.run_hp(param) self.assertIsNotNone(best) self.assertLess(skopt.score, -0.99)
def test_xgbopt_tree_logloss(self): x, y = DataGenerator.get_digits_data() xgbopt = XGBOpt.XGBOpt(x, y) param = HyperoptParam.HyperoptParam.param_space_reg_xgb_tree param['max_evals'] = 10 param['eval_metric'] = 'logloss' best = xgbopt.run_hp(param) self.assertIsNotNone(best) self.assertLess(xgbopt.score, 0.04)
def test_treat_dataframe(self): x, y = DataGenerator.get_iris_data() x['feat_todummy'] = 'test' x['feat_tobedef'] = x[x.columns[0]].map(lambda s: str(s)) x['feat_toother'] = x[x.columns[0]].map(lambda s: str(int(3 * s))) num_cols, todummy_cols, tootherisation_cols, tobedefined_cols = Numerisation.handle_columns( x, [], []) df2 = Numerisation.treat_dataframe(x, num_cols, [], todummy_cols, tootherisation_cols) self.assertTrue(df2.applymap(np.isreal).all(1).all())
def test_handle_columns(self): x, y = DataGenerator.get_iris_data() x['feat_todummy'] = 'test' x['feat_tobedef'] = x[x.columns[0]].map(lambda s: str(s)) x['feat_toother'] = x[x.columns[0]].map(lambda s: str(int(3 * s))) num_cols, todummy_cols, tootherisation_cols, tobedefined_cols = Numerisation.handle_columns( x, [], []) self.assertEqual(len(num_cols), 4) self.assertEqual(len(todummy_cols), 1) self.assertEqual(len(tootherisation_cols), 1) self.assertEqual(len(tobedefined_cols), 1)
def test_cross_val_stack_none(self): x, y = DataGenerator.get_digits_data() # In order to obtain some categorical columns x["i63"] = x["i63"].map(str) x["i62"] = x["i62"].map(str) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) dic = {} x_train.loc[:, "source"] = 0 shadow_selector = x_train["source"] == 0 ChaosGeneration.chaos_feature_importance( x_train, y_train, shadow_selector, feat_dic=dic, feat_iter=10, nb_features=20, chaos_gen_iter=30 ) sorted_x = sorted(dic.items(), key=operator.itemgetter(1)) self.assertGreater(len(dic), len(x_train.columns))
def test_random_forest(self): # loading x, y = DataGenerator.get_adult_data() # cleaning MissingValues.add_miss_val_indicator(x) x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42) x_train_1, x_valid_1 = Automaton.numerize(x_train, x_valid) sklparam = Cvs.get_best_sklopt(x_train_1, y_train, ParamsGenerator.get_rf_init_param()) skopt = SklearnOpt.SklearnOpt(x_train_1, y_train) y_pred_valid, _ = Cvs.predict_opt_clf(skopt, sklparam, x_valid_1, x_valid_1) print 'Random Forest' print metrics.roc_auc_score(y_valid, y_pred_valid) print metrics.log_loss(y_valid, y_pred_valid)
def test_full_robot(self): logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) # loading x, y = DataGenerator.get_adult_data() # cleaning MissingValues.add_miss_val_indicator(x) x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42) ext_ip, rf_ip, robot_args, xgb_ip = self.get_params() res = Automaton.robot(x_train, y_train, x_valid, rf_ip, ext_ip, xgb_ip, **robot_args) y_pred_valid = Misc.stacking_res_to_one_pred(res) print 'Full Robot' print metrics.roc_auc_score(y_valid, y_pred_valid) print metrics.log_loss(y_valid, y_pred_valid)
def test_handle_nocategoric_nonreg(self): x, y = DataGenerator.get_digits_data() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) dic = {} x_train.loc[:, 'source'] = 0 shadow_selector = x_train['source'] == 0 ChaosGeneration.chaos_feature_importance(x_train, y_train, shadow_selector, feat_dic=dic, feat_iter=10, nb_features=20, chaos_gen_iter=30) self.assertGreater(len(dic), len(x_train.columns))
def test_tiny_robot(self): logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) # loading x, y = DataGenerator.get_adult_data() # cleaning MissingValues.add_miss_val_indicator(x) x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42) ext_ip, rf_ip, robot_args, xgb_ip = self.get_params() x_train_num, _ = Automaton.numerize(x_train, x_valid, **robot_args) rf_rp = Misc.enhance_param(Cvs.get_best_sklopt(x_train_num, y_train, rf_ip), **robot_args) ext_rp = Misc.enhance_param(Cvs.get_best_etopt(x_train_num, y_train, ext_ip), **robot_args) xgb_rp = Misc.enhance_param(Cvs.get_best_xgbopt(x_train_num, y_train, xgb_ip), **robot_args) res = Automaton.tiny_robot(x_train, y_train, x_valid, rf_rp, ext_rp, xgb_rp, **robot_args) y_pred_valid = Misc.stacking_res_to_one_pred(res) print 'Tiny Robot' print metrics.roc_auc_score(y_valid, y_pred_valid) print metrics.log_loss(y_valid, y_pred_valid)