def test_cross_val_meta_stack(self): x, y = DataGenerator.get_digits_data() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) xgb_initparam = ParamsGenerator.get_xgb_init_param() rf_initparam = ParamsGenerator.get_rf_init_param() ext_initparam = ParamsGenerator.get_ext_init_param() xgb_bestparam = CrossValStack.get_best_xgbopt(x_train, y_train, xgb_initparam) rf_bestparam = CrossValStack.get_best_sklopt(x_train, y_train, rf_initparam) ext_bestparam = CrossValStack.get_best_etopt(x_train, y_train, ext_initparam) res = CrossValStack.cross_val_meta_stack(x_train, y_train, x_test, xgb_bestparam, rf_bestparam, ext_bestparam, csvstack_cv=3) dfres = pd.DataFrame([res[0][:, 1], res[1][:, 1], res[2][:, 1]]).transpose() dfres.columns = ['p1', 'p2', 'p3'] y_test_xgb = CrossValStack.predict_opt_clf(XGBOpt.XGBOpt(x_train, y_train), xgb_bestparam, x_test, x_test)[0] y_test_skl = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), rf_bestparam, x_test, x_test)[0] y_test_ext = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), ext_bestparam, x_test, x_test)[0] print metrics.roc_auc_score(y_test, y_test_xgb) print metrics.roc_auc_score(y_test, y_test_skl) print metrics.roc_auc_score(y_test, y_test_ext) print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3) self.assertEqual(len(res), 3)
def stack_that(x_train, y_train, x_test, train_idx, stack_idx, rfparams, extparams, xgbparams): x_train_train = x_train.iloc[train_idx] y_train_train = y_train.iloc[train_idx] x_train_stack = x_train.iloc[stack_idx] y_train_stack = y_train.iloc[stack_idx] logging.info(" >>> DGH >>> prediction") xgbopt = XGBOpt.XGBOpt(x_train_train, y_train_train) y_pred_stack_1, y_pred_test_1 = predict_opt_clf(xgbopt, xgbparams, x_train_stack, x_test) skopt = SklearnOpt.SklearnOpt(x_train_train, y_train_train) y_pred_stack_2, y_pred_test_2 = predict_opt_clf(skopt, rfparams, x_train_stack, x_test) skopt = SklearnOpt.SklearnOpt(x_train_train, y_train_train) y_pred_stack_3, y_pred_test_3 = predict_opt_clf(skopt, extparams, x_train_stack, x_test) logging.info(" >>> DGH >>> prediction => stacking") x_pred_stack = pd.DataFrame( np.transpose(np.array([y_pred_stack_1, y_pred_stack_2, y_pred_stack_3]))) x_pred_test = pd.DataFrame( np.transpose(np.array([y_pred_test_1, y_pred_test_2, y_pred_test_3]))) lr = LogisticRegression() lr.fit(x_pred_stack, y_train_stack) return lr.predict_proba(x_pred_test)
def meta_stack_that(x_train, y_train, x_test, train_idx, stack_idx, rfparams, extparams, xgbparams): pca = PCA(n_components=10) kmeans = KMeans(n_clusters=3) x_train_train = x_train.iloc[train_idx] y_train_train = y_train.iloc[train_idx] x_train_stack = x_train.iloc[stack_idx] y_train_stack = y_train.iloc[stack_idx] logging.info(" >>> DGH >>> kmean-pca") x_train_stack_cls = kmeans.fit_predict((pca.fit_transform(x_train_stack))) x_test_stack_cls = kmeans.predict((pca.transform(x_test))) x_cls_stack = pd.get_dummies(x_train_stack_cls, prefix='cls').reset_index(drop=True) x_cls_test = pd.get_dummies(x_test_stack_cls, prefix='cls').reset_index(drop=True) logging.info(" >>> DGH >>> kmean-pca => prediction") xgbopt = XGBOpt.XGBOpt(x_train_train, y_train_train) y_pred_stack_1, y_pred_test_1 = predict_opt_clf(xgbopt, xgbparams, x_train_stack, x_test) skopt = SklearnOpt.SklearnOpt(x_train_train, y_train_train) y_pred_stack_2, y_pred_test_2 = predict_opt_clf(skopt, rfparams, x_train_stack, x_test) skopt = SklearnOpt.SklearnOpt(x_train_train, y_train_train) y_pred_stack_3, y_pred_test_3 = predict_opt_clf(skopt, extparams, x_train_stack, x_test) logging.info(" >>> DGH >>> kmean-pca => prediction => stacking") x_pred_stack = pd.DataFrame( np.transpose(np.array([y_pred_stack_1, y_pred_stack_2, y_pred_stack_3]))) x_pred_test = pd.DataFrame( np.transpose(np.array([y_pred_test_1, y_pred_test_2, y_pred_test_3]))) for col1 in x_cls_stack.columns: for col2 in x_pred_stack.columns: x_cls_stack['ms_' + str(col1) + '_' + str( col2)] = x_cls_stack[col1] * x_pred_stack[col2].reset_index( drop=True) x_cls_test[ 'ms_' + str(col1) + '_' + str(col2)] = x_cls_test[col1] * x_pred_test[col2].reset_index( drop=True) lr = LogisticRegression() lr.fit(x_cls_stack[[c for c in x_cls_stack.columns if c.startswith('ms')]], y_train_stack) return lr.predict_proba( x_cls_test[[c for c in x_cls_test.columns if c.startswith('ms')]])
def test_lropt_logloss(self): x, y = DataGenerator.get_digits_data() skopt = SklearnOpt.SklearnOpt(x, y) param = HyperoptParam.HyperoptParam.param_space_clf_skl_lr param['eval_metric'] = 'logloss' param['type'] = 'logistic_regression' best = skopt.run_hp(param) self.assertIsNotNone(best) self.assertLess(skopt.score, 0.011)
def test_etopt_logloss(self): x, y = DataGenerator.get_digits_data() skopt = SklearnOpt.SklearnOpt(x, y) param = HyperoptParam.HyperoptParam.param_space_reg_skl_rf param['eval_metric'] = 'logloss' param['type'] = 'extra_trees' best = skopt.run_hp(param) self.assertIsNotNone(best) self.assertLess(skopt.score, 0.03)
def test_rfopt_auc(self): x, y = DataGenerator.get_digits_data() skopt = SklearnOpt.SklearnOpt(x, y) param = HyperoptParam.HyperoptParam.param_space_reg_skl_rf param['eval_metric'] = 'auc' param['type'] = 'random_forest' best = skopt.run_hp(param) self.assertIsNotNone(best) self.assertLess(skopt.score, -0.99)
def test_random_forest(self): # loading x, y = DataGenerator.get_adult_data() # cleaning MissingValues.add_miss_val_indicator(x) x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42) x_train_1, x_valid_1 = Automaton.numerize(x_train, x_valid) sklparam = Cvs.get_best_sklopt(x_train_1, y_train, ParamsGenerator.get_rf_init_param()) skopt = SklearnOpt.SklearnOpt(x_train_1, y_train) y_pred_valid, _ = Cvs.predict_opt_clf(skopt, sklparam, x_valid_1, x_valid_1) print 'Random Forest' print metrics.roc_auc_score(y_valid, y_pred_valid) print metrics.log_loss(y_valid, y_pred_valid)
def get_best_etopt(x, y, params): return SklearnOpt.SklearnOpt(x, y).run_hp(params), params