def robot(x_train, y_train, x_valid, rf_ip, ext_ip, xgb_ip, **robot_kwargs): robot_cv_feat = robot_kwargs.get('robot_cv_feat', 6) robot_cv_hopt = robot_kwargs.get('robot_cv_hopt', 6) robot_cv_stack = robot_kwargs.get('robot_cv_stack', 5) robot_nb_auto_max = robot_kwargs.get('robot_nb_auto_max', -1) robot_rand_state = robot_kwargs.get('robot_rand_state', 42) res = [] nb_auto = 0 nb_samples = len(x_train) for train1_idx, feat_idx in KFold(nb_samples, n_folds=robot_cv_feat, shuffle=True, random_state=robot_rand_state): x_train1 = x_train.iloc[train1_idx] y_train1 = y_train.iloc[train1_idx] x_feat = x_train.iloc[feat_idx] y_feat = y_train.iloc[feat_idx] logging.info(" >>> DGH >>> Chaos feature generation") x_train1, x_valid = chaosize(x_feat, x_train1, x_valid, y_feat, **robot_kwargs) logging.info(" >>> DGH >>> Feature cleaning") x_train_num, x_valid_num = numerize(x_train1, x_valid, **robot_kwargs) for train2_idx, hopt_idx in KFold(len(x_train_num), n_folds=robot_cv_hopt, shuffle=True, random_state=robot_rand_state): x_train2 = x_train_num.iloc[train2_idx] y_train2 = y_train1.iloc[train2_idx] x_hopt = x_train_num.iloc[hopt_idx] y_hopt = y_train1.iloc[hopt_idx] logging.info(" >>> DGH >>> Looking for hopt parameters") rf_rp = Misc.enhance_param( Cvs.get_best_sklopt(x_hopt, y_hopt, rf_ip), **robot_kwargs) ext_rp = Misc.enhance_param( Cvs.get_best_etopt(x_hopt, y_hopt, ext_ip), **robot_kwargs) xgb_rp = Misc.enhance_param( Cvs.get_best_xgbopt(x_hopt, y_hopt, xgb_ip), **robot_kwargs) stack_res = [] logging.info(" >>> DGH >>> Cross-val-stacking") for train3_idx, stack_idx in KFold(len(x_train2), n_folds=robot_cv_stack, shuffle=True): y_probas = Cvs.stack_that(x_train2, y_train2, x_valid_num, train3_idx, stack_idx, rf_rp, ext_rp, xgb_rp) stack_res.append(y_probas) res.append(stack_res) nb_auto += 1 if nb_auto == robot_nb_auto_max: return res return res
def robot(x_train, y_train, x_valid, rf_ip, ext_ip, xgb_ip, **robot_kwargs): robot_cv_feat = robot_kwargs.get('robot_cv_feat', 6) robot_cv_hopt = robot_kwargs.get('robot_cv_hopt', 6) robot_cv_stack = robot_kwargs.get('robot_cv_stack', 5) robot_nb_auto_max = robot_kwargs.get('robot_nb_auto_max', -1) robot_rand_state = robot_kwargs.get('robot_rand_state', 42) res = [] nb_auto = 0 nb_samples = len(x_train) for train1_idx, feat_idx in KFold(nb_samples, n_folds=robot_cv_feat, shuffle=True, random_state=robot_rand_state): x_train1 = x_train.iloc[train1_idx] y_train1 = y_train.iloc[train1_idx] x_feat = x_train.iloc[feat_idx] y_feat = y_train.iloc[feat_idx] logging.info("Chaos feature generation") x_train1, x_valid = chaosize(x_feat, x_train1, x_valid, y_feat, **robot_kwargs) logging.info("Feature cleaning") x_train_num, x_valid_num = numerize(x_train1, x_valid, **robot_kwargs) for train2_idx, hopt_idx in KFold(len(x_train_num), n_folds=robot_cv_hopt, shuffle=True, random_state=robot_rand_state): x_train2 = x_train_num.iloc[train2_idx] y_train2 = y_train1.iloc[train2_idx] x_hopt = x_train_num.iloc[hopt_idx] y_hopt = y_train1.iloc[hopt_idx] logging.info("Looking for hopt parameters") rf_rp = Misc.enhance_param(Cvs.get_best_sklopt(x_hopt, y_hopt, rf_ip), **robot_kwargs) ext_rp = Misc.enhance_param(Cvs.get_best_etopt(x_hopt, y_hopt, ext_ip), **robot_kwargs) xgb_rp = Misc.enhance_param(Cvs.get_best_xgbopt(x_hopt, y_hopt, xgb_ip), **robot_kwargs) stack_res = [] logging.info("Starting cross-val-stacking") for train3_idx, stack_idx in KFold(len(x_train2), n_folds=robot_cv_stack, shuffle=True): y_probas = Cvs.stack_that(x_train2, y_train2, x_valid_num, train3_idx, stack_idx, rf_rp, ext_rp, xgb_rp) stack_res.append(y_probas) res.append(stack_res) nb_auto += 1 if nb_auto == robot_nb_auto_max: return res return res
def tiny_robot(x_train, y_train, x_valid, rf_rp, ext_rp, xgb_rp, **robot_kwargs): robot_cv_stack = robot_kwargs.get('robot_cv_stack', 5) robot_nb_auto_max = robot_kwargs.get('robot_nb_auto_max', -1) res = [] nb_auto = 0 logging.info(" >>> DGH >>> Feature cleaning") x_train_num, x_valid_num = numerize(x_train, x_valid, **robot_kwargs) stack_res = [] logging.info(" >>> DGH >>> Cross-val-stacking") for train1_idx, stack_idx in tqdm(KFold(len(x_train_num), n_folds=robot_cv_stack, shuffle=True), nested=True, desc='cv2'): y_probas = Cvs.stack_that(x_train_num, y_train, x_valid_num, train1_idx, stack_idx, rf_rp, ext_rp, xgb_rp) stack_res.append(y_probas) nb_auto += 1 if nb_auto == robot_nb_auto_max: return res res.append(stack_res) return res
def test_cross_val_meta_stack(self): x, y = DataGenerator.get_digits_data() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) xgb_initparam = ParamsGenerator.get_xgb_init_param() rf_initparam = ParamsGenerator.get_rf_init_param() ext_initparam = ParamsGenerator.get_ext_init_param() xgb_bestparam = CrossValStack.get_best_xgbopt(x_train, y_train, xgb_initparam) rf_bestparam = CrossValStack.get_best_sklopt(x_train, y_train, rf_initparam) ext_bestparam = CrossValStack.get_best_etopt(x_train, y_train, ext_initparam) res = CrossValStack.cross_val_meta_stack(x_train, y_train, x_test, xgb_bestparam, rf_bestparam, ext_bestparam, csvstack_cv=3) dfres = pd.DataFrame([res[0][:, 1], res[1][:, 1], res[2][:, 1]]).transpose() dfres.columns = ['p1', 'p2', 'p3'] y_test_xgb = CrossValStack.predict_opt_clf(XGBOpt.XGBOpt(x_train, y_train), xgb_bestparam, x_test, x_test)[0] y_test_skl = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), rf_bestparam, x_test, x_test)[0] y_test_ext = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), ext_bestparam, x_test, x_test)[0] print metrics.roc_auc_score(y_test, y_test_xgb) print metrics.roc_auc_score(y_test, y_test_skl) print metrics.roc_auc_score(y_test, y_test_ext) print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3) self.assertEqual(len(res), 3)
def test_cross_val_stack(self): x, y = DataGenerator.get_digits_data() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) xgb_initparam = ParamsGenerator.get_xgb_init_param() rf_initparam = ParamsGenerator.get_rf_init_param() ext_initparam = ParamsGenerator.get_ext_init_param() xgb_bestparam = CrossValStack.get_best_xgbopt(x_train, y_train, xgb_initparam) rf_bestparam = CrossValStack.get_best_sklopt(x_train, y_train, rf_initparam) ext_bestparam = CrossValStack.get_best_etopt(x_train, y_train, ext_initparam) res = CrossValStack.cross_val_stack(x_train, y_train, x_test, xgb_bestparam, rf_bestparam, ext_bestparam) dfres = pd.DataFrame([res[0][:, 1], res[1][:, 1], res[2][:, 1]]).transpose() dfres.columns = ['p1', 'p2', 'p3'] y_test_xgb = CrossValStack.predict_opt_clf(XGBOpt.XGBOpt(x_train, y_train), xgb_bestparam, x_test, x_test)[0] y_test_skl = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), rf_bestparam, x_test, x_test)[0] y_test_ext = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), ext_bestparam, x_test, x_test)[0] print metrics.roc_auc_score(y_test, y_test_xgb) print metrics.roc_auc_score(y_test, y_test_skl) print metrics.roc_auc_score(y_test, y_test_ext) print metrics.roc_auc_score(y_test, dfres.p1.values) print metrics.roc_auc_score(y_test, dfres.p2.values) print metrics.roc_auc_score(y_test, dfres.p3.values) print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3) print metrics.roc_auc_score(y_test, dfres.p1.values) print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3) self.assertEqual(len(res), 5)
def test_random_forest(self): # loading x, y = DataGenerator.get_adult_data() # cleaning MissingValues.add_miss_val_indicator(x) x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42) x_train_1, x_valid_1 = Automaton.numerize(x_train, x_valid) sklparam = Cvs.get_best_sklopt(x_train_1, y_train, ParamsGenerator.get_rf_init_param()) skopt = SklearnOpt.SklearnOpt(x_train_1, y_train) y_pred_valid, _ = Cvs.predict_opt_clf(skopt, sklparam, x_valid_1, x_valid_1) print 'Random Forest' print metrics.roc_auc_score(y_valid, y_pred_valid) print metrics.log_loss(y_valid, y_pred_valid)
def test_tiny_robot(self): logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) # loading x, y = DataGenerator.get_adult_data() # cleaning MissingValues.add_miss_val_indicator(x) x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42) ext_ip, rf_ip, robot_args, xgb_ip = self.get_params() x_train_num, _ = Automaton.numerize(x_train, x_valid, **robot_args) rf_rp = Misc.enhance_param(Cvs.get_best_sklopt(x_train_num, y_train, rf_ip), **robot_args) ext_rp = Misc.enhance_param(Cvs.get_best_etopt(x_train_num, y_train, ext_ip), **robot_args) xgb_rp = Misc.enhance_param(Cvs.get_best_xgbopt(x_train_num, y_train, xgb_ip), **robot_args) res = Automaton.tiny_robot(x_train, y_train, x_valid, rf_rp, ext_rp, xgb_rp, **robot_args) y_pred_valid = Misc.stacking_res_to_one_pred(res) print 'Tiny Robot' print metrics.roc_auc_score(y_valid, y_pred_valid) print metrics.log_loss(y_valid, y_pred_valid)
def small_robot(x_train, y_train, x_valid, rf_rp, ext_rp, xgb_rp, **robot_kwargs): robot_cv_feat = robot_kwargs.get('robot_cv_feat', 6) robot_cv_stack = robot_kwargs.get('robot_cv_stack', 5) robot_nb_auto_max = robot_kwargs.get('robot_nb_auto_max', -1) robot_rand_state = robot_kwargs.get('robot_rand_state', 42) res = [] nb_auto = 0 nb_samples = len(x_train) for train1_idx, feat_idx in tqdm(KFold(nb_samples, n_folds=robot_cv_feat, shuffle=True, random_state=robot_rand_state), desc='cv1'): x_train1 = x_train.iloc[train1_idx] y_train1 = y_train.iloc[train1_idx] x_feat = x_train.iloc[feat_idx] y_feat = y_train.iloc[feat_idx] logging.info(" >>> DGH >>> Chaos feature generation") x_train1, x_valid = chaosize(x_feat, x_train1, x_valid, y_feat, **robot_kwargs) logging.info(" >>> DGH >>> Feature cleaning") x_train_num, x_valid_num = numerize(x_train1, x_valid, **robot_kwargs) stack_res = [] logging.info(" >>> DGH >>> Cross-val-stacking") for train2_idx, stack_idx in tqdm(KFold(len(x_train_num), n_folds=robot_cv_stack, shuffle=True), nested=True, desc='cv2'): y_probas = Cvs.stack_that(x_train_num, y_train1, x_valid_num, train2_idx, stack_idx, rf_rp, ext_rp, xgb_rp) stack_res.append(y_probas) res.append(stack_res) nb_auto += 1 if nb_auto == robot_nb_auto_max: return res return res
def small_robot(x_train, y_train, x_valid, rf_rp, ext_rp, xgb_rp, **robot_kwargs): robot_cv_feat = robot_kwargs.get('robot_cv_feat', 6) robot_cv_stack = robot_kwargs.get('robot_cv_stack', 5) robot_nb_auto_max = robot_kwargs.get('robot_nb_auto_max', -1) robot_rand_state = robot_kwargs.get('robot_rand_state', 42) res = [] nb_auto = 0 nb_samples = len(x_train) for train1_idx, feat_idx in KFold(nb_samples, n_folds=robot_cv_feat, shuffle=True, random_state=robot_rand_state): x_train1 = x_train.iloc[train1_idx] y_train1 = y_train.iloc[train1_idx] x_feat = x_train.iloc[feat_idx] y_feat = y_train.iloc[feat_idx] logging.info("Chaos feature generation") x_train1, x_valid = chaosize(x_feat, x_train1, x_valid, y_feat, **robot_kwargs) logging.info("Feature cleaning") x_train_num, x_valid_num = numerize(x_train1, x_valid, **robot_kwargs) stack_res = [] logging.info("Starting cross-val-stacking") for train2_idx, stack_idx in KFold(len(x_train_num), n_folds=robot_cv_stack, shuffle=True): y_probas = Cvs.stack_that(x_train_num, y_train1, x_valid_num, train2_idx, stack_idx, rf_rp, ext_rp, xgb_rp) stack_res.append(y_probas) res.append(stack_res) nb_auto += 1 if nb_auto == robot_nb_auto_max: return res return res