Example #1
0
def robot(x_train, y_train, x_valid, rf_ip, ext_ip, xgb_ip, **robot_kwargs):

    robot_cv_feat = robot_kwargs.get('robot_cv_feat', 6)
    robot_cv_hopt = robot_kwargs.get('robot_cv_hopt', 6)
    robot_cv_stack = robot_kwargs.get('robot_cv_stack', 5)
    robot_nb_auto_max = robot_kwargs.get('robot_nb_auto_max', -1)
    robot_rand_state = robot_kwargs.get('robot_rand_state', 42)

    res = []
    nb_auto = 0
    nb_samples = len(x_train)

    for train1_idx, feat_idx in KFold(nb_samples,
                                      n_folds=robot_cv_feat,
                                      shuffle=True,
                                      random_state=robot_rand_state):
        x_train1 = x_train.iloc[train1_idx]
        y_train1 = y_train.iloc[train1_idx]
        x_feat = x_train.iloc[feat_idx]
        y_feat = y_train.iloc[feat_idx]

        logging.info(" >>> DGH >>> Chaos feature generation")
        x_train1, x_valid = chaosize(x_feat, x_train1, x_valid, y_feat,
                                     **robot_kwargs)

        logging.info(" >>> DGH >>> Feature cleaning")
        x_train_num, x_valid_num = numerize(x_train1, x_valid, **robot_kwargs)

        for train2_idx, hopt_idx in KFold(len(x_train_num),
                                          n_folds=robot_cv_hopt,
                                          shuffle=True,
                                          random_state=robot_rand_state):
            x_train2 = x_train_num.iloc[train2_idx]
            y_train2 = y_train1.iloc[train2_idx]
            x_hopt = x_train_num.iloc[hopt_idx]
            y_hopt = y_train1.iloc[hopt_idx]

            logging.info(" >>> DGH >>> Looking for hopt parameters")
            rf_rp = Misc.enhance_param(
                Cvs.get_best_sklopt(x_hopt, y_hopt, rf_ip), **robot_kwargs)
            ext_rp = Misc.enhance_param(
                Cvs.get_best_etopt(x_hopt, y_hopt, ext_ip), **robot_kwargs)
            xgb_rp = Misc.enhance_param(
                Cvs.get_best_xgbopt(x_hopt, y_hopt, xgb_ip), **robot_kwargs)

            stack_res = []
            logging.info(" >>> DGH >>> Cross-val-stacking")
            for train3_idx, stack_idx in KFold(len(x_train2),
                                               n_folds=robot_cv_stack,
                                               shuffle=True):
                y_probas = Cvs.stack_that(x_train2, y_train2, x_valid_num,
                                          train3_idx, stack_idx, rf_rp, ext_rp,
                                          xgb_rp)
                stack_res.append(y_probas)
            res.append(stack_res)

            nb_auto += 1
            if nb_auto == robot_nb_auto_max:
                return res
    return res
Example #2
0
def robot(x_train, y_train, x_valid, rf_ip, ext_ip, xgb_ip, **robot_kwargs):

    robot_cv_feat = robot_kwargs.get('robot_cv_feat', 6)
    robot_cv_hopt = robot_kwargs.get('robot_cv_hopt', 6)
    robot_cv_stack = robot_kwargs.get('robot_cv_stack', 5)
    robot_nb_auto_max = robot_kwargs.get('robot_nb_auto_max', -1)
    robot_rand_state = robot_kwargs.get('robot_rand_state', 42)

    res = []
    nb_auto = 0

    nb_samples = len(x_train)

    for train1_idx, feat_idx in KFold(nb_samples, n_folds=robot_cv_feat, shuffle=True,
                                      random_state=robot_rand_state):
        x_train1 = x_train.iloc[train1_idx]
        y_train1 = y_train.iloc[train1_idx]
        x_feat = x_train.iloc[feat_idx]
        y_feat = y_train.iloc[feat_idx]

        logging.info("Chaos feature generation")
        x_train1, x_valid = chaosize(x_feat, x_train1, x_valid, y_feat, **robot_kwargs)

        logging.info("Feature cleaning")
        x_train_num, x_valid_num = numerize(x_train1, x_valid, **robot_kwargs)

        for train2_idx, hopt_idx in KFold(len(x_train_num), n_folds=robot_cv_hopt, shuffle=True,
                                          random_state=robot_rand_state):
            x_train2 = x_train_num.iloc[train2_idx]
            y_train2 = y_train1.iloc[train2_idx]
            x_hopt = x_train_num.iloc[hopt_idx]
            y_hopt = y_train1.iloc[hopt_idx]

            logging.info("Looking for hopt parameters")
            rf_rp = Misc.enhance_param(Cvs.get_best_sklopt(x_hopt, y_hopt, rf_ip), **robot_kwargs)
            ext_rp = Misc.enhance_param(Cvs.get_best_etopt(x_hopt, y_hopt, ext_ip), **robot_kwargs)
            xgb_rp = Misc.enhance_param(Cvs.get_best_xgbopt(x_hopt, y_hopt, xgb_ip), **robot_kwargs)

            stack_res = []
            logging.info("Starting cross-val-stacking")
            for train3_idx, stack_idx in KFold(len(x_train2), n_folds=robot_cv_stack, shuffle=True):
                y_probas = Cvs.stack_that(x_train2, y_train2, x_valid_num, train3_idx, stack_idx,
                                          rf_rp, ext_rp, xgb_rp)
                stack_res.append(y_probas)
            res.append(stack_res)

            nb_auto += 1
            if nb_auto == robot_nb_auto_max:
                return res
    return res
Example #3
0
def tiny_robot(x_train, y_train, x_valid, rf_rp, ext_rp, xgb_rp,
               **robot_kwargs):

    robot_cv_stack = robot_kwargs.get('robot_cv_stack', 5)
    robot_nb_auto_max = robot_kwargs.get('robot_nb_auto_max', -1)

    res = []
    nb_auto = 0

    logging.info(" >>> DGH >>> Feature cleaning")
    x_train_num, x_valid_num = numerize(x_train, x_valid, **robot_kwargs)

    stack_res = []
    logging.info(" >>> DGH >>> Cross-val-stacking")
    for train1_idx, stack_idx in tqdm(KFold(len(x_train_num),
                                            n_folds=robot_cv_stack,
                                            shuffle=True),
                                      nested=True,
                                      desc='cv2'):
        y_probas = Cvs.stack_that(x_train_num, y_train, x_valid_num,
                                  train1_idx, stack_idx, rf_rp, ext_rp, xgb_rp)
        stack_res.append(y_probas)
        nb_auto += 1
        if nb_auto == robot_nb_auto_max:
            return res
        res.append(stack_res)
    return res
Example #4
0
    def test_cross_val_meta_stack(self):
        x, y = DataGenerator.get_digits_data()

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

        xgb_initparam = ParamsGenerator.get_xgb_init_param()
        rf_initparam = ParamsGenerator.get_rf_init_param()
        ext_initparam = ParamsGenerator.get_ext_init_param()

        xgb_bestparam = CrossValStack.get_best_xgbopt(x_train, y_train, xgb_initparam)
        rf_bestparam = CrossValStack.get_best_sklopt(x_train, y_train, rf_initparam)
        ext_bestparam = CrossValStack.get_best_etopt(x_train, y_train, ext_initparam)

        res = CrossValStack.cross_val_meta_stack(x_train, y_train, x_test, xgb_bestparam, rf_bestparam, ext_bestparam,
                                                 csvstack_cv=3)
        dfres = pd.DataFrame([res[0][:, 1], res[1][:, 1], res[2][:, 1]]).transpose()
        dfres.columns = ['p1', 'p2', 'p3']

        y_test_xgb = CrossValStack.predict_opt_clf(XGBOpt.XGBOpt(x_train, y_train), xgb_bestparam, x_test, x_test)[0]
        y_test_skl = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), rf_bestparam, x_test, x_test)[0]
        y_test_ext = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), ext_bestparam, x_test, x_test)[0]

        print metrics.roc_auc_score(y_test, y_test_xgb)
        print metrics.roc_auc_score(y_test, y_test_skl)
        print metrics.roc_auc_score(y_test, y_test_ext)

        print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3)

        self.assertEqual(len(res), 3)
    def test_cross_val_stack(self):
        x, y = DataGenerator.get_digits_data()

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

        xgb_initparam = ParamsGenerator.get_xgb_init_param()
        rf_initparam = ParamsGenerator.get_rf_init_param()
        ext_initparam = ParamsGenerator.get_ext_init_param()

        xgb_bestparam = CrossValStack.get_best_xgbopt(x_train, y_train, xgb_initparam)
        rf_bestparam = CrossValStack.get_best_sklopt(x_train, y_train, rf_initparam)
        ext_bestparam = CrossValStack.get_best_etopt(x_train, y_train, ext_initparam)

        res = CrossValStack.cross_val_stack(x_train, y_train, x_test, xgb_bestparam, rf_bestparam, ext_bestparam)
        dfres = pd.DataFrame([res[0][:, 1], res[1][:, 1], res[2][:, 1]]).transpose()
        dfres.columns = ['p1', 'p2', 'p3']

        y_test_xgb = CrossValStack.predict_opt_clf(XGBOpt.XGBOpt(x_train, y_train), xgb_bestparam, x_test, x_test)[0]
        y_test_skl = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), rf_bestparam, x_test, x_test)[0]
        y_test_ext = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), ext_bestparam, x_test, x_test)[0]

        print metrics.roc_auc_score(y_test, y_test_xgb)
        print metrics.roc_auc_score(y_test, y_test_skl)
        print metrics.roc_auc_score(y_test, y_test_ext)

        print metrics.roc_auc_score(y_test, dfres.p1.values)
        print metrics.roc_auc_score(y_test, dfres.p2.values)
        print metrics.roc_auc_score(y_test, dfres.p3.values)

        print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3)

        print metrics.roc_auc_score(y_test, dfres.p1.values)
        print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3)

        self.assertEqual(len(res), 5)
Example #6
0
    def test_random_forest(self):
        # loading
        x, y = DataGenerator.get_adult_data()

        # cleaning
        MissingValues.add_miss_val_indicator(x)

        x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

        x_train_1, x_valid_1 = Automaton.numerize(x_train, x_valid)

        sklparam = Cvs.get_best_sklopt(x_train_1, y_train, ParamsGenerator.get_rf_init_param())
        skopt = SklearnOpt.SklearnOpt(x_train_1, y_train)
        y_pred_valid, _ = Cvs.predict_opt_clf(skopt, sklparam, x_valid_1, x_valid_1)

        print 'Random Forest'
        print metrics.roc_auc_score(y_valid, y_pred_valid)
        print metrics.log_loss(y_valid, y_pred_valid)
Example #7
0
    def test_tiny_robot(self):
        logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
        # loading
        x, y = DataGenerator.get_adult_data()

        # cleaning
        MissingValues.add_miss_val_indicator(x)

        x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

        ext_ip, rf_ip, robot_args, xgb_ip = self.get_params()

        x_train_num, _ = Automaton.numerize(x_train, x_valid, **robot_args)
        rf_rp = Misc.enhance_param(Cvs.get_best_sklopt(x_train_num, y_train, rf_ip), **robot_args)
        ext_rp = Misc.enhance_param(Cvs.get_best_etopt(x_train_num, y_train, ext_ip), **robot_args)
        xgb_rp = Misc.enhance_param(Cvs.get_best_xgbopt(x_train_num, y_train, xgb_ip), **robot_args)

        res = Automaton.tiny_robot(x_train, y_train, x_valid, rf_rp, ext_rp, xgb_rp, **robot_args)

        y_pred_valid = Misc.stacking_res_to_one_pred(res)

        print 'Tiny Robot'
        print metrics.roc_auc_score(y_valid, y_pred_valid)
        print metrics.log_loss(y_valid, y_pred_valid)
Example #8
0
def small_robot(x_train, y_train, x_valid, rf_rp, ext_rp, xgb_rp,
                **robot_kwargs):

    robot_cv_feat = robot_kwargs.get('robot_cv_feat', 6)
    robot_cv_stack = robot_kwargs.get('robot_cv_stack', 5)
    robot_nb_auto_max = robot_kwargs.get('robot_nb_auto_max', -1)
    robot_rand_state = robot_kwargs.get('robot_rand_state', 42)

    res = []
    nb_auto = 0

    nb_samples = len(x_train)

    for train1_idx, feat_idx in tqdm(KFold(nb_samples,
                                           n_folds=robot_cv_feat,
                                           shuffle=True,
                                           random_state=robot_rand_state),
                                     desc='cv1'):
        x_train1 = x_train.iloc[train1_idx]
        y_train1 = y_train.iloc[train1_idx]
        x_feat = x_train.iloc[feat_idx]
        y_feat = y_train.iloc[feat_idx]

        logging.info(" >>> DGH >>> Chaos feature generation")
        x_train1, x_valid = chaosize(x_feat, x_train1, x_valid, y_feat,
                                     **robot_kwargs)

        logging.info(" >>> DGH >>> Feature cleaning")
        x_train_num, x_valid_num = numerize(x_train1, x_valid, **robot_kwargs)

        stack_res = []
        logging.info(" >>> DGH >>> Cross-val-stacking")
        for train2_idx, stack_idx in tqdm(KFold(len(x_train_num),
                                                n_folds=robot_cv_stack,
                                                shuffle=True),
                                          nested=True,
                                          desc='cv2'):
            y_probas = Cvs.stack_that(x_train_num, y_train1, x_valid_num,
                                      train2_idx, stack_idx, rf_rp, ext_rp,
                                      xgb_rp)
            stack_res.append(y_probas)
        res.append(stack_res)

        nb_auto += 1
        if nb_auto == robot_nb_auto_max:
            return res
    return res
Example #9
0
def small_robot(x_train, y_train, x_valid, rf_rp, ext_rp, xgb_rp, **robot_kwargs):

    robot_cv_feat = robot_kwargs.get('robot_cv_feat', 6)
    robot_cv_stack = robot_kwargs.get('robot_cv_stack', 5)
    robot_nb_auto_max = robot_kwargs.get('robot_nb_auto_max', -1)
    robot_rand_state = robot_kwargs.get('robot_rand_state', 42)

    res = []
    nb_auto = 0

    nb_samples = len(x_train)

    for train1_idx, feat_idx in KFold(nb_samples, n_folds=robot_cv_feat, shuffle=True,
                                      random_state=robot_rand_state):
        x_train1 = x_train.iloc[train1_idx]
        y_train1 = y_train.iloc[train1_idx]
        x_feat = x_train.iloc[feat_idx]
        y_feat = y_train.iloc[feat_idx]

        logging.info("Chaos feature generation")
        x_train1, x_valid = chaosize(x_feat, x_train1, x_valid, y_feat, **robot_kwargs)

        logging.info("Feature cleaning")
        x_train_num, x_valid_num = numerize(x_train1, x_valid, **robot_kwargs)

        stack_res = []
        logging.info("Starting cross-val-stacking")
        for train2_idx, stack_idx in KFold(len(x_train_num), n_folds=robot_cv_stack, shuffle=True):
            y_probas = Cvs.stack_that(x_train_num, y_train1, x_valid_num, train2_idx, stack_idx,
                                      rf_rp, ext_rp, xgb_rp)
            stack_res.append(y_probas)
        res.append(stack_res)

        nb_auto += 1
        if nb_auto == robot_nb_auto_max:
            return res
    return res