Esempi in Python per DataGenerator, esempi in Python per dataghobot.utils.DataGenerator

Esempio n. 1

0

Mostra file

File: CrossValStackUnitTests.py Progetto: AshtonIzmev/dataghobot

    def test_cross_val_stack(self):
        x, y = DataGenerator.get_digits_data()

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

        xgb_initparam = ParamsGenerator.get_xgb_init_param()
        rf_initparam = ParamsGenerator.get_rf_init_param()
        ext_initparam = ParamsGenerator.get_ext_init_param()

        xgb_bestparam = CrossValStack.get_best_xgbopt(x_train, y_train, xgb_initparam)
        rf_bestparam = CrossValStack.get_best_sklopt(x_train, y_train, rf_initparam)
        ext_bestparam = CrossValStack.get_best_etopt(x_train, y_train, ext_initparam)

        res = CrossValStack.cross_val_stack(x_train, y_train, x_test, xgb_bestparam, rf_bestparam, ext_bestparam)
        dfres = pd.DataFrame([res[0][:, 1], res[1][:, 1], res[2][:, 1]]).transpose()
        dfres.columns = ['p1', 'p2', 'p3']

        y_test_xgb = CrossValStack.predict_opt_clf(XGBOpt.XGBOpt(x_train, y_train), xgb_bestparam, x_test, x_test)[0]
        y_test_skl = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), rf_bestparam, x_test, x_test)[0]
        y_test_ext = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), ext_bestparam, x_test, x_test)[0]

        print metrics.roc_auc_score(y_test, y_test_xgb)
        print metrics.roc_auc_score(y_test, y_test_skl)
        print metrics.roc_auc_score(y_test, y_test_ext)

        print metrics.roc_auc_score(y_test, dfres.p1.values)
        print metrics.roc_auc_score(y_test, dfres.p2.values)
        print metrics.roc_auc_score(y_test, dfres.p3.values)

        print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3)

        print metrics.roc_auc_score(y_test, dfres.p1.values)
        print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3)

        self.assertEqual(len(res), 5)

Esempio n. 2

0

Mostra file

    def test_cross_val_stack(self):
        x, y = DataGenerator.get_digits_data()

        # In order to obtain some categorical columns
        x['i63'] = x['i63'].map(str)
        x['i62'] = x['i62'].map(str)
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=42)
        dic = {}
        x_shadow = x_train.copy()

        x_train.loc[:, 'source'] = 0
        x_shadow.loc[:, 'source'] = 1
        x_all = pd.concat([x_train, x_shadow])
        shadow_selector = x_all['source'] == 0
        ChaosGeneration.chaos_feature_importance(x_all,
                                                 y_train,
                                                 shadow_selector,
                                                 feat_dic=dic,
                                                 feat_iter=10,
                                                 nb_features=20,
                                                 chaos_gen_iter=30)
        sorted_x = sorted(dic.items(), key=operator.itemgetter(1))
        self.assertGreater(len(dic), len(x_train.columns))
        self.assertGreater(len(dic), len(x_shadow.columns))

Esempio n. 3

0

Mostra file

    def test_cross_val_meta_stack(self):
        x, y = DataGenerator.get_digits_data()

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

        xgb_initparam = ParamsGenerator.get_xgb_init_param()
        rf_initparam = ParamsGenerator.get_rf_init_param()
        ext_initparam = ParamsGenerator.get_ext_init_param()

        xgb_bestparam = CrossValStack.get_best_xgbopt(x_train, y_train, xgb_initparam)
        rf_bestparam = CrossValStack.get_best_sklopt(x_train, y_train, rf_initparam)
        ext_bestparam = CrossValStack.get_best_etopt(x_train, y_train, ext_initparam)

        res = CrossValStack.cross_val_meta_stack(x_train, y_train, x_test, xgb_bestparam, rf_bestparam, ext_bestparam,
                                                 csvstack_cv=3)
        dfres = pd.DataFrame([res[0][:, 1], res[1][:, 1], res[2][:, 1]]).transpose()
        dfres.columns = ['p1', 'p2', 'p3']

        y_test_xgb = CrossValStack.predict_opt_clf(XGBOpt.XGBOpt(x_train, y_train), xgb_bestparam, x_test, x_test)[0]
        y_test_skl = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), rf_bestparam, x_test, x_test)[0]
        y_test_ext = CrossValStack.predict_opt_clf(SklearnOpt.SklearnOpt(x_train, y_train), ext_bestparam, x_test, x_test)[0]

        print metrics.roc_auc_score(y_test, y_test_xgb)
        print metrics.roc_auc_score(y_test, y_test_skl)
        print metrics.roc_auc_score(y_test, y_test_ext)

        print metrics.roc_auc_score(y_test, (dfres.p1+dfres.p2+dfres.p3).values/3)

        self.assertEqual(len(res), 3)

Esempio n. 4

0

Mostra file

 def test_kerasopt_auc(self):
     x, y = DataGenerator.get_digits_data()
     kerasopt = KerasOpt.KerasOpt(x, y)
     param = HyperoptParam.HyperoptParam.param_space_reg_keras_dnn
     param['eval_metric'] = 'auc'
     best = kerasopt.run_hp(param)
     self.assertIsNotNone(best)
     self.assertLess(kerasopt.score, -0.85)

Esempio n. 5

0

Mostra file

 def test_handle_columns_entropy_inf(self):
     x, y = DataGenerator.get_iris_data()
     x['feat_tobedef'] = x[x.columns[0]].map(lambda s: str(s))
     x['feat_toother'] = x[x.columns[0]].map(lambda s: str(int(3 * s)))
     num_cols, todummy_cols, tootherisation_cols, tobedefined_cols = \
         Numerisation.handle_columns(x, [], [], numerize_entropy_max=np.inf)
     self.assertEqual(len(tootherisation_cols), 2)
     self.assertEqual(len(tobedefined_cols), 0)

Esempio n. 6

0

Mostra file

 def test_xgbopt_tree_auc(self):
     x, y = DataGenerator.get_digits_data()
     xgbopt = XGBOpt.XGBOpt(x, y)
     param = HyperoptParam.HyperoptParam.param_space_reg_xgb_tree
     param['eval_metric'] = 'auc'
     best = xgbopt.run_hp(param)
     self.assertIsNotNone(best)
     self.assertLess(xgbopt.score, -0.99)

Esempio n. 7

0

Mostra file

 def test_lropt_logloss(self):
     x, y = DataGenerator.get_digits_data()
     skopt = SklearnOpt.SklearnOpt(x, y)
     param = HyperoptParam.HyperoptParam.param_space_clf_skl_lr
     param['eval_metric'] = 'logloss'
     param['type'] = 'logistic_regression'
     best = skopt.run_hp(param)
     self.assertIsNotNone(best)
     self.assertLess(skopt.score, 0.011)

Esempio n. 8

0

Mostra file

 def test_etopt_logloss(self):
     x, y = DataGenerator.get_digits_data()
     skopt = SklearnOpt.SklearnOpt(x, y)
     param = HyperoptParam.HyperoptParam.param_space_reg_skl_rf
     param['eval_metric'] = 'logloss'
     param['type'] = 'extra_trees'
     best = skopt.run_hp(param)
     self.assertIsNotNone(best)
     self.assertLess(skopt.score, 0.03)

Esempio n. 9

0

Mostra file

 def test_rfopt_auc(self):
     x, y = DataGenerator.get_digits_data()
     skopt = SklearnOpt.SklearnOpt(x, y)
     param = HyperoptParam.HyperoptParam.param_space_reg_skl_rf
     param['eval_metric'] = 'auc'
     param['type'] = 'random_forest'
     best = skopt.run_hp(param)
     self.assertIsNotNone(best)
     self.assertLess(skopt.score, -0.99)

Esempio n. 10

0

Mostra file

 def test_xgbopt_tree_logloss(self):
     x, y = DataGenerator.get_digits_data()
     xgbopt = XGBOpt.XGBOpt(x, y)
     param = HyperoptParam.HyperoptParam.param_space_reg_xgb_tree
     param['max_evals'] = 10
     param['eval_metric'] = 'logloss'
     best = xgbopt.run_hp(param)
     self.assertIsNotNone(best)
     self.assertLess(xgbopt.score, 0.04)

Esempio n. 11

0

Mostra file

 def test_treat_dataframe(self):
     x, y = DataGenerator.get_iris_data()
     x['feat_todummy'] = 'test'
     x['feat_tobedef'] = x[x.columns[0]].map(lambda s: str(s))
     x['feat_toother'] = x[x.columns[0]].map(lambda s: str(int(3 * s)))
     num_cols, todummy_cols, tootherisation_cols, tobedefined_cols = Numerisation.handle_columns(
         x, [], [])
     df2 = Numerisation.treat_dataframe(x, num_cols, [], todummy_cols,
                                        tootherisation_cols)
     self.assertTrue(df2.applymap(np.isreal).all(1).all())

Esempio n. 12

0

Mostra file

 def test_handle_columns(self):
     x, y = DataGenerator.get_iris_data()
     x['feat_todummy'] = 'test'
     x['feat_tobedef'] = x[x.columns[0]].map(lambda s: str(s))
     x['feat_toother'] = x[x.columns[0]].map(lambda s: str(int(3 * s)))
     num_cols, todummy_cols, tootherisation_cols, tobedefined_cols = Numerisation.handle_columns(
         x, [], [])
     self.assertEqual(len(num_cols), 4)
     self.assertEqual(len(todummy_cols), 1)
     self.assertEqual(len(tootherisation_cols), 1)
     self.assertEqual(len(tobedefined_cols), 1)

Esempio n. 13

0

Mostra file

File: ChaosGenerationUnitTests.py Progetto: AshtonIzmev/dataghobot

    def test_cross_val_stack_none(self):
        x, y = DataGenerator.get_digits_data()

        # In order to obtain some categorical columns
        x["i63"] = x["i63"].map(str)
        x["i62"] = x["i62"].map(str)
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
        dic = {}

        x_train.loc[:, "source"] = 0
        shadow_selector = x_train["source"] == 0
        ChaosGeneration.chaos_feature_importance(
            x_train, y_train, shadow_selector, feat_dic=dic, feat_iter=10, nb_features=20, chaos_gen_iter=30
        )
        sorted_x = sorted(dic.items(), key=operator.itemgetter(1))
        self.assertGreater(len(dic), len(x_train.columns))

Esempio n. 14

0

Mostra file

    def test_random_forest(self):
        # loading
        x, y = DataGenerator.get_adult_data()

        # cleaning
        MissingValues.add_miss_val_indicator(x)

        x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

        x_train_1, x_valid_1 = Automaton.numerize(x_train, x_valid)

        sklparam = Cvs.get_best_sklopt(x_train_1, y_train, ParamsGenerator.get_rf_init_param())
        skopt = SklearnOpt.SklearnOpt(x_train_1, y_train)
        y_pred_valid, _ = Cvs.predict_opt_clf(skopt, sklparam, x_valid_1, x_valid_1)

        print 'Random Forest'
        print metrics.roc_auc_score(y_valid, y_pred_valid)
        print metrics.log_loss(y_valid, y_pred_valid)

Esempio n. 15

0

Mostra file

    def test_full_robot(self):
        logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
        # loading
        x, y = DataGenerator.get_adult_data()

        # cleaning
        MissingValues.add_miss_val_indicator(x)

        x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

        ext_ip, rf_ip, robot_args, xgb_ip = self.get_params()

        res = Automaton.robot(x_train, y_train, x_valid, rf_ip, ext_ip, xgb_ip, **robot_args)

        y_pred_valid = Misc.stacking_res_to_one_pred(res)

        print 'Full Robot'
        print metrics.roc_auc_score(y_valid, y_pred_valid)
        print metrics.log_loss(y_valid, y_pred_valid)

Esempio n. 16

0

Mostra file

    def test_handle_nocategoric_nonreg(self):
        x, y = DataGenerator.get_digits_data()

        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=42)
        dic = {}

        x_train.loc[:, 'source'] = 0
        shadow_selector = x_train['source'] == 0
        ChaosGeneration.chaos_feature_importance(x_train,
                                                 y_train,
                                                 shadow_selector,
                                                 feat_dic=dic,
                                                 feat_iter=10,
                                                 nb_features=20,
                                                 chaos_gen_iter=30)
        self.assertGreater(len(dic), len(x_train.columns))

Esempio n. 17

0

Mostra file

    def test_tiny_robot(self):
        logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
        # loading
        x, y = DataGenerator.get_adult_data()

        # cleaning
        MissingValues.add_miss_val_indicator(x)

        x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

        ext_ip, rf_ip, robot_args, xgb_ip = self.get_params()

        x_train_num, _ = Automaton.numerize(x_train, x_valid, **robot_args)
        rf_rp = Misc.enhance_param(Cvs.get_best_sklopt(x_train_num, y_train, rf_ip), **robot_args)
        ext_rp = Misc.enhance_param(Cvs.get_best_etopt(x_train_num, y_train, ext_ip), **robot_args)
        xgb_rp = Misc.enhance_param(Cvs.get_best_xgbopt(x_train_num, y_train, xgb_ip), **robot_args)

        res = Automaton.tiny_robot(x_train, y_train, x_valid, rf_rp, ext_rp, xgb_rp, **robot_args)

        y_pred_valid = Misc.stacking_res_to_one_pred(res)

        print 'Tiny Robot'
        print metrics.roc_auc_score(y_valid, y_pred_valid)
        print metrics.log_loss(y_valid, y_pred_valid)