Example #1
0
 def run_dt(self, config):
     df_train = dsutils.load_adult().head(1000)
     y = df_train.pop(14).values
     X = df_train
     dt = deeptable.DeepTable(config=config)
     dm, history = dt.fit(X, y, epochs=1)
     return dt, dm, history
Example #2
0
    def test_zero_testset_cross_validation(self):
        data = dsutils.load_adult().head(1000)
        conf = deeptable.ModelConfig(
            # dnn_units=((256, 0, False), (128, 0, False)),
            # dnn_activation='relu',
            fixed_embedding_dim=False,
            embeddings_output_dim=0,
            apply_gbm_features=False,
            auto_discrete=False,
        )
        bt = batch_trainer.BatchTrainer(
            data,
            'x_14',
            eval_size=0,
            validation_size=0.2,
            eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'],
            dt_config=conf,
            verbose=0,
            dt_epochs=1,
            cross_validation=True,
            num_folds=2,
            retain_single_model=False,
        )
        assert len(bt.X_train), 1000
        assert bt.X_eval is None

        ms = bt.start(models=['dt'])
        assert len(ms.get_models()), 1
Example #3
0
    def test_ensemble_predict_proba(self):
        data = dsutils.load_adult().head(1000)
        conf = deeptable.ModelConfig(
            # dnn_units=((256, 0, False), (128, 0, False)),
            # dnn_activation='relu',
            fixed_embedding_dim=False,
            embeddings_output_dim=0,
            apply_gbm_features=False,
            auto_discrete=False,
        )
        bt = batch_trainer.BatchTrainer(
            data,
            'x_14',
            eval_size=0.2,
            validation_size=0.2,
            eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'],
            # AUC/recall/precision/f1/mse/mae/msle/rmse/r2
            dt_config=conf,
            verbose=0,
            dt_epochs=1,
            # seed=9527,
            cross_validation=True,
            num_folds=5,
        )
        ms = bt.start()
        proba, preds, score, submission = bt.ensemble_predict_proba('all')

        assert proba.shape, (6513, )
Example #4
0
    def test_leaderboard(self):
        data = dsutils.load_adult().head(1000)
        conf = deeptable.ModelConfig(
            # dnn_units=((256, 0, False), (128, 0, False)),
            # dnn_activation='relu',
            fixed_embedding_dim=False,
            embeddings_output_dim=0,
            apply_gbm_features=False,
            auto_discrete=False,
        )
        bt = batch_trainer.BatchTrainer(
            data,
            'x_14',
            eval_size=0.2,
            validation_size=0.2,
            eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'],
            dt_config=conf,
            verbose=0,
            dt_epochs=1,
            cross_validation=True,
            num_folds=2,
            retain_single_model=True,
        )

        ms = bt.start()
        eval_lb = ms.leaderboard(type='eval')
        oof_lb = ms.leaderboard(type='oof')
        val_lb = ms.leaderboard(type='val')
        assert len(eval_lb), 5
        assert len(oof_lb), 1
        assert val_lb is None
Example #5
0
 def test_probe_evaluation(self):
     data = dsutils.load_adult().head(1000)
     conf = deeptable.ModelConfig(
         # dnn_units=((256, 0, False), (128, 0, False)),
         # dnn_activation='relu',
         fixed_embedding_dim=False,
         embeddings_output_dim=0,
         apply_gbm_features=False,
         auto_discrete=False,
     )
     bt = batch_trainer.BatchTrainer(
         data,
         'x_14',
         eval_size=0.2,
         validation_size=0.2,
         eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'],
         dt_config=conf,
         verbose=0,
         dt_epochs=1,
         cross_validation=False,
     )
     ms = bt.start(models=['dt'])
     result = bt.probe_evaluate(
         'all', layers=['flatten_embeddings', 'dnn_dense_1', 'dnn_dense_2'])
     assert len(result), 1
     assert len(result["conf-1 - ['dnn_nets'] - eval"]), 3
Example #6
0
 def test_run_cross_validation(self):
     data = dsutils.load_adult().head(1000)
     conf = deeptable.ModelConfig(
         # dnn_units=((256, 0, False), (128, 0, False)),
         # dnn_activation='relu',
         fixed_embedding_dim=False,
         embeddings_output_dim=0,
         apply_gbm_features=False,
         auto_discrete=False,
     )
     bt = batch_trainer.BatchTrainer(
         data,
         'x_14',
         data_test=data,
         eval_size=0.2,
         validation_size=0.2,
         eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'],
         # AUC/recall/precision/f1/mse/mae/msle/rmse/r2
         dt_config=conf,
         verbose=0,
         dt_epochs=1,
         # seed=9527,
         cross_validation=True,
         num_folds=5,
     )
     ms = bt.start(models=['dt'])
     assert ms.leaderboard().shape[1], 7
Example #7
0
 def test_run_lgbm(self):
     data = dsutils.load_adult().head(1000)
     conf = deeptable.ModelConfig(
         dnn_params={
             'dnn_units': ((256, 0, False), (256, 0, False)),
             'dnn_activation': 'relu'
         },
         fixed_embedding_dim=False,
         embeddings_output_dim=0,
         apply_gbm_features=False,
         # auto_discrete=True,
     )
     bt = batch_trainer.BatchTrainer(
         data,
         'x_14',
         eval_size=0.2,
         validation_size=0.2,
         eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'],
         # AUC/recall/precision/f1/mse/mae/msle/rmse/r2
         dt_config=conf,
         verbose=0,
         dt_epochs=1,
         # seed=9527,
         lightgbm_params={
             'learning_rate': 0.01,
             'colsample_bytree': 0.95,
             'reg_alpha': 0.04,
             'reg_lambda': 0.07
         },
     )
     lgbm, score = bt.train_lgbm(conf)
     assert lgbm
     assert score['auc'] > 0
Example #8
0
 def test_run_catboost(self):
     data = dsutils.load_adult().head(1000)
     conf = deeptable.ModelConfig(
         dnn_params={
             'dnn_units': ((256, 0, False), (256, 0, False)),
             'dnn_activation': 'relu'
         },
         fixed_embedding_dim=False,
         embeddings_output_dim=0,
         apply_gbm_features=False,
         # auto_discrete=True,
     )
     bt = batch_trainer.BatchTrainer(
         data,
         'x_14',
         eval_size=0.2,
         validation_size=0.2,
         eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'],
         # AUC/recall/precision/f1/mse/mae/msle/rmse/r2
         dt_config=conf,
         verbose=0,
         dt_epochs=1,
         catboost_params={'iterations': 5}
         # seed=9527,
     )
     cb, score = bt.train_catboost(conf)
     assert cb
     assert score['auc'] > 0
Example #9
0
 def test_run_binary(self):
     data = dsutils.load_adult().head(1000)
     conf = deeptable.ModelConfig(
         dnn_params={
             'dnn_units': ((256, 0, False), (256, 0, False)),
             'dnn_activation': 'relu'
         },
         fixed_embedding_dim=False,
         embeddings_output_dim=0,
         apply_gbm_features=False,
         # auto_discrete=True,
     )
     bt = batch_trainer.BatchTrainer(
         data,
         'x_14',
         eval_size=0.2,
         validation_size=0.2,
         eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'],
         # AUC/recall/precision/f1/mse/mae/msle/rmse/r2
         dt_config=conf,
         verbose=0,
         dt_epochs=1,
         # seed=9527,
     )
     ms = bt.start()
     assert ms.leaderboard().shape[1], 7
Example #10
0
    def test_multi_config(self):
        data = dsutils.load_adult().head(1000)
        conf1 = deeptable.ModelConfig(
            name='conf001',
            fixed_embedding_dim=False,
            embeddings_output_dim=0,
            apply_gbm_features=False,
            auto_discrete=False,
        )
        conf2 = deeptable.ModelConfig(
            name='conf002',
            fixed_embedding_dim=False,
            embeddings_output_dim=0,
            apply_gbm_features=False,
            auto_discrete=False,
        )
        bt = batch_trainer.BatchTrainer(
            data,
            'x_14',
            eval_size=0,
            validation_size=0.2,
            eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'],
            dt_config=[conf1, conf2],
            verbose=0,
            dt_epochs=1,
            cross_validation=True,
            num_folds=2,
            retain_single_model=False,
        )

        ms = bt.start(models=['dt'])
        assert len(ms.get_models()), 2
Example #11
0
    def run_nets(self, nets, **kwargs):
        df_train = dsutils.load_adult().head(100)
        y = df_train.pop(14).values
        X = df_train

        conf = deeptable.ModelConfig(nets=nets,
                                     metrics=['AUC'],
                                     fixed_embedding_dim=True,
                                     embeddings_output_dim=2,
                                     apply_gbm_features=False,
                                     apply_class_weight=True,
                                     **kwargs)

        dt = deeptable.DeepTable(config=conf)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)
        model, history = dt.fit(X_train, y_train, epochs=1)

        result = dt.evaluate(X_test, y_test)
        assert result['AUC'] >= 0.0

        # test reload from disk
        # model_path = os.path.join("/tmp/dt_model", str(uuid.uuid4()))
        # dt.save(model_path)
        #
        # p = multiprocessing.Process(target=self.run_load_model, args=(model_path, X_test, y_test, ))
        # p.start()
        # p.join()

        return dt, result
Example #12
0
    def test_custom_dnn(self):
        df_train = dsutils.load_adult().head(100)
        y = df_train.pop(14).values
        X = df_train

        conf = deeptable.ModelConfig(nets=['dnn_nets'],
                                     dnn_params={
                                         'custom_dnn_fn':
                                         deepnets.custom_dnn_D_A_D_B,
                                         'hidden_units':
                                         ((128, 0.2, True), (64, 0, False)),
                                     },
                                     metrics=['AUC'],
                                     fixed_embedding_dim=True,
                                     embeddings_output_dim=2,
                                     apply_gbm_features=False,
                                     apply_class_weight=True)
        dt = deeptable.DeepTable(config=conf)
        model, history = dt.fit(X, y, epochs=1)
        l1 = model.model.get_layer('dnn_custom_dense_1')
        l2 = model.model.get_layer('dnn_custom_dropout_1')
        l3 = model.model.get_layer('dnn_custom_bn_1')
        l4 = model.model.get_layer('dnn_custom_dense_2')

        assert l1
        assert l2
        assert l3
        assert l4
Example #13
0
 def test_transform(self):
     df_train = dsutils.load_adult()
     df_train = dd.from_pandas(df_train, npartitions=2)
     y = df_train.pop(14)  # .values
     X = df_train
     X_train, X_test, y_train, y_test = get_tool_box(X, y).train_test_split(
         X, y, test_size=0.2, random_state=42)
     conf = deeptable.ModelConfig(auto_discrete=True,
                                  auto_imputation=True,
                                  auto_encode_label=True,
                                  auto_categorize=True,
                                  apply_gbm_features=False)
     processor = DefaultDaskPreprocessor(conf, compute_to_local=True)
     X1, y1 = processor.fit_transform(X_train, y_train)
     X2, y2 = processor.transform(X_test, y_test)
     assert len(
         set(X1.columns.tolist()) - set([
             'x_1', 'x_3', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_13',
             'x_0_cat', 'x_4_cat', 'x_10_cat', 'x_11_cat', 'x_12_cat',
             'x_2', 'x_0', 'x_4', 'x_10', 'x_11', 'x_12', 'x_2_discrete',
             'x_0_discrete', 'x_4_discrete', 'x_10_discrete',
             'x_11_discrete', 'x_12_discrete'
         ])) == 0
     assert len(set(X1.columns) - set(X2.columns)) == 0
     assert X1.shape, (X_train.shape[0], 25)
     assert X2.shape, (X_test.shape[0], 25)
     assert y1.sum(), 6297
     assert y2.sum(), 1544
Example #14
0
    def setup_class(self):
        setup_dask(self)

        print("Loading datasets...")
        df_train = dd.from_pandas(dsutils.load_adult().head(1000),
                                  npartitions=2)
        self.y = df_train.pop(14)
        self.X = df_train

        conf = deeptable.ModelConfig(metrics=['AUC'],
                                     apply_gbm_features=False,
                                     auto_categorize=False,
                                     auto_discrete=False)
        self.dt = deeptable.DeepTable(config=conf)

        self.X_train, \
        self.X_eval, \
        self.y_train, \
        self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
        self.oof_proba, self.eval_proba, self.test_proba = \
            self.dt.fit_cross_validation(self.X_train,
                                         self.y_train,
                                         self.X_eval,
                                         num_folds=3,
                                         epochs=1,
                                         n_jobs=1)
Example #15
0
 def test_load_data(self):
     df_adult = dsutils.load_adult()
     df_glass = dsutils.load_glass_uci()
     df_hd = dsutils.load_heart_disease_uci()
     df_bank = dsutils.load_bank()
     df_boston = dsutils.load_boston()
     assert df_adult.shape, (32561, 15)
     assert df_glass.shape, (214, 11)
     assert df_hd.shape, (303, 14)
     assert df_bank.shape, (108504, 18)
     assert df_boston.shape, (506, 14)
Example #16
0
    def setup_class(cls):
        setup_dask(cls)

        print("Loading datasets...")
        row_count = 1000
        df = dsutils.load_adult().head(row_count)

        cls.df = dex.dd.from_pandas(df, npartitions=2)
        cls.df_row_count = row_count
        cls.target = 14

        print(f'Class {cls.__name__} setup.')
Example #17
0
    def test_cache_preprocessed_data(self):
        config = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True)
        df_train = dsutils.load_adult().head(100)
        y = df_train.pop(14).values
        X = df_train
        cache_home = homedir + '/cache'
        preprocessor = DefaultPreprocessor(config, cache_home=cache_home, use_cache=True)
        dt = deeptable.DeepTable(config=config, preprocessor=preprocessor)
        dt.fit(X, y, epochs=1)

        dt = deeptable.DeepTable(config=config, preprocessor=preprocessor)
        dt.fit(X, y, epochs=1)
Example #18
0
    def setup_class(self):
        print("Loading datasets...")
        df_train = dsutils.load_adult().head(1000)
        self.y = df_train.pop(14).values
        self.X = df_train

        conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True)
        self.dt = deeptable.DeepTable(config=conf)

        self.X_train, \
        self.X_test, \
        self.y_train, \
        self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
        self.model, self.history = self.dt.fit(self.X_train, self.y_train, epochs=1)
Example #19
0
    def test_modelinfo(self):
        df_train = dsutils.load_adult()
        y = df_train.pop(14).values
        X = df_train

        conf = deepmodels.ModelConfig(metrics=['AUC'])
        dt = deeptable.DeepTable(config=conf)
        model, history = dt.fit(X, y, epochs=2)
        mi = modelset.ModelInfo('val',
                                'm1',
                                model, {},
                                history=history.history)
        assert mi.score['val_auc'] > 0
        assert len(mi.meta['history']['AUC']) == 2
Example #20
0
    def test_embeddings_output_dim(self):
        print("Loading datasets...")
        df_train = dsutils.load_adult().head(1000)
        y = df_train.pop(14).values
        X = df_train

        conf = deeptable.ModelConfig(fixed_embedding_dim=False,
                                     embeddings_output_dim=0)
        dt = deeptable.DeepTable(config=conf)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)
        model, history = dt.fit(X_train, y_train, epochs=1)
    def test_categorical_columns_config(self):
        df_train = dsutils.load_adult().head(1000)
        y = df_train.pop(14).values

        conf = deeptable.ModelConfig(categorical_columns=['x_1', 'x_2', 'x_3'],
                                     auto_discrete=False,
                                     auto_imputation=True,
                                     auto_encode_label=True,
                                     auto_categorize=False,
                                     apply_gbm_features=False)
        processor = DefaultPreprocessor(conf)
        X, y = processor.fit_transform(df_train, y)
        assert len(
            set(X.columns) -
            set(['x_1', 'x_2', 'x_3', 'x_0', 'x_4', 'x_10', 'x_11', 'x_12'])
        ) == 0
Example #22
0
    def test_use_cache(self):
        config = deeptable.ModelConfig(metrics=['AUC'],
                                       apply_gbm_features=False,
                                       apply_class_weight=True)
        df_train = dsutils.load_adult().head(1000)
        y = df_train.pop(14).values
        X = df_train

        X, X_val, y, y_val = train_test_split(X, y, test_size=0.2)
        cache_home = homedir + '/preprocessor_cache'
        preprocessor = DefaultPreprocessor(config,
                                           cache_home=cache_home,
                                           use_cache=True)
        preprocessor.clear_cache()

        sign = preprocessor.get_X_y_signature(X, y)
        sign_val = preprocessor.get_X_y_signature(X_val, y_val)
        X_t, y_t = preprocessor.get_transformed_X_y_from_cache(sign)

        assert X_t is None and y_t is None

        preprocessor.fit_transform(X, y)
        preprocessor.transform(X_val, y_val)
        X_t2, y_t2 = preprocessor.get_transformed_X_y_from_cache(sign)
        assert X_t2 is not None and y_t2 is not None

        preprocessor = DefaultPreprocessor(config,
                                           cache_home=cache_home,
                                           use_cache=True)

        assert len(preprocessor.X_transformers) == 0
        assert preprocessor.y_lable_encoder is None

        assert preprocessor.load_transformers_from_cache() == True

        assert len(preprocessor.X_transformers) == 3
        assert preprocessor.y_lable_encoder is not None

        X_t, y_t = preprocessor.get_transformed_X_y_from_cache(sign)
        assert X_t is not None and y_t is not None

        X_val_t, y_val_t = preprocessor.get_transformed_X_y_from_cache(
            sign_val)
        assert X_val_t is not None and y_val_t is not None
Example #23
0
    def test_categorical_columns_config_2(self):
        df_train = dsutils.load_adult().head(1000)
        df_train = dd.from_pandas(df_train, npartitions=2)
        y = df_train.pop(14)

        conf = deeptable.ModelConfig(categorical_columns=['x_1', 'x_2', 'x_3'],
                                     auto_discrete=True,
                                     auto_imputation=True,
                                     auto_encode_label=True,
                                     auto_categorize=False,
                                     apply_gbm_features=False)
        processor = DefaultDaskPreprocessor(conf, compute_to_local=True)
        X, y = processor.fit_transform(df_train, y)
        assert len(
            set(X.columns) - set([
                'x_1', 'x_2', 'x_3', 'x_0', 'x_4', 'x_10', 'x_11', 'x_12',
                'x_0_discrete', 'x_4_discrete', 'x_10_discrete',
                'x_11_discrete', 'x_12_discrete'
            ])) == 0
Example #24
0
    def run_nets(self, nets):
        df_train = dsutils.load_adult().head(100)
        y = df_train.pop(14).values
        X = df_train

        conf = deeptable.ModelConfig(nets=nets,
                                     metrics=['AUC'],
                                     fixed_embedding_dim=True,
                                     embeddings_output_dim=2,
                                     apply_gbm_features=False,
                                     apply_class_weight=True)
        dt = deeptable.DeepTable(config=conf)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)
        model, history = dt.fit(X_train, y_train, epochs=1)
        result = dt.evaluate(X_test, y_test)
        assert result['AUC'] >= 0.0
        return dt, result
Example #25
0
 def test_get_models_retian_single_model(self):
     data = dsutils.load_adult().head(1000)
     conf = deeptable.ModelConfig(
         # dnn_units=((256, 0, False), (128, 0, False)),
         # dnn_activation='relu',
         fixed_embedding_dim=False,
         embeddings_output_dim=0,
         apply_gbm_features=False,
         auto_discrete=False,
     )
     bt = batch_trainer.BatchTrainer(
         data,
         'x_14',
         eval_size=0.2,
         validation_size=0.2,
         eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'],
         # AUC/recall/precision/f1/mse/mae/msle/rmse/r2
         dt_config=conf,
         verbose=0,
         dt_epochs=1,
         # seed=9527,
         cross_validation=True,
         num_folds=2,
         retain_single_model=True,
     )
     ms = bt.start()
     mis_all = bt.get_models('all')
     mis_top2 = bt.get_models('top2')
     mis_modelindex = bt.get_models([1, 3])
     mis_modelnames = bt.get_models([
         'conf-1 - [\'dnn_nets\'] - CV - oof',
         'conf-1 - [\'dnn_nets\'] - dnn_nets-kfold-1 - eval', 'LightGBM',
         'CatBoost'
     ])
     assert len(mis_all), 6
     assert len(mis_top2), 2
     assert len(mis_modelnames), 4
     assert len(mis_modelindex), 2
     assert mis_modelnames[0].name, 'conf-1 - [\'dnn_nets\'] - CV - oof'
Example #26
0
    def test_fit_cv(self):
        df_train = dsutils.load_adult().head(1000)

        y = df_train.pop(14).values
        X = df_train
        cols = X.columns
        num_cols = X._get_numeric_data().columns
        cat_cols = list(set(cols) - set(num_cols))

        le = LabelEncoder()
        for c in cat_cols:
            X[c] = le.fit_transform(X[c])
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            train_size=0.8)

        oof_proba = BatchTrainer.fit_cross_validation(
            'lightGBM',
            lgbm_fit,
            X_train,
            y_train,
            X_test,
            score_fn=roc_auc_score,
            estimator_params={
                'max_depth': 3,
                'learning_rate': 0.01
            },
            categorical_feature=cols,
            task_type='binary',
            num_folds=5,
            stratified=True,
            iterators=None,
            batch_size=None,
            preds_filepath=None,
        )
        auc = roc_auc_score(y_train, oof_proba)
        assert auc > 0
    def test_callback_injection(self):
        print("Loading datasets...")
        df_train = dsutils.load_adult()
        self.y = df_train.pop(14).values
        self.X = df_train
        path = tempfile.mkdtemp()
        conf = deeptable.ModelConfig(
            metrics=['AUC'],
            apply_gbm_features=False,
            auto_discrete=False,
            home_dir=path,
        )

        self.dt = deeptable.DeepTable(config=conf)

        mcp = ModelCheckpoint(
            path,
            'val_auc',
            verbose=0,
            save_best_only=False,
            save_weights_only=False,
            mode='max',
            save_freq='epoch',
        )
        callbacks = [mcp]
        self.X_train, \
        self.X_test, \
        self.y_train, \
        self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
        self.model, self.history = self.dt.fit(self.X_train,
                                               self.y_train,
                                               epochs=1,
                                               callbacks=callbacks)

        files = os.listdir(path)
        assert 'saved_model.pb' in files
Example #28
0
hdt = HyperDT(searcher,
              callbacks=[
                  SummaryCallback(),
                  FileStorageLoggingCallback(searcher,
                                             output_dir=f'hotexamples_com/hyn_logs')
              ],
              reward_metric='AUC',
              earlystopping_patience=1)

space = mini_dt_space()
assert space.combinations == 589824
space2 = default_dt_space()
assert space2.combinations == 3559292928

df = dsutils.load_adult()
# df.drop(['id'], axis=1, inplace=True)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
X = df_train
y = df_train.pop(14)
y_test = df_test.pop(14)
# dataset_id='adult_whole_data',
hdt.search(
    df_train,
    y,
    df_test,
    y_test,
    max_trials=3,
    batch_size=256,
    epochs=1,
    verbose=1,
Example #29
0
    def test_opt_lightgbm(self):
        df_train = dsutils.load_adult().head(1000)
        y = df_train.pop(14).values
        X = df_train
        cols = X.columns
        num_cols = X._get_numeric_data().columns
        cat_cols = list(set(cols) - set(num_cols))

        le = LabelEncoder()
        for c in cat_cols:
            X[c] = le.fit_transform(X[c])

        clf = LGBMClassifier(n_estimators=10,
                             boosting_type='gbdt',
                             categorical_feature=cat_cols,
                             num_leaves=31)
        fit_params = {'eval_metric': 'roc_auc'}
        # randomized_search
        param_distributions = {
            # 'iterations': sp_randint(10, 1000),
            'max_depth': [1, 3, 5],  # sp_randint(1, 5),
            'learning_rate': sp_uniform(0.01, 1.0),
        }
        best_params1 = BatchTrainer.randomized_search(clf,
                                                      param_distributions,
                                                      X,
                                                      y,
                                                      fit_params=fit_params,
                                                      scoring='roc_auc',
                                                      n_jobs=1,
                                                      cv=5)

        # grid_search
        param_grid = {
            # 'iterations': [10, 30],
            'max_depth': [1, 3, 5],  # sp_randint(1, 5),
            'learning_rate': [0.01, 0.05, 0.1],
        }
        best_params2 = BatchTrainer.grid_search(clf,
                                                param_grid,
                                                X,
                                                y,
                                                fit_params=fit_params,
                                                scoring='roc_auc',
                                                n_jobs=1,
                                                cv=5)

        # bayes_search
        search_spaces = {
            'max_depth': Integer(1, 5),
            'learning_rate': Real(0.02, 0.6, 'log-uniform'),
        }
        best_params3 = BatchTrainer.bayes_search(clf,
                                                 search_spaces,
                                                 X,
                                                 y,
                                                 fit_params=fit_params,
                                                 scoring='roc_auc',
                                                 n_jobs=1,
                                                 cv=5,
                                                 n_iter=10)

        assert best_params1['max_depth'] > 0
        assert best_params2['max_depth'] > 0
        assert best_params3['max_depth'] > 0
Example #30
0
    def test_opt_catboost(self):
        df_train = dsutils.load_adult().head(1000)
        y = df_train.pop(14).values
        X = df_train
        cols = X.columns
        num_cols = X._get_numeric_data().columns
        cat_cols = list(set(cols) - set(num_cols))
        clf = CatBoostClassifier(thread_count=4,
                                 loss_function='Logloss',
                                 cat_features=cat_cols,
                                 od_type='Iter',
                                 nan_mode='Min',
                                 iterations=1,
                                 eval_metric='AUC',
                                 metric_period=50,
                                 verbose=False)
        fit_params = {'early_stopping_rounds': 10}
        # randomized_search
        param_distributions = {
            # 'iterations': sp_randint(10, 1000),
            'depth': [1, 3, 5],  # sp_randint(1, 5),
            'learning_rate': sp_uniform(0.01, 1.0),
        }
        best_params1 = BatchTrainer.randomized_search(clf,
                                                      param_distributions,
                                                      X,
                                                      y,
                                                      fit_params=fit_params,
                                                      scoring='roc_auc',
                                                      n_jobs=1,
                                                      cv=5)

        # grid_search
        param_grid = {
            # 'iterations': [10, 30],
            'depth': [1, 3, 5],  # sp_randint(1, 5),
            'learning_rate': [0.01, 0.05, 0.1],
        }
        best_params2 = BatchTrainer.grid_search(clf,
                                                param_grid,
                                                X,
                                                y,
                                                fit_params=fit_params,
                                                scoring='roc_auc',
                                                n_jobs=1,
                                                cv=5)

        # bayes_search
        search_spaces = {
            'depth': Integer(1, 5),
            'learning_rate': Real(0.02, 0.6, 'log-uniform'),
        }
        best_params3 = BatchTrainer.bayes_search(clf,
                                                 search_spaces,
                                                 X,
                                                 y,
                                                 fit_params=fit_params,
                                                 scoring='roc_auc',
                                                 n_jobs=1,
                                                 cv=5,
                                                 n_iter=10)

        assert best_params1['depth'] > 0
        assert best_params2['depth'] > 0
        assert best_params3['depth'] > 0