def run_dt(self, config): df_train = dsutils.load_adult().head(1000) y = df_train.pop(14).values X = df_train dt = deeptable.DeepTable(config=config) dm, history = dt.fit(X, y, epochs=1) return dt, dm, history
def test_zero_testset_cross_validation(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( # dnn_units=((256, 0, False), (128, 0, False)), # dnn_activation='relu', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], dt_config=conf, verbose=0, dt_epochs=1, cross_validation=True, num_folds=2, retain_single_model=False, ) assert len(bt.X_train), 1000 assert bt.X_eval is None ms = bt.start(models=['dt']) assert len(ms.get_models()), 1
def test_ensemble_predict_proba(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( # dnn_units=((256, 0, False), (128, 0, False)), # dnn_activation='relu', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], # AUC/recall/precision/f1/mse/mae/msle/rmse/r2 dt_config=conf, verbose=0, dt_epochs=1, # seed=9527, cross_validation=True, num_folds=5, ) ms = bt.start() proba, preds, score, submission = bt.ensemble_predict_proba('all') assert proba.shape, (6513, )
def test_leaderboard(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( # dnn_units=((256, 0, False), (128, 0, False)), # dnn_activation='relu', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], dt_config=conf, verbose=0, dt_epochs=1, cross_validation=True, num_folds=2, retain_single_model=True, ) ms = bt.start() eval_lb = ms.leaderboard(type='eval') oof_lb = ms.leaderboard(type='oof') val_lb = ms.leaderboard(type='val') assert len(eval_lb), 5 assert len(oof_lb), 1 assert val_lb is None
def test_probe_evaluation(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( # dnn_units=((256, 0, False), (128, 0, False)), # dnn_activation='relu', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], dt_config=conf, verbose=0, dt_epochs=1, cross_validation=False, ) ms = bt.start(models=['dt']) result = bt.probe_evaluate( 'all', layers=['flatten_embeddings', 'dnn_dense_1', 'dnn_dense_2']) assert len(result), 1 assert len(result["conf-1 - ['dnn_nets'] - eval"]), 3
def test_run_cross_validation(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( # dnn_units=((256, 0, False), (128, 0, False)), # dnn_activation='relu', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', data_test=data, eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], # AUC/recall/precision/f1/mse/mae/msle/rmse/r2 dt_config=conf, verbose=0, dt_epochs=1, # seed=9527, cross_validation=True, num_folds=5, ) ms = bt.start(models=['dt']) assert ms.leaderboard().shape[1], 7
def test_run_lgbm(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( dnn_params={ 'dnn_units': ((256, 0, False), (256, 0, False)), 'dnn_activation': 'relu' }, fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, # auto_discrete=True, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], # AUC/recall/precision/f1/mse/mae/msle/rmse/r2 dt_config=conf, verbose=0, dt_epochs=1, # seed=9527, lightgbm_params={ 'learning_rate': 0.01, 'colsample_bytree': 0.95, 'reg_alpha': 0.04, 'reg_lambda': 0.07 }, ) lgbm, score = bt.train_lgbm(conf) assert lgbm assert score['auc'] > 0
def test_run_catboost(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( dnn_params={ 'dnn_units': ((256, 0, False), (256, 0, False)), 'dnn_activation': 'relu' }, fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, # auto_discrete=True, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], # AUC/recall/precision/f1/mse/mae/msle/rmse/r2 dt_config=conf, verbose=0, dt_epochs=1, catboost_params={'iterations': 5} # seed=9527, ) cb, score = bt.train_catboost(conf) assert cb assert score['auc'] > 0
def test_run_binary(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( dnn_params={ 'dnn_units': ((256, 0, False), (256, 0, False)), 'dnn_activation': 'relu' }, fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, # auto_discrete=True, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], # AUC/recall/precision/f1/mse/mae/msle/rmse/r2 dt_config=conf, verbose=0, dt_epochs=1, # seed=9527, ) ms = bt.start() assert ms.leaderboard().shape[1], 7
def test_multi_config(self): data = dsutils.load_adult().head(1000) conf1 = deeptable.ModelConfig( name='conf001', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) conf2 = deeptable.ModelConfig( name='conf002', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], dt_config=[conf1, conf2], verbose=0, dt_epochs=1, cross_validation=True, num_folds=2, retain_single_model=False, ) ms = bt.start(models=['dt']) assert len(ms.get_models()), 2
def run_nets(self, nets, **kwargs): df_train = dsutils.load_adult().head(100) y = df_train.pop(14).values X = df_train conf = deeptable.ModelConfig(nets=nets, metrics=['AUC'], fixed_embedding_dim=True, embeddings_output_dim=2, apply_gbm_features=False, apply_class_weight=True, **kwargs) dt = deeptable.DeepTable(config=conf) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model, history = dt.fit(X_train, y_train, epochs=1) result = dt.evaluate(X_test, y_test) assert result['AUC'] >= 0.0 # test reload from disk # model_path = os.path.join("/tmp/dt_model", str(uuid.uuid4())) # dt.save(model_path) # # p = multiprocessing.Process(target=self.run_load_model, args=(model_path, X_test, y_test, )) # p.start() # p.join() return dt, result
def test_custom_dnn(self): df_train = dsutils.load_adult().head(100) y = df_train.pop(14).values X = df_train conf = deeptable.ModelConfig(nets=['dnn_nets'], dnn_params={ 'custom_dnn_fn': deepnets.custom_dnn_D_A_D_B, 'hidden_units': ((128, 0.2, True), (64, 0, False)), }, metrics=['AUC'], fixed_embedding_dim=True, embeddings_output_dim=2, apply_gbm_features=False, apply_class_weight=True) dt = deeptable.DeepTable(config=conf) model, history = dt.fit(X, y, epochs=1) l1 = model.model.get_layer('dnn_custom_dense_1') l2 = model.model.get_layer('dnn_custom_dropout_1') l3 = model.model.get_layer('dnn_custom_bn_1') l4 = model.model.get_layer('dnn_custom_dense_2') assert l1 assert l2 assert l3 assert l4
def test_transform(self): df_train = dsutils.load_adult() df_train = dd.from_pandas(df_train, npartitions=2) y = df_train.pop(14) # .values X = df_train X_train, X_test, y_train, y_test = get_tool_box(X, y).train_test_split( X, y, test_size=0.2, random_state=42) conf = deeptable.ModelConfig(auto_discrete=True, auto_imputation=True, auto_encode_label=True, auto_categorize=True, apply_gbm_features=False) processor = DefaultDaskPreprocessor(conf, compute_to_local=True) X1, y1 = processor.fit_transform(X_train, y_train) X2, y2 = processor.transform(X_test, y_test) assert len( set(X1.columns.tolist()) - set([ 'x_1', 'x_3', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_13', 'x_0_cat', 'x_4_cat', 'x_10_cat', 'x_11_cat', 'x_12_cat', 'x_2', 'x_0', 'x_4', 'x_10', 'x_11', 'x_12', 'x_2_discrete', 'x_0_discrete', 'x_4_discrete', 'x_10_discrete', 'x_11_discrete', 'x_12_discrete' ])) == 0 assert len(set(X1.columns) - set(X2.columns)) == 0 assert X1.shape, (X_train.shape[0], 25) assert X2.shape, (X_test.shape[0], 25) assert y1.sum(), 6297 assert y2.sum(), 1544
def setup_class(self): setup_dask(self) print("Loading datasets...") df_train = dd.from_pandas(dsutils.load_adult().head(1000), npartitions=2) self.y = df_train.pop(14) self.X = df_train conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, auto_categorize=False, auto_discrete=False) self.dt = deeptable.DeepTable(config=conf) self.X_train, \ self.X_eval, \ self.y_train, \ self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.oof_proba, self.eval_proba, self.test_proba = \ self.dt.fit_cross_validation(self.X_train, self.y_train, self.X_eval, num_folds=3, epochs=1, n_jobs=1)
def test_load_data(self): df_adult = dsutils.load_adult() df_glass = dsutils.load_glass_uci() df_hd = dsutils.load_heart_disease_uci() df_bank = dsutils.load_bank() df_boston = dsutils.load_boston() assert df_adult.shape, (32561, 15) assert df_glass.shape, (214, 11) assert df_hd.shape, (303, 14) assert df_bank.shape, (108504, 18) assert df_boston.shape, (506, 14)
def setup_class(cls): setup_dask(cls) print("Loading datasets...") row_count = 1000 df = dsutils.load_adult().head(row_count) cls.df = dex.dd.from_pandas(df, npartitions=2) cls.df_row_count = row_count cls.target = 14 print(f'Class {cls.__name__} setup.')
def test_cache_preprocessed_data(self): config = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True) df_train = dsutils.load_adult().head(100) y = df_train.pop(14).values X = df_train cache_home = homedir + '/cache' preprocessor = DefaultPreprocessor(config, cache_home=cache_home, use_cache=True) dt = deeptable.DeepTable(config=config, preprocessor=preprocessor) dt.fit(X, y, epochs=1) dt = deeptable.DeepTable(config=config, preprocessor=preprocessor) dt.fit(X, y, epochs=1)
def setup_class(self): print("Loading datasets...") df_train = dsutils.load_adult().head(1000) self.y = df_train.pop(14).values self.X = df_train conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True) self.dt = deeptable.DeepTable(config=conf) self.X_train, \ self.X_test, \ self.y_train, \ self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.model, self.history = self.dt.fit(self.X_train, self.y_train, epochs=1)
def test_modelinfo(self): df_train = dsutils.load_adult() y = df_train.pop(14).values X = df_train conf = deepmodels.ModelConfig(metrics=['AUC']) dt = deeptable.DeepTable(config=conf) model, history = dt.fit(X, y, epochs=2) mi = modelset.ModelInfo('val', 'm1', model, {}, history=history.history) assert mi.score['val_auc'] > 0 assert len(mi.meta['history']['AUC']) == 2
def test_embeddings_output_dim(self): print("Loading datasets...") df_train = dsutils.load_adult().head(1000) y = df_train.pop(14).values X = df_train conf = deeptable.ModelConfig(fixed_embedding_dim=False, embeddings_output_dim=0) dt = deeptable.DeepTable(config=conf) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model, history = dt.fit(X_train, y_train, epochs=1)
def test_categorical_columns_config(self): df_train = dsutils.load_adult().head(1000) y = df_train.pop(14).values conf = deeptable.ModelConfig(categorical_columns=['x_1', 'x_2', 'x_3'], auto_discrete=False, auto_imputation=True, auto_encode_label=True, auto_categorize=False, apply_gbm_features=False) processor = DefaultPreprocessor(conf) X, y = processor.fit_transform(df_train, y) assert len( set(X.columns) - set(['x_1', 'x_2', 'x_3', 'x_0', 'x_4', 'x_10', 'x_11', 'x_12']) ) == 0
def test_use_cache(self): config = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True) df_train = dsutils.load_adult().head(1000) y = df_train.pop(14).values X = df_train X, X_val, y, y_val = train_test_split(X, y, test_size=0.2) cache_home = homedir + '/preprocessor_cache' preprocessor = DefaultPreprocessor(config, cache_home=cache_home, use_cache=True) preprocessor.clear_cache() sign = preprocessor.get_X_y_signature(X, y) sign_val = preprocessor.get_X_y_signature(X_val, y_val) X_t, y_t = preprocessor.get_transformed_X_y_from_cache(sign) assert X_t is None and y_t is None preprocessor.fit_transform(X, y) preprocessor.transform(X_val, y_val) X_t2, y_t2 = preprocessor.get_transformed_X_y_from_cache(sign) assert X_t2 is not None and y_t2 is not None preprocessor = DefaultPreprocessor(config, cache_home=cache_home, use_cache=True) assert len(preprocessor.X_transformers) == 0 assert preprocessor.y_lable_encoder is None assert preprocessor.load_transformers_from_cache() == True assert len(preprocessor.X_transformers) == 3 assert preprocessor.y_lable_encoder is not None X_t, y_t = preprocessor.get_transformed_X_y_from_cache(sign) assert X_t is not None and y_t is not None X_val_t, y_val_t = preprocessor.get_transformed_X_y_from_cache( sign_val) assert X_val_t is not None and y_val_t is not None
def test_categorical_columns_config_2(self): df_train = dsutils.load_adult().head(1000) df_train = dd.from_pandas(df_train, npartitions=2) y = df_train.pop(14) conf = deeptable.ModelConfig(categorical_columns=['x_1', 'x_2', 'x_3'], auto_discrete=True, auto_imputation=True, auto_encode_label=True, auto_categorize=False, apply_gbm_features=False) processor = DefaultDaskPreprocessor(conf, compute_to_local=True) X, y = processor.fit_transform(df_train, y) assert len( set(X.columns) - set([ 'x_1', 'x_2', 'x_3', 'x_0', 'x_4', 'x_10', 'x_11', 'x_12', 'x_0_discrete', 'x_4_discrete', 'x_10_discrete', 'x_11_discrete', 'x_12_discrete' ])) == 0
def run_nets(self, nets): df_train = dsutils.load_adult().head(100) y = df_train.pop(14).values X = df_train conf = deeptable.ModelConfig(nets=nets, metrics=['AUC'], fixed_embedding_dim=True, embeddings_output_dim=2, apply_gbm_features=False, apply_class_weight=True) dt = deeptable.DeepTable(config=conf) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model, history = dt.fit(X_train, y_train, epochs=1) result = dt.evaluate(X_test, y_test) assert result['AUC'] >= 0.0 return dt, result
def test_get_models_retian_single_model(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( # dnn_units=((256, 0, False), (128, 0, False)), # dnn_activation='relu', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], # AUC/recall/precision/f1/mse/mae/msle/rmse/r2 dt_config=conf, verbose=0, dt_epochs=1, # seed=9527, cross_validation=True, num_folds=2, retain_single_model=True, ) ms = bt.start() mis_all = bt.get_models('all') mis_top2 = bt.get_models('top2') mis_modelindex = bt.get_models([1, 3]) mis_modelnames = bt.get_models([ 'conf-1 - [\'dnn_nets\'] - CV - oof', 'conf-1 - [\'dnn_nets\'] - dnn_nets-kfold-1 - eval', 'LightGBM', 'CatBoost' ]) assert len(mis_all), 6 assert len(mis_top2), 2 assert len(mis_modelnames), 4 assert len(mis_modelindex), 2 assert mis_modelnames[0].name, 'conf-1 - [\'dnn_nets\'] - CV - oof'
def test_fit_cv(self): df_train = dsutils.load_adult().head(1000) y = df_train.pop(14).values X = df_train cols = X.columns num_cols = X._get_numeric_data().columns cat_cols = list(set(cols) - set(num_cols)) le = LabelEncoder() for c in cat_cols: X[c] = le.fit_transform(X[c]) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) oof_proba = BatchTrainer.fit_cross_validation( 'lightGBM', lgbm_fit, X_train, y_train, X_test, score_fn=roc_auc_score, estimator_params={ 'max_depth': 3, 'learning_rate': 0.01 }, categorical_feature=cols, task_type='binary', num_folds=5, stratified=True, iterators=None, batch_size=None, preds_filepath=None, ) auc = roc_auc_score(y_train, oof_proba) assert auc > 0
def test_callback_injection(self): print("Loading datasets...") df_train = dsutils.load_adult() self.y = df_train.pop(14).values self.X = df_train path = tempfile.mkdtemp() conf = deeptable.ModelConfig( metrics=['AUC'], apply_gbm_features=False, auto_discrete=False, home_dir=path, ) self.dt = deeptable.DeepTable(config=conf) mcp = ModelCheckpoint( path, 'val_auc', verbose=0, save_best_only=False, save_weights_only=False, mode='max', save_freq='epoch', ) callbacks = [mcp] self.X_train, \ self.X_test, \ self.y_train, \ self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.model, self.history = self.dt.fit(self.X_train, self.y_train, epochs=1, callbacks=callbacks) files = os.listdir(path) assert 'saved_model.pb' in files
hdt = HyperDT(searcher, callbacks=[ SummaryCallback(), FileStorageLoggingCallback(searcher, output_dir=f'hotexamples_com/hyn_logs') ], reward_metric='AUC', earlystopping_patience=1) space = mini_dt_space() assert space.combinations == 589824 space2 = default_dt_space() assert space2.combinations == 3559292928 df = dsutils.load_adult() # df.drop(['id'], axis=1, inplace=True) df_train, df_test = train_test_split(df, test_size=0.2, random_state=42) X = df_train y = df_train.pop(14) y_test = df_test.pop(14) # dataset_id='adult_whole_data', hdt.search( df_train, y, df_test, y_test, max_trials=3, batch_size=256, epochs=1, verbose=1,
def test_opt_lightgbm(self): df_train = dsutils.load_adult().head(1000) y = df_train.pop(14).values X = df_train cols = X.columns num_cols = X._get_numeric_data().columns cat_cols = list(set(cols) - set(num_cols)) le = LabelEncoder() for c in cat_cols: X[c] = le.fit_transform(X[c]) clf = LGBMClassifier(n_estimators=10, boosting_type='gbdt', categorical_feature=cat_cols, num_leaves=31) fit_params = {'eval_metric': 'roc_auc'} # randomized_search param_distributions = { # 'iterations': sp_randint(10, 1000), 'max_depth': [1, 3, 5], # sp_randint(1, 5), 'learning_rate': sp_uniform(0.01, 1.0), } best_params1 = BatchTrainer.randomized_search(clf, param_distributions, X, y, fit_params=fit_params, scoring='roc_auc', n_jobs=1, cv=5) # grid_search param_grid = { # 'iterations': [10, 30], 'max_depth': [1, 3, 5], # sp_randint(1, 5), 'learning_rate': [0.01, 0.05, 0.1], } best_params2 = BatchTrainer.grid_search(clf, param_grid, X, y, fit_params=fit_params, scoring='roc_auc', n_jobs=1, cv=5) # bayes_search search_spaces = { 'max_depth': Integer(1, 5), 'learning_rate': Real(0.02, 0.6, 'log-uniform'), } best_params3 = BatchTrainer.bayes_search(clf, search_spaces, X, y, fit_params=fit_params, scoring='roc_auc', n_jobs=1, cv=5, n_iter=10) assert best_params1['max_depth'] > 0 assert best_params2['max_depth'] > 0 assert best_params3['max_depth'] > 0
def test_opt_catboost(self): df_train = dsutils.load_adult().head(1000) y = df_train.pop(14).values X = df_train cols = X.columns num_cols = X._get_numeric_data().columns cat_cols = list(set(cols) - set(num_cols)) clf = CatBoostClassifier(thread_count=4, loss_function='Logloss', cat_features=cat_cols, od_type='Iter', nan_mode='Min', iterations=1, eval_metric='AUC', metric_period=50, verbose=False) fit_params = {'early_stopping_rounds': 10} # randomized_search param_distributions = { # 'iterations': sp_randint(10, 1000), 'depth': [1, 3, 5], # sp_randint(1, 5), 'learning_rate': sp_uniform(0.01, 1.0), } best_params1 = BatchTrainer.randomized_search(clf, param_distributions, X, y, fit_params=fit_params, scoring='roc_auc', n_jobs=1, cv=5) # grid_search param_grid = { # 'iterations': [10, 30], 'depth': [1, 3, 5], # sp_randint(1, 5), 'learning_rate': [0.01, 0.05, 0.1], } best_params2 = BatchTrainer.grid_search(clf, param_grid, X, y, fit_params=fit_params, scoring='roc_auc', n_jobs=1, cv=5) # bayes_search search_spaces = { 'depth': Integer(1, 5), 'learning_rate': Real(0.02, 0.6, 'log-uniform'), } best_params3 = BatchTrainer.bayes_search(clf, search_spaces, X, y, fit_params=fit_params, scoring='roc_auc', n_jobs=1, cv=5, n_iter=10) assert best_params1['depth'] > 0 assert best_params2['depth'] > 0 assert best_params3['depth'] > 0