def test_multi_config(self): data = dsutils.load_adult().head(1000) conf1 = deeptable.ModelConfig( name='conf001', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) conf2 = deeptable.ModelConfig( name='conf002', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], dt_config=[conf1, conf2], verbose=0, dt_epochs=1, cross_validation=True, num_folds=2, retain_single_model=False, ) ms = bt.start(models=['dt']) assert len(ms.get_models()), 2
def setup_class(self): setup_dask(self) print("Loading datasets...") df_train = dd.from_pandas(dsutils.load_adult().head(1000), npartitions=2) self.y = df_train.pop(14) self.X = df_train conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, auto_categorize=False, auto_discrete=False) self.dt = deeptable.DeepTable(config=conf) self.X_train, \ self.X_eval, \ self.y_train, \ self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.oof_proba, self.eval_proba, self.test_proba = \ self.dt.fit_cross_validation(self.X_train, self.y_train, self.X_eval, num_folds=3, epochs=1, n_jobs=1)
def test_run_binary(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( dnn_params={ 'dnn_units': ((256, 0, False), (256, 0, False)), 'dnn_activation': 'relu' }, fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, # auto_discrete=True, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], # AUC/recall/precision/f1/mse/mae/msle/rmse/r2 dt_config=conf, verbose=0, dt_epochs=1, # seed=9527, ) ms = bt.start() assert ms.leaderboard().shape[1], 7
def test_var_categorical_feature(self): X = self.df.copy() y = X.pop('rating').values.astype('float32') conf = deeptable.ModelConfig(nets=['dnn_nets'], task=consts.TASK_REGRESSION, categorical_columns=[ "movie_id", "user_id", "gender", "occupation", "zip", "title", "age" ], metrics=['mse'], fixed_embedding_dim=True, embeddings_output_dim=4, apply_gbm_features=False, apply_class_weight=True, earlystopping_patience=5, var_len_categorical_columns=[ ('genres', "|", "max") ]) dt = deeptable.DeepTable(config=conf) X_train, X_validation, y_train, y_validation = train_test_split( X, y, test_size=0.2) model, history = dt.fit(X_train, y_train, validation_data=(X_validation, y_validation), epochs=10, batch_size=32) assert 'genres' in model.model.input_names
def test_run_binary_heart_disease_CV(self): data = dsutils.load_heart_disease_uci() conf = deeptable.ModelConfig( dnn_params={ 'dnn_units': ((256, 0, False), (256, 0, False)), 'dnn_activation': 'relu' }, fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=True, auto_categorize=False, cat_exponent=0.4, cat_remain_numeric=True, # optimizer=keras.optimizers.RMSprop(), monitor_metric='val_loss', ) bt = batch_trainer.BatchTrainer( data, 'target', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], # AUC/recall/precision/f1/mse/mae/msle/rmse/r2 dt_config=conf, verbose=0, dt_epochs=1, cross_validation=True, num_folds=3, # seed=9527, ) ms = bt.start() assert ms.leaderboard().shape[1], 7
def test_custom_dnn(self): df_train = dsutils.load_adult().head(100) y = df_train.pop(14).values X = df_train conf = deeptable.ModelConfig(nets=['dnn_nets'], dnn_params={ 'custom_dnn_fn': deepnets.custom_dnn_D_A_D_B, 'hidden_units': ((128, 0.2, True), (64, 0, False)), }, metrics=['AUC'], fixed_embedding_dim=True, embeddings_output_dim=2, apply_gbm_features=False, apply_class_weight=True) dt = deeptable.DeepTable(config=conf) model, history = dt.fit(X, y, epochs=1) l1 = model.model.get_layer('dnn_custom_dense_1') l2 = model.model.get_layer('dnn_custom_dropout_1') l3 = model.model.get_layer('dnn_custom_bn_1') l4 = model.model.get_layer('dnn_custom_dense_2') assert l1 assert l2 assert l3 assert l4
def test_run_lgbm(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( dnn_params={ 'dnn_units': ((256, 0, False), (256, 0, False)), 'dnn_activation': 'relu' }, fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, # auto_discrete=True, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], # AUC/recall/precision/f1/mse/mae/msle/rmse/r2 dt_config=conf, verbose=0, dt_epochs=1, # seed=9527, lightgbm_params={ 'learning_rate': 0.01, 'colsample_bytree': 0.95, 'reg_alpha': 0.04, 'reg_lambda': 0.07 }, ) lgbm, score = bt.train_lgbm(conf) assert lgbm assert score['auc'] > 0
def test_probe_evaluation(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( # dnn_units=((256, 0, False), (128, 0, False)), # dnn_activation='relu', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], dt_config=conf, verbose=0, dt_epochs=1, cross_validation=False, ) ms = bt.start(models=['dt']) result = bt.probe_evaluate( 'all', layers=['flatten_embeddings', 'dnn_dense_1', 'dnn_dense_2']) assert len(result), 1 assert len(result["conf-1 - ['dnn_nets'] - eval"]), 3
def test_zero_testset_cross_validation(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( # dnn_units=((256, 0, False), (128, 0, False)), # dnn_activation='relu', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], dt_config=conf, verbose=0, dt_epochs=1, cross_validation=True, num_folds=2, retain_single_model=False, ) assert len(bt.X_train), 1000 assert bt.X_eval is None ms = bt.start(models=['dt']) assert len(ms.get_models()), 1
def test_run_catboost(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( dnn_params={ 'dnn_units': ((256, 0, False), (256, 0, False)), 'dnn_activation': 'relu' }, fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, # auto_discrete=True, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], # AUC/recall/precision/f1/mse/mae/msle/rmse/r2 dt_config=conf, verbose=0, dt_epochs=1, catboost_params={'iterations': 5} # seed=9527, ) cb, score = bt.train_catboost(conf) assert cb assert score['auc'] > 0
def test_ensemble_predict_proba(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( # dnn_units=((256, 0, False), (128, 0, False)), # dnn_activation='relu', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], # AUC/recall/precision/f1/mse/mae/msle/rmse/r2 dt_config=conf, verbose=0, dt_epochs=1, # seed=9527, cross_validation=True, num_folds=5, ) ms = bt.start() proba, preds, score, submission = bt.ensemble_predict_proba('all') assert proba.shape, (6513, )
def test_class_weights(self): conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True) dt = deeptable.DeepTable(config=conf) model, history = dt.fit(self.X_train, self.y_train, epochs=1) assert history.history['AUC'][0] > 0
def run_nets(self, nets, **kwargs): df_train = dsutils.load_adult().head(100) y = df_train.pop(14).values X = df_train conf = deeptable.ModelConfig(nets=nets, metrics=['AUC'], fixed_embedding_dim=True, embeddings_output_dim=2, apply_gbm_features=False, apply_class_weight=True, **kwargs) dt = deeptable.DeepTable(config=conf) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model, history = dt.fit(X_train, y_train, epochs=1) result = dt.evaluate(X_test, y_test) assert result['AUC'] >= 0.0 # test reload from disk # model_path = os.path.join("/tmp/dt_model", str(uuid.uuid4())) # dt.save(model_path) # # p = multiprocessing.Process(target=self.run_load_model, args=(model_path, X_test, y_test, )) # p.start() # p.join() return dt, result
def run(distribute_strategy=None, batch_size=32, epochs=5): # loading data df = dsutils.load_bank() df_train, df_test = train_test_split(df, test_size=0.2, random_state=42) y = df_train.pop('y') y_test = df_test.pop('y') # training config = deeptable.ModelConfig( nets=deepnets.DeepFM, earlystopping_patience=999, apply_class_weight=True, distribute_strategy=distribute_strategy, ) dt = deeptable.DeepTable(config=config) model, history = dt.fit(df_train, y, batch_size=batch_size, epochs=epochs) # evaluation result = dt.evaluate(df_test, y_test, verbose=0) print('score:', result) # scoring preds = dt.predict(df_test) uniques = np.unique(preds, return_counts=True) print({k: v for k, v in zip(*uniques)})
def test_only_1_categorical(self, net): # Note: afm_nets needs embedding array, and at least 2 elements # Note: opnn_nets,ipnn_nets,pnn_nets, needs at least 2 embedding to build `layers.InnerProduct` # Note: dnn_nets,cross_dnn_nets,cross_nets,dcn_nets, does not using embedding # Note: fibi_nets,fibi_dnn_nets needs at least 2 embedding because of `BilinearInteraction` df = dsutils.load_movielens() y = df['rating'].values.astype('float32') X = df[['movie_id']] conf = deeptable.ModelConfig(nets=[net], task=consts.TASK_REGRESSION, categorical_columns=["movie_id"], metrics=['mse'], fixed_embedding_dim=True, embeddings_output_dim=4, apply_gbm_features=False, apply_class_weight=True, earlystopping_patience=5) dt = deeptable.DeepTable(config=conf) model, history = dt.fit(X, y, validation_split=0.2, epochs=10, batch_size=32) assert model
def test_transform(self): df_train = dsutils.load_adult() df_train = dd.from_pandas(df_train, npartitions=2) y = df_train.pop(14) # .values X = df_train X_train, X_test, y_train, y_test = get_tool_box(X, y).train_test_split( X, y, test_size=0.2, random_state=42) conf = deeptable.ModelConfig(auto_discrete=True, auto_imputation=True, auto_encode_label=True, auto_categorize=True, apply_gbm_features=False) processor = DefaultDaskPreprocessor(conf, compute_to_local=True) X1, y1 = processor.fit_transform(X_train, y_train) X2, y2 = processor.transform(X_test, y_test) assert len( set(X1.columns.tolist()) - set([ 'x_1', 'x_3', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_13', 'x_0_cat', 'x_4_cat', 'x_10_cat', 'x_11_cat', 'x_12_cat', 'x_2', 'x_0', 'x_4', 'x_10', 'x_11', 'x_12', 'x_2_discrete', 'x_0_discrete', 'x_4_discrete', 'x_10_discrete', 'x_11_discrete', 'x_12_discrete' ])) == 0 assert len(set(X1.columns) - set(X2.columns)) == 0 assert X1.shape, (X_train.shape[0], 25) assert X2.shape, (X_test.shape[0], 25) assert y1.sum(), 6297 assert y2.sum(), 1544
def test_predict_unseen_data(self): x1 = np.random.randint(0, 10, size=(100), dtype='int') x2 = np.random.randint(0, 2, size=(100)).astype('str') x3 = np.random.normal(0.0, 1.0, size=(100)) y = np.random.randint(0, 2, size=(100), dtype='int') df = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'y': y}) df = dd.from_pandas(df, npartitions=1) y = df.pop('y') dt = deeptable.DeepTable(config=deeptable.ModelConfig( apply_gbm_features=False, auto_categorize=True, auto_discrete=True, # nets=['linear', 'cin_nets', 'fm_nets', 'afm_nets', 'pnn_nets', 'dnn2_nets', 'dcn_nets', # 'autoint_nets', 'fibi_dnn_nets'], # 'fg_nets', 'fgcnn_cin_nets', 'fgcnn_fm_nets', 'fgcnn_ipnn_nets', # 'fgcnn_dnn_nets', ] )) dt.fit(df, y) xt_1 = np.random.randint(0, 50, size=(10), dtype='int') xt_2 = np.random.randint(0, 10, size=(10)).astype('str') xt_3 = np.random.normal(0.0, 2.0, size=(10)) dft = pd.DataFrame({'x1': xt_1, 'x2': xt_2, 'x3': xt_3}) dft = dd.from_pandas(dft, npartitions=2) preds = dt.predict(dft) assert len(preds), 10
def test_fit(self): print("Loading datasets...") x1 = np.random.randint(0, 10, size=(100), dtype='int') x2 = np.random.normal(0.0, 1.0, size=(100)) x3 = np.random.normal(0.0, 1.0, size=(100)) y1 = np.random.randint(0, 2, size=(100), dtype='int') y2 = np.random.randint(0, 2, size=(100), dtype='int') df = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3}) df_y = pd.DataFrame({'y1': y1, 'y2': y2}) conf = deeptable.ModelConfig(metrics=['AUC'], nets=['dnn_nets'], apply_gbm_features=False, task='multilabel') dt = deeptable.DeepTable(config=conf) X_train, X_test, y_train, y_test = train_test_split(df, df_y, test_size=0.2, random_state=42) model, history = dt.fit(X_train, y_train.values, batch_size=10, epochs=1)
def test_leaderboard(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( # dnn_units=((256, 0, False), (128, 0, False)), # dnn_activation='relu', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], dt_config=conf, verbose=0, dt_epochs=1, cross_validation=True, num_folds=2, retain_single_model=True, ) ms = bt.start() eval_lb = ms.leaderboard(type='eval') oof_lb = ms.leaderboard(type='oof') val_lb = ms.leaderboard(type='val') assert len(eval_lb), 5 assert len(oof_lb), 1 assert val_lb is None
def test_run_cross_validation(self): data = dsutils.load_adult().head(1000) conf = deeptable.ModelConfig( # dnn_units=((256, 0, False), (128, 0, False)), # dnn_activation='relu', fixed_embedding_dim=False, embeddings_output_dim=0, apply_gbm_features=False, auto_discrete=False, ) bt = batch_trainer.BatchTrainer( data, 'x_14', data_test=data, eval_size=0.2, validation_size=0.2, eval_metrics=['AUC', 'accuracy', 'recall', 'precision', 'f1'], # AUC/recall/precision/f1/mse/mae/msle/rmse/r2 dt_config=conf, verbose=0, dt_epochs=1, # seed=9527, cross_validation=True, num_folds=5, ) ms = bt.start(models=['dt']) assert ms.leaderboard().shape[1], 7
def setup_class(self): self.X, self.y = self.load_data() conf = deeptable.ModelConfig(task=consts.TASK_REGRESSION, metrics=[r2_c, 'RootMeanSquaredError'], apply_gbm_features=False) self.dt = deeptable.DeepTable(config=conf) self.X_train, self.X_test, self.y_train, self.y_test = \ get_tool_box(self.X).train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.model, self.history = self.dt.fit(self.X_train, self.y_train, batch_size=32, epochs=100)
def test_gbm_feature_embedding(self): conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=True, gbm_feature_type=consts.GBM_FEATURE_TYPE_EMB, gbm_params={'learning_rate': 0.01, 'colsample_bytree': 0.95, 'reg_alpha': 0.04, 'reg_lambda': 0.07, 'n_estimators': 10}, ) dt, dm, history = self.run_dt(conf) lgbm_leaves = [c for c in dt.preprocessor.get_categorical_columns() if 'lgbm_leaf' in c] assert len(lgbm_leaves), 10
def test_cache_preprocessed_data(self): config = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True) df_train = dsutils.load_adult().head(100) y = df_train.pop(14).values X = df_train cache_home = homedir + '/cache' preprocessor = DefaultPreprocessor(config, cache_home=cache_home, use_cache=True) dt = deeptable.DeepTable(config=config, preprocessor=preprocessor) dt.fit(X, y, epochs=1) dt = deeptable.DeepTable(config=config, preprocessor=preprocessor) dt.fit(X, y, epochs=1)
def setup_class(self): setup_dask(self) print("Loading datasets...") data = dd.from_pandas(dsutils.load_glass_uci(), npartitions=2) self.y = data.pop(10).values self.X = data conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, ) self.dt = deeptable.DeepTable(config=conf) self.X_train, self.X_test, self.y_train, self.y_test = \ [t.persist() for t in get_tool_box(data).train_test_split(self.X, self.y, test_size=0.2, random_state=42)] self.model, self.history = self.dt.fit(self.X_train, self.y_train, batch_size=32, epochs=3)
def test_default_settings(self): config = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True) dt, _ = self.run_dt(config) # test save and load filepath = f'{type(self).__name__}_{time.strftime("%Y%m%d%H%M%S")}' dt.save(filepath) assert fs.exists(f'{filepath}/dt.pkl') assert fs.exists(f'{filepath}/dnn_nets.h5') newdt = deeptable.DeepTable.load(filepath) X_eval = self.df.copy() X_eval.pop(self.target) preds = newdt.predict(X_eval) assert preds.shape, (self.df_row_count,)
def setup_class(self): print("Loading datasets...") df_train = dsutils.load_adult().head(1000) self.y = df_train.pop(14).values self.X = df_train conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True) self.dt = deeptable.DeepTable(config=conf) self.X_train, \ self.X_test, \ self.y_train, \ self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.model, self.history = self.dt.fit(self.X_train, self.y_train, epochs=1)
def test_gbm_features_with_params(self): params = {'learning_rate': 0.01, 'colsample_bytree': 0.95, 'reg_alpha': 0.04, 'reg_lambda': 0.07, 'n_estimators': 10} config = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=True, gbm_params=params, ) dt, _ = self.run_dt(config) lgbm = dt.preprocessor.X_transformers['gbm_features'].lgbm assert all([getattr(lgbm, k, None) == v for k, v in params.items()]) lgbm_leaves = [c for c in dt.preprocessor.get_categorical_columns() if 'lgbm_leaf' in c] assert len(lgbm_leaves), 10
def test_gbm_feature_dense(self): conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=True, gbm_feature_type=consts.GBM_FEATURE_TYPE_DENSE, gbm_params={'learning_rate': 0.01, 'colsample_bytree': 0.95, 'reg_alpha': 0.04, 'reg_lambda': 0.07, 'n_estimators': 10}, ) dt, dm, history = self.run_dt(conf) layers = dm.model.layers dense_lgbm_input = dm.model.get_layer(consts.INPUT_PREFIX_NUM + 'gbm_leaves') concat_continuous_inputs = dm.model.get_layer('concat_continuous_inputs') # last_lgbm_emb = model.get_layer('emb_lgbm_leaf_9') # flatten_embeddings = model.get_layer('flatten_embeddings') assert dense_lgbm_input assert concat_continuous_inputs
def test_only_var_len_categorical_feature(self): df: pd.DataFrame = self.df_movielens.copy() X = df[['genres']] y = df['rating'] conf = deeptable.ModelConfig( nets=['dnn_nets'], task=consts.TASK_REGRESSION, metrics=['mse'], fixed_embedding_dim=True, embeddings_output_dim=4, apply_gbm_features=False, apply_class_weight=True, earlystopping_patience=3, ) self._train_and_asset(X, y, conf)
def test_embeddings_output_dim(self): print("Loading datasets...") df_train = dsutils.load_adult().head(1000) y = df_train.pop(14).values X = df_train conf = deeptable.ModelConfig(fixed_embedding_dim=False, embeddings_output_dim=0) dt = deeptable.DeepTable(config=conf) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model, history = dt.fit(X_train, y_train, epochs=1)