def test_cache_preprocessed_data(self): config = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True) df_train = dsutils.load_adult().head(100) y = df_train.pop(14).values X = df_train cache_home = homedir + '/cache' preprocessor = DefaultPreprocessor(config, cache_home=cache_home, use_cache=True) dt = deeptable.DeepTable(config=config, preprocessor=preprocessor) dt.fit(X, y, epochs=1) dt = deeptable.DeepTable(config=config, preprocessor=preprocessor) dt.fit(X, y, epochs=1)
def test_predict_unseen_data(self): x1 = np.random.randint(0, 10, size=(100), dtype='int') x2 = np.random.randint(0, 2, size=(100)).astype('str') x3 = np.random.normal(0.0, 1.0, size=(100)) y = np.random.randint(0, 2, size=(100), dtype='int') df = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'y': y}) df = dd.from_pandas(df, npartitions=1) y = df.pop('y') dt = deeptable.DeepTable(config=deeptable.ModelConfig( apply_gbm_features=False, auto_categorize=True, auto_discrete=True, # nets=['linear', 'cin_nets', 'fm_nets', 'afm_nets', 'pnn_nets', 'dnn2_nets', 'dcn_nets', # 'autoint_nets', 'fibi_dnn_nets'], # 'fg_nets', 'fgcnn_cin_nets', 'fgcnn_fm_nets', 'fgcnn_ipnn_nets', # 'fgcnn_dnn_nets', ] )) dt.fit(df, y) xt_1 = np.random.randint(0, 50, size=(10), dtype='int') xt_2 = np.random.randint(0, 10, size=(10)).astype('str') xt_3 = np.random.normal(0.0, 2.0, size=(10)) dft = pd.DataFrame({'x1': xt_1, 'x2': xt_2, 'x3': xt_3}) dft = dd.from_pandas(dft, npartitions=2) preds = dt.predict(dft) assert len(preds), 10
def run_nets(self, nets, **kwargs): df_train = dsutils.load_adult().head(100) y = df_train.pop(14).values X = df_train conf = deeptable.ModelConfig(nets=nets, metrics=['AUC'], fixed_embedding_dim=True, embeddings_output_dim=2, apply_gbm_features=False, apply_class_weight=True, **kwargs) dt = deeptable.DeepTable(config=conf) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model, history = dt.fit(X_train, y_train, epochs=1) result = dt.evaluate(X_test, y_test) assert result['AUC'] >= 0.0 # test reload from disk # model_path = os.path.join("/tmp/dt_model", str(uuid.uuid4())) # dt.save(model_path) # # p = multiprocessing.Process(target=self.run_load_model, args=(model_path, X_test, y_test, )) # p.start() # p.join() return dt, result
def run(distribute_strategy=None, batch_size=32, epochs=5): # loading data df = dsutils.load_bank() df_train, df_test = train_test_split(df, test_size=0.2, random_state=42) y = df_train.pop('y') y_test = df_test.pop('y') # training config = deeptable.ModelConfig( nets=deepnets.DeepFM, earlystopping_patience=999, apply_class_weight=True, distribute_strategy=distribute_strategy, ) dt = deeptable.DeepTable(config=config) model, history = dt.fit(df_train, y, batch_size=batch_size, epochs=epochs) # evaluation result = dt.evaluate(df_test, y_test, verbose=0) print('score:', result) # scoring preds = dt.predict(df_test) uniques = np.unique(preds, return_counts=True) print({k: v for k, v in zip(*uniques)})
def run_dt(self, config): df_train = dsutils.load_adult().head(1000) y = df_train.pop(14).values X = df_train dt = deeptable.DeepTable(config=config) dm, history = dt.fit(X, y, epochs=1) return dt, dm, history
def test_fit(self): print("Loading datasets...") x1 = np.random.randint(0, 10, size=(100), dtype='int') x2 = np.random.normal(0.0, 1.0, size=(100)) x3 = np.random.normal(0.0, 1.0, size=(100)) y1 = np.random.randint(0, 2, size=(100), dtype='int') y2 = np.random.randint(0, 2, size=(100), dtype='int') df = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3}) df_y = pd.DataFrame({'y1': y1, 'y2': y2}) conf = deeptable.ModelConfig(metrics=['AUC'], nets=['dnn_nets'], apply_gbm_features=False, task='multilabel') dt = deeptable.DeepTable(config=conf) X_train, X_test, y_train, y_test = train_test_split(df, df_y, test_size=0.2, random_state=42) model, history = dt.fit(X_train, y_train.values, batch_size=10, epochs=1)
def test_custom_dnn(self): df_train = dsutils.load_adult().head(100) y = df_train.pop(14).values X = df_train conf = deeptable.ModelConfig(nets=['dnn_nets'], dnn_params={ 'custom_dnn_fn': deepnets.custom_dnn_D_A_D_B, 'hidden_units': ((128, 0.2, True), (64, 0, False)), }, metrics=['AUC'], fixed_embedding_dim=True, embeddings_output_dim=2, apply_gbm_features=False, apply_class_weight=True) dt = deeptable.DeepTable(config=conf) model, history = dt.fit(X, y, epochs=1) l1 = model.model.get_layer('dnn_custom_dense_1') l2 = model.model.get_layer('dnn_custom_dropout_1') l3 = model.model.get_layer('dnn_custom_bn_1') l4 = model.model.get_layer('dnn_custom_dense_2') assert l1 assert l2 assert l3 assert l4
def setup_class(self): setup_dask(self) print("Loading datasets...") df_train = dd.from_pandas(dsutils.load_adult().head(1000), npartitions=2) self.y = df_train.pop(14) self.X = df_train conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, auto_categorize=False, auto_discrete=False) self.dt = deeptable.DeepTable(config=conf) self.X_train, \ self.X_eval, \ self.y_train, \ self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.oof_proba, self.eval_proba, self.test_proba = \ self.dt.fit_cross_validation(self.X_train, self.y_train, self.X_eval, num_folds=3, epochs=1, n_jobs=1)
def test_class_weights(self): conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True) dt = deeptable.DeepTable(config=conf) model, history = dt.fit(self.X_train, self.y_train, epochs=1) assert history.history['AUC'][0] > 0
def test_only_1_categorical(self, net): # Note: afm_nets needs embedding array, and at least 2 elements # Note: opnn_nets,ipnn_nets,pnn_nets, needs at least 2 embedding to build `layers.InnerProduct` # Note: dnn_nets,cross_dnn_nets,cross_nets,dcn_nets, does not using embedding # Note: fibi_nets,fibi_dnn_nets needs at least 2 embedding because of `BilinearInteraction` df = dsutils.load_movielens() y = df['rating'].values.astype('float32') X = df[['movie_id']] conf = deeptable.ModelConfig(nets=[net], task=consts.TASK_REGRESSION, categorical_columns=["movie_id"], metrics=['mse'], fixed_embedding_dim=True, embeddings_output_dim=4, apply_gbm_features=False, apply_class_weight=True, earlystopping_patience=5) dt = deeptable.DeepTable(config=conf) model, history = dt.fit(X, y, validation_split=0.2, epochs=10, batch_size=32) assert model
def test_var_categorical_feature(self): X = self.df.copy() y = X.pop('rating').values.astype('float32') conf = deeptable.ModelConfig(nets=['dnn_nets'], task=consts.TASK_REGRESSION, categorical_columns=[ "movie_id", "user_id", "gender", "occupation", "zip", "title", "age" ], metrics=['mse'], fixed_embedding_dim=True, embeddings_output_dim=4, apply_gbm_features=False, apply_class_weight=True, earlystopping_patience=5, var_len_categorical_columns=[ ('genres', "|", "max") ]) dt = deeptable.DeepTable(config=conf) X_train, X_validation, y_train, y_validation = train_test_split( X, y, test_size=0.2) model, history = dt.fit(X_train, y_train, validation_data=(X_validation, y_validation), epochs=10, batch_size=32) assert 'genres' in model.model.input_names
def _train_and_asset(self, X, y, conf: deeptable.ModelConfig): dt = deeptable.DeepTable(config=conf) model, history = dt.fit(X, y, validation_split=0.2, epochs=2, batch_size=32) assert len(model.model.input_names) == 1
def setup_class(self): self.X, self.y = self.load_data() conf = deeptable.ModelConfig(task=consts.TASK_REGRESSION, metrics=[r2_c, 'RootMeanSquaredError'], apply_gbm_features=False) self.dt = deeptable.DeepTable(config=conf) self.X_train, self.X_test, self.y_train, self.y_test = \ get_tool_box(self.X).train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.model, self.history = self.dt.fit(self.X_train, self.y_train, batch_size=32, epochs=100)
def setup_class(self): setup_dask(self) print("Loading datasets...") data = dd.from_pandas(dsutils.load_glass_uci(), npartitions=2) self.y = data.pop(10).values self.X = data conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, ) self.dt = deeptable.DeepTable(config=conf) self.X_train, self.X_test, self.y_train, self.y_test = \ [t.persist() for t in get_tool_box(data).train_test_split(self.X, self.y, test_size=0.2, random_state=42)] self.model, self.history = self.dt.fit(self.X_train, self.y_train, batch_size=32, epochs=3)
def test_modelinfo(self): df_train = dsutils.load_adult() y = df_train.pop(14).values X = df_train conf = deepmodels.ModelConfig(metrics=['AUC']) dt = deeptable.DeepTable(config=conf) model, history = dt.fit(X, y, epochs=2) mi = modelset.ModelInfo('val', 'm1', model, {}, history=history.history) assert mi.score['val_auc'] > 0 assert len(mi.meta['history']['AUC']) == 2
def setup_class(self): print("Loading datasets...") df_train = dsutils.load_adult().head(1000) self.y = df_train.pop(14).values self.X = df_train conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True) self.dt = deeptable.DeepTable(config=conf) self.X_train, \ self.X_test, \ self.y_train, \ self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.model, self.history = self.dt.fit(self.X_train, self.y_train, epochs=1)
def test_embeddings_output_dim(self): print("Loading datasets...") df_train = dsutils.load_adult().head(1000) y = df_train.pop(14).values X = df_train conf = deeptable.ModelConfig(fixed_embedding_dim=False, embeddings_output_dim=0) dt = deeptable.DeepTable(config=conf) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model, history = dt.fit(X_train, y_train, epochs=1)
def test_duplicate_columns(self): x1 = np.random.randint(0, 10, size=(100), dtype='int') x2 = np.random.randint(0, 2, size=(100)).astype('str') x3 = np.random.normal(0.0, 1.0, size=(100)) y = np.random.randint(0, 2, size=(100), dtype='int') df = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3}) df.columns = ['x1', 'x1', 'x3'] dt = deeptable.DeepTable(config=deeptable.ModelConfig( apply_gbm_features=False, auto_categorize=True, auto_discrete=True, )) with pytest.raises(ValueError) as excinfo: dt.fit(df, y) assert "Columns with duplicate names in X:" in str(excinfo.value) assert excinfo.type == ValueError
def setup_class(self): print("Loading datasets...") boston_dataset = load_boston() df_train = pd.DataFrame(boston_dataset.data) df_train.columns = boston_dataset.feature_names self.y = pd.Series(boston_dataset.target) self.X = df_train conf = deeptable.ModelConfig(task=consts.TASK_REGRESSION, metrics=[r2_c, 'RootMeanSquaredError'], apply_gbm_features=False) self.dt = deeptable.DeepTable(config=conf) self.X_train, \ self.X_test, \ self.y_train, \ self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.model, self.history = self.dt.fit(self.X_train, self.y_train, epochs=100)
def test_importances(self): if have_eli5: df = dsutils.load_bank().head(100) df.drop(['id'], axis=1, inplace=True) X, X_test = train_test_split(df, test_size=0.2, random_state=42) y = X.pop('y') y_test = X_test.pop('y') config = deeptable.ModelConfig(nets=['dnn_nets'], auto_discrete=True, metrics=['AUC']) dt = deeptable.DeepTable(config=config) dt.fit(X, y, epochs=1) fi = get_score_importances(dt, X_test, y_test, 'AUC', 1, mode='max') assert fi.shape == (16, 2) fi2 = get_score_importances(dt, X_test, y_test, 'log_loss', 1, mode='min') assert fi2.shape == (16, 2)
def run(distribute_strategy=None, batch_size=32, epochs=5): # loading data df = dsutils.load_bank_by_dask() df_train, df_test = get_tool_box(df).train_test_split(df, test_size=0.2, random_state=42) y = df_train.pop('y') y_test = df_test.pop('y') df_train, y, df_test, y_test = dask.persist(df_train, y, df_test, y_test) # training config = deeptable.ModelConfig( nets=deepnets.DeepFM, earlystopping_patience=5, distribute_strategy=distribute_strategy, ) dt = deeptable.DeepTable(config=config) model, history = dt.fit(df_train, y, batch_size=batch_size, epochs=epochs) # save model_path = 'model_by_dask' dt.save(model_path) print(f'saved to {model_path}') # evaluation model_path = 'model_by_dask' dt2 = deeptable.DeepTable.load(model_path) result = dt2.evaluate(df_test, y_test, batch_size=512, verbose=0) print('score:', result) # scoring preds = dt2.predict( df_test, batch_size=512, ) proba = dt2.predict_proba( df_test, batch_size=512, ) print( get_tool_box(y_test).metrics.calc_score(y_test, preds, proba, metrics=['accuracy', 'auc']))
def test_shap(self): if have_shap: df = dsutils.load_bank().head(100) df.drop(['id'], axis=1, inplace=True) X, X_test = train_test_split(df, test_size=0.2, random_state=42) y = X.pop('y') y_test = X_test.pop('y') config = deeptable.ModelConfig(nets=['dnn_nets'], auto_discrete=True, metrics=['AUC']) dt = deeptable.DeepTable(config=config) dt.fit(X, y, epochs=1) dt_explainer = DeepTablesExplainer(dt, X, num_samples=10) shap_values = dt_explainer.get_shap_values(X[:1], nsamples='auto') assert shap_values[0].shape == (1, 16)
def test_no_input(self): df: pd.DataFrame = self.df_movielens.copy() X = pd.DataFrame() y = df['rating'] conf = deeptable.ModelConfig( nets=['dnn_nets'], task=consts.TASK_REGRESSION, metrics=['mse'], fixed_embedding_dim=True, embeddings_output_dim=4, apply_gbm_features=False, apply_class_weight=True, earlystopping_patience=3, ) dt = deeptable.DeepTable(config=conf) with pytest.raises(ValueError) as err_info: dt.fit(X, y, validation_split=0.2, epochs=2, batch_size=32) print(err_info)
def setup_class(self): print("Loading datasets...") data = dsutils.load_glass_uci() self.y = data.pop(10).values self.X = data conf = deeptable.ModelConfig( metrics=['AUC'], apply_gbm_features=False, ) self.dt = deeptable.DeepTable(config=conf) self.X_train, \ self.X_test, \ self.y_train, \ self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.model, self.history = self.dt.fit(self.X_train, self.y_train, epochs=1)
def setup_class(self): self.X = self.load_data() self.y = self.X.pop(14) conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, auto_categorize=False, auto_discrete=False) self.dt = deeptable.DeepTable(config=conf) self.X_train, \ self.X_eval, \ self.y_train, \ self.y_test = get_tool_box(self.X).train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.oof_proba, self.eval_proba, self.test_proba = self.dt.fit_cross_validation(self.X_train, self.y_train, self.X_eval, num_folds=3, epochs=1, n_jobs=1)
def run_nets(self, nets): df_train = dsutils.load_adult().head(100) y = df_train.pop(14).values X = df_train conf = deeptable.ModelConfig(nets=nets, metrics=['AUC'], fixed_embedding_dim=True, embeddings_output_dim=2, apply_gbm_features=False, apply_class_weight=True) dt = deeptable.DeepTable(config=conf) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model, history = dt.fit(X_train, y_train, epochs=1) result = dt.evaluate(X_test, y_test) assert result['AUC'] >= 0.0 return dt, result
def test_fit_cross_validation(self): print("Loading datasets...") x1 = np.random.randint(0, 10, size=(100), dtype='int') x2 = np.random.normal(0.0, 1.0, size=(100)) x3 = np.random.normal(0.0, 1.0, size=(100)) y1 = np.random.randint(0, 2, size=(100), dtype='int') y2 = np.random.randint(0, 2, size=(100), dtype='int') df = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3}) df_y = pd.DataFrame({'y1': y1, 'y2': y2}) conf = deeptable.ModelConfig(metrics=['AUC'], nets=['dnn_nets'], apply_gbm_features=False, task='multilabel') dt = deeptable.DeepTable(config=conf) oof_predict, _, test_predict = dt.fit_cross_validation(df, df_y, X_test=df, num_folds=3) assert oof_predict.shape[-1] == df_y.shape[-1] assert test_predict.shape[-1] == df_y.shape[-1]
def test_callback_injection(self): print("Loading datasets...") df_train = dsutils.load_adult() self.y = df_train.pop(14).values self.X = df_train path = tempfile.mkdtemp() conf = deeptable.ModelConfig( metrics=['AUC'], apply_gbm_features=False, auto_discrete=False, home_dir=path, ) self.dt = deeptable.DeepTable(config=conf) mcp = ModelCheckpoint( path, 'val_auc', verbose=0, save_best_only=False, save_weights_only=False, mode='max', save_freq='epoch', ) callbacks = [mcp] self.X_train, \ self.X_test, \ self.y_train, \ self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.model, self.history = self.dt.fit(self.X_train, self.y_train, epochs=1, callbacks=callbacks) files = os.listdir(path) assert 'saved_model.pb' in files
# -*- encoding: utf-8 -*- from deeptables.models import deeptable, ModelConfig import pickle as pkl nets=['dnn_nets','linear','cin_nets','fm_nets','afm_nets','opnn_nets','ipnn_nets','pnn_nets','cross_nets','cross_dnn_nets','dcn_nets','autoint_nets','fg_nets','fgcnn_cin_nets','fgcnn_fm_nets','fgcnn_ipnn_nets','fgcnn_dnn_nets','fibi_nets','fibi_dnn_nets'] config = ModelConfig(nets=nets) with open('/Users/wuhf/Downloads/model_config.pkl', 'rb') as f: pass # config = pkl.load(f) dt = deeptable.DeepTable(config=config) print(dt) with open('/Users/wuhf/Downloads/7c4373fe6880477185d4bb0674f99ba2_1.pkl', 'rb') as f: import pickle as pkl df = pkl.load(f) X_train = df y_trian = df.pop('y') dt.fit(X_train, y_trian)
def run_dt(self, config, df=None, target=None, fit_kwargs={}): if df is None or target is None: df = self.df.copy() target = self.target X_train, X_test = dex.train_test_split(df, test_size=0.2, random_state=9527) y_train = X_train.pop(target) y_test = X_test.pop(target) test_size = dex.compute(X_test.shape)[0][0] dt = deeptable.DeepTable(config=config) if fit_kwargs is None: fit_kwargs = {'epochs': 1} else: fit_kwargs = {'epochs': 1, **fit_kwargs} dm, history = dt.fit(X_train, y_train, **fit_kwargs) assert dm is not None assert history is not None # test evaluate result = dt.evaluate(X_test, y_test) assert result.get(config.metrics[0]) is not None print('evaluate:', result) # test_model_selector(self): m1 = dt.get_model(consts.MODEL_SELECTOR_CURRENT) m2 = dt.get_model(consts.MODEL_SELECTOR_BEST) m3 = dt.get_model('dnn_nets') assert isinstance(m1, deepmodel.DeepModel) assert m1 is m2 assert m2 is m3 # test_best_model(self): model = dt.best_model assert isinstance(model, deepmodel.DeepModel) if dt.task in [consts.TASK_BINARY, consts.TASK_MULTICLASS]: # test_predict_proba(self): num_classes = dt.num_classes proba = dt.predict_proba(X_test) assert proba.shape == (test_size, num_classes) # test_proba2predict(self): proba = dt.predict_proba(X_test) preds = dt.predict(X_test) preds2 = dt.proba2predict(proba) assert proba.shape == (test_size, num_classes) assert (preds == preds2).sum() == test_size assert preds2.shape == (test_size, ) elif dt.task in [ consts.TASK_REGRESSION, ]: preds = dt.predict(X_test) assert preds.shape == (test_size, 1) # test_apply(self): features = dt.apply( X_test, output_layers=['flatten_embeddings', 'dnn_dense_1', 'dnn_dense_2']) assert len(features) == 3 assert len(features[0].shape) == 2 features = dt.apply(X_test, output_layers=['flatten_embeddings']) assert len(features.shape) == 2 # test_apply_with_transformer(self): tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) out1 = random.sample(range(test_size), test_size // 2) # X_sample = X_test.iloc[out1,] X_test_values = X_test.to_dask_array(lengths=True) samples = dex.make_chunk_size_known(X_test_values[out1]) X_sample = dex.dd.from_array(samples, columns=X_test.columns) features = dt.apply( X_sample, output_layers=['flatten_embeddings', 'dnn_dense_1'], transformer=tsne) assert len(features) == 2 assert len(features[0].shape) == 2 assert features[0].shape[1] == 2 assert features[1].shape[1] == 2 # def test_probe_evaluate(self): result = deeptable.probe_evaluate(dt, X_train, y_train, X_test, y_test, layers=['flatten_embeddings'], score_fn={}) assert len(result) == 1 assert result['flatten_embeddings']['accuracy'] > 0 scores = {'MSE': mean_squared_error} if dt.task in [consts.TASK_BINARY, consts.TASK_MULTICLASS]: scores = {'AUC': roc_auc_score, 'F1': f1_score, **scores} result = deeptable.probe_evaluate( dt, X_train, y_train, X_test, y_test, layers=['flatten_embeddings', 'dnn_dense_1', 'dnn_dense_2'], score_fn=scores) assert len(result) == 3 assert len(result['flatten_embeddings']) == len(scores) if dt.task in [consts.TASK_BINARY, consts.TASK_MULTICLASS]: assert result['flatten_embeddings']['AUC'] > 0 assert result['dnn_dense_2']['AUC'] > 0 return dt, dm