def test_save_load(self): import time from deeptables.utils import fs filepath = f'{type(self).__name__}_{time.strftime("%Y%m%d%H%M%S")}' self.dt.save(filepath) assert fs.exists(f'{filepath}/dt.pkl') assert fs.exists(f'{filepath}/dnn_nets-kfold-1.h5') assert fs.exists(f'{filepath}/dnn_nets-kfold-2.h5') assert fs.exists(f'{filepath}/dnn_nets-kfold-3.h5') newdt = deeptable.DeepTable.load(filepath) preds = newdt.predict(self.X_eval) assert preds.shape, (200, )
def test_default_settings(self): config = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, apply_class_weight=True) dt, _ = self.run_dt(config) # test save and load filepath = f'{type(self).__name__}_{time.strftime("%Y%m%d%H%M%S")}' dt.save(filepath) assert fs.exists(f'{filepath}/dt.pkl') assert fs.exists(f'{filepath}/dnn_nets.h5') newdt = deeptable.DeepTable.load(filepath) X_eval = self.df.copy() X_eval.pop(self.target) preds = newdt.predict(X_eval) assert preds.shape, (self.df_row_count,)
def load(model_path): if not fs.exists(model_path): raise ValueError(f'Not found storage path: {model_path}') if not model_path.endswith(fs.sep): model_path = model_path + fs.sep stub_path = model_path + 'dt_estimator.pkl' if not fs.exists(stub_path): raise ValueError( f'Not found storage path of estimator: {stub_path}') with fs.open(stub_path, 'rb') as f: stub = pickle.load(f) model = DeepTable.load(model_path) stub.model = model return stub
def fit_cross_validation(estimator_type, fit_fn, X, y, X_test=None, score_fn=roc_auc_score, estimator_params={}, categorical_feature=None, task_type=consts.TASK_BINARY, num_folds=5, stratified=True, iterators=None, batch_size=None, preds_filepath=None, ): print("Start cross validation") print(f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}') # Cross validation model if iterators is None: if stratified: iterators = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: iterators = KFold(n_splits=num_folds, shuffle=True, random_state=1001) print(f'Iterators:{iterators}') if len(y.shape) > 1: oof_proba = np.zeros(y.shape) else: oof_proba = np.zeros((y.shape[0], 1)) y = np.array(y) if preds_filepath is None and os.environ.get(consts.ENV_DEEPTABLES_HOME) is not None: preds_filepath = os.environ.get(consts.ENV_DEEPTABLES_HOME) if preds_filepath is None: preds_filepath = f'./preds_{estimator_type}_{datetime.datetime.now().__format__("%Y_%m_%d %H:%M:%S")}/' if not fs.exists(preds_filepath): fs.makedirs(preds_filepath, exist_ok=True) for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)): print(f'\nFold:{n_fold + 1}\n') x_train_fold, y_train_fold = X.iloc[train_idx], y[train_idx] x_val_fold, y_val_fold = X.iloc[valid_idx], y[valid_idx] model = fit_fn( x_train_fold, y_train_fold, x_val_fold, y_val_fold, cat_vars=categorical_feature, task=task_type, estimator_params=estimator_params, ) print(f'Fold {n_fold + 1} finished.') proba = model.predict_proba(x_val_fold)[:, 1:2] oof_proba[valid_idx] = proba test_fold_proba = model.predict_proba(X_test) score = round(score_fn(y_val_fold, proba), 5) file = f'{preds_filepath}{score}_fold{n_fold + 1}.csv' with fs.open(file, 'w', encoding='utf-8') as f: pd.DataFrame(test_fold_proba).to_csv(f, index=False) print(f'Fold {n_fold + 1} Score:{score}') if oof_proba.shape[-1] == 1: oof_proba = oof_proba.reshape(-1) print(f'OOF score:{score_fn(y, oof_proba)}') return oof_proba