def save(self, model_path): if not model_path.endswith(fs.sep): model_path = model_path + fs.sep self.model.save(model_path) stub = copy.copy(self) stub.model = None stub_path = model_path + 'dt_estimator.pkl' with fs.open(stub_path, 'wb') as f: pickle.dump(stub, f, protocol=pickle.HIGHEST_PROTOCOL)
def save(self, filepath): import h5py from deeptables.utils import fs with fs.open(filepath, 'wb') as f: buf = io.BytesIO() with h5py.File(buf, 'w') as h: save_model(self.model, h, save_format='h5') data = buf.getvalue() buf.close() f.write(data)
def _load_model(filepath, custom_objects): import h5py from deeptables.utils import fs with fs.open(filepath, 'rb') as f: data = f.read() buf = io.BytesIO(data) del data with h5py.File(buf, 'r') as h: return load_model(h, custom_objects)
def load(model_path): if not fs.exists(model_path): raise ValueError(f'Not found storage path: {model_path}') if not model_path.endswith(fs.sep): model_path = model_path + fs.sep stub_path = model_path + 'dt_estimator.pkl' if not fs.exists(stub_path): raise ValueError( f'Not found storage path of estimator: {stub_path}') with fs.open(stub_path, 'rb') as f: stub = pickle.load(f) model = DeepTable.load(model_path) stub.model = model return stub
def fit_cross_validation(estimator_type, fit_fn, X, y, X_test=None, score_fn=roc_auc_score, estimator_params={}, categorical_feature=None, task_type=consts.TASK_BINARY, num_folds=5, stratified=True, iterators=None, batch_size=None, preds_filepath=None, ): print("Start cross validation") print(f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}') # Cross validation model if iterators is None: if stratified: iterators = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: iterators = KFold(n_splits=num_folds, shuffle=True, random_state=1001) print(f'Iterators:{iterators}') if len(y.shape) > 1: oof_proba = np.zeros(y.shape) else: oof_proba = np.zeros((y.shape[0], 1)) y = np.array(y) if preds_filepath is None and os.environ.get(consts.ENV_DEEPTABLES_HOME) is not None: preds_filepath = os.environ.get(consts.ENV_DEEPTABLES_HOME) if preds_filepath is None: preds_filepath = f'./preds_{estimator_type}_{datetime.datetime.now().__format__("%Y_%m_%d %H:%M:%S")}/' if not fs.exists(preds_filepath): fs.makedirs(preds_filepath, exist_ok=True) for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)): print(f'\nFold:{n_fold + 1}\n') x_train_fold, y_train_fold = X.iloc[train_idx], y[train_idx] x_val_fold, y_val_fold = X.iloc[valid_idx], y[valid_idx] model = fit_fn( x_train_fold, y_train_fold, x_val_fold, y_val_fold, cat_vars=categorical_feature, task=task_type, estimator_params=estimator_params, ) print(f'Fold {n_fold + 1} finished.') proba = model.predict_proba(x_val_fold)[:, 1:2] oof_proba[valid_idx] = proba test_fold_proba = model.predict_proba(X_test) score = round(score_fn(y_val_fold, proba), 5) file = f'{preds_filepath}{score}_fold{n_fold + 1}.csv' with fs.open(file, 'w', encoding='utf-8') as f: pd.DataFrame(test_fold_proba).to_csv(f, index=False) print(f'Fold {n_fold + 1} Score:{score}') if oof_proba.shape[-1] == 1: oof_proba = oof_proba.reshape(-1) print(f'OOF score:{score_fn(y, oof_proba)}') return oof_proba
def train_dt(self, model_set, config, nets=['dnn_nets']): print(f'Start training DT model.{nets}') conf = config fixed_embedding_dim = conf.fixed_embedding_dim if 'fm_nets' in nets: fixed_embedding_dim = True print(f'train metrics:{config.metrics}') print(f'eval metrics:{self.eval_metrics}') # conf = conf._replace(nets=nets, metrics=[self.eval_metrics[0]], # fixed_embedding_dim=fixed_embedding_dim, # ) dt = deeptable.DeepTable(config=conf) print(f'Fitting model...') if self.cross_validation: oof_proba, eval_proba, test_proba = dt.fit_cross_validation(self.X_train, self.y_train, self.X_eval, self.X_test, verbose=self.verbose, batch_size=self.dt_batch_size, epochs=self.dt_epochs, num_folds=self.num_folds, stratified=self.stratified, random_state=self.seed, n_jobs=self.n_jobs) print(f'Scoring...') oof_preds = dt.proba2predict(oof_proba) oof_score = calc_score(self.y_train, oof_preds, oof_proba, self.eval_metrics, self.task, dt.pos_label) model_set.push( modelset.ModelInfo('oof', f'{config.name} - {nets} - CV - oof', dt, oof_score, model_selector=consts.MODEL_SELECTOR_ALL)) print(f'\n------------OOF------------ score:\n{oof_score}') if eval_proba is not None: eval_preds = dt.proba2predict(eval_proba) eval_cv_score = calc_score(self.y_eval, eval_preds, eval_proba, self.eval_metrics, self.task, dt.pos_label) model_set.push( modelset.ModelInfo('cv-eval', f'{config.name} - {nets} - CV - eval', dt, eval_cv_score, model_selector=consts.MODEL_SELECTOR_ALL)) print(f'\n------------CV------------ Eval score:\n{eval_cv_score}') if self.retain_single_model: all_model_proba = dt.predict_proba_all(self.X_eval) for fold_name, fold_proba in all_model_proba.items(): fold_preds = dt.proba2predict(fold_proba) fold_score = calc_score(self.y_eval, fold_preds, fold_proba, self.eval_metrics, self.task, dt.pos_label) print(f'\n------------{fold_name} -------------Eval score:\n{fold_score}') model_set.push( modelset.ModelInfo('eval', f'{config.name} - {nets} - {fold_name} - eval', dt, fold_score, model_selector=fold_name)) else: print(f'X_train.shape:{self.X_train.shape},y_train.shape:{self.y_train.shape}') model, history = dt.fit(self.X_train, self.y_train, epochs=self.dt_epochs, validation_split=self.validation_size, verbose=self.verbose, ) print(f'Scoring...') if self.X_eval is not None: proba = dt.predict_proba(self.X_eval, model_selector=consts.MODEL_SELECTOR_BEST) preds = dt.proba2predict(proba) score = calc_score(self.y_eval, preds, proba, self.eval_metrics, self.task, dt.pos_label) # score = dt.evaluate(self.X_test, self.y_test) print(f'\n------------{nets} -------------Eval score:\n{score}') model_set.push( modelset.ModelInfo('eval', f'{config.name} - {nets} - eval', dt, score, model_selector=consts.MODEL_SELECTOR_BEST)) else: print(f'\n------------{nets} -------------Val score:\n{history.history}') model_set.push( modelset.ModelInfo('val', f'{config.name} - {nets} - val', dt, {}, model_selector=consts.MODEL_SELECTOR_BEST, history=history.history)) if self.X_test is not None: test_proba = dt.predict_proba(self.X_test) score = str(round(history.history[self.first_metric_name][-1], 5)) file = f'{dt.output_path}{score}_{"_".join(nets)}.csv' with fs.open(file, 'w', encoding='utf-8') as f: pd.DataFrame(test_proba).to_csv(f, index=False) print(f'DT finished.') return dt