def _load_cache(toolbox, cache_path): with fs.open(f'{cache_path}.meta', 'rb') as f: meta = pickle.load(f) if meta['version'] != __version__: raise EnvironmentError( f'Incompatible version: {meta["version"]}, please clear cache and try again.' ) data_kind = meta['kind'] items = meta['items'] if data_kind == _KIND_NONE: data = None elif data_kind == _KIND_LIST: data = [_load_cache(f'{cache_path}{i}')[0] for i in items] elif data_kind == _KIND_DEFAULT: # pickle with fs.open(f'{cache_path}{items[0]}', 'rb') as f: data = pickle.load(f) elif data_kind == _KIND_PARQUET: pq = toolbox.parquet() data = pq.load(f'{cache_path}{items[0]}', filesystem=fs) else: raise ValueError(f'Unexpected cache data kind "{data_kind}"') return data, meta
def save(self, model_path): if not model_path.endswith(fs.sep): model_path = model_path + fs.sep if not fs.exists(model_path): fs.mkdirs(model_path, exist_ok=True) stub = copy.copy(self) estimators = self.estimators if estimators is not None: stub.estimators = [None for _ in estimators] # keep size if estimators is not None: for i, est in enumerate(estimators): est_pkl = f'{model_path}{i}.pkl' est_model = f'{model_path}{i}.model' for t in [est_pkl, est_model]: if fs.exists(t): fs.rm(t) if est is None: continue with fs.open(est_pkl, 'wb') as f: pickle.dump(est, f, protocol=pickle.HIGHEST_PROTOCOL) if hasattr(est, 'save') and hasattr(est, 'load'): est.save(est_model) with fs.open(f'{model_path}ensemble.pkl', 'wb') as f: pickle.dump(stub, f, protocol=pickle.HIGHEST_PROTOCOL)
def _store_cache(toolbox, cache_path, data, meta): meta = meta.copy() if meta is not None else {} meta['version'] = __version__ if data is None: meta.update({'kind': _KIND_NONE, 'items': []}) elif isinstance(data, (list, tuple)): items = [f'_{i}' for i in range(len(data))] for d, i in zip(data, items): _store_cache(f'{cache_path}{i}', d, meta) meta.update({'kind': _KIND_LIST, 'items': items}) else: pq = toolbox.parquet() if isinstance(data, pq.acceptable_types): item = f'.parquet' pq.store(data, f'{cache_path}{item}', filesystem=fs) meta.update({'kind': _KIND_PARQUET, 'items': [item]}) else: item = f'.pkl' with fs.open(f'{cache_path}{item}', 'wb') as f: pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL) meta.update({'kind': _KIND_DEFAULT, 'items': [item]}) with fs.open(f'{cache_path}.meta', 'wb') as f: pickle.dump(meta, f, protocol=pickle.HIGHEST_PROTOCOL)
def load(filepath): if filepath[-1] != '/': filepath = filepath + '/' with fs.open(f'{filepath}dt.pkl', 'rb') as input: dt = pickle.load(input) dt.restore_modelset(filepath) return dt
def load(model_path): if not model_path.endswith(fs.sep): model_path = model_path + fs.sep with fs.open(f'{model_path}ensemble.pkl', 'rb') as f: stub = pickle.load(f) if stub.estimators is not None: for i in range(len(stub.estimators)): if fs.exists(f'{model_path}{i}.pkl'): with fs.open(f'{model_path}{i}.pkl', 'rb') as f: est = pickle.load(f) if fs.exists(f'{model_path}{i}.model') and hasattr( est, 'load'): est = est.load(f'{model_path}{i}.model') stub.estimators[i] = est return stub
def save(self, filepath): import h5py from hypernets.utils import fs with fs.open(filepath, 'wb') as f: buf = io.BytesIO() with h5py.File(buf, 'w') as h: save_model(self.model, h, save_format='h5') data = buf.getvalue() buf.close() f.write(data)
def _load_model(filepath, custom_objects): import h5py from hypernets.utils import fs with fs.open(filepath, 'rb') as f: data = f.read() buf = io.BytesIO(data) del data with h5py.File(buf, 'r') as h: return load_model(h, custom_objects)
def load_transformers_from_cache(self): transformer_path = f'{self.cache_dir}/transformers.pkl' if fs.exists(transformer_path): try: with fs.open(transformer_path, 'rb') as input: preprocessor = pickle.load(input) self.__dict__.update(preprocessor.__dict__) return True except Exception as e: logger.error(e) fs.rm(transformer_path) return False
def save_transformed_X_y_to_cache(self, sign, X, y): filepath = f'{self.cache_dir}/X_y_{sign}.pkl.gz' try: # x_t = X.copy(deep=True) X.insert(0, 'saved__y__', y) with fs.open(filepath, mode='wb') as f: X.to_pickle(f, compression='gzip') return True except Exception as e: logger.error(e) if fs.exists(filepath): fs.rm(filepath) return False
def get_transformed_X_y_from_cache(self, sign): file_x_y = f'{self.cache_dir}/X_y_{sign}.pkl.gz' X_t, y_t = None, None if fs.exists(file_x_y): try: with fs.open(file_x_y, mode='rb') as f: df = pd.read_pickle(f, compression='gzip') y_t = df.pop('saved__y__') X_t = df except Exception as e: logger.error(e) fs.rm(file_x_y) return X_t, y_t
def add(self, trial_no, space_sample): space_id = space_sample.space_id assert space_id not in self.all_items.keys() space_file = f'{self.spaces_dir}/space-{trial_no}.pkl' with fs.open(space_file, 'wb') as f: pickle.dump(space_sample, f) model_file = '%s/%05d_%s.pkl' % (self.models_dir, trial_no, space_id) item = TrialItem(trial_no, space_file, space_sample, model_file) detail = f'trial_no={item.trial_no}, space_id={item.space_id}, space_file={space_file}' if logger.is_info_enabled(): logger.info(f'[{self.search_id}] [search] {detail}') self.queued_pool.put(item) self.all_items[space_id] = item
def save(self, filepath, deepmodel_basename=None): if filepath[-1] != '/': filepath = filepath + '/' if not fs.exists(filepath): fs.makedirs(filepath, exist_ok=True) num_model = len(self.__modelset.get_modelinfos()) for mi in self.__modelset.get_modelinfos(): if isinstance(mi.model, str): dm = self.load_deepmodel(mi.model) mi.model = dm if not isinstance(mi.model, DeepModel): raise ValueError( f'Currently does not support saving non-DeepModel models.') if num_model == 1 and deepmodel_basename is not None: mi.name = deepmodel_basename self.__current_model = deepmodel_basename modelfile = f'{filepath}{mi.name}.h5' mi.model.save(modelfile) mi.model = modelfile with fs.open(f'{filepath}dt.pkl', 'wb') as output: pickle.dump(self, output, protocol=4)
def save(self, model_file): with fs.open(model_file, 'wb') as f: pickle.dump(self, f, protocol=4)
def dispatch(self, hyper_model, X, y, X_val, y_val, max_trails, dataset_id, trail_store, **fit_kwargs): if 'search_id' in fit_kwargs: search_id = fit_kwargs.pop('search_id') else: global _search_counter _search_counter += 1 search_id = 'search-%02d' % _search_counter if logger.is_info_enabled(): logger.info(f'[{search_id}] started') print(f'[{search_id}] connect to driver {self.driver_address}', end='') client = SearchDriverClient(self.driver_address, search_id) client.ping(wait=True) def response(item_x, success, reward=0.0, message=''): res = copy.copy(item_x) res.success = success res.reward = reward res.message = message return res trail_no = 0 sch = client.search(search_id) try: item = next(sch) while item: if item.is_waiting(): if logger.is_info_enabled(): logger.info( f'[{search_id}] not found search, wait and continue' ) time.sleep(1) item = sch.send(response(item, True)) continue if item.is_finished(): if logger.is_info_enabled(): logger.info(f'[{search_id}] search finished, exit.') # sch.send(None) break if not item.is_ok(): if logger.is_info_enabled(): logger.info( f'[{search_id}] dispatched with {item.code}, exit.' ) # sch.send(None) break trail_no = item.trail_no if item.trail_no is not None else trail_no + 1 detail = f'trail_no={trail_no}, space_id={item.space_id}, space_file={item.space_file}' if logger.is_info_enabled(): logger.info(f'[{search_id}] new trail:' + detail) try: with fs.open(item.space_file, 'rb') as f: space_sample = pickle.load(f) for callback in hyper_model.callbacks: # callback.on_build_estimator(hyper_model, space_sample, estimator, trail_no) callback.on_trail_begin(hyper_model, space_sample, trail_no) model_file = item.model_file trail = hyper_model._run_trial(space_sample, trail_no, X, y, X_val, y_val, model_file, **fit_kwargs) if trail.reward != 0: improved = hyper_model.history.append(trail) for callback in hyper_model.callbacks: callback.on_trail_end(hyper_model, space_sample, trail_no, trail.reward, improved, trail.elapsed) else: for callback in hyper_model.callbacks: callback.on_trail_error(hyper_model, space_sample, trail_no) if trail_store is not None: trail_store.put(dataset_id, trail) item = sch.send( response(item, trail.reward != 0.0, trail.reward)) except StopIteration: break except KeyboardInterrupt: if logger.is_info_enabled(): logger.info('KeyboardInterrupt') break except Exception as e: import traceback msg = f'[{search_id}] {e.__class__.__name__}: {e}' logger.error(msg + '\n' + traceback.format_exc()) item = sch.send(response(item, False, 0.0, msg)) except StopIteration as e: pass finally: sch.close() client.close() if logger.is_info_enabled(): logger.info(f'[{search_id}] search done, last trail_no={trail_no}') return trail_no
def save_transformers_to_cache(self): transformer_path = f'{self.cache_dir}/transformers.pkl' with fs.open(transformer_path, 'wb') as output: pickle.dump(self, output, protocol=4)
def train_dt(self, model_set, config, nets=['dnn_nets']): print(f'Start training DT model.{nets}') conf = config fixed_embedding_dim = conf.fixed_embedding_dim if 'fm_nets' in nets: fixed_embedding_dim = True print(f'train metrics:{config.metrics}') print(f'eval metrics:{self.eval_metrics}') # conf = conf._replace(nets=nets, metrics=[self.eval_metrics[0]], # fixed_embedding_dim=fixed_embedding_dim, # ) dt = deeptable.DeepTable(config=conf) print(f'Fitting model...') if self.cross_validation: oof_proba, eval_proba, test_proba = dt.fit_cross_validation( self.X_train, self.y_train, self.X_eval, self.X_test, verbose=self.verbose, batch_size=self.dt_batch_size, epochs=self.dt_epochs, num_folds=self.num_folds, stratified=self.stratified, random_state=self.seed, n_jobs=self.n_jobs) print(f'Scoring...') oof_preds = dt.proba2predict(oof_proba) oof_score = calc_score(self.y_train, oof_proba, oof_preds, self.eval_metrics, self.task, dt.pos_label) model_set.push( modelset.ModelInfo('oof', f'{config.name} - {nets} - CV - oof', dt, oof_score, model_selector=consts.MODEL_SELECTOR_ALL)) print(f'\n------------OOF------------ score:\n{oof_score}') if eval_proba is not None: eval_preds = dt.proba2predict(eval_proba) eval_cv_score = calc_score(self.y_eval, eval_proba, eval_preds, self.eval_metrics, self.task, dt.pos_label) model_set.push( modelset.ModelInfo( 'cv-eval', f'{config.name} - {nets} - CV - eval', dt, eval_cv_score, model_selector=consts.MODEL_SELECTOR_ALL)) print( f'\n------------CV------------ Eval score:\n{eval_cv_score}' ) if self.retain_single_model: all_model_proba = dt.predict_proba_all(self.X_eval) for fold_name, fold_proba in all_model_proba.items(): fold_preds = dt.proba2predict(fold_proba) fold_score = calc_score(self.y_eval, fold_proba, fold_preds, self.eval_metrics, self.task, dt.pos_label) print( f'\n------------{fold_name} -------------Eval score:\n{fold_score}' ) model_set.push( modelset.ModelInfo( 'eval', f'{config.name} - {nets} - {fold_name} - eval', dt, fold_score, model_selector=fold_name)) else: print( f'X_train.shape:{self.X_train.shape},y_train.shape:{self.y_train.shape}' ) model, history = dt.fit( self.X_train, self.y_train, epochs=self.dt_epochs, validation_split=self.validation_size, verbose=self.verbose, ) print(f'Scoring...') if self.X_eval is not None: proba = dt.predict_proba( self.X_eval, model_selector=consts.MODEL_SELECTOR_BEST) preds = dt.proba2predict(proba) score = calc_score(self.y_eval, proba, preds, self.eval_metrics, self.task, dt.pos_label) # score = dt.evaluate(self.X_test, self.y_test) print( f'\n------------{nets} -------------Eval score:\n{score}') model_set.push( modelset.ModelInfo( 'eval', f'{config.name} - {nets} - eval', dt, score, model_selector=consts.MODEL_SELECTOR_BEST)) else: print( f'\n------------{nets} -------------Val score:\n{history.history}' ) model_set.push( modelset.ModelInfo( 'val', f'{config.name} - {nets} - val', dt, {}, model_selector=consts.MODEL_SELECTOR_BEST, history=history.history)) if self.X_test is not None: test_proba = dt.predict_proba(self.X_test) score = str( round(history.history[self.first_metric_name][-1], 5)) file = f'{dt.output_path}{score}_{"_".join(nets)}.csv' with fs.open(file, 'w', encoding='utf-8') as f: pd.DataFrame(test_proba).to_csv(f, index=False) print(f'DT finished.') return dt
def load(model_file): with fs.open(model_file, 'rb') as f: return pickle.load(f)
def _fit_and_score(task, num_classes, config, categorical_columns, continuous_columns, n_fold, valid_idx, X_train, y_train, X_val, y_val, X_eval=None, X_test=None, model_file=None, batch_size=128, epochs=1, verbose=0, callbacks=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, max_queue_size=10, workers=1, use_multiprocessing=False): print(f'\nFold:{n_fold + 1}\n') model = DeepModel(task, num_classes, config, categorical_columns, continuous_columns) history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=verbose, callbacks=callbacks, validation_data=(X_val, y_val), shuffle=shuffle, class_weight=class_weight, sample_weight=sample_weight, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=validation_freq, max_queue_size=max_queue_size, workers=workers, use_multiprocessing=use_multiprocessing) print(f'Fold {n_fold + 1} fitting over.') oof_proba = model.predict(X_val) eval_proba = None test_proba = None if X_eval is not None: eval_proba = model.predict(X_eval) if X_test is not None: test_proba = model.predict(X_test) if model_file is not None: file = f'{model_file}.test_proba.csv' with fs.open(file, 'w', encoding='utf-8') as f: pd.DataFrame(test_proba).to_csv(f, index=False) print(f'Fold {n_fold + 1} scoring over.') if model_file is not None: model.save(model_file) print(f'Save model to:{model_file}.') model.release() return n_fold, valid_idx, history.history, oof_proba, eval_proba, test_proba
def fit_cross_validation( estimator_type, fit_fn, X, y, X_test=None, score_fn=roc_auc_score, estimator_params={}, categorical_feature=None, task_type=consts.TASK_BINARY, num_folds=5, stratified=True, iterators=None, batch_size=None, preds_filepath=None, ): print("Start cross validation") print( f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}' ) # Cross validation model if iterators is None: if stratified: iterators = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: iterators = KFold(n_splits=num_folds, shuffle=True, random_state=1001) print(f'Iterators:{iterators}') if len(y.shape) > 1: oof_proba = np.zeros(y.shape) else: oof_proba = np.zeros((y.shape[0], 1)) y = np.array(y) if preds_filepath is None and os.environ.get( consts.ENV_DEEPTABLES_HOME) is not None: preds_filepath = os.environ.get(consts.ENV_DEEPTABLES_HOME) if preds_filepath is None: preds_filepath = f'./preds_{estimator_type}_{datetime.datetime.now().__format__("%Y_%m_%d %H:%M:%S")}/' if not fs.exists(preds_filepath): fs.makedirs(preds_filepath, exist_ok=True) for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)): print(f'\nFold:{n_fold + 1}\n') x_train_fold, y_train_fold = X.iloc[train_idx], y[train_idx] x_val_fold, y_val_fold = X.iloc[valid_idx], y[valid_idx] model = fit_fn( x_train_fold, y_train_fold, x_val_fold, y_val_fold, cat_vars=categorical_feature, task=task_type, estimator_params=estimator_params, ) print(f'Fold {n_fold + 1} finished.') proba = model.predict_proba(x_val_fold)[:, 1:2] oof_proba[valid_idx] = proba test_fold_proba = model.predict_proba(X_test) score = round(score_fn(y_val_fold, proba), 5) file = f'{preds_filepath}{score}_fold{n_fold + 1}.csv' with fs.open(file, 'w', encoding='utf-8') as f: pd.DataFrame(test_fold_proba).to_csv(f, index=False) print(f'Fold {n_fold + 1} Score:{score}') if oof_proba.shape[-1] == 1: oof_proba = oof_proba.reshape(-1) print(f'OOF score:{score_fn(y, oof_proba)}') return oof_proba
def fit_cross_validation(self, X, y, X_eval=None, X_test=None, num_folds=5, stratified=False, iterators=None, batch_size=None, epochs=1, verbose=1, callbacks=None, n_jobs=1, random_state=9527, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, max_queue_size=10, workers=1, use_multiprocessing=False): print("Start cross validation") start = time.time() logger.info( f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}, config={self.config}' ) logger.info(f'metrics:{self.config.metrics}') n_rows = np.shape(X)[0] self.__modelset.clear() X, y = self.preprocessor.fit_transform(X, y) if X_eval is not None: print(f'transform X_eval') X_eval = self.preprocessor.transform_X(X_eval) if X_test is not None: print(f'transform X_test') X_test = self.preprocessor.transform_X(X_test) if iterators is None: if stratified and self.task != consts.TASK_REGRESSION: iterators = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_state) else: iterators = KFold(n_splits=num_folds, shuffle=True, random_state=random_state) print(f'Iterators:{iterators}') test_proba_mean = None eval_proba_mean = None if self.task in (consts.TASK_MULTICLASS, consts.TASK_MULTILABEL): oof_proba = np.full((y.shape[0], self.num_classes), np.nan) else: oof_proba = np.full((y.shape[0], 1), np.nan) y = np.array(y) if class_weight is None and self.config.apply_class_weight and self.task == consts.TASK_BINARY: class_weight = self.get_class_weight(y) callbacks = self.__inject_callbacks(callbacks) parallel = Parallel(n_jobs=n_jobs, verbose=verbose) fit_and_score_kwargs = dict(batch_size=batch_size, epochs=epochs, verbose=verbose, callbacks=callbacks, class_weight=class_weight, shuffle=shuffle, sample_weight=sample_weight, validation_steps=validation_steps, validation_freq=validation_freq, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, max_queue_size=max_queue_size, workers=workers, use_multiprocessing=use_multiprocessing) with parallel: out = parallel( delayed(_fit_and_score) (self.task, self.num_classes, self.config, self.preprocessor. categorical_columns, self.preprocessor.continuous_columns, n_fold, valid_idx, X.iloc[train_idx], y[train_idx], X.iloc[valid_idx], y[valid_idx], X_eval, X_test, f'{self.output_path}{"_".join(self.nets)}-kfold-{n_fold + 1}.h5', **fit_and_score_kwargs) for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y))) for n_fold, idx, history, fold_oof_proba, fold_eval_proba, fold_test_proba in out: oof_proba[idx] = fold_oof_proba if X_eval is not None: if eval_proba_mean is None: eval_proba_mean = fold_eval_proba / num_folds else: eval_proba_mean += fold_eval_proba / num_folds if X_test is not None: if test_proba_mean is None: test_proba_mean = fold_test_proba / num_folds else: test_proba_mean += fold_test_proba / num_folds self.__push_model( 'val', f'{"+".join(self.nets)}-kfold-{n_fold + 1}', f'{self.output_path}{"_".join(self.nets)}-kfold-{n_fold + 1}.h5', history) oof_proba_origin = oof_proba.copy() nan_idx = np.argwhere(np.isnan(oof_proba).any(1)).ravel() if self.task == consts.TASK_BINARY: oof_proba_fixed = self._fix_softmax_proba(X.shape[0], oof_proba_origin.copy()) else: oof_proba_fixed = oof_proba if len(nan_idx) > 0: oof_proba_fixed[nan_idx] = np.nan if eval_proba_mean is not None: if self.task == consts.TASK_BINARY: eval_proba_mean_fixed = self._fix_softmax_proba( X_eval.shape[0], eval_proba_mean.copy()) else: eval_proba_mean_fixed = eval_proba_mean else: eval_proba_mean_fixed = eval_proba_mean if test_proba_mean is not None: if self.task == consts.TASK_BINARY: test_proba_mean_fixed = self._fix_softmax_proba( X_test.shape[0], test_proba_mean.copy()) file = f'{self.output_path}{"_".join(self.nets)}-cv-{num_folds}.csv' with fs.open(file, 'w', encoding='utf-8') as f: pd.DataFrame(test_proba_mean.reshape(-1)).to_csv( f, index=False) else: test_proba_mean_fixed = test_proba_mean else: test_proba_mean_fixed = test_proba_mean logger.info(f'fit_cross_validation taken {time.time() - start}s') return oof_proba_fixed, eval_proba_mean_fixed, test_proba_mean_fixed