def test_dataframe_fs(self): file_path = f'/{type(self).__name__}/test_df_fs.parquet' fs.makedirs(file_path, exist_ok=True) df = dsutils.load_bank_by_dask().repartition(npartitions=3) p.store(df, file_path, filesystem=fs) assert len(fs.glob(f'{file_path}/*.parquet')) == df.npartitions # read it df_read = p.load(file_path, filesystem=fs) assert self.is_same_df(df, df_read)
def _prepare_output_dir(self, home_dir, nets): if home_dir is None: home_dir = 'dt_output' if home_dir[-1] == '/': home_dir = home_dir[:-1] running_dir = f'dt_{datetime.datetime.now().__format__("%Y%m%d %H%M%S")}_{"_".join(nets)}' output_path = os.path.expanduser(f'{home_dir}/{running_dir}/') if not fs.exists(output_path): fs.makedirs(output_path, exist_ok=True) return output_path
def __init__(self, work_dir): try: default_client() except ValueError: # create default Client # client = Client("tcp://127.0.0.1:55208") # client = Client(processes=False, threads_per_worker=5, n_workers=1, memory_limit='4GB') Client() # detect env: DASK_SCHEDULER_ADDRESS super(DaskDispatcher, self).__init__() self.work_dir = work_dir self.models_dir = f'{work_dir}/models' fs.makedirs(self.models_dir, exist_ok=True)
def _prepare_cache_dir(self, cache_home, clear_cache=False): if cache_home is None: cache_home = 'cache' if cache_home[-1] == '/': cache_home = cache_home[:-1] cache_home = os.path.expanduser(f'{cache_home}') if not fs.exists(cache_home): fs.makedirs(cache_home, exist_ok=True) else: if clear_cache: fs.rm(cache_home, recursive=True) fs.mkdirs(cache_home, exist_ok=True) cache_dir = f'{cache_home}/{self.signature}' if not fs.exists(cache_dir): fs.makedirs(cache_dir, exist_ok=True) return cache_dir
def __init__(self, search_id, spaces_dir, models_dir, on_next, on_report, on_summary): super(SearchHolder, self).__init__() fs.makedirs(spaces_dir, exist_ok=True) fs.makedirs(models_dir, exist_ok=True) self.search_id = search_id self.spaces_dir = spaces_dir self.models_dir = models_dir self.on_next = on_next self.on_report = on_report self.on_summary = on_summary self.start_at = time.time() self.finish_at = None self.queued_pool = queue.Queue() # TrailItem self.running_items = {} # space_id -> TrailItem self.reported_items = {} # space_id -> TrailItem self.all_items = {} # space_id -> TrailItem
def save(self, filepath, deepmodel_basename=None): if filepath[-1] != '/': filepath = filepath + '/' if not fs.exists(filepath): fs.makedirs(filepath, exist_ok=True) num_model = len(self.__modelset.get_modelinfos()) for mi in self.__modelset.get_modelinfos(): if isinstance(mi.model, str): dm = self.load_deepmodel(mi.model) mi.model = dm if not isinstance(mi.model, DeepModel): raise ValueError( f'Currently does not support saving non-DeepModel models.') if num_model == 1 and deepmodel_basename is not None: mi.name = deepmodel_basename self.__current_model = deepmodel_basename modelfile = f'{filepath}{mi.name}.h5' mi.model.save(modelfile) mi.model = modelfile with fs.open(f'{filepath}dt.pkl', 'wb') as output: pickle.dump(self, output, protocol=4)
def clear_cache(self): fs.rm(self.cache_dir, recursive=True) fs.makedirs(self.cache_dir, exist_ok=True)
def fit_cross_validation( estimator_type, fit_fn, X, y, X_test=None, score_fn=roc_auc_score, estimator_params={}, categorical_feature=None, task_type=consts.TASK_BINARY, num_folds=5, stratified=True, iterators=None, batch_size=None, preds_filepath=None, ): print("Start cross validation") print( f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}' ) # Cross validation model if iterators is None: if stratified: iterators = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: iterators = KFold(n_splits=num_folds, shuffle=True, random_state=1001) print(f'Iterators:{iterators}') if len(y.shape) > 1: oof_proba = np.zeros(y.shape) else: oof_proba = np.zeros((y.shape[0], 1)) y = np.array(y) if preds_filepath is None and os.environ.get( consts.ENV_DEEPTABLES_HOME) is not None: preds_filepath = os.environ.get(consts.ENV_DEEPTABLES_HOME) if preds_filepath is None: preds_filepath = f'./preds_{estimator_type}_{datetime.datetime.now().__format__("%Y_%m_%d %H:%M:%S")}/' if not fs.exists(preds_filepath): fs.makedirs(preds_filepath, exist_ok=True) for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)): print(f'\nFold:{n_fold + 1}\n') x_train_fold, y_train_fold = X.iloc[train_idx], y[train_idx] x_val_fold, y_val_fold = X.iloc[valid_idx], y[valid_idx] model = fit_fn( x_train_fold, y_train_fold, x_val_fold, y_val_fold, cat_vars=categorical_feature, task=task_type, estimator_params=estimator_params, ) print(f'Fold {n_fold + 1} finished.') proba = model.predict_proba(x_val_fold)[:, 1:2] oof_proba[valid_idx] = proba test_fold_proba = model.predict_proba(X_test) score = round(score_fn(y_val_fold, proba), 5) file = f'{preds_filepath}{score}_fold{n_fold + 1}.csv' with fs.open(file, 'w', encoding='utf-8') as f: pd.DataFrame(test_fold_proba).to_csv(f, index=False) print(f'Fold {n_fold + 1} Score:{score}') if oof_proba.shape[-1] == 1: oof_proba = oof_proba.reshape(-1) print(f'OOF score:{score_fn(y, oof_proba)}') return oof_proba