Esempio n. 1
0
    def test_dataframe_fs(self):
        file_path = f'/{type(self).__name__}/test_df_fs.parquet'
        fs.makedirs(file_path, exist_ok=True)
        df = dsutils.load_bank_by_dask().repartition(npartitions=3)
        p.store(df, file_path, filesystem=fs)
        assert len(fs.glob(f'{file_path}/*.parquet')) == df.npartitions

        # read it
        df_read = p.load(file_path, filesystem=fs)
        assert self.is_same_df(df, df_read)
Esempio n. 2
0
    def _prepare_output_dir(self, home_dir, nets):
        if home_dir is None:
            home_dir = 'dt_output'
        if home_dir[-1] == '/':
            home_dir = home_dir[:-1]

        running_dir = f'dt_{datetime.datetime.now().__format__("%Y%m%d %H%M%S")}_{"_".join(nets)}'
        output_path = os.path.expanduser(f'{home_dir}/{running_dir}/')
        if not fs.exists(output_path):
            fs.makedirs(output_path, exist_ok=True)
        return output_path
Esempio n. 3
0
    def __init__(self, work_dir):
        try:
            default_client()
        except ValueError:
            # create default Client
            # client = Client("tcp://127.0.0.1:55208")
            # client = Client(processes=False, threads_per_worker=5, n_workers=1, memory_limit='4GB')
            Client()  # detect env: DASK_SCHEDULER_ADDRESS

        super(DaskDispatcher, self).__init__()

        self.work_dir = work_dir
        self.models_dir = f'{work_dir}/models'

        fs.makedirs(self.models_dir, exist_ok=True)
Esempio n. 4
0
    def _prepare_cache_dir(self, cache_home, clear_cache=False):
        if cache_home is None:
            cache_home = 'cache'
        if cache_home[-1] == '/':
            cache_home = cache_home[:-1]

        cache_home = os.path.expanduser(f'{cache_home}')
        if not fs.exists(cache_home):
            fs.makedirs(cache_home, exist_ok=True)
        else:
            if clear_cache:
                fs.rm(cache_home, recursive=True)
                fs.mkdirs(cache_home, exist_ok=True)
        cache_dir = f'{cache_home}/{self.signature}'
        if not fs.exists(cache_dir):
            fs.makedirs(cache_dir, exist_ok=True)
        return cache_dir
    def __init__(self, search_id, spaces_dir, models_dir, on_next, on_report, on_summary):
        super(SearchHolder, self).__init__()

        fs.makedirs(spaces_dir, exist_ok=True)
        fs.makedirs(models_dir, exist_ok=True)

        self.search_id = search_id
        self.spaces_dir = spaces_dir
        self.models_dir = models_dir

        self.on_next = on_next
        self.on_report = on_report
        self.on_summary = on_summary

        self.start_at = time.time()
        self.finish_at = None

        self.queued_pool = queue.Queue()  # TrailItem
        self.running_items = {}  # space_id -> TrailItem
        self.reported_items = {}  # space_id -> TrailItem
        self.all_items = {}  # space_id -> TrailItem
Esempio n. 6
0
    def save(self, filepath, deepmodel_basename=None):
        if filepath[-1] != '/':
            filepath = filepath + '/'

        if not fs.exists(filepath):
            fs.makedirs(filepath, exist_ok=True)
        num_model = len(self.__modelset.get_modelinfos())
        for mi in self.__modelset.get_modelinfos():
            if isinstance(mi.model, str):
                dm = self.load_deepmodel(mi.model)
                mi.model = dm
            if not isinstance(mi.model, DeepModel):
                raise ValueError(
                    f'Currently does not support saving non-DeepModel models.')

            if num_model == 1 and deepmodel_basename is not None:
                mi.name = deepmodel_basename
                self.__current_model = deepmodel_basename
            modelfile = f'{filepath}{mi.name}.h5'
            mi.model.save(modelfile)
            mi.model = modelfile

        with fs.open(f'{filepath}dt.pkl', 'wb') as output:
            pickle.dump(self, output, protocol=4)
Esempio n. 7
0
 def clear_cache(self):
     fs.rm(self.cache_dir, recursive=True)
     fs.makedirs(self.cache_dir, exist_ok=True)
Esempio n. 8
0
    def fit_cross_validation(
        estimator_type,
        fit_fn,
        X,
        y,
        X_test=None,
        score_fn=roc_auc_score,
        estimator_params={},
        categorical_feature=None,
        task_type=consts.TASK_BINARY,
        num_folds=5,
        stratified=True,
        iterators=None,
        batch_size=None,
        preds_filepath=None,
    ):
        print("Start cross validation")
        print(
            f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}'
        )

        # Cross validation model
        if iterators is None:
            if stratified:
                iterators = StratifiedKFold(n_splits=num_folds,
                                            shuffle=True,
                                            random_state=1001)
            else:
                iterators = KFold(n_splits=num_folds,
                                  shuffle=True,
                                  random_state=1001)
        print(f'Iterators:{iterators}')

        if len(y.shape) > 1:
            oof_proba = np.zeros(y.shape)
        else:
            oof_proba = np.zeros((y.shape[0], 1))

        y = np.array(y)
        if preds_filepath is None and os.environ.get(
                consts.ENV_DEEPTABLES_HOME) is not None:
            preds_filepath = os.environ.get(consts.ENV_DEEPTABLES_HOME)
        if preds_filepath is None:
            preds_filepath = f'./preds_{estimator_type}_{datetime.datetime.now().__format__("%Y_%m_%d %H:%M:%S")}/'

        if not fs.exists(preds_filepath):
            fs.makedirs(preds_filepath, exist_ok=True)

        for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)):
            print(f'\nFold:{n_fold + 1}\n')

            x_train_fold, y_train_fold = X.iloc[train_idx], y[train_idx]
            x_val_fold, y_val_fold = X.iloc[valid_idx], y[valid_idx]

            model = fit_fn(
                x_train_fold,
                y_train_fold,
                x_val_fold,
                y_val_fold,
                cat_vars=categorical_feature,
                task=task_type,
                estimator_params=estimator_params,
            )
            print(f'Fold {n_fold + 1} finished.')
            proba = model.predict_proba(x_val_fold)[:, 1:2]
            oof_proba[valid_idx] = proba
            test_fold_proba = model.predict_proba(X_test)
            score = round(score_fn(y_val_fold, proba), 5)
            file = f'{preds_filepath}{score}_fold{n_fold + 1}.csv'
            with fs.open(file, 'w', encoding='utf-8') as f:
                pd.DataFrame(test_fold_proba).to_csv(f, index=False)
            print(f'Fold {n_fold + 1} Score:{score}')

        if oof_proba.shape[-1] == 1:
            oof_proba = oof_proba.reshape(-1)
        print(f'OOF score:{score_fn(y, oof_proba)}')
        return oof_proba