Esempio n. 1
0
    def save(self, model_path):
        if not model_path.endswith(fs.sep):
            model_path = model_path + fs.sep
        if not fs.exists(model_path):
            fs.mkdirs(model_path, exist_ok=True)

        stub = copy.copy(self)
        estimators = self.estimators
        if estimators is not None:
            stub.estimators = [None for _ in estimators]  # keep size

        if estimators is not None:
            for i, est in enumerate(estimators):
                est_pkl = f'{model_path}{i}.pkl'
                est_model = f'{model_path}{i}.model'
                for t in [est_pkl, est_model]:
                    if fs.exists(t):
                        fs.rm(t)

                if est is None:
                    continue
                with fs.open(est_pkl, 'wb') as f:
                    pickle.dump(est, f, protocol=pickle.HIGHEST_PROTOCOL)

                if hasattr(est, 'save') and hasattr(est, 'load'):
                    est.save(est_model)

        with fs.open(f'{model_path}ensemble.pkl', 'wb') as f:
            pickle.dump(stub, f, protocol=pickle.HIGHEST_PROTOCOL)
Esempio n. 2
0
    def test_save_load(self):
        import time
        from hypernets.utils import fs

        filepath = f'{type(self).__name__}_{time.strftime("%Y%m%d%H%M%S")}'
        self.dt.save(filepath)
        assert fs.exists(f'{filepath}/dt.pkl')
        assert fs.exists(f'{filepath}/dnn_nets.h5')
        newdt = deeptable.DeepTable.load(filepath)

        print(newdt.config)
        preds = newdt.predict(self.X_test)
        assert preds.shape, (200, )
Esempio n. 3
0
    def _prepare_cache_dir(self, cache_home, clear_cache=False):
        if cache_home is None:
            cache_home = 'cache'
        if cache_home[-1] == '/':
            cache_home = cache_home[:-1]

        cache_home = os.path.expanduser(f'{cache_home}')
        if not fs.exists(cache_home):
            fs.makedirs(cache_home, exist_ok=True)
        else:
            if clear_cache:
                fs.rm(cache_home, recursive=True)
                fs.mkdirs(cache_home, exist_ok=True)
        cache_dir = f'{cache_home}/{self.signature}'
        if not fs.exists(cache_dir):
            fs.makedirs(cache_dir, exist_ok=True)
        return cache_dir
Esempio n. 4
0
    def test_dataframe_fs(self):
        file_path = f'/{type(self).__name__}/test_df_fs.parquet'
        df = dsutils.load_bank()
        p.store(df, file_path, filesystem=fs)
        assert fs.exists(file_path)

        # read it
        df_read = p.load(file_path, filesystem=fs)
        assert self.is_same_df(df, df_read)
Esempio n. 5
0
    def load(model_path):
        if not model_path.endswith(fs.sep):
            model_path = model_path + fs.sep

        with fs.open(f'{model_path}ensemble.pkl', 'rb') as f:
            stub = pickle.load(f)

        if stub.estimators is not None:
            for i in range(len(stub.estimators)):
                if fs.exists(f'{model_path}{i}.pkl'):
                    with fs.open(f'{model_path}{i}.pkl', 'rb') as f:
                        est = pickle.load(f)
                    if fs.exists(f'{model_path}{i}.model') and hasattr(
                            est, 'load'):
                        est = est.load(f'{model_path}{i}.model')
                    stub.estimators[i] = est

        return stub
Esempio n. 6
0
def clear(cache_dir=None, fn=None):
    assert fn is None or callable(fn)

    if cache_dir is None:
        cache_dir = cfg.cache_dir
    if callable(fn):
        cache_dir = f'{cache_dir}{fs.sep}{".".join([fn.__module__, fn.__qualname__])}'

    if fs.exists(cache_dir):
        fs.rm(cache_dir, recursive=True)
        fs.mkdirs(cache_dir, exist_ok=True)
Esempio n. 7
0
    def _prepare_output_dir(self, home_dir, nets):
        if home_dir is None:
            home_dir = 'dt_output'
        if home_dir[-1] == '/':
            home_dir = home_dir[:-1]

        running_dir = f'dt_{datetime.datetime.now().__format__("%Y%m%d %H%M%S")}_{"_".join(nets)}'
        output_path = os.path.expanduser(f'{home_dir}/{running_dir}/')
        if not fs.exists(output_path):
            fs.makedirs(output_path, exist_ok=True)
        return output_path
Esempio n. 8
0
 def load_transformers_from_cache(self):
     transformer_path = f'{self.cache_dir}/transformers.pkl'
     if fs.exists(transformer_path):
         try:
             with fs.open(transformer_path, 'rb') as input:
                 preprocessor = pickle.load(input)
                 self.__dict__.update(preprocessor.__dict__)
                 return True
         except Exception as e:
             logger.error(e)
             fs.rm(transformer_path)
     return False
Esempio n. 9
0
 def load_deepmodel(self, filepath):
     if fs.exists(filepath):
         print(f'Load model from: {filepath}.')
         dm = DeepModel(self.task,
                        self.num_classes,
                        self.config,
                        self.preprocessor.categorical_columns,
                        self.preprocessor.continuous_columns,
                        model_file=filepath)
         return dm
     else:
         raise ValueError(f'Invalid model filename:{filepath}.')
Esempio n. 10
0
 def save_transformed_X_y_to_cache(self, sign, X, y):
     filepath = f'{self.cache_dir}/X_y_{sign}.pkl.gz'
     try:
         # x_t = X.copy(deep=True)
         X.insert(0, 'saved__y__', y)
         with fs.open(filepath, mode='wb') as f:
             X.to_pickle(f, compression='gzip')
         return True
     except Exception as e:
         logger.error(e)
         if fs.exists(filepath):
             fs.rm(filepath)
     return False
Esempio n. 11
0
 def get_transformed_X_y_from_cache(self, sign):
     file_x_y = f'{self.cache_dir}/X_y_{sign}.pkl.gz'
     X_t, y_t = None, None
     if fs.exists(file_x_y):
         try:
             with fs.open(file_x_y, mode='rb') as f:
                 df = pd.read_pickle(f, compression='gzip')
             y_t = df.pop('saved__y__')
             X_t = df
         except Exception as e:
             logger.error(e)
             fs.rm(file_x_y)
     return X_t, y_t
Esempio n. 12
0
    def save(self, filepath, deepmodel_basename=None):
        if filepath[-1] != '/':
            filepath = filepath + '/'

        if not fs.exists(filepath):
            fs.makedirs(filepath, exist_ok=True)
        num_model = len(self.__modelset.get_modelinfos())
        for mi in self.__modelset.get_modelinfos():
            if isinstance(mi.model, str):
                dm = self.load_deepmodel(mi.model)
                mi.model = dm
            if not isinstance(mi.model, DeepModel):
                raise ValueError(
                    f'Currently does not support saving non-DeepModel models.')

            if num_model == 1 and deepmodel_basename is not None:
                mi.name = deepmodel_basename
                self.__current_model = deepmodel_basename
            modelfile = f'{filepath}{mi.name}.h5'
            mi.model.save(modelfile)
            mi.model = modelfile

        with fs.open(f'{filepath}dt.pkl', 'wb') as output:
            pickle.dump(self, output, protocol=4)
Esempio n. 13
0
    def _cache_call(*args, **kwargs):
        assert len(args) > 0

        obj = None
        cache_path = None
        loaded = False
        result = None
        tb = _get_tool_box_for_cache(*args, **kwargs)

        try:
            for c in callbacks:
                c.on_enter(fn, *args, **kwargs)

            # bind arguments
            bind_args = sig.bind(*args, **kwargs)
            bind_args.apply_defaults()

            obj = bind_args.arguments.get('self', None)

            # calc cache_key
            key_items = {}

            arg_kwargs = bind_args.arguments.get('kwargs', {}).copy()
            arg_items = {
                k: v
                for k, v in bind_args.arguments.items() if k not in [
                    'self',
                ]
            }  # as dict
            arg_items.update(arg_kwargs)

            if arg_keys is not None and len(arg_keys) > 0:
                key_items.update({k: arg_items.get(k) for k in arg_keys})
            else:
                key_items.update(arg_items)

            if attr_keys is not None:
                key_items.update({k: getattr(obj, k, None) for k in attr_keys})
            elif isinstance(obj, BaseEstimator) and 'params_' not in key_items:
                key_items['params_'] = obj.get_params(deep=False)

            if attrs_to_restore is not None:
                key_items['attrs_to_restore_'] = attrs_to_restore

            cache_key = tb.data_hasher()(key_items)

            # join cache_path
            if not fs.exists(cache_dir):
                fs.mkdirs(cache_dir, exist_ok=True)
            cache_path = f'{cache_dir}{fs.sep}{cache_key}'

            # detect and load cache
            if fs.exists(f'{cache_path}.meta'):
                # load
                cached_data, meta = _load_cache(tb, cache_path)

                for c in callbacks:
                    c.on_apply(fn, cached_data, *args, **kwargs)

                # restore attributes
                if attrs_to_restore is not None:
                    cached_attributes = meta.get('attributes', {})
                    for k in attrs_to_restore:
                        setattr(obj, k, cached_attributes.get(k))

                if meta['strategy'] == _STRATEGY_DATA:
                    result = cached_data
                else:  # strategy==transform
                    if isinstance(transformer, str):
                        tfn = getattr(obj, transformer)
                        assert callable(tfn)
                        result = tfn(*args[1:],
                                     **kwargs)  # exclude args[0]==self
                    elif callable(transformer):
                        result = transformer(*args, **kwargs)

                loaded = True
        except SkipCache:
            pass
        except Exception as e:
            logger.warning(e)

        if not loaded:
            result = fn(*args, **kwargs)

        if cache_path is not None and not loaded:
            try:
                for c in callbacks:
                    c.on_store(fn, result, *args, **kwargs)

                # store cache
                cache_strategy = strategy if strategy is not None else cfg.cache_strategy
                if cache_strategy == _STRATEGY_TRANSFORM and (
                        result is None or transformer is not None):
                    cache_data = None
                    meta = {'strategy': _STRATEGY_TRANSFORM}
                else:
                    cache_data = result
                    meta = {'strategy': _STRATEGY_DATA}

                if attrs_to_restore is not None:
                    meta['attributes'] = {
                        k: getattr(obj, k, None)
                        for k in attrs_to_restore
                    }
                if isinstance(obj, BaseEstimator):
                    meta['params_'] = obj.get_params(deep=False)  # for info

                _store_cache(tb, cache_path, cache_data, meta=meta)

                for c in callbacks:
                    c.on_leave(fn, *args, **kwargs)

            except Exception as e:
                logger.warning(e)

        return result
Esempio n. 14
0
    def fit_cross_validation(
        estimator_type,
        fit_fn,
        X,
        y,
        X_test=None,
        score_fn=roc_auc_score,
        estimator_params={},
        categorical_feature=None,
        task_type=consts.TASK_BINARY,
        num_folds=5,
        stratified=True,
        iterators=None,
        batch_size=None,
        preds_filepath=None,
    ):
        print("Start cross validation")
        print(
            f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}'
        )

        # Cross validation model
        if iterators is None:
            if stratified:
                iterators = StratifiedKFold(n_splits=num_folds,
                                            shuffle=True,
                                            random_state=1001)
            else:
                iterators = KFold(n_splits=num_folds,
                                  shuffle=True,
                                  random_state=1001)
        print(f'Iterators:{iterators}')

        if len(y.shape) > 1:
            oof_proba = np.zeros(y.shape)
        else:
            oof_proba = np.zeros((y.shape[0], 1))

        y = np.array(y)
        if preds_filepath is None and os.environ.get(
                consts.ENV_DEEPTABLES_HOME) is not None:
            preds_filepath = os.environ.get(consts.ENV_DEEPTABLES_HOME)
        if preds_filepath is None:
            preds_filepath = f'./preds_{estimator_type}_{datetime.datetime.now().__format__("%Y_%m_%d %H:%M:%S")}/'

        if not fs.exists(preds_filepath):
            fs.makedirs(preds_filepath, exist_ok=True)

        for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)):
            print(f'\nFold:{n_fold + 1}\n')

            x_train_fold, y_train_fold = X.iloc[train_idx], y[train_idx]
            x_val_fold, y_val_fold = X.iloc[valid_idx], y[valid_idx]

            model = fit_fn(
                x_train_fold,
                y_train_fold,
                x_val_fold,
                y_val_fold,
                cat_vars=categorical_feature,
                task=task_type,
                estimator_params=estimator_params,
            )
            print(f'Fold {n_fold + 1} finished.')
            proba = model.predict_proba(x_val_fold)[:, 1:2]
            oof_proba[valid_idx] = proba
            test_fold_proba = model.predict_proba(X_test)
            score = round(score_fn(y_val_fold, proba), 5)
            file = f'{preds_filepath}{score}_fold{n_fold + 1}.csv'
            with fs.open(file, 'w', encoding='utf-8') as f:
                pd.DataFrame(test_fold_proba).to_csv(f, index=False)
            print(f'Fold {n_fold + 1} Score:{score}')

        if oof_proba.shape[-1] == 1:
            oof_proba = oof_proba.reshape(-1)
        print(f'OOF score:{score_fn(y, oof_proba)}')
        return oof_proba