Esempio n. 1
0
    def save(self, model_path):
        if not model_path.endswith(fs.sep):
            model_path = model_path + fs.sep

        self.model.save(model_path)

        stub = copy.copy(self)
        stub.model = None
        stub_path = model_path + 'dt_estimator.pkl'
        with fs.open(stub_path, 'wb') as f:
            pickle.dump(stub, f, protocol=pickle.HIGHEST_PROTOCOL)
Esempio n. 2
0
    def save(self, filepath):
        import h5py
        from deeptables.utils import fs

        with fs.open(filepath, 'wb') as f:
            buf = io.BytesIO()
            with h5py.File(buf, 'w') as h:
                save_model(self.model, h, save_format='h5')
            data = buf.getvalue()
            buf.close()
            f.write(data)
Esempio n. 3
0
    def _load_model(filepath, custom_objects):
        import h5py
        from deeptables.utils import fs

        with fs.open(filepath, 'rb') as f:
            data = f.read()

        buf = io.BytesIO(data)
        del data
        with h5py.File(buf, 'r') as h:
            return load_model(h, custom_objects)
Esempio n. 4
0
    def load(model_path):
        if not fs.exists(model_path):
            raise ValueError(f'Not found storage path: {model_path}')

        if not model_path.endswith(fs.sep):
            model_path = model_path + fs.sep

        stub_path = model_path + 'dt_estimator.pkl'
        if not fs.exists(stub_path):
            raise ValueError(
                f'Not found storage path of estimator: {stub_path}')

        with fs.open(stub_path, 'rb') as f:
            stub = pickle.load(f)

        model = DeepTable.load(model_path)
        stub.model = model

        return stub
Esempio n. 5
0
    def fit_cross_validation(estimator_type,
                             fit_fn,
                             X,
                             y,
                             X_test=None,
                             score_fn=roc_auc_score,
                             estimator_params={},
                             categorical_feature=None,
                             task_type=consts.TASK_BINARY,
                             num_folds=5,
                             stratified=True,
                             iterators=None,
                             batch_size=None,
                             preds_filepath=None, ):
        print("Start cross validation")
        print(f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}')

        # Cross validation model
        if iterators is None:
            if stratified:
                iterators = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001)
            else:
                iterators = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
        print(f'Iterators:{iterators}')

        if len(y.shape) > 1:
            oof_proba = np.zeros(y.shape)
        else:
            oof_proba = np.zeros((y.shape[0], 1))

        y = np.array(y)
        if preds_filepath is None and os.environ.get(consts.ENV_DEEPTABLES_HOME) is not None:
            preds_filepath = os.environ.get(consts.ENV_DEEPTABLES_HOME)
        if preds_filepath is None:
            preds_filepath = f'./preds_{estimator_type}_{datetime.datetime.now().__format__("%Y_%m_%d %H:%M:%S")}/'

        if not fs.exists(preds_filepath):
            fs.makedirs(preds_filepath, exist_ok=True)

        for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)):
            print(f'\nFold:{n_fold + 1}\n')

            x_train_fold, y_train_fold = X.iloc[train_idx], y[train_idx]
            x_val_fold, y_val_fold = X.iloc[valid_idx], y[valid_idx]

            model = fit_fn(
                x_train_fold,
                y_train_fold,
                x_val_fold,
                y_val_fold,
                cat_vars=categorical_feature,
                task=task_type,
                estimator_params=estimator_params,
            )
            print(f'Fold {n_fold + 1} finished.')
            proba = model.predict_proba(x_val_fold)[:, 1:2]
            oof_proba[valid_idx] = proba
            test_fold_proba = model.predict_proba(X_test)
            score = round(score_fn(y_val_fold, proba), 5)
            file = f'{preds_filepath}{score}_fold{n_fold + 1}.csv'
            with fs.open(file, 'w', encoding='utf-8') as f:
                pd.DataFrame(test_fold_proba).to_csv(f, index=False)
            print(f'Fold {n_fold + 1} Score:{score}')

        if oof_proba.shape[-1] == 1:
            oof_proba = oof_proba.reshape(-1)
        print(f'OOF score:{score_fn(y, oof_proba)}')
        return oof_proba
Esempio n. 6
0
    def train_dt(self, model_set, config, nets=['dnn_nets']):
        print(f'Start training DT model.{nets}')
        conf = config

        fixed_embedding_dim = conf.fixed_embedding_dim
        if 'fm_nets' in nets:
            fixed_embedding_dim = True
        print(f'train metrics:{config.metrics}')
        print(f'eval metrics:{self.eval_metrics}')
        # conf = conf._replace(nets=nets, metrics=[self.eval_metrics[0]],
        #                      fixed_embedding_dim=fixed_embedding_dim,
        #                      )

        dt = deeptable.DeepTable(config=conf)

        print(f'Fitting model...')
        if self.cross_validation:
            oof_proba, eval_proba, test_proba = dt.fit_cross_validation(self.X_train,
                                                                        self.y_train,
                                                                        self.X_eval,
                                                                        self.X_test,
                                                                        verbose=self.verbose,
                                                                        batch_size=self.dt_batch_size,
                                                                        epochs=self.dt_epochs,
                                                                        num_folds=self.num_folds,
                                                                        stratified=self.stratified,
                                                                        random_state=self.seed,
                                                                        n_jobs=self.n_jobs)
            print(f'Scoring...')
            oof_preds = dt.proba2predict(oof_proba)
            oof_score = calc_score(self.y_train, oof_preds, oof_proba, self.eval_metrics, self.task,
                                   dt.pos_label)
            model_set.push(
                modelset.ModelInfo('oof', f'{config.name} - {nets} - CV - oof', dt, oof_score,
                                   model_selector=consts.MODEL_SELECTOR_ALL))
            print(f'\n------------OOF------------ score:\n{oof_score}')

            if eval_proba is not None:
                eval_preds = dt.proba2predict(eval_proba)
                eval_cv_score = calc_score(self.y_eval, eval_preds, eval_proba, self.eval_metrics, self.task,
                                           dt.pos_label)
                model_set.push(
                    modelset.ModelInfo('cv-eval', f'{config.name} - {nets} - CV - eval', dt, eval_cv_score,
                                       model_selector=consts.MODEL_SELECTOR_ALL))
                print(f'\n------------CV------------ Eval score:\n{eval_cv_score}')

            if self.retain_single_model:
                all_model_proba = dt.predict_proba_all(self.X_eval)
                for fold_name, fold_proba in all_model_proba.items():
                    fold_preds = dt.proba2predict(fold_proba)
                    fold_score = calc_score(self.y_eval, fold_preds, fold_proba, self.eval_metrics, self.task,
                                            dt.pos_label)
                    print(f'\n------------{fold_name} -------------Eval score:\n{fold_score}')
                    model_set.push(
                        modelset.ModelInfo('eval', f'{config.name} - {nets} - {fold_name} - eval', dt, fold_score,
                                           model_selector=fold_name))

        else:
            print(f'X_train.shape:{self.X_train.shape},y_train.shape:{self.y_train.shape}')
            model, history = dt.fit(self.X_train,
                                    self.y_train,
                                    epochs=self.dt_epochs,
                                    validation_split=self.validation_size,
                                    verbose=self.verbose,
                                    )
            print(f'Scoring...')
            if self.X_eval is not None:
                proba = dt.predict_proba(self.X_eval, model_selector=consts.MODEL_SELECTOR_BEST)
                preds = dt.proba2predict(proba)
                score = calc_score(self.y_eval, preds, proba, self.eval_metrics, self.task, dt.pos_label)
                # score = dt.evaluate(self.X_test, self.y_test)
                print(f'\n------------{nets} -------------Eval score:\n{score}')
                model_set.push(
                    modelset.ModelInfo('eval', f'{config.name} - {nets} - eval', dt, score,
                                       model_selector=consts.MODEL_SELECTOR_BEST))
            else:
                print(f'\n------------{nets} -------------Val score:\n{history.history}')
                model_set.push(
                    modelset.ModelInfo('val', f'{config.name} - {nets} - val', dt, {},
                                       model_selector=consts.MODEL_SELECTOR_BEST,
                                       history=history.history))

            if self.X_test is not None:
                test_proba = dt.predict_proba(self.X_test)
                score = str(round(history.history[self.first_metric_name][-1], 5))
                file = f'{dt.output_path}{score}_{"_".join(nets)}.csv'
                with fs.open(file, 'w', encoding='utf-8') as f:
                    pd.DataFrame(test_proba).to_csv(f, index=False)

        print(f'DT finished.')
        return dt