Example #1
0
 def fit(self, train_x, valid_x):
     self.model = TabNetPretrainer(**self.model_params)
     self.model.fit(train_x,
                    eval_set=[valid_x],
                    eval_name=["train"],
                    **self.fit_params)
     return self.model
Example #2
0
def pretrain_model(X,y):
    tabnet_params = dict(n_d=8, n_a=8, n_steps=3, gamma=1.3,
                         n_independent=2, n_shared=2,
                         seed=SEED, lambda_sparse=1e-3, 
                         optimizer_fn=torch.optim.Adam, 
                         optimizer_params=dict(lr=2e-2),
                         mask_type="entmax",
                         scheduler_params=dict(mode="min",
                                               patience=5,
                                               min_lr=1e-5,
                                               factor=0.9,),
                         scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                         verbose=10
                        )
    
    pretrainer = TabNetPretrainer(**tabnet_params)

    pretrainer.fit(
        X_train=X.values,
        eval_set=[X.values],
        max_epochs=200,
        patience=20, batch_size=30000, virtual_batch_size=3000,
        num_workers=1,drop_last=True)
    
    return pretrainer
Example #3
0
class MyPretrainTabnet:
    def __init__(self, model_params, fit_params):
        self.model_params = model_params
        self.fit_params = fit_params
        self.model = None

    def fit(self, train_x, valid_x):
        self.model = TabNetPretrainer(**self.model_params)
        self.model.fit(train_x,
                       eval_set=[valid_x],
                       eval_name=["train"],
                       **self.fit_params)
        return self.model

    def pretrain_run(self,
                     name: str,
                     train_x: pd.DataFrame,
                     train_y: np.ndarray,
                     cv,
                     output_dir: str = "./"):
        va_idxes = []
        for cv_num, (trn_idx, val_idx) in enumerate(cv):
            print(decorate("start pretraining"))
            tr_x, va_x = train_x.values[trn_idx], train_x.values[val_idx]
            # tr_y, va_y = train_y[trn_idx], train_y[val_idx]
            va_idxes.append(val_idx)
            model = self.fit(tr_x, va_x)
            model_name = f"{name}_FOLD{cv_num}_pretrain_model"
            model.save_model(output_dir + model_name)
    def _add_tabnet_unsupervised_model_to_fit_params(self, pipeline,
                                                     fit_params, X_train,
                                                     X_val):
        steps = pipeline.steps

        if steps[-1][1].__class__ not in [
                TabNetClassifier,
                TabNetRegressor,
        ]:
            return fit_params

        eval_X_train = self._preprocess_x(pipeline, X_train, X_train)
        eval_X_val = self._preprocess_x(pipeline, X_train, X_val)

        logger.info('tabnet unsupervised pre-training')
        _uns_fit_params = {}
        for k, v in fit_params.items():
            if f'{steps[-1][0]}__' not in k:
                continue
            if 'from_unsupervised' in k or 'eval' in k:
                continue

            _uns_fit_params[k.replace(f'{steps[-1][0]}__', '')] = v
        unsupervised_model = TabNetPretrainer()
        unsupervised_model.fit(X_train=eval_X_train,
                               eval_set=[eval_X_val],
                               **_uns_fit_params)

        fit_params[f'{steps[-1][0]}__from_unsupervised'] = unsupervised_model
        return fit_params
Example #5
0
    def crossval_and_predict(self, n_folds: int, df: pd.DataFrame,
                             df_test: pd.DataFrame, feature_col: list,
                             target_col: str, model_params: dict):
        oof = np.zeros((len(df)))
        cv_preds = np.zeros((len(df_test)))
        kfold = KFold(n_splits=n_folds,
                      random_state=self.random_state,
                      shuffle=True)
        for train_idx, valid_idx in kfold.split(df):
            X_train, y_train = df[feature_col].values[train_idx], df[
                target_col].values[train_idx].reshape(-1, 1)
            X_valid, y_valid = df[feature_col].values[valid_idx], df[
                target_col].values[valid_idx].reshape(-1, 1)
            X_test = df_test[feature_col].values

            params = self.default_params()
            params['seed'] = self.random_state
            params['n_d'] = model_params['n_d']
            params['n_a'] = model_params['n_d']
            params['gamma'] = model_params['gamma']
            params['momentum'] = model_params['momentum']
            params['n_steps'] = model_params['n_steps']
            params['n_shared'] = model_params['n_shared']
            params['n_independent'] = model_params['n_independent']

            logging.info(
                f'Parameters used for TabNet supervised training: {params}')

            unsupervised_model = TabNetPretrainer(**params)
            unsupervised_model.fit(X_train=X_train,
                                   eval_set=[X_valid],
                                   pretraining_ratio=0.5,
                                   max_epochs=20)

            model = TabNetRegressor(**params)
            model.fit(X_train=X_train,
                      y_train=y_train,
                      eval_set=[(X_valid, y_valid)],
                      eval_name=['valid'],
                      eval_metric=['rmse'],
                      max_epochs=100,
                      patience=10,
                      batch_size=1024,
                      from_unsupervised=unsupervised_model)

            oof[valid_idx] = model.predict(X_valid).squeeze()
            cv_preds += model.predict(X_test).squeeze() / n_folds
            logging.info(
                f'Finished fold with score {rmse(y_valid, oof[valid_idx])}')

        rmse_score = rmse(df[target_col], oof)
        return rmse_score, cv_preds
def main():
    # Generate Synthetic Data
    data, test_data, cat_col_names, num_col_names = data_load()
    cat_dims = data[cat_col_names].nunique().to_list()
    cat_idxs = [(cat_col_names + num_col_names).index(cat_col)
                for cat_col in cat_col_names]
    cat_emb_dims = np.ceil(np.log(cat_dims)).astype(np.int).tolist()
    cat_emb_dims = np.ceil(np.clip((np.array(cat_dims)) / 2, a_min=1,
                                   a_max=50)).astype(np.int).tolist()
    FEATURES = cat_col_names + num_col_names
    df_sub = pd.read_csv('Data/sample_submission.csv')

    bsize = 2500 * 2

    # ##########Define the Configs############
    N_D = 16
    N_A = 16
    N_INDEP = 2
    N_SHARED = 2
    N_STEPS = 1  # 2
    MASK_TYPE = "sparsemax"
    GAMMA = 1.5
    BS = 512
    MAX_EPOCH = 21  # 20
    PRETRAIN = True

    X = data[FEATURES].values
    y = data["target"].values

    X_test = test_data[FEATURES].values

    if PRETRAIN:
        pretrain_params = dict(
            n_d=N_D,
            n_a=N_A,
            n_steps=N_STEPS,  # 0.2,
            n_independent=N_INDEP,
            n_shared=N_SHARED,
            cat_idxs=cat_idxs,
            cat_dims=cat_dims,
            cat_emb_dim=cat_emb_dims,
            gamma=GAMMA,
            lambda_sparse=0.,
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=2e-2),
            mask_type=MASK_TYPE,
            scheduler_params=dict(
                mode="min",
                patience=3,
                min_lr=1e-5,
                factor=0.5,
            ),
            scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
            verbose=1,
        )

        pretrainer = TabNetPretrainer(**pretrain_params)

        pretrainer.fit(
            X_train=X_test,
            eval_set=[X],
            max_epochs=MAX_EPOCH,
            patience=25,
            batch_size=BS,
            virtual_batch_size=BS,  # 128,
            num_workers=0,
            drop_last=True,
            pretraining_ratio=
            0.5  # The bigger your pretraining_ratio the harder it is to reconstruct
        )
    # Training the Model
    # tabular_mode.fit(train=train, validation=val)
    # # Evaluating the Model
    # # #Loss and Metrics on New Data¶
    # result = tabular_mode.evaluate(test)

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=777)

    BS = 2048
    MAX_EPOCH = 20
    # skf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)

    data['oof_preds'] = np.nan

    for fold_nb, (train_index, valid_index) in enumerate(cv.split(X, y)):
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]

        tabnet_params = dict(
            n_d=N_D,
            n_a=N_A,
            n_steps=N_STEPS,
            gamma=GAMMA,
            n_independent=N_INDEP,
            n_shared=N_SHARED,
            lambda_sparse=1e-5,
            seed=0,
            clip_value=2,
            cat_idxs=cat_idxs,
            cat_dims=cat_dims,
            cat_emb_dim=cat_emb_dims,
            mask_type=MASK_TYPE,
            device_name='auto',
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=5e-2, weight_decay=1e-5),
            scheduler_params=dict(
                max_lr=5e-2,
                steps_per_epoch=int(X_train.shape[0] / BS),
                epochs=MAX_EPOCH,
                # final_div_factor=100,
                is_batch_level=True),
            scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
            #                               scheduler_params=dict(mode='max',
            #                                                     factor=0.5,
            #                                                     patience=5,
            #                                                     is_batch_level=False,),
            #                               scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
            verbose=1)
        # Defining TabNet model
        model = TabNetClassifier(**tabnet_params)

        model.fit(
            X_train=X_train,
            y_train=y_train,
            from_unsupervised=pretrainer if PRETRAIN else None,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            eval_name=["train", "valid"],
            eval_metric=["auc"],
            batch_size=BS,
            virtual_batch_size=256,
            max_epochs=MAX_EPOCH,
            drop_last=True,
            pin_memory=True,
            patience=10,
        )

        val_preds = model.predict_proba(X_valid)[:, -1]
        print('auc:', roc_auc_score(y_true=y_valid, y_score=val_preds))

        data['oof_preds'].iloc[valid_index] = val_preds

        test_preds = model.predict_proba(X_test)[:, -1]
        df_sub[f"fold_{fold_nb+1}"] = test_preds

    df_sub["target"] = df_sub.filter(like="fold_").mean(axis=1).values

    df_sub.to_csv("Analysis/submission_5_tabnet.csv", index=False)

    df_sub = pd.read_csv("Analysis/submission_5_tabnet.csv")

    # df_sub.target = df_sub.target.map(lambda x: 0 if x<=0.5 else 1)
    df_sub.loc[:,
               ["id", "target"]].to_csv("Analysis/submission_5_2_tabnet.csv",
                                        index=False)
Example #7
0
    "result_score",
]
feat_cols = [c for c in train.columns if c not in no_feature_cols]

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(train[feat_cols].values)
X_tst = scaler.transform(test[feat_cols].values)
y_binary = train.result.astype("float64").copy().values
X_trn, X_vld, _, _ = train_test_split(X, y_binary, random_state=42, shuffle=True)

# TabNetPretrainer
unsupervised_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type="sparsemax",  # "sparsemax"
)
max_epochs = 1000
unsupervised_model.fit(
    X_train=X_trn,
    eval_set=[X_vld],
    max_epochs=max_epochs,
    patience=10,
    batch_size=512,
    virtual_batch_size=64,
    num_workers=0,
    drop_last=False,
    pretraining_ratio=0.8,
)
Example #8
0
import pandas as pd
import torch
from matplotlib import pyplot as plt
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

# -- settings
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

# TabNetPretrainer
unsupervised_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-3),
    mask_type='entmax', # "sparsemax",
    device_name='cuda'
)

clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-3),
    scheduler_params={"step_size":7, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='entmax', # This will be overwritten if using pretrain model,
    device_name='cuda'

)

Example #9
0
    def fit(self, x_train, y_train, kf_splits=5, tabnet_type=None):
        def _get_tabnet_params(tabnet_type):
            if (tabnet_type is None):
                tabnet_params = dict(
                    verbose=40,
                    optimizer_fn=torch.optim.Adam,
                    optimizer_params=dict(lr=1e-2, weight_decay=1e-5),
                    scheduler_params=dict(max_lr=0.05,
                                          steps_per_epoch=x_train.shape[0] //
                                          128,
                                          epochs=300),
                    scheduler_fn=torch.optim.lr_scheduler.OneCycleLR)
                fit_params = dict(batch_size=1024,
                                  virtual_batch_size=128,
                                  eval_metric='accuracy')
            elif (tabnet_type == 'TabNet-S'):
                tabnet_params = dict(
                    n_d=8,
                    n_a=8,
                    lambda_sparse=0.0001,
                    momentum=0.1,
                    n_steps=3,
                    gamma=1.2,
                    verbose=40,
                    optimizer_fn=torch.optim.Adam,
                    optimizer_params=dict(lr=0.01),
                    scheduler_params=dict(step_size=8000, gamma=0.05),
                    scheduler_fn=torch.optim.lr_scheduler.StepLR)
                fit_params = dict(batch_size=4096,
                                  virtual_batch_size=256,
                                  eval_metric='mse')
            else:
                print('[ERROR] Unknown tabnet_type: {}'.format(tabnet_type))
                quit()

            # --- check problem ---
            if fit_params['eval_metric'] in [
                    'auc', 'accuracy', 'balanced_accuracy', 'logloss'
            ]:
                problem = 'classification'
            elif fit_params['eval_metric'] in ['mse', 'mae', 'rmse', 'rmsle']:
                problem = 'regression'

            return tabnet_params, fit_params, problem

        kf = KFold(n_splits=kf_splits, shuffle=False)
        scores = []
        self.tabnet_models = []

        tabnet_params, fit_params, problem = _get_tabnet_params(tabnet_type)

        for i, (train_index,
                val_index) in enumerate(kf.split(x_train, y_train)):
            if (problem == 'classification'):
                unsupervised_model = TabNetPretrainer(**tabnet_params)
                tabnet_model = TabNetClassifier(**tabnet_params)
            elif (problem == 'regression'):
                unsupervised_model = TabNetPretrainer(**tabnet_params)
                tabnet_model = TabNetRegressor(**tabnet_params)
            else:
                pring('[ERROR] Unknown problem: {}'.format(problem))
                quit()

            x_tr = x_train[train_index]
            x_val = x_train[val_index]
            y_tr = y_train[train_index]
            y_val = y_train[val_index]

            unsupervised_model.fit(x_tr,
                                   eval_set=[x_val],
                                   patience=300,
                                   max_epochs=5000,
                                   pretraining_ratio=0.8)

            tabnet_model.fit(
                x_tr,
                y_tr,
                eval_set=[(x_val, y_val)],
                eval_metric=[fit_params['eval_metric']],
                batch_size=fit_params['batch_size'],
                virtual_batch_size=fit_params['virtual_batch_size'],
                patience=300,
                max_epochs=5000,
                from_unsupervised=unsupervised_model)

            self.tabnet_models.append(tabnet_model)
            prediction = tabnet_model.predict(x_val)
            if (problem == 'classification'):
                scores.append(accuracy_score(y_val, prediction))
            elif (problem == 'regression'):
                scores.append(mean_squared_error(y_val, prediction))
            else:
                pring('[ERROR] Unknown problem: {}'.format(problem))
                quit()

            if (i == 0):
                feature_importances = tabnet_model.feature_importances_.copy()
            else:
                feature_importances = np.vstack(
                    (feature_importances, tabnet_model.feature_importances_))

        print(scores)
        print(np.mean(scores))

        return scores, feature_importances