def model_train_pred(fold):
     
     model_path = os.path.join(model_dir, model_file_name + f"_FOLD{fold}.pth")
     tabnet_params = dict(n_d = 64, n_a = 128, n_steps = 1,
                          gamma = 1.3,lambda_sparse = 0,
                          n_independent = 2,n_shared = 1,optimizer_fn = optim.Adam,
                          optimizer_params = dict(lr = self.LEARNING_RATE, weight_decay = 1e-5),
                          mask_type = "entmax",
                          scheduler_params = dict(mode = "min", patience = 10, min_lr = 1e-5, factor = 0.9),
                          scheduler_fn = ReduceLROnPlateau,verbose = 10)
     
     x_fold_train, y_fold_train, x_fold_val, y_fold_val, df_test_x_copy, val_idx = \
     preprocess(fold, df_train, df_train_x, df_train_y, df_test_x, no_of_components)
     x_fold_train, x_fold_val, df_test_x_copy = variance_threshold(x_fold_train, x_fold_val, df_test_x_copy)
     
     ### Fit ###
     model = TabNetRegressor(**tabnet_params)
     model.fit(X_train = x_fold_train.values, y_train = y_fold_train.values,
               eval_set = [(x_fold_val.values, y_fold_val.values)], eval_name = ["val"],
               eval_metric = ["logits_ll"],max_epochs = self.EPOCHS,
               patience = 40,batch_size = self.BATCH_SIZE,
               virtual_batch_size = 32,num_workers = 1,drop_last = False,
               loss_fn = SmoothBCEwLogits(smoothing = 1e-4, pos_weight=pos_weight))
     
     ###---- Prediction ---
     oof = np.zeros(df_train_y.shape)
     valid_preds = 1 / (1 + np.exp(-model.predict(x_fold_val.values)))
     oof[val_idx] = valid_preds
     predictions = 1 / (1 + np.exp(-model.predict(df_test_x_copy.values)))
     model_path = model.save_model(model_path)
     return oof, predictions
Example #2
0
    def crossval_and_predict(self, n_folds: int, df: pd.DataFrame,
                             df_test: pd.DataFrame, feature_col: list,
                             target_col: str, model_params: dict):
        oof = np.zeros((len(df)))
        cv_preds = np.zeros((len(df_test)))
        kfold = KFold(n_splits=n_folds,
                      random_state=self.random_state,
                      shuffle=True)
        for train_idx, valid_idx in kfold.split(df):
            X_train, y_train = df[feature_col].values[train_idx], df[
                target_col].values[train_idx].reshape(-1, 1)
            X_valid, y_valid = df[feature_col].values[valid_idx], df[
                target_col].values[valid_idx].reshape(-1, 1)
            X_test = df_test[feature_col].values

            params = self.default_params()
            params['seed'] = self.random_state
            params['n_d'] = model_params['n_d']
            params['n_a'] = model_params['n_d']
            params['gamma'] = model_params['gamma']
            params['momentum'] = model_params['momentum']
            params['n_steps'] = model_params['n_steps']
            params['n_shared'] = model_params['n_shared']
            params['n_independent'] = model_params['n_independent']

            logging.info(
                f'Parameters used for TabNet supervised training: {params}')

            unsupervised_model = TabNetPretrainer(**params)
            unsupervised_model.fit(X_train=X_train,
                                   eval_set=[X_valid],
                                   pretraining_ratio=0.5,
                                   max_epochs=20)

            model = TabNetRegressor(**params)
            model.fit(X_train=X_train,
                      y_train=y_train,
                      eval_set=[(X_valid, y_valid)],
                      eval_name=['valid'],
                      eval_metric=['rmse'],
                      max_epochs=100,
                      patience=10,
                      batch_size=1024,
                      from_unsupervised=unsupervised_model)

            oof[valid_idx] = model.predict(X_valid).squeeze()
            cv_preds += model.predict(X_test).squeeze() / n_folds
            logging.info(
                f'Finished fold with score {rmse(y_valid, oof[valid_idx])}')

        rmse_score = rmse(df[target_col], oof)
        return rmse_score, cv_preds
Example #3
0
def fit_tabnet(x_tr, y_tr, x_va, y_va, cat_feats, args):
    import torch
    from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

    cat_idxs = [x_tr.columns.get_loc(f) for f in cat_feats]
    cat_dims = x_tr[cat_feats].apply(lambda s: s.nunique()).tolist()

    cat_emb_dim = [i // 2 for i in cat_dims]

    x_tr = x_tr.values
    y_tr = y_tr.values.reshape(-1, 1)

    x_va = x_va.values
    y_va = y_va.values.reshape(-1, 1)

    params = dict(
        n_d=16,
        n_a=16,
        n_steps=3,
        gamma=1.5,
        n_independent=4,
        n_shared=4,
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        cat_emb_dim=cat_emb_dim,
        lambda_sparse=0.0001,
        momentum=0.95,
        clip_value=2.,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=0.0005),
        #scheduler_params = {"gamma": 0.95, "step_size": 500},
        scheduler_params={"gamma": 0.95},
        scheduler_fn=torch.optim.lr_scheduler.ExponentialLR,
        epsilon=1e-1)

    clf = TabNetRegressor(**params)
    fit_params = {
        'batch_size': 4096,
        'virtual_batch_size': 1024,
        'eval_set': [(x_va, y_va)],
        'max_epochs': 1000,
        'patience': 50,
    }

    clf.fit(x_tr, y_tr, **fit_params)

    tr_pred = np.clip(clf.predict(x_tr), 0, 361)
    va_pred = np.clip(clf.predict(x_va), 0, 361)
    train_score = np.sqrt(mean_squared_error(tr_pred, y_tr))
    val_score = np.sqrt(mean_squared_error(va_pred, y_va))

    return clf, train_score, val_score
Example #4
0
class TabNetModel(BaseModel):
    def __init__(self,
                 dataset,
                 dataset_path,
                 sample=False,
                 sample_num=None,
                 split_method='6-2-2',
                 use_category=True):
        BaseModel.__init__(self, dataset, dataset_path, sample, sample_num,
                           split_method, use_category)
        self.model_name = 'tabnet'

    def preprocessing(self, train_X, val_X, test_X, cat_cols):
        train_X, val_X, test_X = remove_unseen_category(
            train_X, val_X, test_X, cat_cols)
        train_X, val_X, test_X = label_encoding(train_X, val_X, test_X,
                                                cat_cols)
        # tabnet predict_proba func does not accept objet ndarray
        train_X, val_X, test_X = train_X.astype(float), val_X.astype(
            float), test_X.astype(float)
        # tabnet does not accept np.nan
        train_X = np.where(np.isnan(train_X), 0, train_X)
        val_X = np.where(np.isnan(val_X), 0, val_X)
        test_X = np.where(np.isnan(test_X), 0, test_X)
        return train_X, val_X, test_X

    def fit(self, train_X, train_Y, val_X, val_Y, cat_cols):
        cat_dims = [len(set(train_X[:, idx])) + 1 for idx in cat_cols]
        if self.dataset_type in {'2-class', 'm-class'}:
            self.model = TabNetClassifier(cat_idxs=cat_cols, cat_dims=cat_dims)
            self.model.fit(train_X,
                           train_Y,
                           eval_set=[(val_X, val_Y)],
                           eval_metric=['logloss'],
                           max_epochs=200,
                           patience=20)
        elif self.dataset_type in {'regression'}:
            self.model = TabNetRegressor(cat_idxs=cat_cols, cat_dims=cat_dims)
            self.model.fit(train_X,
                           train_Y[:, np.newaxis],
                           eval_set=[(val_X, val_Y[:, np.newaxis])],
                           eval_metric=['rmse'],
                           max_epochs=200,
                           patience=20)
Example #5
0
class ModelTabNetRegressor(Model):

    def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None):

        categorical_dims = {}
        for col in self.categorical_features:
            tr_x[col] = tr_x[col].fillna("unk")
            va_x[col] = va_x[col].fillna("unk")
            te_x[col] = te_x[col].fillna("unk")
            categorical_dims[col] = len(set(tr_x[col].values) | set(va_x[col].values) | set(te_x[col].values))

        cat_idxs = [i for i, f in enumerate(tr_x.columns) if f in self.categorical_features]
        cat_dims = [categorical_dims[f] for i, f in enumerate(tr_x.columns) if f in self.categorical_features]
        cat_emb_dim = [10 for _ in categorical_dims]

        for col in tr_x.columns:
            tr_x[col] = tr_x[col].fillna(tr_x[col].mean())
            va_x[col] = va_x[col].fillna(tr_x[col].mean())
            te_x[col] = te_x[col].fillna(tr_x[col].mean())

        self.model = TabNetRegressor(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs)
        self.model.fit(X_train=tr_x.values, y_train=tr_y.values.reshape(-1, 1),
                       X_valid=va_x.values, y_valid=va_y.values.reshape(-1, 1),
                       max_epochs=1000,
                       patience=50,
                       batch_size=1024,
                       virtual_batch_size=128)

    def predict(self, te_x):
        return self.model.predict(te_x.values).reshape(-1, )

    def save_model(self):
        model_path = os.path.join('../output/model', f'{self.run_fold_name}.model')
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        Data.dump(self.model, model_path)

    def load_model(self):
        model_path = os.path.join('../output/model', f'{self.run_fold_name}.model')
        self.model = Data.load(model_path)
Example #6
0
 class TabNetModel:
     def __init__(self, feat_sel = None):
         if feat_sel is not None:
             self.pipeline = Pipeline([
                 ('preprocessing', MinMaxScaler()),
                 ('feature_selection', SelectFromModel(feat_sel))
             ])
         else:
             self.pipeline = Pipeline([
                 ('preprocessing', MinMaxScaler())
             ])
         self.feat_sel = feat_sel
         self.params = {'feat_sel': feat_sel}
     def set_params(self, **kwargs):
         tabnet_params, proc_params = {}, {}
         for key in kwargs.keys():
             if key == 'feat_sel':
                 self = TabNetRegression.TabNetModel.__init__(kwargs[key])
             elif key.startswith(TabNetRegression.prefix):
                 tabnet_params[key.replace(TabNetRegression.prefix + "__", "")] = kwargs[key]
             else:
                 proc_params[key] = kwargs[key]
         self.tabnet = TabNetRegressor(**tabnet_params)
         self.pipeline.set_params(**proc_params)
         self.params = {**tabnet_params, **proc_params}
         return self
     def get_params(self, deep = False):
         return self.params
     def fit(self, Xtrain, Ytrain, Xvalid, Yvalid):
         if self.feat_sel is not None:
             self.pipeline.fit(Xtrain, Ytrain)
         else:
             self.pipeline.fit(Xtrain)
         Xtrain_scaled, Xvalid_scaled = self.pipeline.transform(Xtrain), self.pipeline.transform(Xvalid)
         self.tabnet.fit(Xtrain_scaled, Ytrain.flatten(), Xvalid_scaled, Yvalid.flatten())
     def predict(self, X):
         return self.tabnet.predict(self.pipeline.transform(X))
Example #7
0
    def __call__(self, trial):
        df_train, df_valid = train_test_split(self.df,
                                              test_size=0.1,
                                              random_state=self.random_state)
        X_train, y_train = df_train[self.feature_col].values, df_train[
            self.target_col].values.reshape(-1, 1)
        X_valid, y_valid = df_valid[self.feature_col].values, df_valid[
            self.target_col].values.reshape(-1, 1)
        logging.info(
            f'Train/valid split: {X_train.shape[0]} for training, {X_valid.shape[0]} for validation'
        )

        n_d = trial.suggest_int('n_d', 8, 64)

        params = self.default_params
        params['n_d'] = n_d
        params['n_a'] = n_d
        params['seed'] = self.random_state
        params['n_steps'] = trial.suggest_int('n_steps', 3, 10)
        params['n_shared'] = trial.suggest_int('n_shared', 2, 5)
        params['n_independent'] = trial.suggest_int('n_independent', 2, 5)
        params['momentum'] = trial.suggest_float('momentum', 0.01, 0.4)
        params['gamma'] = trial.suggest_float('gamma', 1.0, 2.0)

        model = TabNetRegressor(**params)

        model.fit(X_train=X_train,
                  y_train=y_train,
                  eval_set=[(X_valid, y_valid)],
                  eval_metric=['rmse'],
                  max_epochs=20,
                  patience=10,
                  batch_size=1024)

        score = rmse(y_valid, model.predict(X_valid).squeeze())
        return score
Example #8
0
def make_model(X_train,y_train,X_valid,y_valid,pretrainer):
    """Builds a decision tree model based on stored trainingd data"""
    
    tabnet_params = dict(n_d=8, n_a=8, n_steps=3, gamma=1.3,
                         n_independent=2, n_shared=2,
                         seed=SEED, lambda_sparse=1e-3,
                         optimizer_fn=torch.optim.Adam,
                         optimizer_params=dict(lr=2e-2,
                                               weight_decay=1e-5
                                              ),
                         mask_type="entmax",
                         scheduler_params=dict(max_lr=0.05,
                                               steps_per_epoch=int(X_train.shape[0] / 256),
                                               epochs=200,
                                               is_batch_level=True
                                              ),
                         scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
                         verbose=10,
#                          cat_idxs=cat_idxs, # comment out when Unsupervised
#                          cat_dims=cat_dims, # comment out when Unsupervised
#                          cat_emb_dim=1 # comment out when Unsupervised
                        )

    model = TabNetRegressor(**tabnet_params)

    model.fit(X_train=X_train.values,
              y_train=y_train.values,
              eval_set=[(X_valid.values, y_valid.values)],
              eval_name = ["valid"],
              eval_metric = ["rmse"],
              max_epochs=200,
              patience=20, batch_size=4096, virtual_batch_size=256,
              num_workers=0,drop_last=False,
              from_unsupervised=pretrainer # comment out when Unsupervised
             )
    return model
def run_tabnet(df, max_epochs=20, device='cpu'):
    X, y = get_X_y_tab(df)
    X_train, X_val, X_test, y_train, y_val, y_test = split_data_tab(X, y)
    reg = TabNetRegressor(device_name=device)
    print('Running the TabNet DNN, this could take a while')
    model = reg.fit(X_train,
                    y_train,
                    eval_set=[(X_train, y_train), (X_val, y_val)],
                    eval_name=['train', 'val'],
                    eval_metric=['logloss'],
                    max_epochs=max_epochs)
    print('Fitting the test data to the model')
    y_pred = reg.predict(X_test)
    ll = round(log_loss(y_test, y_pred), 5)
    print(f'The Log loss is {ll}')
    return model
Example #10
0
def run_training_tabnet(train,
                        test,
                        trn_idx,
                        val_idx,
                        feature_cols,
                        target_cols,
                        fold,
                        seed,
                        filename="tabnet"):

    seed_everything(seed)

    train_ = process_data(train)
    test_ = process_data(test)

    train_df = train_.loc[trn_idx, :].reset_index(drop=True)
    valid_df = train_.loc[val_idx, :].reset_index(drop=True)

    x_train, y_train = train_df[feature_cols].values, train_df[
        target_cols].values
    x_valid, y_valid = valid_df[feature_cols].values, valid_df[
        target_cols].values

    model = TabNetRegressor(
        n_d=32,
        n_a=32,
        n_steps=1,
        lambda_sparse=0,
        cat_dims=[3, 2],
        cat_emb_dim=[1, 1],
        cat_idxs=[0, 1],
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        mask_type='entmax',  # device_name=DEVICE,
        scheduler_params=dict(milestones=[100, 150], gamma=0.9),  #)
        scheduler_fn=torch.optim.lr_scheduler.MultiStepLR,
        verbose=10,
        seed=seed)

    loss_fn = LabelSmoothing(0.001)
    #     eval_metric = SmoothedLogLossMetric(0.001)
    #     eval_metric_nosmoothing = SmoothedLogLossMetric(0.)

    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))

    if IS_TRAIN:
        #         print("isnan", np.any(np.isnan(x_train)))
        model.fit(X_train=x_train,
                  y_train=y_train,
                  eval_set=[(x_valid, y_valid)],
                  eval_metric=[LogLossMetric, SmoothedLogLossMetric],
                  max_epochs=200,
                  patience=50,
                  batch_size=1024,
                  virtual_batch_size=128,
                  num_workers=0,
                  drop_last=False,
                  loss_fn=loss_fn)
        model.save_model(f"{MODEL_DIR}/{NB}_{filename}_SEED{seed}_FOLD{fold}")

    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values

    model = TabNetRegressor(
        n_d=32,
        n_a=32,
        n_steps=1,
        lambda_sparse=0,
        cat_dims=[3, 2],
        cat_emb_dim=[1, 1],
        cat_idxs=[0, 1],
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        mask_type='entmax',  # device_name=DEVICE,
        scheduler_params=dict(milestones=[100, 150], gamma=0.9),  #)
        scheduler_fn=torch.optim.lr_scheduler.MultiStepLR,
        verbose=10,
        seed=seed)

    model.load_model(
        f"{MODEL_DIR}/{NB}_{filename}_SEED{seed}_FOLD{fold}.model")

    valid_preds = model.predict(x_valid)

    valid_preds = torch.sigmoid(
        torch.as_tensor(valid_preds)).detach().cpu().numpy()
    oof[val_idx] = valid_preds

    predictions = model.predict(x_test)
    predictions = torch.sigmoid(
        torch.as_tensor(predictions)).detach().cpu().numpy()

    return oof, predictions
Example #11
0
class TabNetBase(AI_Base):
    # file_path = os.getcwd() + "\\src\\AIs\\models\\TabNetv1\\"
    file_path = os.getcwd() + "\\"
    save_name = file_path + "test_model"

    def __init__(self, *args, **kwargs):
        _TPI(self, locals())
        super(TabNetBase, self).__init__(*args, **kwargs)
        ACT = self.env.act
        MATCH = self.env.match_loader

        self.X_train, self.X_valid, self.X_test = None, None, None
        self.y_train, self.y_valid, self.y_test = None, None, None
        self.cat_idxs, self.cat_dims, self.cat_emb_dim = MATCH.get_categorical(
        )
        self.ai = None

        self._scenario_tactics = None
        self._scenario_matches = None

        self._scenario_learn_from_file = list([[
            1,
            # [self.epochs,
            [
                1,
                # [len(MATCH),
                [
                    1,
                    (self.act_register_data,
                     dict(data=MATCH.act_get(is_flat=True))),
                    self.act_modify_data,
                    self.act_init_ai,
                    # self.act_load_game,
                    self.act_run_ai_with_learn,
                    # self.act_test
                ]
            ],
        ]])

        self.set_mode(self.mode)

    def act_register_data(self, data, is_test=False):
        if is_test is True:
            _TPI(self, locals())
        else:
            self.X_train = np.array(self.env.match_loader.train_players)
            self.y_train = np.array(self.env.match_loader.train_plus)
            self.X_valid = np.array(self.env.match_loader.valid_players)
            self.y_valid = np.array(self.env.match_loader.valid_plus)
            self.X_test = np.array(self.env.match_loader.test_players)
            self.y_test = np.array(self.env.match_loader.test_plus)

    def act_init_ai(self, is_test=False):
        if is_test is True:
            _TPI(self, locals())
        else:
            MATCH = self.env.match_loader
            self.ai = TabNetRegressor(n_steps=10,
                                      input_dim=MATCH.count_cols *
                                      MATCH.count_players,
                                      cat_dims=self.cat_dims,
                                      cat_emb_dim=self.cat_emb_dim,
                                      cat_idxs=self.cat_idxs)

    def act_modify_data(self, is_test=False):
        if is_test is True:
            _TPI(self, locals())
        else:
            pass

    def act_load_game(self, is_test=False):
        if is_test is True:
            _TPI(self, locals())
        else:
            save = self.save_name + ".zip"
            if os.path.isfile(save):
                print("Load Network")
                self.ai.load_model(save)

    def act_test(self, is_test=False):
        if is_test is True:
            _TPI(self, locals())
        else:
            predictions = self.ai.predict(self.X_test)
            y_true = self.y_test
            test_score = mean_squared_error(y_pred=predictions, y_true=y_true)
            #np.savetxt("predict.txt", predictions, delimiter=',', fmt='%d')
            #np.savetxt("true.txt", y_true, delimiter=',', fmt='%d')
            print(test_score)

    def act_run_ai_with_learn(self, is_test=False):
        if is_test is True:
            _TPI(self, locals())
        else:
            self.ai.fit(X_train=self.X_train,
                        y_train=self.y_train,
                        X_valid=self.X_valid,
                        y_valid=self.y_valid,
                        max_epochs=self.epochs,
                        patience=500,
                        batch_size=512,
                        drop_last=False)
            # self.ai.save_model(self.save_name)

    def act_save_model(self, is_test=False):
        if is_test is True:
            _TPI(self, locals())
        else:
            print(self.save_name)
            self.ai.save_model(self.save_name)
Example #12
0
        print("FOLDS: ", fold_nb + 1, 'seed:', tabnet_params['seed'])
        print('*' * 60)

        X_train, y_train = train.values[train_idx, :], target.values[
            train_idx, :]
        X_val, y_val = train.values[val_idx, :], target.values[val_idx, :]
        ### Model ###
        model = TabNetRegressor(**tabnet_params)

        ### Fit ###
        model.fit(X_train=X_train,
                  y_train=y_train,
                  eval_set=[(X_val, y_val)],
                  eval_name=["val"],
                  eval_metric=["logits_ll"],
                  max_epochs=MAX_EPOCH,
                  patience=20,
                  batch_size=1024,
                  virtual_batch_size=32,
                  num_workers=1,
                  drop_last=False,
                  loss_fn=SmoothBCEwLogits(smoothing=5e-5))
        print('-' * 60)

        ### Predict on validation ###
        preds_val = model.predict(X_val)
        # Apply sigmoid to the predictions
        preds = 1 / (1 + np.exp(-preds_val))
        score = np.min(model.history["val_logits_ll"])
        saving_path_name = 'TabNet_seed_' + str(
            tabnet_params['seed']) + '_fold_' + str(fold_nb + 1)
        saved_filepath = model.save_model(saving_path_name)
Example #13
0
    def fit(
        self,
        x_train: AoD,
        y_train: AoS,
        x_valid: AoD,
        y_valid: AoS,
        config: dict,
        **kwargs
    ) -> Tuple[TabNetModel, dict]:
        model_params = config["model"]["model_params"]
        train_params = config["model"]["train_params"]
        categorical_cols = config["categorical_cols"]
        self.config["categorical_cols"] = categorical_cols

        categorical_dims = {}
        for col in categorical_cols:
            x_train[col] = x_train[col].cat.add_categories("Unknown")
            x_train[col] = x_train[col].fillna("Unknown")
            x_train[col] = x_train[col].cat.codes
            x_valid[col] = x_valid[col].cat.add_categories("Unknown")
            x_valid[col] = x_valid[col].fillna("Unknown")
            x_valid[col] = x_valid[col].cat.codes
            categorical_dims[col] = len(
                set(x_train[col].values) | set(x_valid[col].values)
            )

        cat_idxs = [i for i, f in enumerate(x_train.columns) if f in categorical_cols]
        cat_dims = [
            categorical_dims[f]
            for i, f in enumerate(x_train.columns)
            if f in categorical_cols
        ]
        cat_emb_dim = [10 for _ in categorical_dims]

        numerical_cols = [col for col in x_train.columns if col not in categorical_cols]
        for col in numerical_cols:
            x_train[col] = x_train[col].fillna(x_train[col].mean())
            x_valid[col] = x_valid[col].fillna(x_train[col].mean())

        mode = config["model"]["mode"]
        self.mode = mode

        if mode == "regression":
            model = TabNetRegressor(
                cat_dims=cat_dims,
                cat_emb_dim=cat_emb_dim,
                cat_idxs=cat_idxs,
                **model_params,
            )
        else:
            model = TabNetClassifier(
                cat_dims=cat_dims,
                cat_emb_dim=cat_emb_dim,
                cat_idxs=cat_idxs,
                **model_params,
            )

        model.fit(
            X_train=x_train.values,
            y_train=y_train.reshape(-1, 1),
            X_valid=x_valid.values,
            y_valid=y_valid.reshape(-1, 1),
            **train_params,
        )

        best_score = {"valid_score": model.losses_valid}

        return model, best_score
Example #14
0
def run(try_num, config):
    args = get_args()

    print('config:', config.to_dict(), flush=True)
    print('args:', args, flush=True)
    os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

    model_dir = f'blending-02-tabnet-{try_num}'

    if not os.path.exists(model_dir):
        os.mkdir(model_dir)

    train_features = pd.read_csv('../input/lish-moa/train_features.csv')
    train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
    dae_features = pd.read_csv(config.dae_path)
    test_features = pd.read_csv('../input/lish-moa/test_features.csv')

    if args.debug:
        train_features = train_features[:500]
        train_targets = train_targets[:500]
        dae_features = pd.concat([dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True)

        config.update(dict(
            n_folds=3,
            seeds=[222],
            n_epochs=3,
            batch_size=128,
        ))

    target_columns = [col for col in train_targets.columns if col != 'sig_id']
    n_targets = len(target_columns)

    train_features, train_targets, test_features = preprocess(config, model_dir, train_features,
                                                              train_targets, test_features,
                                                              dae_features)
    features_columns = [col for col in train_features.columns
                        if col not in ['sig_id', 'cp_type', 'cp_time', 'cp_dose',
                                       'cp_type_ctl_vehicle', 'cp_type_trt_cp']]

    train_features = train_features[features_columns]
    test_features = test_features[features_columns]

    smooth_loss_function = SmoothBCEwLogits(smoothing=config.smoothing)
    kfold = MultilabelStratifiedKFold(n_splits=config.n_folds, random_state=42, shuffle=True)

    oof_preds = np.zeros((len(train_features), len(config.seeds), n_targets))
    test_preds = []

    for seed_index, seed in enumerate(config.seeds):
        print(f'Train seed {seed}', flush=True)
        set_seed(seed)

        for fold_index, (train_indices, val_indices) in enumerate(kfold.split(
            train_targets[target_columns].values,
            train_targets[target_columns].values
        )):
            print(f'Train fold {fold_index + 1}', flush=True)
            x_train = train_features.loc[train_indices, features_columns].values
            y_train = train_targets.loc[train_indices, target_columns].values
            x_val = train_features.loc[val_indices, features_columns].values
            y_val = train_targets.loc[val_indices, target_columns].values

            weights_path = f'{model_dir}/weights-{seed}-{fold_index}.pt'

            tabnet_conf = dict(
                seed=seed,
                optimizer_fn=optim.Adam,
                scheduler_fn=optim.lr_scheduler.ReduceLROnPlateau,
                n_d=32,
                n_a=32,
                n_steps=1,
                gamma=1.3,
                lambda_sparse=0,
                momentum=0.02,
                optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9),
                mask_type="entmax",
                verbose=10,
                n_independent=1,
                n_shared=1,
            )

            if args.only_pred:
                print('Skip training', flush=True)
            else:
                model = TabNetRegressor(**tabnet_conf)

                model.fit(
                    X_train=x_train,
                    y_train=y_train,
                    eval_set=[(x_val, y_val)],
                    eval_name=['val'],
                    eval_metric=['logits_ll'],
                    max_epochs=config.n_epochs,
                    patience=20,
                    batch_size=1024,
                    virtual_batch_size=32,
                    num_workers=1,
                    drop_last=True,
                    loss_fn=smooth_loss_function
                )

                model.save_model(weights_path)
                print('Save weights to: ', weights_path, flush=True)

            model = TabNetRegressor(**tabnet_conf)
            model.load_model(f'{weights_path}.zip')

            val_preds = sigmoid(model.predict(x_val))
            score = mean_log_loss(y_val, val_preds, n_targets)
            print(f'fold_index {fold_index}   -   val_loss: {score:5.5f}', flush=True)

            oof_preds[val_indices, seed_index, :] = val_preds

            preds = sigmoid(model.predict(test_features.values))
            test_preds.append(preds)

        score = mean_log_loss(train_targets[target_columns].values, oof_preds[:, seed_index, :], n_targets)
        print(f'Seed {seed}   -   val_loss: {score:5.5f}', flush=True)

    oof_preds = np.mean(oof_preds, axis=1)
    score = mean_log_loss(train_targets[target_columns].values, oof_preds, n_targets)
    print(f'Overall score is {score:5.5f}', flush=True)

    oof_pred_df = train_targets.copy()
    oof_pred_df.loc[:, target_columns] = oof_preds
    oof_pred_df.to_csv(f'{model_dir}/oof_pred.csv', index=False)

    test_features = pd.read_csv('../input/lish-moa/test_features.csv')
    submission = create_submission(test_features, ['sig_id'] + target_columns)
    submission[target_columns] = np.mean(test_preds, axis=0)
    submission.loc[test_features['cp_type'] == 'ctl_vehicle', target_columns] = 0
    submission.to_csv(f'{model_dir}/submission.csv', index=False)
Example #15
0
def train_tabnet(x_train, y_train, x_test, submission, feature_cols,
                 target_cols, seeds, nfolds, save_path):

    cfg_fe = Config_FeatureEngineer()
    seed_everything(seed_value=cfg_fe.seed)

    cfg_tabnet = Config_TabNet()

    test_cv_preds = []
    oof_preds = []
    scores = []

    for seed in seeds:
        kfold_col = f'kfold_{seed}'
        print("seed: {}".format(seed))
        print('*' * 60)

        for fold in range(nfolds):

            oof_preds_fold = y_train.copy()
            oof_preds_fold.iloc[:, :] = 0

            print('*' * 60)
            print("FOLD: {}".format(fold + 1))
            print('*' * 60)

            trn_idx = x_train[x_train[kfold_col] != fold].index
            val_idx = x_train[x_train[kfold_col] == fold].index

            train_df = x_train[x_train[kfold_col] != fold].reset_index(
                drop=True)
            valid_df = x_train[x_train[kfold_col] == fold].reset_index(
                drop=True)

            x_tr, y_tr = train_df[feature_cols].values, train_df[
                target_cols].values
            x_val, y_val = valid_df[feature_cols].values, valid_df[
                target_cols].values

            # tabnet model
            model = TabNetRegressor(
                n_d=cfg_tabnet.n_d,
                n_a=cfg_tabnet.n_a,
                n_steps=cfg_tabnet.n_steps,
                n_independent=cfg_tabnet.n_independent,
                n_shared=cfg_tabnet.n_shared,
                gamma=cfg_tabnet.gamma,
                lambda_sparse=cfg_tabnet.lambda_sparse,
                optimizer_fn=cfg_tabnet.optimizer_fn,
                optimizer_params=cfg_tabnet.optimizer_params,
                mask_type=cfg_tabnet.mask_type,
                scheduler_params=cfg_tabnet.scheduler_params,
                scheduler_fn=cfg_tabnet.scheduler_fn,
                seed=seed,
                verbose=cfg_tabnet.verbose)

            # fit model
            model.fit(
                X_train=x_tr,
                y_train=y_tr,
                eval_set=[(x_val, y_val)],
                eval_name=["val"],
                eval_metric=["logits_ll"],
                max_epochs=cfg_tabnet.max_epochs,
                patience=cfg_tabnet.fit_patience,
                batch_size=cfg_tabnet.batch_size,
                virtual_batch_size=cfg_tabnet.virtual_batch_size,
                num_workers=1,
                drop_last=False,
                # To use binary cross entropy because this is not a regression problem
                loss_fn=BCEwLogitsSmooth(smooth=cfg_tabnet.labelsmooth_rate))

            print('-' * 60)

            # save model
            model.save_model(
                os.path.join(save_path, f"TabNet_seed{seed}_FOLD{fold}"))
            print('*' * 60)

            # Predict on validation
            preds_val = model.predict(x_val)
            # Apply sigmoid to the predictions
            preds = 1 / (1 + np.exp(-preds_val))
            score = np.min(model.history["val_logits_ll"])

            oof_preds.append(preds)
            scores.append(score)

            # Save OOF for CV
            preds_tr = model.predict(x_train[feature_cols].values)
            preds = 1 / (1 + np.exp(-preds_tr))
            oof_preds_fold.loc[:, target_cols] = preds
            oof_preds_fold.to_csv(
                path_or_buf=f"./TabNet_oof_preds_seed{seed}_FOLD{fold}.csv",
                sep=',',
                index=False)

            # Predict on test
            preds_test = model.predict(x_test[feature_cols].values)
            preds_test = 1 / (1 + np.exp(-preds_test))
            test_cv_preds.append(preds_test)
            test_cv_preds_fold = pd.DataFrame(preds_test, columns=target_cols)
            test_cv_preds_fold["sig_id"] = x_test["sig_id"]
            test_cv_preds_fold.to_csv(
                path_or_buf=f"./TabNet_test_preds_seed{seed}_FOLD{fold}.csv",
                sep=',',
                index=False)

    oof_preds_all = np.concatenate(oof_preds)
    test_preds_all = np.stack(test_cv_preds)
    print("Averaged Best Score for CVs is: {}".format(np.mean(scores)))

    return test_preds_all
Example #16
0
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=2e-1),
            scheduler_params={
                "step_size": 10,
                "gamma": 0.9
            },  # how to use learning rate scheduler
            scheduler_fn=torch.optim.lr_scheduler.StepLR,
            mask_type="sparsemax",
        )

        clf.fit(
            X_train=X_trn,
            y_train=y_trn,
            eval_set=[(X_trn, y_trn), (X_vld, y_vld)],
            eval_name=["train", "valid"],
            eval_metric=["rmse"],
            max_epochs=2000,
            patience=50,
            batch_size=128,
            virtual_batch_size=128,
            num_workers=0,
        )

        fold_preds = clf.predict(X_vld).astype(np.float64)[:, 0]
        _test_preds.append(clf.predict(X_tst)[:, 0])
        oof[vld_index] = fold_preds
        scores.append(np.sqrt(mean_squared_error(y_vld, fold_preds)))
        importances = pd.concat(
            [
                importances,
                pd.DataFrame({
                    "feature": feat_cols,
def objective(trial):
    SEED = 25  # [20,21,22]
     # all hyperparameters here
#     mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
    n_d = trial.suggest_int("n_d", 8, 32, step=8)
    n_a = trial.suggest_int("n_a", 32, 64, step=8)
    #n_steps = trial.suggest_int("n_steps", 1, 3, step=1)

    #n_shared = trial.suggest_int("n_shared", 1, 2)
    #n_independent = trial.suggest_int("n_independent", 1, 2, step=1)
    clip_value = trial.suggest_int("clip_value", 1, 2, step=1)

    gamma = trial.suggest_float("gamma", 1., 1.6, step=0.2)
    #lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)
    #batch_size = trial.suggest_int("batch_size", 512, 1024, step=256)
    #momentum = trial.suggest_float("momentum", 0.02, 0.1, step=0.02)

    #factor = trial.suggest_float("factor", 0.5, 0.9,step=0.1)

    for t in trial.study.trials:
        if t.state != optuna.structs.TrialState.COMPLETE:
            continue

        if t.params == trial.params:
            return t.value  # Return the previous value without re-evaluating it.
    
    tabnet_params = dict(n_d=n_d, 
                         n_a=n_a, 
                         n_steps=1,
                         gamma=gamma,
                         n_shared=1,
                         n_independent=1,
                         lambda_sparse=0,
                         #momentum = momentum,
                         clip_value=clip_value,
                         optimizer_fn=torch.optim.Adam,
                         optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                         mask_type="entmax", 
                         scheduler_params=dict(mode="min",
                                               patience=5,
                                               min_lr=1e-5,
                                               factor=0.5,),
                         scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                         verbose=10,
                         seed=SEED,
                         )
    print(m_,'params:',tabnet_params)
    scores_auc_all = []
    test_cv_preds = []

    NB_SPLITS = 7 # 7
    mskf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, random_state = 0, shuffle = True)

    oof_preds = []
    oof_targets = []
    scores = []
    scores_auc = []
    for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, target)):
        print(b_,"FOLDS: ", r_, fold_nb + 1)
        print(b_, "FOLDS: ", r_, fold_nb + 1, y_, 'seed:', tabnet_params['seed'])
        print(g_, '*' * 60, c_)

        X_train, y_train = train.values[train_idx, :], target.values[train_idx, :]
        X_val, y_val = train.values[val_idx, :], target.values[val_idx, :]
        ### Model ###
        model =  TabNetRegressor(**tabnet_params)
       
        ### Fit ###
        model.fit(
            X_train = X_train,
            y_train = y_train,
            eval_set = [(X_val, y_val)],
            eval_name = ["val"],
            eval_metric = ["logits_ll"],
            max_epochs = MAX_EPOCH,
            patience = 20,
            batch_size = 512, #1024
            virtual_batch_size = 64,#32
            num_workers = 1,
            drop_last = False,
            # To use binary cross entropy because this is not a regression problem
            loss_fn = SmoothBCEwLogits(smoothing =0.0005)
        )
        print(y_, '-' * 60)

        ### Predict on validation ###
        preds_val = model.predict(X_val)
        # Apply sigmoid to the predictions
        preds = 1 / (1 + np.exp(-preds_val))
        score = np.min(model.history["val_logits_ll"])

        ### Save OOF for CV ###
        oof_preds.append(preds_val)
        oof_targets.append(y_val)
        scores.append(score)

        ### Predict on test ###
        #preds_test = model.predict(X_test)
        #test_cv_preds.append(1 / (1 + np.exp(-preds_test)))

    #oof_preds_all = np.concatenate(oof_preds)
    #oof_targets_all = np.concatenate(oof_targets)
    #test_preds_all = np.stack(test_cv_preds)
    return np.mean(scores)
for seed in [0,1]:
  print('Seed',seed)

  for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, train_targets_scored)):
      print("FOLDS : ", fold_nb)

      ## model
      X_train, y_train = train.values[train_idx, :], train_targets_scored.values[train_idx, :]
      X_val, y_val = train.values[val_idx, :], train_targets_scored.values[val_idx, :]
      model = TabNetRegressor(**tabnet_params)

      model.fit(X_train=X_train,
                y_train=y_train,
                eval_set=[(X_val, y_val)],
                eval_name = ["val"],
                eval_metric = ["logits_ll"],
                max_epochs=MAX_EPOCH,
                patience=20, batch_size=1024, virtual_batch_size=128,
                num_workers=1, drop_last=False,
                # use binary cross entropy as this is not a regression problem
                loss_fn=torch.nn.functional.binary_cross_entropy_with_logits)

      preds_val = model.predict(X_val)
      # Apply sigmoid to the predictions
      preds =  1 / (1 + np.exp(-preds_val))
      score = np.min(model.history["val_logits_ll"])
  #     name = cfg.save_name + f"_fold{fold_nb}"
  #     model.save_model(name)
      ## save oof to compute the CV later
      oof_preds.append(preds_val)
      oof_targets.append(y_val)
      scores.append(score)
Example #19
0
class MyTabNetRegressorModel(BaseModel):
    """
    Paramters
    ---------
    ref: https://dreamquark-ai.github.io/tabnet/generated_docs/README.html#model-parameters
    model_params:
        n_d:default=8(range 8 to 64)
        n_a:default=8
        n_steps:default=3(range 3 to 10)
        gamma:default=1.3(range 1.0 to 2.0)
        n_independent:default=2(range 1 to 5)
        n_shared:default=2(range 1 to 5)
        lambda_sparse:default=1e3
        optimizer_fn:default=Adam
        optimizer_params:default=(lr=2e2, weight_decay=None),
        mask_type:default=sparsemax or entmax
        scheduler_params:dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
        seed: default=0
        verbose=5,
        cat_dims=cat_dims, cat_idxs=cat_idx, cat_emb_dim=1
        
    fit_params:
        max_epochs:default=200
        patience:default=15
        loss_fn(torch.loss or list of torch.loss):default to mse for regression and cross entropy for classification
        eval_metric(list or str)
        batch_size:default=1024
        virtual_batch_size:default=128
        pretrain_ratio
        
    ### Example use:
        >>>nunique = train_feat_df.nunique()
        >>>types = train_feat_df.dtypes
        >>>categorical_columns = []
        >>>categorical_dims = {}
        >>>train_feat_df["is_train"] = 1
        >>>test_feat_df["is_train"] = 0
        >>>all_df = pd.concat([train_feat_df, test_feat_df])
        >>for col in train_feat_df.drop(["is_train"], axis=1).columns:
            if str(types[col]) == 'category' or nunique[col] < 200:
                l_enc = LabelEncoder()
                all_df[col] = l_enc.fit_transform(all_df[col].values)
                all_df[col] = all_df[col].astype("category")
                categorical_columns.append(col)
                categorical_dims[col] = len(l_enc.classes_ )
                
        >>>cat_idx = [i for i, f in enumerate(train_feat_df.columns.tolist()) if f in categorical_columns]
        >>>cat_dims = [categorical_dims[f] for i, f in enumerate(train_feat_df.columns.tolist()) if f in categorical_columns]
    """
    def __init__(self, model_params, fit_params: Optional[Dict]):
        self.model_params = model_params
        self.fit_params = fit_params
        if self.fit_params is None:
            self.fit_params = {}

    def build_model(self):
        self.model = TabNetRegressor(**self.model_params)
        return self.model

    def fit(self, train_x, train_y, valid_x=None, valid_y=None):
        train_x, valid_x = train_x.values, valid_x.values
        self.model = self.build_model()
        self.model.fit(train_x,
                       train_y,
                       eval_set=[(train_x, train_y), (valid_x, valid_y)],
                       eval_name=["train", "valid"],
                       **self.fit_params)
        return self.model

    def predict(self, est, valid_x):
        valid_x = valid_x.values
        preds = est.predict(valid_x)
        return preds
Example #20
0
                      cat_emb_dim=cat_emb_dim,
                      cat_idxs=cat_idxs)

X_train_1 = X_train_1.values
X_valid_1 = X_valid_1.values
Y_train_1 = Y_train_1.values.reshape(-1, 1)
Y_valid_1 = Y_valid_1.values.reshape(-1, 1)

max_epochs = 1000

clf.fit(
    X_train=X_train_1,
    y_train=Y_train_1,
    eval_set=[(X_train_1, Y_train_1), (X_valid_1, Y_valid_1)],
    eval_name=['train', 'valid'],
    eval_metric=['mae', 'mse'],
    max_epochs=max_epochs,
    patience=50,
    # batch_size=1024,
    # virtual_batch_size=128,
    num_workers=0,
    drop_last=False)

X_test = X_test.values
preds = clf.predict(X_test)
preds[:48]

sub = pd.DataFrame(preds)

sub['1'] = sub[0] * 0.6
sub['2'] = sub[0] * 0.7
sub['3'] = sub[0] * 0.8
    def run_model(self, train_df, targets, X_test):
        """
        Run model.

        Args:
          train_df (dataframe): training inputs with dimensions
            [n_observations,n_features]
          targets (dataframe): updated input data of known responses (binary) from MoA targets for train data
          X_test (arr): test inputs with dimensions
            [n_observations,n_features]

        Returns:
          arr: predicted outputs with dimensions
            [n_splits_kfold,n_observations,n_moa_targets]
        """
        test_cv_preds = []
        oof_preds = []
        oof_targets = []
        scores = []

        mskf = MultilabelStratifiedKFold(n_splits=self.config.NB_SPLITS,
                                         random_state=0,
                                         shuffle=True)

        for fold_nb, (train_idx,
                      val_idx) in enumerate(mskf.split(train_df, targets)):
            print("FOLDS: ", fold_nb + 1)
            print('*' * 60)

            X_train, y_train = train_df.values[train_idx, :], targets.values[
                train_idx, :]
            X_val, y_val = train_df.values[val_idx, :], targets.values[
                val_idx, :]

            model = TabNetRegressor(**self.tabnet_params)

            model.fit(X_train=X_train,
                      y_train=y_train,
                      eval_set=[(X_val, y_val)],
                      eval_name=["val"],
                      eval_metric=["logits_ll"],
                      max_epochs=self.config.MAX_EPOCH,
                      patience=20,
                      batch_size=1024,
                      virtual_batch_size=32,
                      num_workers=1,
                      drop_last=False,
                      loss_fn=F.binary_cross_entropy_with_logits)
            print('-' * 60)

            preds_val = model.predict(X_val)
            preds = 1 / (1 + np.exp(-preds_val))
            score = np.min(model.history["val_logits_ll"])

            oof_preds.append(preds_val)
            oof_targets.append(y_val)
            scores.append(score)

            preds_test = model.predict(X_test)
            test_cv_preds.append(1 / (1 + np.exp(-preds_test)))

        oof_preds_all = np.concatenate(oof_preds)
        oof_targets_all = np.concatenate(oof_targets)
        test_preds_all = np.stack(test_cv_preds)

        aucs = []
        for task_id in range(oof_preds_all.shape[1]):
            aucs.append(
                roc_auc_score(y_true=oof_targets_all[:, task_id],
                              y_score=oof_preds_all[:, task_id]))

        print(f"Overall AUC: {np.mean(aucs)}")
        print(f"Average CV: {np.mean(scores)}")

        return test_preds_all