def fit(self, train_x, valid_x): self.model = TabNetPretrainer(**self.model_params) self.model.fit(train_x, eval_set=[valid_x], eval_name=["train"], **self.fit_params) return self.model
def pretrain_model(X,y): tabnet_params = dict(n_d=8, n_a=8, n_steps=3, gamma=1.3, n_independent=2, n_shared=2, seed=SEED, lambda_sparse=1e-3, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2), mask_type="entmax", scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9,), scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau, verbose=10 ) pretrainer = TabNetPretrainer(**tabnet_params) pretrainer.fit( X_train=X.values, eval_set=[X.values], max_epochs=200, patience=20, batch_size=30000, virtual_batch_size=3000, num_workers=1,drop_last=True) return pretrainer
class MyPretrainTabnet: def __init__(self, model_params, fit_params): self.model_params = model_params self.fit_params = fit_params self.model = None def fit(self, train_x, valid_x): self.model = TabNetPretrainer(**self.model_params) self.model.fit(train_x, eval_set=[valid_x], eval_name=["train"], **self.fit_params) return self.model def pretrain_run(self, name: str, train_x: pd.DataFrame, train_y: np.ndarray, cv, output_dir: str = "./"): va_idxes = [] for cv_num, (trn_idx, val_idx) in enumerate(cv): print(decorate("start pretraining")) tr_x, va_x = train_x.values[trn_idx], train_x.values[val_idx] # tr_y, va_y = train_y[trn_idx], train_y[val_idx] va_idxes.append(val_idx) model = self.fit(tr_x, va_x) model_name = f"{name}_FOLD{cv_num}_pretrain_model" model.save_model(output_dir + model_name)
def _add_tabnet_unsupervised_model_to_fit_params(self, pipeline, fit_params, X_train, X_val): steps = pipeline.steps if steps[-1][1].__class__ not in [ TabNetClassifier, TabNetRegressor, ]: return fit_params eval_X_train = self._preprocess_x(pipeline, X_train, X_train) eval_X_val = self._preprocess_x(pipeline, X_train, X_val) logger.info('tabnet unsupervised pre-training') _uns_fit_params = {} for k, v in fit_params.items(): if f'{steps[-1][0]}__' not in k: continue if 'from_unsupervised' in k or 'eval' in k: continue _uns_fit_params[k.replace(f'{steps[-1][0]}__', '')] = v unsupervised_model = TabNetPretrainer() unsupervised_model.fit(X_train=eval_X_train, eval_set=[eval_X_val], **_uns_fit_params) fit_params[f'{steps[-1][0]}__from_unsupervised'] = unsupervised_model return fit_params
def crossval_and_predict(self, n_folds: int, df: pd.DataFrame, df_test: pd.DataFrame, feature_col: list, target_col: str, model_params: dict): oof = np.zeros((len(df))) cv_preds = np.zeros((len(df_test))) kfold = KFold(n_splits=n_folds, random_state=self.random_state, shuffle=True) for train_idx, valid_idx in kfold.split(df): X_train, y_train = df[feature_col].values[train_idx], df[ target_col].values[train_idx].reshape(-1, 1) X_valid, y_valid = df[feature_col].values[valid_idx], df[ target_col].values[valid_idx].reshape(-1, 1) X_test = df_test[feature_col].values params = self.default_params() params['seed'] = self.random_state params['n_d'] = model_params['n_d'] params['n_a'] = model_params['n_d'] params['gamma'] = model_params['gamma'] params['momentum'] = model_params['momentum'] params['n_steps'] = model_params['n_steps'] params['n_shared'] = model_params['n_shared'] params['n_independent'] = model_params['n_independent'] logging.info( f'Parameters used for TabNet supervised training: {params}') unsupervised_model = TabNetPretrainer(**params) unsupervised_model.fit(X_train=X_train, eval_set=[X_valid], pretraining_ratio=0.5, max_epochs=20) model = TabNetRegressor(**params) model.fit(X_train=X_train, y_train=y_train, eval_set=[(X_valid, y_valid)], eval_name=['valid'], eval_metric=['rmse'], max_epochs=100, patience=10, batch_size=1024, from_unsupervised=unsupervised_model) oof[valid_idx] = model.predict(X_valid).squeeze() cv_preds += model.predict(X_test).squeeze() / n_folds logging.info( f'Finished fold with score {rmse(y_valid, oof[valid_idx])}') rmse_score = rmse(df[target_col], oof) return rmse_score, cv_preds
def main(): # Generate Synthetic Data data, test_data, cat_col_names, num_col_names = data_load() cat_dims = data[cat_col_names].nunique().to_list() cat_idxs = [(cat_col_names + num_col_names).index(cat_col) for cat_col in cat_col_names] cat_emb_dims = np.ceil(np.log(cat_dims)).astype(np.int).tolist() cat_emb_dims = np.ceil(np.clip((np.array(cat_dims)) / 2, a_min=1, a_max=50)).astype(np.int).tolist() FEATURES = cat_col_names + num_col_names df_sub = pd.read_csv('Data/sample_submission.csv') bsize = 2500 * 2 # ##########Define the Configs############ N_D = 16 N_A = 16 N_INDEP = 2 N_SHARED = 2 N_STEPS = 1 # 2 MASK_TYPE = "sparsemax" GAMMA = 1.5 BS = 512 MAX_EPOCH = 21 # 20 PRETRAIN = True X = data[FEATURES].values y = data["target"].values X_test = test_data[FEATURES].values if PRETRAIN: pretrain_params = dict( n_d=N_D, n_a=N_A, n_steps=N_STEPS, # 0.2, n_independent=N_INDEP, n_shared=N_SHARED, cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=cat_emb_dims, gamma=GAMMA, lambda_sparse=0., optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2), mask_type=MASK_TYPE, scheduler_params=dict( mode="min", patience=3, min_lr=1e-5, factor=0.5, ), scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau, verbose=1, ) pretrainer = TabNetPretrainer(**pretrain_params) pretrainer.fit( X_train=X_test, eval_set=[X], max_epochs=MAX_EPOCH, patience=25, batch_size=BS, virtual_batch_size=BS, # 128, num_workers=0, drop_last=True, pretraining_ratio= 0.5 # The bigger your pretraining_ratio the harder it is to reconstruct ) # Training the Model # tabular_mode.fit(train=train, validation=val) # # Evaluating the Model # # #Loss and Metrics on New Data¶ # result = tabular_mode.evaluate(test) cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=777) BS = 2048 MAX_EPOCH = 20 # skf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True) data['oof_preds'] = np.nan for fold_nb, (train_index, valid_index) in enumerate(cv.split(X, y)): X_train, X_valid = X[train_index], X[valid_index] y_train, y_valid = y[train_index], y[valid_index] tabnet_params = dict( n_d=N_D, n_a=N_A, n_steps=N_STEPS, gamma=GAMMA, n_independent=N_INDEP, n_shared=N_SHARED, lambda_sparse=1e-5, seed=0, clip_value=2, cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=cat_emb_dims, mask_type=MASK_TYPE, device_name='auto', optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=5e-2, weight_decay=1e-5), scheduler_params=dict( max_lr=5e-2, steps_per_epoch=int(X_train.shape[0] / BS), epochs=MAX_EPOCH, # final_div_factor=100, is_batch_level=True), scheduler_fn=torch.optim.lr_scheduler.OneCycleLR, # scheduler_params=dict(mode='max', # factor=0.5, # patience=5, # is_batch_level=False,), # scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau, verbose=1) # Defining TabNet model model = TabNetClassifier(**tabnet_params) model.fit( X_train=X_train, y_train=y_train, from_unsupervised=pretrainer if PRETRAIN else None, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_name=["train", "valid"], eval_metric=["auc"], batch_size=BS, virtual_batch_size=256, max_epochs=MAX_EPOCH, drop_last=True, pin_memory=True, patience=10, ) val_preds = model.predict_proba(X_valid)[:, -1] print('auc:', roc_auc_score(y_true=y_valid, y_score=val_preds)) data['oof_preds'].iloc[valid_index] = val_preds test_preds = model.predict_proba(X_test)[:, -1] df_sub[f"fold_{fold_nb+1}"] = test_preds df_sub["target"] = df_sub.filter(like="fold_").mean(axis=1).values df_sub.to_csv("Analysis/submission_5_tabnet.csv", index=False) df_sub = pd.read_csv("Analysis/submission_5_tabnet.csv") # df_sub.target = df_sub.target.map(lambda x: 0 if x<=0.5 else 1) df_sub.loc[:, ["id", "target"]].to_csv("Analysis/submission_5_2_tabnet.csv", index=False)
"result_score", ] feat_cols = [c for c in train.columns if c not in no_feature_cols] from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X = scaler.fit_transform(train[feat_cols].values) X_tst = scaler.transform(test[feat_cols].values) y_binary = train.result.astype("float64").copy().values X_trn, X_vld, _, _ = train_test_split(X, y_binary, random_state=42, shuffle=True) # TabNetPretrainer unsupervised_model = TabNetPretrainer( optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2), mask_type="sparsemax", # "sparsemax" ) max_epochs = 1000 unsupervised_model.fit( X_train=X_trn, eval_set=[X_vld], max_epochs=max_epochs, patience=10, batch_size=512, virtual_batch_size=64, num_workers=0, drop_last=False, pretraining_ratio=0.8, )
import pandas as pd import torch from matplotlib import pyplot as plt from pytorch_tabnet.pretraining import TabNetPretrainer from pytorch_tabnet.tab_model import TabNetClassifier from sklearn.metrics import roc_auc_score from sklearn.preprocessing import LabelEncoder # -- settings use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # TabNetPretrainer unsupervised_model = TabNetPretrainer( optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=1e-3), mask_type='entmax', # "sparsemax", device_name='cuda' ) clf = TabNetClassifier( optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=1e-3), scheduler_params={"step_size":7, # how to use learning rate scheduler "gamma":0.9}, scheduler_fn=torch.optim.lr_scheduler.StepLR, mask_type='entmax', # This will be overwritten if using pretrain model, device_name='cuda' )
def fit(self, x_train, y_train, kf_splits=5, tabnet_type=None): def _get_tabnet_params(tabnet_type): if (tabnet_type is None): tabnet_params = dict( verbose=40, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=1e-2, weight_decay=1e-5), scheduler_params=dict(max_lr=0.05, steps_per_epoch=x_train.shape[0] // 128, epochs=300), scheduler_fn=torch.optim.lr_scheduler.OneCycleLR) fit_params = dict(batch_size=1024, virtual_batch_size=128, eval_metric='accuracy') elif (tabnet_type == 'TabNet-S'): tabnet_params = dict( n_d=8, n_a=8, lambda_sparse=0.0001, momentum=0.1, n_steps=3, gamma=1.2, verbose=40, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=0.01), scheduler_params=dict(step_size=8000, gamma=0.05), scheduler_fn=torch.optim.lr_scheduler.StepLR) fit_params = dict(batch_size=4096, virtual_batch_size=256, eval_metric='mse') else: print('[ERROR] Unknown tabnet_type: {}'.format(tabnet_type)) quit() # --- check problem --- if fit_params['eval_metric'] in [ 'auc', 'accuracy', 'balanced_accuracy', 'logloss' ]: problem = 'classification' elif fit_params['eval_metric'] in ['mse', 'mae', 'rmse', 'rmsle']: problem = 'regression' return tabnet_params, fit_params, problem kf = KFold(n_splits=kf_splits, shuffle=False) scores = [] self.tabnet_models = [] tabnet_params, fit_params, problem = _get_tabnet_params(tabnet_type) for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)): if (problem == 'classification'): unsupervised_model = TabNetPretrainer(**tabnet_params) tabnet_model = TabNetClassifier(**tabnet_params) elif (problem == 'regression'): unsupervised_model = TabNetPretrainer(**tabnet_params) tabnet_model = TabNetRegressor(**tabnet_params) else: pring('[ERROR] Unknown problem: {}'.format(problem)) quit() x_tr = x_train[train_index] x_val = x_train[val_index] y_tr = y_train[train_index] y_val = y_train[val_index] unsupervised_model.fit(x_tr, eval_set=[x_val], patience=300, max_epochs=5000, pretraining_ratio=0.8) tabnet_model.fit( x_tr, y_tr, eval_set=[(x_val, y_val)], eval_metric=[fit_params['eval_metric']], batch_size=fit_params['batch_size'], virtual_batch_size=fit_params['virtual_batch_size'], patience=300, max_epochs=5000, from_unsupervised=unsupervised_model) self.tabnet_models.append(tabnet_model) prediction = tabnet_model.predict(x_val) if (problem == 'classification'): scores.append(accuracy_score(y_val, prediction)) elif (problem == 'regression'): scores.append(mean_squared_error(y_val, prediction)) else: pring('[ERROR] Unknown problem: {}'.format(problem)) quit() if (i == 0): feature_importances = tabnet_model.feature_importances_.copy() else: feature_importances = np.vstack( (feature_importances, tabnet_model.feature_importances_)) print(scores) print(np.mean(scores)) return scores, feature_importances