def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): categorical_dims = {} for col in self.categorical_features: tr_x[col] = tr_x[col].fillna("unk") va_x[col] = va_x[col].fillna("unk") te_x[col] = te_x[col].fillna("unk") categorical_dims[col] = len(set(tr_x[col].values) | set(va_x[col].values) | set(te_x[col].values)) cat_idxs = [i for i, f in enumerate(tr_x.columns) if f in self.categorical_features] cat_dims = [categorical_dims[f] for i, f in enumerate(tr_x.columns) if f in self.categorical_features] cat_emb_dim = [10 for _ in categorical_dims] for col in tr_x.columns: tr_x[col] = tr_x[col].fillna(tr_x[col].mean()) va_x[col] = va_x[col].fillna(tr_x[col].mean()) te_x[col] = te_x[col].fillna(tr_x[col].mean()) self.model = TabNetClassifier(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs) self.model.fit(X_train=tr_x.values, y_train=tr_y.values, X_valid=va_x.values, y_valid=va_y.values, max_epochs=1000, patience=50, batch_size=1024, virtual_batch_size=128)
def tabtune(Xtrain, Xvalid, ytrain, yvalid, verbose=True, scorefunc=roc_auc_score, predfunc=pred_1dprobs, return_extras=False, return_score=False, **kwargs): for k in DEFAULT_TAB: if k not in kwargs: kwargs[k] = DEFAULT_TAB[k] xg_bestimator = None xg_best_score = -float('inf') xg_best_params = None xg_params_scores = {} xg_results = {} params = TUNE_TAB.copy() xg_keys = params.keys() for vals in tqdm(list(itertools.product(*[params[k] for k in xg_keys]))): paramdict = {k: v for k, v in zip(xg_keys, vals)} n_d_a = paramdict.pop('n_d_a') paramdict['n_d'] = paramdict['n_a'] = n_d_a xg_results[vals] = {} bst = TabNetClassifier(**kwargs, **paramdict) bst.fit(Xtrain, ytrain, eval_set=[(Xtrain, ytrain), (Xvalid, yvalid)], eval_metric=['auc'], max_epochs=MAX_EPOCHS, patience=20, batch_size=1024, virtual_batch_size=128, num_workers=0, weights=1, drop_last=False) #,eval_set=[(Xvalid,yvalid)] cur_score = scorefunc(yvalid, predfunc(bst, Xvalid)) xg_params_scores[vals] = cur_score if cur_score > xg_best_score: xg_bestimator = bst xg_best_score = cur_score xg_best_params = paramdict # xg_best_params['n_estimators']=bst.booster_.best_iteration if not return_extras: return (xg_best_params, xg_best_score) if return_score else xg_best_params else: return ((xg_best_params, xg_best_score), xg_params_scores)
def act_init_ai(self, is_test=False): if is_test is True: _TPI(self, locals()) else: MATCH = self.env.match_loader self.ai = TabNetClassifier(n_steps=10, input_dim=MATCH.count_cols * MATCH.count_players, cat_dims=self.cat_dims, cat_emb_dim=self.cat_emb_dim, cat_idxs=self.cat_idxs, device_name='cuda')
def process_data(df, date_valid, date_test, features, target): data = df.copy() # split datasets if "set" not in data.columns: data["set"] = "train" data.loc[data.date > date_valid, "set"] = "valid" data.loc[data.date > date_test, "set"] = "test" train_indices = data[data.set == "train"].index valid_indices = data[data.set == "valid"].index test_indices = data[data.set == "test"].index indices = data.set.values # Select data data = data[features + [target]] # Get categorical features and preprocess nunique = data.nunique() types = data.dtypes categorical_columns = [] categorical_dims = {} for col in data.columns: if types[col] == 'object' or nunique[col] < 200: l_enc = LabelEncoder() data[col] = data[col].fillna("Unknown") data[col] = l_enc.fit_transform(data[col].values) categorical_columns.append(col) categorical_dims[col] = len(l_enc.classes_) else: data[col].fillna(data[col].mean(), inplace=True) scaler = StandardScaler() data[[col]] = scaler.fit_transform(data[[col]]) cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns] cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns ] # Define model clf = TabNetClassifier(cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=1, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=1e-2), scheduler_params={ "step_size": 10, "gamma": 0.9 }, scheduler_fn=torch.optim.lr_scheduler.StepLR, mask_type='entmax') # Datasets X_train = data[features].values[train_indices] y_train = data[target].values[train_indices] X_valid = data[features].values[valid_indices] y_valid = data[target].values[valid_indices] X_test = data[features].values[test_indices] y_test = data[target].values[test_indices] return clf, X_train, y_train, X_valid, y_valid, X_test, y_test, indices
class ModelTabNetClassifier(Model): def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): categorical_dims = {} for col in self.categorical_features: tr_x[col] = tr_x[col].fillna("unk") va_x[col] = va_x[col].fillna("unk") te_x[col] = te_x[col].fillna("unk") categorical_dims[col] = len(set(tr_x[col].values) | set(va_x[col].values) | set(te_x[col].values)) cat_idxs = [i for i, f in enumerate(tr_x.columns) if f in self.categorical_features] cat_dims = [categorical_dims[f] for i, f in enumerate(tr_x.columns) if f in self.categorical_features] cat_emb_dim = [10 for _ in categorical_dims] for col in tr_x.columns: tr_x[col] = tr_x[col].fillna(tr_x[col].mean()) va_x[col] = va_x[col].fillna(tr_x[col].mean()) te_x[col] = te_x[col].fillna(tr_x[col].mean()) self.model = TabNetClassifier(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs) self.model.fit(X_train=tr_x.values, y_train=tr_y.values, X_valid=va_x.values, y_valid=va_y.values, max_epochs=1000, patience=50, batch_size=1024, virtual_batch_size=128) def predict(self, te_x): return self.model.predict_proba(te_x.values)[:, 1].reshape(-1, ) def save_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') os.makedirs(os.path.dirname(model_path), exist_ok=True) Data.dump(self.model, model_path) def load_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') self.model = Data.load(model_path)
def get_classifier(cat_idxs, cat_dims): clf = TabNetClassifier( cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=1, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2), scheduler_params={ "step_size": 50, # how to use learning rate scheduler "gamma": 0.9 }, scheduler_fn=torch.optim.lr_scheduler.StepLR, mask_type='entmax' # "sparsemax" ) return clf
def fit(self, train_X, train_Y, val_X, val_Y, cat_cols): cat_dims = [len(set(train_X[:, idx])) + 1 for idx in cat_cols] if self.dataset_type in {'2-class', 'm-class'}: self.model = TabNetClassifier(cat_idxs=cat_cols, cat_dims=cat_dims) self.model.fit(train_X, train_Y, eval_set=[(val_X, val_Y)], eval_metric=['logloss'], max_epochs=200, patience=20) elif self.dataset_type in {'regression'}: self.model = TabNetRegressor(cat_idxs=cat_cols, cat_dims=cat_dims) self.model.fit(train_X, train_Y[:, np.newaxis], eval_set=[(val_X, val_Y[:, np.newaxis])], eval_metric=['rmse'], max_epochs=200, patience=20)
def build_tabnet(): model_file_name = 'tabnet_model_{}'.format(current_dataset_name) df = current_dataset.copy() cleaning_text(df) X = df['clean_content'] y = df['emotion'] # tokenize la data tok = Tokenizer(num_words=1000, oov_token='<UNK>') # fit le model avec les données de train # tok.fit_on_texts(X) # X = tok.texts_to_matrix(X, mode='tfidf') # split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=1) X_test_save = X_test tok.fit_on_texts(X_test) X_test = tok.texts_to_matrix(X_test, mode='tfidf') tok.fit_on_texts(X_train) X_train = tok.texts_to_matrix(X_train, mode='tfidf') # X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, stratify=y) # build model, fit and predict if LOAD_MODEL and pathlib.Path(model_file_name).exists(): model = pickle.load(open(model_file_name, 'rb')) else: model = TabNetClassifier() model.fit(X_train=X_train, y_train=y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_name=['train', 'valid'], eval_metric=['accuracy', 'balanced_accuracy', 'logloss']) preds_mapper = { idx: class_name for idx, class_name in enumerate(model.classes_) } preds = model.predict_proba(X_test) y_pred_proba = np.vectorize(preds_mapper.get)(np.argmax(preds, axis=1)) y_pred = model.predict(X_test) test_acc = accuracy_score(y_pred=y_pred, y_true=y_test) pickle.dump(model, open(model_file_name, 'wb')) # model.save_model(model_file_name) return model, y_test, y_pred, test_acc
def fit(self, X, y): X, y = check_X_y(X, y) self.estimators_ = [] self.features_ = [] self.classes_ = np.unique(y) self.n_samples_ = int(np.round(self.max_samples * X.shape[0])) self.n_features_ = int(np.round(self.max_features * X.shape[1])) for _ in range(self.n_estimators): samples = np.random.choice(X.shape[0], size=self.n_samples_, replace=self.bootstrap) features = np.random.choice(X.shape[1], size=self.n_features_, replace=False) unused_samples = np.array( [i for i in range(X.shape[0]) if i not in samples]) X_train = X[samples][:, features] y_train = y[samples] estimator = TabNetClassifier(verbose=self.verbose, device_name=self.device_name) if self.oob_score and len(unused_samples) > 0: X_val = X[unused_samples][:, features] y_val = y[unused_samples] estimator.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=['balanced_accuracy'], patience=self.patience, max_epochs=self.max_epochs) else: estimator.fit(X_train, y_train, patience=self.patience, max_epochs=self.max_epochs) self.estimators_.append(estimator) self.features_.append(features) return self
class MyTabNetClassifierModel(BaseModel): """ Paramters --------- ref: https://dreamquark-ai.github.io/tabnet/generated_docs/README.html#model-parameters model_params: n_d:default=8(range 8 to 64) n_a:default=8 n_steps:default=3(range 3 to 10) gamma:default=1.3(range 1.0 to 2.0) n_independent:default=2(range 1 to 5) n_shared:default=2(range 1 to 5) lambda_sparse:default=1e3 optimizer_fn:default=Adam optimizer_params:default=(lr=2e2, weight_decay=None), mask_type:default=sparsemax or entmax scheduler_params:dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False), seed: default=0 verbose=5, cat_dims=cat_dims, cat_idxs=cat_idx, cat_emb_dim=1 fit_params: max_epochs:default=200 patience:default=15 loss_fn(torch.loss or list of torch.loss):default to mse for regression and cross entropy for classification eval_metric(list or str) batch_size:default=1024 virtual_batch_size:default=128 pretrain_ratio ### Example use: >>>nunique = train_feat_df.nunique() >>>types = train_feat_df.dtypes >>>categorical_columns = [] >>>categorical_dims = {} >>>train_feat_df["is_train"] = 1 >>>test_feat_df["is_train"] = 0 >>>all_df = pd.concat([train_feat_df, test_feat_df]) >>for col in train_feat_df.drop(["is_train"], axis=1).columns: if str(types[col]) == 'category' or nunique[col] < 200: l_enc = LabelEncoder() all_df[col] = l_enc.fit_transform(all_df[col].values) all_df[col] = all_df[col].astype("category") categorical_columns.append(col) categorical_dims[col] = len(l_enc.classes_ ) >>>cat_idx = [i for i, f in enumerate(train_feat_df.columns.tolist()) if f in categorical_columns] >>>cat_dims = [categorical_dims[f] for i, f in enumerate(train_feat_df.columns.tolist()) if f in categorical_columns] """ def __init__(self, model_params, fit_params): self.model_params = model_params self.fit_params = fit_params def build_model(self): self.model = TabNetClassifier(**self.model_params) return self.model def fit(self, train_x, train_y, valid_x=None, valid_y=None): train_x = train_x.values valid_x = valid_x.values self.model = self.build_model() self.model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_name=["train", "valid"], **self.fit_params) return self.model def predict(self, est, valid_x): valid_x = valid_x.values preds = est.predict_proba(valid_x)[:, 1] return preds def get_feature_importance(self, train_x: pd.DataFrame, is_save=False, filepath=None): feature_importance_df = pd.DataFrame() num = 0 for i, model in self.models.items(): _df = pd.DataFrame() _df['feature_importance'] = model.feature_importances_ _df['column'] = train_x.columns _df['fold'] = num + 1 feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True) num += 1 order = feature_importance_df.groupby('column')\ .sum()[['feature_importance']]\ .sort_values('feature_importance', ascending=False).index[:50] fig, ax = plt.subplots(figsize=(8, max(6, len(order) * .25))) if is_save: fig.savefig(filepath + "tabnet_feature_importance.png") _df.to_csv(filepath + "tabnet_feature_importance.csv", index=False) sns.boxenplot(data=feature_importance_df, x='feature_importance', y='column', order=order, ax=ax, palette='viridis', orient='h') ax.tick_params(axis='x', rotation=90) ax.set_title('Tabnet Feature Importance') ax.grid() plt.show()
def build_model(self): self.model = TabNetClassifier(**self.model_params) return self.model
from pytorch_tabnet.pretraining import TabNetPretrainer from pytorch_tabnet.tab_model import TabNetClassifier import torch N_D = 16 N_A = 16 N_INDEP = 2 N_SHARED = 2 N_STEPS = 1 #2 MASK_TYPE = "sparsemax" GAMMA = 1.5 BS = 128 #512 MAX_EPOCH = 20 # 20 PRETRAIN = True clf = TabNetClassifier() clf.fit( X, y, # eval_set=[(X_valid, y_valid)] ) # if PRETRAIN: # pretrain_params = dict(n_d=N_D, n_a=N_A, n_steps=N_STEPS, #0.2, # n_independent=N_INDEP, n_shared=N_SHARED, # cat_idxs=cat_idxs, # cat_dims=cat_dims, # cat_emb_dim=cat_emb_dims, # gamma=GAMMA, # lambda_sparse=0., optimizer_fn=torch.optim.Adam, # optimizer_params=dict(lr=2e-2),
X_train = X_all[0:d_train.shape[0]].to_numpy() y_train = y_all[0:d_train.shape[0]] X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])].to_numpy() y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])] MAX_EPOCH = 10 BS = 1024 md = TabNetClassifier(cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=1, ## optimizer_fn=torch.optim.Adam, ## optimizer_params=dict(lr=2e-2), scheduler_fn=torch.optim.lr_scheduler.OneCycleLR, scheduler_params=dict(max_lr=0.05, steps_per_epoch=int(X_train.shape[0] / BS), epochs=MAX_EPOCH, is_batch_level=True), mask_type='entmax' # "sparsemax" ) %%time md.fit( X_train=X_train, y_train=y_train, max_epochs=MAX_EPOCH, patience=0, ## batch_size=1024, virtual_batch_size=128, ## weights=0, drop_last = True )
def get_base_estimator(self, model, create_nn_model=None): # keras config tf.random.set_seed(42) # torch config # for reproducibility torch.manual_seed(42) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # gpu or cpu device = 'cuda' if torch.cuda.is_available() else 'cpu' if model == 'log_reg': return LogisticRegression(solver='lbfgs') elif model == 'log_reg_cv': return LogisticRegressionCV() elif model == 'linear_reg': return LinearRegression() elif model == 'lasso': return Lasso() elif model == 'ridge': return Ridge() elif model == 'svc': return SVC() elif model == 'svr': return SVR() elif model == 'l_svc': return LinearSVC() elif model == 'l_svr': return LinearSVR() elif model == 'rf_clf': return RandomForestClassifier() elif model == 'rf_reg': return RandomForestRegressor() elif model == 'gbdt_clf': return GradientBoostingClassifier() elif model == 'gbdt_reg': return GradientBoostingRegressor() elif model == 'knn_clf': return KNeighborsClassifier() elif model == 'knn_reg': return KNeighborsRegressor() elif model == 'g_mix': return GaussianMixture() elif model == 'g_nb': return GaussianNB() elif model == 'preceptron': return Perceptron() elif model == 'sgd_clf': return SGDClassifier() elif model == 'sgd_reg': return SGDRegressor() elif model == 'dt_clf': return DecisionTreeClassifier() elif model == 'dt_reg': return DecisionTreeRegressor() elif model == 'xgb_clf': return XGBClassifier() elif model == 'xgb_reg': return XGBRegressor() elif model == 'lgb_clf': return LGBMClassifier() elif model == 'lgb_reg': return LGBMRegressor() elif model == 'catb_clf': return CatBoostClassifier() elif model == 'catb_reg': return CatBoostRegressor() elif model == 'rgf_clf': return RGFClassifier() elif model == 'rgf_reg': return RGFRegressor() elif model == 'keras_clf': return MyKerasClassifier(build_fn=create_nn_model) elif model == 'keras_reg': return MyKerasRegressor(build_fn=create_nn_model) elif model == 'torch_clf': return NeuralNetClassifier(module=create_nn_model(), device=device, train_split=None) elif model == 'torch_reg': return NeuralNetRegressor(module=create_nn_model(), device=device, train_split=None) elif model == 'tabnet_clf': return TabNetClassifier() elif model == 'tabnet_reg': return TabNetRegressor() else: logger.error('NOT IMPLEMENTED BASE MODEL: %s' % model) raise Exception('NOT IMPLEMENTED')
oof = train.result_score.astype("float64").copy().values kfold = KFold(n_splits=5, shuffle=True, random_state=i * 42) _test_preds = [] for fold, (trn_index, vld_index) in enumerate(kfold.split(X, y_binary)): X_trn, y_trn = X[trn_index], y_binary[trn_index] X_vld, y_vld = X[vld_index], y_binary[vld_index] print(f"Fold {fold}, data split") print(f" Shape of train x, y = {X_trn.shape}, {y_trn.shape}") print(f" Shape of valid x, y = {X_vld.shape}, {y_vld.shape}") loaded_pretrain = TabNetPretrainer() loaded_pretrain.load_model(f"{OUTPUTDIR}/pretrain.zip") clf = TabNetClassifier( optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-1), scheduler_params={"step_size": 10, "gamma": 0.9}, # how to use learning rate scheduler scheduler_fn=torch.optim.lr_scheduler.StepLR, mask_type="sparsemax", # This will be overwritten if using pretrain model ) clf.fit( X_train=X_trn, y_train=y_trn, eval_set=[(X_trn, y_trn), (X_vld, y_vld)], eval_name=["train", "valid"], eval_metric=["logloss"], max_epochs=max_epochs, patience=50, batch_size=128, virtual_batch_size=128, num_workers=0,
test_pred = np.zeros(shape=len(y_test)) for train_index, val_index in skf.split(X_train, y_train): X_tr = X_train[train_index] y_tr = y_train[train_index] X_val = X_train[val_index] y_val = y_train[val_index] clf = TabNetClassifier( cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=1, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2), verbose=0, scheduler_params={ "step_size": 50, # how to use learning rate scheduler "gamma": 0.9 }, scheduler_fn=torch.optim.lr_scheduler.StepLR, mask_type='entmax' # "sparsemax" ) clf.fit(X_train=X_tr, y_train=y_tr, X_valid=X_val, y_valid=y_val, max_epochs=epoch, patience=epoch, batch_size=1024, virtual_batch_size=128,
y_t = data.train_ds.trend_class.astype(dtype=np.int64).squeeze() y_v = data.val_ds.trend_class.astype(dtype=np.int64).squeeze() for i in np.arange(X_t.shape[1]): interactions_t = X_t * np.repeat(X_t[:,i:(i+1)],X_t.shape[1],axis=1) interactions_v = X_v * np.repeat(X_v[:,i:(i+1)],X_v.shape[1],axis=1) scaler = StandardScaler().fit(interactions_t) interactions_t = scaler.transform(interactions_t) interactions_v = scaler.transform(interactions_v) torch.manual_seed(0) np.random.seed(0) clf = TabNetClassifier( scheduler_params={"step_size":5, "gamma":0.8}, scheduler_fn=torch.optim.lr_scheduler.StepLR ) clf.fit( interactions_t, y_t, eval_set=[(interactions_t, y_t),(interactions_v, y_v)], eval_name=['train','val'], batch_size = 4096, virtual_batch_size = 4096//8, patience = 3, max_epochs = 100 ) # print('importances:',clf.feature_importances_*100) selected = np.where(clf.feature_importances_*100 > 8)[0] print('multiplying by:',i)
def fit( self, x_train: AoD, y_train: AoS, x_valid: AoD, y_valid: AoS, config: dict, **kwargs ) -> Tuple[TabNetModel, dict]: model_params = config["model"]["model_params"] train_params = config["model"]["train_params"] categorical_cols = config["categorical_cols"] self.config["categorical_cols"] = categorical_cols categorical_dims = {} for col in categorical_cols: x_train[col] = x_train[col].cat.add_categories("Unknown") x_train[col] = x_train[col].fillna("Unknown") x_train[col] = x_train[col].cat.codes x_valid[col] = x_valid[col].cat.add_categories("Unknown") x_valid[col] = x_valid[col].fillna("Unknown") x_valid[col] = x_valid[col].cat.codes categorical_dims[col] = len( set(x_train[col].values) | set(x_valid[col].values) ) cat_idxs = [i for i, f in enumerate(x_train.columns) if f in categorical_cols] cat_dims = [ categorical_dims[f] for i, f in enumerate(x_train.columns) if f in categorical_cols ] cat_emb_dim = [10 for _ in categorical_dims] numerical_cols = [col for col in x_train.columns if col not in categorical_cols] for col in numerical_cols: x_train[col] = x_train[col].fillna(x_train[col].mean()) x_valid[col] = x_valid[col].fillna(x_train[col].mean()) mode = config["model"]["mode"] self.mode = mode if mode == "regression": model = TabNetRegressor( cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs, **model_params, ) else: model = TabNetClassifier( cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs, **model_params, ) model.fit( X_train=x_train.values, y_train=y_train.reshape(-1, 1), X_valid=x_valid.values, y_valid=y_valid.reshape(-1, 1), **train_params, ) best_score = {"valid_score": model.losses_valid} return model, best_score
#we separe between categorical variables and date variables to preprocess it separately categorical_val.remove('answer_creation_date') categorical_val.remove('group_creation_date') categorical_val.remove('request_creation_date') categorical_val.remove('victim_of_violence_type') date_columns = [ 'answer_creation_date', 'group_creation_date', 'request_creation_date' ] X_test, y_test = preprocess(requests_test, categorical_val, date_columns) # Drop Nan value because otherwise there are memory errors X_test['granted_number_of_nights'] = y_test X_test = X_test.dropna() y_test = X_test['granted_number_of_nights'] X_test = X_test.drop(columns=['granted_number_of_nights']) #preprocess the datasets for TabNet X_test_tab, y_test_tab = preprocess_for_tabnet(X_test, y_test) # retrieve model # Not working with version <3.7 PATH = '../model_zoo/TabNet_model.zip' clf = TabNetClassifier() clf.load_model(PATH) #run inference start = time.time() preds = clf.predict_proba(X_test_tab) end = time.time() score = competition_scorer(y_test_tab, preds) print('time per prediction:', (end - start) / len(X_test)) print('The competition score on test data', score)
d_all[col] = preprocessing.LabelEncoder().fit_transform(d_all[col]) X_all = d_all[vars_num+vars_cat] y_all = np.where(d_all["dep_delayed_15min"]=="Y",1,0) cat_idxs = [ i for i, col in enumerate(X_all.columns) if col in vars_cat] cat_dims = [ len(np.unique(X_all.iloc[:,i].values)) for i in cat_idxs] X_train = X_all[0:d_train.shape[0]].to_numpy() y_train = y_all[0:d_train.shape[0]] X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])].to_numpy() y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])] md = TabNetClassifier(cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=1 ) %%time md.fit( X_train=X_train, y_train=y_train, max_epochs=10, patience=0 ) y_pred = md.predict_proba(X_test)[:,1] print(metrics.roc_auc_score(y_test, y_pred))
X_train = X_all[0:d_train.shape[0]].to_numpy() y_train = y_all[0:d_train.shape[0]] X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])].to_numpy() y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])] cat_idxs = [ i for i, col in enumerate(X_all.columns) if col in vars_cat] cat_dims = [ len(np.unique(X_all.iloc[:,i].values)) for i in cat_idxs] cat_emb_dim = np.floor(np.log(cat_dims)).astype(int) md = TabNetClassifier(cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, ## optimizer_fn=torch.optim.Adam, ## optimizer_params=dict(lr=2e-2), ## mask_type='sparsemax', n_steps=1, ) %%time md.fit( X_train=X_train, y_train=y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_name=['train', 'test_EVIL'], eval_metric=['auc'], max_epochs=100, patience=100, ## batch_size=1024, virtual_batch_size=128, ## weights=0, ) y_pred = md.predict_proba(X_test)[:,1]
categorical_val.remove('answer_creation_date') categorical_val.remove('group_creation_date') categorical_val.remove('request_creation_date') categorical_val.remove('victim_of_violence_type') date_columns = [ 'answer_creation_date', 'group_creation_date', 'request_creation_date' ] # We transform the dataframe into encoded features and target X, y = preprocess(requests_train, categorical_val, date_columns) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42) X_train, y_train = preprocess_for_tabnet(X_train, y_train) X_val, y_val = preprocess_for_tabnet(X_val, y_val) clf = TabNetClassifier(optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=1e-1)) clf.device = device weights = {0: 1, 1: 10, 2: 10**2, 3: 10**3} clf.fit( X_train=X_train, y_train=y_train, ##Train features and train targets X_valid=X_val, y_valid=y_val, ##Valid features and valid targets weights=weights, max_epochs=20, ##Maxiµmum number of epochs during training patience= 5, ##Number of consecutive non improving epoch before early stopping batch_size=16, ##Training batch size )
X_train = X_all[0:d_train.shape[0]].to_numpy() y_train = y_all[0:d_train.shape[0]] X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])].to_numpy() y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])] cat_idxs = [ i for i, col in enumerate(X_all.columns) if col in vars_cat] cat_dims = [ len(np.unique(X_all.iloc[:,i].values)) for i in cat_idxs] cat_emb_dim = np.floor(np.log(cat_dims)).astype(int) md = TabNetClassifier(cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, ## optimizer_fn=torch.optim.Adam, ## optimizer_params=dict(lr=2e-2), ## mask_type='sparsemax', n_steps=1, ) %%time md.fit( X_train=X_train, y_train=y_train, max_epochs=10, patience=0, ## batch_size=1024, virtual_batch_size=128, ## weights=0, ) y_pred = md.predict_proba(X_test)[:,1] print(metrics.roc_auc_score(y_test, y_pred))
y_all = np.where(d_all["dep_delayed_15min"]=="Y",1,0) cat_idxs = [ i for i, col in enumerate(X_all.columns) if col in vars_cat] cat_dims = [ len(np.unique(X_all.iloc[:,i].values)) for i in cat_idxs] X_train = X_all[0:d_train.shape[0]].to_numpy() y_train = y_all[0:d_train.shape[0]] X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])].to_numpy() y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])] md = TabNetClassifier(cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=1, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2), scheduler_params={"step_size":50, # how to use learning rate scheduler "gamma":0.9}, scheduler_fn=torch.optim.lr_scheduler.StepLR, mask_type='entmax' # "sparsemax" ) %%time md.fit( X_train=X_train, y_train=y_train, eval_set=[(X_train, y_train)], eval_name=['train'], eval_metric=['auc'], max_epochs=10, patience=0, batch_size=1024, virtual_batch_size=128, num_workers=0, weights=1, drop_last=False
import torch from matplotlib import pyplot as plt from pytorch_tabnet.pretraining import TabNetPretrainer from pytorch_tabnet.tab_model import TabNetClassifier from sklearn.metrics import roc_auc_score from sklearn.preprocessing import LabelEncoder # -- settings use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # TabNetPretrainer unsupervised_model = TabNetPretrainer( optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=1e-3), mask_type='entmax', # "sparsemax", device_name='cuda' ) clf = TabNetClassifier( optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=1e-3), scheduler_params={"step_size":7, # how to use learning rate scheduler "gamma":0.9}, scheduler_fn=torch.optim.lr_scheduler.StepLR, mask_type='entmax', # This will be overwritten if using pretrain model, device_name='cuda' )
def main(): # Generate Synthetic Data data, test_data, cat_col_names, num_col_names = data_load() cat_dims = data[cat_col_names].nunique().to_list() cat_idxs = [(cat_col_names + num_col_names).index(cat_col) for cat_col in cat_col_names] cat_emb_dims = np.ceil(np.log(cat_dims)).astype(np.int).tolist() cat_emb_dims = np.ceil(np.clip((np.array(cat_dims)) / 2, a_min=1, a_max=50)).astype(np.int).tolist() FEATURES = cat_col_names + num_col_names df_sub = pd.read_csv('Data/sample_submission.csv') bsize = 2500 * 2 # ##########Define the Configs############ N_D = 16 N_A = 16 N_INDEP = 2 N_SHARED = 2 N_STEPS = 1 # 2 MASK_TYPE = "sparsemax" GAMMA = 1.5 BS = 512 MAX_EPOCH = 21 # 20 PRETRAIN = True X = data[FEATURES].values y = data["target"].values X_test = test_data[FEATURES].values if PRETRAIN: pretrain_params = dict( n_d=N_D, n_a=N_A, n_steps=N_STEPS, # 0.2, n_independent=N_INDEP, n_shared=N_SHARED, cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=cat_emb_dims, gamma=GAMMA, lambda_sparse=0., optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2), mask_type=MASK_TYPE, scheduler_params=dict( mode="min", patience=3, min_lr=1e-5, factor=0.5, ), scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau, verbose=1, ) pretrainer = TabNetPretrainer(**pretrain_params) pretrainer.fit( X_train=X_test, eval_set=[X], max_epochs=MAX_EPOCH, patience=25, batch_size=BS, virtual_batch_size=BS, # 128, num_workers=0, drop_last=True, pretraining_ratio= 0.5 # The bigger your pretraining_ratio the harder it is to reconstruct ) # Training the Model # tabular_mode.fit(train=train, validation=val) # # Evaluating the Model # # #Loss and Metrics on New Data¶ # result = tabular_mode.evaluate(test) cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=777) BS = 2048 MAX_EPOCH = 20 # skf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True) data['oof_preds'] = np.nan for fold_nb, (train_index, valid_index) in enumerate(cv.split(X, y)): X_train, X_valid = X[train_index], X[valid_index] y_train, y_valid = y[train_index], y[valid_index] tabnet_params = dict( n_d=N_D, n_a=N_A, n_steps=N_STEPS, gamma=GAMMA, n_independent=N_INDEP, n_shared=N_SHARED, lambda_sparse=1e-5, seed=0, clip_value=2, cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=cat_emb_dims, mask_type=MASK_TYPE, device_name='auto', optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=5e-2, weight_decay=1e-5), scheduler_params=dict( max_lr=5e-2, steps_per_epoch=int(X_train.shape[0] / BS), epochs=MAX_EPOCH, # final_div_factor=100, is_batch_level=True), scheduler_fn=torch.optim.lr_scheduler.OneCycleLR, # scheduler_params=dict(mode='max', # factor=0.5, # patience=5, # is_batch_level=False,), # scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau, verbose=1) # Defining TabNet model model = TabNetClassifier(**tabnet_params) model.fit( X_train=X_train, y_train=y_train, from_unsupervised=pretrainer if PRETRAIN else None, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_name=["train", "valid"], eval_metric=["auc"], batch_size=BS, virtual_batch_size=256, max_epochs=MAX_EPOCH, drop_last=True, pin_memory=True, patience=10, ) val_preds = model.predict_proba(X_valid)[:, -1] print('auc:', roc_auc_score(y_true=y_valid, y_score=val_preds)) data['oof_preds'].iloc[valid_index] = val_preds test_preds = model.predict_proba(X_test)[:, -1] df_sub[f"fold_{fold_nb+1}"] = test_preds df_sub["target"] = df_sub.filter(like="fold_").mean(axis=1).values df_sub.to_csv("Analysis/submission_5_tabnet.csv", index=False) df_sub = pd.read_csv("Analysis/submission_5_tabnet.csv") # df_sub.target = df_sub.target.map(lambda x: 0 if x<=0.5 else 1) df_sub.loc[:, ["id", "target"]].to_csv("Analysis/submission_5_2_tabnet.csv", index=False)
class TabNetCv1(TabNetBase): def act_init_ai(self, is_test=False): if is_test is True: _TPI(self, locals()) else: MATCH = self.env.match_loader self.ai = TabNetClassifier(n_steps=10, input_dim=MATCH.count_cols * MATCH.count_players, cat_dims=self.cat_dims, cat_emb_dim=self.cat_emb_dim, cat_idxs=self.cat_idxs, device_name='cuda') def act_modify_data(self, is_test=False): if is_test is True: _TPI(self, locals()) else: MATCH = self.env.match_loader for i in range(len(self.y_train)): result = MATCH.train_plus[i] - MATCH.train_minus[i] if result < 0: result = 0 elif result == 0: result = 1 else: result = 2 self.y_train[i] = result for i in range(len(self.y_valid)): result = MATCH.valid_plus[i] - MATCH.valid_minus[i] if result < 0: result = 0 elif result == 0: result = 1 else: result = 2 self.y_valid[i] = result for i in range(len(self.y_valid)): result = MATCH.test_plus[i] - MATCH.test_minus[i] if result < 0: result = 0 elif result == 0: result = 1 else: result = 2 self.y_test[i] = result def load_model(self): print("Loading Model") print(self.save_name + ".zip") self.ai.load_model(self.save_name + ".zip") def get_result(self, my_team: list, your_team: list): # tmp ''' MATCH = self.env.match_loader self.act_register_data(data=MATCH.act_get(is_flat=True)) self.act_modify_data() self.ai = TabNetClassifier( n_steps=10, input_dim=MATCH.count_cols * MATCH.count_players, cat_dims=self.cat_dims, cat_emb_dim=self.cat_emb_dim, cat_idxs=self.cat_idxs, device_name='cuda' ) self.ai.fit( X_train=self.X_train, y_train=self.y_train, X_valid=self.X_valid, y_valid=self.y_valid, max_epochs=self.epochs, patience=500, batch_size=512, drop_last=False ) ''' try: x = list() for player in my_team: x.append(0) x.extend(player) for player in your_team: x.append(1) x.extend(player) x = np.array(x) x = np.array([x]) print(x.shape) # predictions = self.ai.predict(x) probability = self.ai.predict_proba(x) print(probability) return probability[0][2] except Exception as e: print(e)
def make_tabnet_oof_prediction(train, y, test, features, categorical_features='auto', folds=10): ####################MLFLOW########################### import mlflow HOST = "http://localhost" mlflow.set_tracking_uri(HOST+":6006/") mlflow.start_run() ####################MLFLOW########################### x_train = train[features] x_test = test[features] unsupervised_model.fit( X_train=x_train.values, # values는 np.array랑 똑같은 역할 eval_set=[x_test.values], max_epochs=1000 , patience=50, batch_size=2048, virtual_batch_size=128, drop_last=False, pretraining_ratio=0.8, ) clf = TabNetClassifier( n_d=64, n_a=64, n_steps=5, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2), scheduler_params={"step_size":20, "gamma":0.95}, scheduler_fn=torch.optim.lr_scheduler.StepLR, mask_type='entmax', lambda_sparse=1e-4, device_name='auto', ) # 테스트 데이터 예측값을 저장할 변수 test_preds = np.zeros((x_test.shape[0])) # 21 - feature dimension으로 설정 # Out Of Fold Validation 예측 데이터를 저장할 변수 y_oof = np.zeros((x_train.shape[0])) # 폴드별 평균 Validation 스코어를 저장할 변수 score = 0 # 피처 중요도를 저장할 데이터 프레임 선언 fi = pd.DataFrame() fi['feature'] = features # Stratified K Fold 선언 skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED) for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)): # train index, validation index로 train 데이터를 나눔 x_tr, x_val = np.array(x_train.loc[tr_idx, features]), np.array(x_train.loc[val_idx, features]) y_tr, y_val = np.array(y[tr_idx]),np.array(y[val_idx]) clf.fit( x_tr, y_tr, eval_set=[(x_val, y_val)], max_epochs=1000 , patience=20, batch_size=1024, virtual_batch_size=128, ) print(f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}') # Validation 데이터 예측 val_preds = clf.predict_proba(x_val)[:,1] # Validation index에 예측값 저장 y_oof[val_idx] = val_preds # 폴드별 Validation 스코어 측정 print(f"Fold {fold + 1} | AUC: {roc_auc_score(y_val, val_preds)}") print('-'*80) # score 변수에 폴드별 평균 Validation 스코어 저장 score += roc_auc_score(y_val, val_preds) / folds # 테스트 데이터 예측하고 평균해서 저장 test_preds += clf.predict_proba(x_test.values)[:,1] / folds # 폴드별 피처 중요도 저장 fi[f'fold_{fold+1}'] = clf.feature_importances_ del x_tr, x_val, y_tr, y_val gc.collect() print(f"\nMean AUC = {score}") # 폴드별 Validation 스코어 출력 print(f"OOF AUC = {roc_auc_score(y, y_oof)}") # Out Of Fold Validation 스코어 출력 ####################MLFLOW########################### mlflow.log_param("folds", folds) for k,v in model_params.items(): mlflow.log_param(k, v) mlflow.log_metric("Mean AUC", score) mlflow.log_metric("OOF AUC", roc_auc_score(y, y_oof)) mlflow.end_run() ####################MLFLOW########################### # 폴드별 피처 중요도 평균값 계산해서 저장 fi_cols = [col for col in fi.columns if 'fold_' in col] fi['importance'] = fi[fi_cols].mean(axis=1) return y_oof, test_preds, fi
def fit(self, x_train, y_train, kf_splits=5, tabnet_type=None): def _get_tabnet_params(tabnet_type): if (tabnet_type is None): tabnet_params = dict( verbose=40, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=1e-2, weight_decay=1e-5), scheduler_params=dict(max_lr=0.05, steps_per_epoch=x_train.shape[0] // 128, epochs=300), scheduler_fn=torch.optim.lr_scheduler.OneCycleLR) fit_params = dict(batch_size=1024, virtual_batch_size=128, eval_metric='accuracy') elif (tabnet_type == 'TabNet-S'): tabnet_params = dict( n_d=8, n_a=8, lambda_sparse=0.0001, momentum=0.1, n_steps=3, gamma=1.2, verbose=40, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=0.01), scheduler_params=dict(step_size=8000, gamma=0.05), scheduler_fn=torch.optim.lr_scheduler.StepLR) fit_params = dict(batch_size=4096, virtual_batch_size=256, eval_metric='mse') else: print('[ERROR] Unknown tabnet_type: {}'.format(tabnet_type)) quit() # --- check problem --- if fit_params['eval_metric'] in [ 'auc', 'accuracy', 'balanced_accuracy', 'logloss' ]: problem = 'classification' elif fit_params['eval_metric'] in ['mse', 'mae', 'rmse', 'rmsle']: problem = 'regression' return tabnet_params, fit_params, problem kf = KFold(n_splits=kf_splits, shuffle=False) scores = [] self.tabnet_models = [] tabnet_params, fit_params, problem = _get_tabnet_params(tabnet_type) for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)): if (problem == 'classification'): unsupervised_model = TabNetPretrainer(**tabnet_params) tabnet_model = TabNetClassifier(**tabnet_params) elif (problem == 'regression'): unsupervised_model = TabNetPretrainer(**tabnet_params) tabnet_model = TabNetRegressor(**tabnet_params) else: pring('[ERROR] Unknown problem: {}'.format(problem)) quit() x_tr = x_train[train_index] x_val = x_train[val_index] y_tr = y_train[train_index] y_val = y_train[val_index] unsupervised_model.fit(x_tr, eval_set=[x_val], patience=300, max_epochs=5000, pretraining_ratio=0.8) tabnet_model.fit( x_tr, y_tr, eval_set=[(x_val, y_val)], eval_metric=[fit_params['eval_metric']], batch_size=fit_params['batch_size'], virtual_batch_size=fit_params['virtual_batch_size'], patience=300, max_epochs=5000, from_unsupervised=unsupervised_model) self.tabnet_models.append(tabnet_model) prediction = tabnet_model.predict(x_val) if (problem == 'classification'): scores.append(accuracy_score(y_val, prediction)) elif (problem == 'regression'): scores.append(mean_squared_error(y_val, prediction)) else: pring('[ERROR] Unknown problem: {}'.format(problem)) quit() if (i == 0): feature_importances = tabnet_model.feature_importances_.copy() else: feature_importances = np.vstack( (feature_importances, tabnet_model.feature_importances_)) print(scores) print(np.mean(scores)) return scores, feature_importances