def build_tabnet(): model_file_name = 'tabnet_model_{}'.format(current_dataset_name) df = current_dataset.copy() cleaning_text(df) X = df['clean_content'] y = df['emotion'] # tokenize la data tok = Tokenizer(num_words=1000, oov_token='<UNK>') # fit le model avec les données de train # tok.fit_on_texts(X) # X = tok.texts_to_matrix(X, mode='tfidf') # split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=1) X_test_save = X_test tok.fit_on_texts(X_test) X_test = tok.texts_to_matrix(X_test, mode='tfidf') tok.fit_on_texts(X_train) X_train = tok.texts_to_matrix(X_train, mode='tfidf') # X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, stratify=y) # build model, fit and predict if LOAD_MODEL and pathlib.Path(model_file_name).exists(): model = pickle.load(open(model_file_name, 'rb')) else: model = TabNetClassifier() model.fit(X_train=X_train, y_train=y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_name=['train', 'valid'], eval_metric=['accuracy', 'balanced_accuracy', 'logloss']) preds_mapper = { idx: class_name for idx, class_name in enumerate(model.classes_) } preds = model.predict_proba(X_test) y_pred_proba = np.vectorize(preds_mapper.get)(np.argmax(preds, axis=1)) y_pred = model.predict(X_test) test_acc = accuracy_score(y_pred=y_pred, y_true=y_test) pickle.dump(model, open(model_file_name, 'wb')) # model.save_model(model_file_name) return model, y_test, y_pred, test_acc
class ModelTabNetClassifier(Model): def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): categorical_dims = {} for col in self.categorical_features: tr_x[col] = tr_x[col].fillna("unk") va_x[col] = va_x[col].fillna("unk") te_x[col] = te_x[col].fillna("unk") categorical_dims[col] = len(set(tr_x[col].values) | set(va_x[col].values) | set(te_x[col].values)) cat_idxs = [i for i, f in enumerate(tr_x.columns) if f in self.categorical_features] cat_dims = [categorical_dims[f] for i, f in enumerate(tr_x.columns) if f in self.categorical_features] cat_emb_dim = [10 for _ in categorical_dims] for col in tr_x.columns: tr_x[col] = tr_x[col].fillna(tr_x[col].mean()) va_x[col] = va_x[col].fillna(tr_x[col].mean()) te_x[col] = te_x[col].fillna(tr_x[col].mean()) self.model = TabNetClassifier(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs) self.model.fit(X_train=tr_x.values, y_train=tr_y.values, X_valid=va_x.values, y_valid=va_y.values, max_epochs=1000, patience=50, batch_size=1024, virtual_batch_size=128) def predict(self, te_x): return self.model.predict_proba(te_x.values)[:, 1].reshape(-1, ) def save_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') os.makedirs(os.path.dirname(model_path), exist_ok=True) Data.dump(self.model, model_path) def load_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') self.model = Data.load(model_path)
def main(): # Generate Synthetic Data data, test_data, cat_col_names, num_col_names = data_load() cat_dims = data[cat_col_names].nunique().to_list() cat_idxs = [(cat_col_names + num_col_names).index(cat_col) for cat_col in cat_col_names] cat_emb_dims = np.ceil(np.log(cat_dims)).astype(np.int).tolist() cat_emb_dims = np.ceil(np.clip((np.array(cat_dims)) / 2, a_min=1, a_max=50)).astype(np.int).tolist() FEATURES = cat_col_names + num_col_names df_sub = pd.read_csv('Data/sample_submission.csv') bsize = 2500 * 2 # ##########Define the Configs############ N_D = 16 N_A = 16 N_INDEP = 2 N_SHARED = 2 N_STEPS = 1 # 2 MASK_TYPE = "sparsemax" GAMMA = 1.5 BS = 512 MAX_EPOCH = 21 # 20 PRETRAIN = True X = data[FEATURES].values y = data["target"].values X_test = test_data[FEATURES].values if PRETRAIN: pretrain_params = dict( n_d=N_D, n_a=N_A, n_steps=N_STEPS, # 0.2, n_independent=N_INDEP, n_shared=N_SHARED, cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=cat_emb_dims, gamma=GAMMA, lambda_sparse=0., optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2), mask_type=MASK_TYPE, scheduler_params=dict( mode="min", patience=3, min_lr=1e-5, factor=0.5, ), scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau, verbose=1, ) pretrainer = TabNetPretrainer(**pretrain_params) pretrainer.fit( X_train=X_test, eval_set=[X], max_epochs=MAX_EPOCH, patience=25, batch_size=BS, virtual_batch_size=BS, # 128, num_workers=0, drop_last=True, pretraining_ratio= 0.5 # The bigger your pretraining_ratio the harder it is to reconstruct ) # Training the Model # tabular_mode.fit(train=train, validation=val) # # Evaluating the Model # # #Loss and Metrics on New Data¶ # result = tabular_mode.evaluate(test) cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=777) BS = 2048 MAX_EPOCH = 20 # skf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True) data['oof_preds'] = np.nan for fold_nb, (train_index, valid_index) in enumerate(cv.split(X, y)): X_train, X_valid = X[train_index], X[valid_index] y_train, y_valid = y[train_index], y[valid_index] tabnet_params = dict( n_d=N_D, n_a=N_A, n_steps=N_STEPS, gamma=GAMMA, n_independent=N_INDEP, n_shared=N_SHARED, lambda_sparse=1e-5, seed=0, clip_value=2, cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=cat_emb_dims, mask_type=MASK_TYPE, device_name='auto', optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=5e-2, weight_decay=1e-5), scheduler_params=dict( max_lr=5e-2, steps_per_epoch=int(X_train.shape[0] / BS), epochs=MAX_EPOCH, # final_div_factor=100, is_batch_level=True), scheduler_fn=torch.optim.lr_scheduler.OneCycleLR, # scheduler_params=dict(mode='max', # factor=0.5, # patience=5, # is_batch_level=False,), # scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau, verbose=1) # Defining TabNet model model = TabNetClassifier(**tabnet_params) model.fit( X_train=X_train, y_train=y_train, from_unsupervised=pretrainer if PRETRAIN else None, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_name=["train", "valid"], eval_metric=["auc"], batch_size=BS, virtual_batch_size=256, max_epochs=MAX_EPOCH, drop_last=True, pin_memory=True, patience=10, ) val_preds = model.predict_proba(X_valid)[:, -1] print('auc:', roc_auc_score(y_true=y_valid, y_score=val_preds)) data['oof_preds'].iloc[valid_index] = val_preds test_preds = model.predict_proba(X_test)[:, -1] df_sub[f"fold_{fold_nb+1}"] = test_preds df_sub["target"] = df_sub.filter(like="fold_").mean(axis=1).values df_sub.to_csv("Analysis/submission_5_tabnet.csv", index=False) df_sub = pd.read_csv("Analysis/submission_5_tabnet.csv") # df_sub.target = df_sub.target.map(lambda x: 0 if x<=0.5 else 1) df_sub.loc[:, ["id", "target"]].to_csv("Analysis/submission_5_2_tabnet.csv", index=False)
y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])] cat_idxs = [ i for i, col in enumerate(X_all.columns) if col in vars_cat] cat_dims = [ len(np.unique(X_all.iloc[:,i].values)) for i in cat_idxs] cat_emb_dim = np.floor(np.log(cat_dims)).astype(int) md = TabNetClassifier(cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, ## optimizer_fn=torch.optim.Adam, ## optimizer_params=dict(lr=2e-2), ## mask_type='sparsemax', n_steps=1, ) %%time md.fit( X_train=X_train, y_train=y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_name=['train', 'test_EVIL'], eval_metric=['auc'], max_epochs=100, patience=100, ## batch_size=1024, virtual_batch_size=128, ## weights=0, ) y_pred = md.predict_proba(X_test)[:,1] print(metrics.roc_auc_score(y_test, y_pred))
X_train=X_trn, y_train=y_trn, eval_set=[(X_trn, y_trn), (X_vld, y_vld)], eval_name=["train", "valid"], eval_metric=["logloss"], max_epochs=max_epochs, patience=50, batch_size=128, virtual_batch_size=128, num_workers=0, weights=1, drop_last=False, from_unsupervised=loaded_pretrain, ) fold_preds = clf.predict_proba(X_vld).astype(np.float64)[:, 1] _test_preds.append(clf.predict_proba(X_tst)[:, 1]) oof[vld_index] = fold_preds scores.append(log_loss(y_vld, fold_preds)) importances = pd.concat( [ importances, pd.DataFrame({"feature": feat_cols, "importance": clf.feature_importances_}), ], axis=0, ) oof_preds.append(oof) test_preds.append(np.mean(_test_preds, axis=0)) fig, ax = plt.subplots(figsize=(6, 18))
X_train=X_trn, y_train=y_trn, eval_set=[(X_trn, y_trn), (X_vld, y_vld)], eval_name=["train", "valid"], eval_metric=["logloss"], max_epochs=max_epochs, patience=25, batch_size=128, virtual_batch_size=128, num_workers=0, weights=1, drop_last=False, from_unsupervised=loaded_pretrain, ) fold_preds = clf.predict_proba(X_vld).astype(np.float64) preds[vld_index] = fold_preds[:, 1] scores.append(log_loss(y_vld, fold_preds[:, 1])) importances = pd.concat( [ importances, pd.DataFrame({ "feature": feat_cols, "importance": clf.feature_importances_ }) ], axis=0, ) fig, ax = plt.subplots(figsize=(6, 18)) sns.barplot(
#we separe between categorical variables and date variables to preprocess it separately categorical_val.remove('answer_creation_date') categorical_val.remove('group_creation_date') categorical_val.remove('request_creation_date') categorical_val.remove('victim_of_violence_type') date_columns = [ 'answer_creation_date', 'group_creation_date', 'request_creation_date' ] X_test, y_test = preprocess(requests_test, categorical_val, date_columns) # Drop Nan value because otherwise there are memory errors X_test['granted_number_of_nights'] = y_test X_test = X_test.dropna() y_test = X_test['granted_number_of_nights'] X_test = X_test.drop(columns=['granted_number_of_nights']) #preprocess the datasets for TabNet X_test_tab, y_test_tab = preprocess_for_tabnet(X_test, y_test) # retrieve model # Not working with version <3.7 PATH = '../model_zoo/TabNet_model.zip' clf = TabNetClassifier() clf.load_model(PATH) #run inference start = time.time() preds = clf.predict_proba(X_test_tab) end = time.time() score = competition_scorer(y_test_tab, preds) print('time per prediction:', (end - start) / len(X_test)) print('The competition score on test data', score)
class TabNetCv1(TabNetBase): def act_init_ai(self, is_test=False): if is_test is True: _TPI(self, locals()) else: MATCH = self.env.match_loader self.ai = TabNetClassifier(n_steps=10, input_dim=MATCH.count_cols * MATCH.count_players, cat_dims=self.cat_dims, cat_emb_dim=self.cat_emb_dim, cat_idxs=self.cat_idxs, device_name='cuda') def act_modify_data(self, is_test=False): if is_test is True: _TPI(self, locals()) else: MATCH = self.env.match_loader for i in range(len(self.y_train)): result = MATCH.train_plus[i] - MATCH.train_minus[i] if result < 0: result = 0 elif result == 0: result = 1 else: result = 2 self.y_train[i] = result for i in range(len(self.y_valid)): result = MATCH.valid_plus[i] - MATCH.valid_minus[i] if result < 0: result = 0 elif result == 0: result = 1 else: result = 2 self.y_valid[i] = result for i in range(len(self.y_valid)): result = MATCH.test_plus[i] - MATCH.test_minus[i] if result < 0: result = 0 elif result == 0: result = 1 else: result = 2 self.y_test[i] = result def load_model(self): print("Loading Model") print(self.save_name + ".zip") self.ai.load_model(self.save_name + ".zip") def get_result(self, my_team: list, your_team: list): # tmp ''' MATCH = self.env.match_loader self.act_register_data(data=MATCH.act_get(is_flat=True)) self.act_modify_data() self.ai = TabNetClassifier( n_steps=10, input_dim=MATCH.count_cols * MATCH.count_players, cat_dims=self.cat_dims, cat_emb_dim=self.cat_emb_dim, cat_idxs=self.cat_idxs, device_name='cuda' ) self.ai.fit( X_train=self.X_train, y_train=self.y_train, X_valid=self.X_valid, y_valid=self.y_valid, max_epochs=self.epochs, patience=500, batch_size=512, drop_last=False ) ''' try: x = list() for player in my_team: x.append(0) x.extend(player) for player in your_team: x.append(1) x.extend(player) x = np.array(x) x = np.array([x]) print(x.shape) # predictions = self.ai.predict(x) probability = self.ai.predict_proba(x) print(probability) return probability[0][2] except Exception as e: print(e)
"gamma": 0.9 }, scheduler_fn=torch.optim.lr_scheduler.StepLR, mask_type='entmax' # "sparsemax" ) clf.fit(X_train=X_tr, y_train=y_tr, X_valid=X_val, y_valid=y_val, max_epochs=epoch, patience=epoch, batch_size=1024, virtual_batch_size=128, num_workers=0, weights=1, drop_last=False) test_pred += clf.predict_proba(X_test)[:, 1] test_pred /= 5 test_auc = roc_auc_score(y_test, test_pred) print('test auc:', test_auc) res[data_name] = test_auc print(res) with open("tabnet_result.pickle", "wb") as f: pickle.dump(res, f)