def model_train_pred(fold): model_path = os.path.join(model_dir, model_file_name + f"_FOLD{fold}.pth") tabnet_params = dict(n_d = 64, n_a = 128, n_steps = 1, gamma = 1.3,lambda_sparse = 0, n_independent = 2,n_shared = 1,optimizer_fn = optim.Adam, optimizer_params = dict(lr = self.LEARNING_RATE, weight_decay = 1e-5), mask_type = "entmax", scheduler_params = dict(mode = "min", patience = 10, min_lr = 1e-5, factor = 0.9), scheduler_fn = ReduceLROnPlateau,verbose = 10) x_fold_train, y_fold_train, x_fold_val, y_fold_val, df_test_x_copy, val_idx = \ preprocess(fold, df_train, df_train_x, df_train_y, df_test_x, no_of_components) x_fold_train, x_fold_val, df_test_x_copy = variance_threshold(x_fold_train, x_fold_val, df_test_x_copy) ### Fit ### model = TabNetRegressor(**tabnet_params) model.fit(X_train = x_fold_train.values, y_train = y_fold_train.values, eval_set = [(x_fold_val.values, y_fold_val.values)], eval_name = ["val"], eval_metric = ["logits_ll"],max_epochs = self.EPOCHS, patience = 40,batch_size = self.BATCH_SIZE, virtual_batch_size = 32,num_workers = 1,drop_last = False, loss_fn = SmoothBCEwLogits(smoothing = 1e-4, pos_weight=pos_weight)) ###---- Prediction --- oof = np.zeros(df_train_y.shape) valid_preds = 1 / (1 + np.exp(-model.predict(x_fold_val.values))) oof[val_idx] = valid_preds predictions = 1 / (1 + np.exp(-model.predict(df_test_x_copy.values))) model_path = model.save_model(model_path) return oof, predictions
def crossval_and_predict(self, n_folds: int, df: pd.DataFrame, df_test: pd.DataFrame, feature_col: list, target_col: str, model_params: dict): oof = np.zeros((len(df))) cv_preds = np.zeros((len(df_test))) kfold = KFold(n_splits=n_folds, random_state=self.random_state, shuffle=True) for train_idx, valid_idx in kfold.split(df): X_train, y_train = df[feature_col].values[train_idx], df[ target_col].values[train_idx].reshape(-1, 1) X_valid, y_valid = df[feature_col].values[valid_idx], df[ target_col].values[valid_idx].reshape(-1, 1) X_test = df_test[feature_col].values params = self.default_params() params['seed'] = self.random_state params['n_d'] = model_params['n_d'] params['n_a'] = model_params['n_d'] params['gamma'] = model_params['gamma'] params['momentum'] = model_params['momentum'] params['n_steps'] = model_params['n_steps'] params['n_shared'] = model_params['n_shared'] params['n_independent'] = model_params['n_independent'] logging.info( f'Parameters used for TabNet supervised training: {params}') unsupervised_model = TabNetPretrainer(**params) unsupervised_model.fit(X_train=X_train, eval_set=[X_valid], pretraining_ratio=0.5, max_epochs=20) model = TabNetRegressor(**params) model.fit(X_train=X_train, y_train=y_train, eval_set=[(X_valid, y_valid)], eval_name=['valid'], eval_metric=['rmse'], max_epochs=100, patience=10, batch_size=1024, from_unsupervised=unsupervised_model) oof[valid_idx] = model.predict(X_valid).squeeze() cv_preds += model.predict(X_test).squeeze() / n_folds logging.info( f'Finished fold with score {rmse(y_valid, oof[valid_idx])}') rmse_score = rmse(df[target_col], oof) return rmse_score, cv_preds
def fit_tabnet(x_tr, y_tr, x_va, y_va, cat_feats, args): import torch from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor cat_idxs = [x_tr.columns.get_loc(f) for f in cat_feats] cat_dims = x_tr[cat_feats].apply(lambda s: s.nunique()).tolist() cat_emb_dim = [i // 2 for i in cat_dims] x_tr = x_tr.values y_tr = y_tr.values.reshape(-1, 1) x_va = x_va.values y_va = y_va.values.reshape(-1, 1) params = dict( n_d=16, n_a=16, n_steps=3, gamma=1.5, n_independent=4, n_shared=4, cat_idxs=cat_idxs, cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, lambda_sparse=0.0001, momentum=0.95, clip_value=2., optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=0.0005), #scheduler_params = {"gamma": 0.95, "step_size": 500}, scheduler_params={"gamma": 0.95}, scheduler_fn=torch.optim.lr_scheduler.ExponentialLR, epsilon=1e-1) clf = TabNetRegressor(**params) fit_params = { 'batch_size': 4096, 'virtual_batch_size': 1024, 'eval_set': [(x_va, y_va)], 'max_epochs': 1000, 'patience': 50, } clf.fit(x_tr, y_tr, **fit_params) tr_pred = np.clip(clf.predict(x_tr), 0, 361) va_pred = np.clip(clf.predict(x_va), 0, 361) train_score = np.sqrt(mean_squared_error(tr_pred, y_tr)) val_score = np.sqrt(mean_squared_error(va_pred, y_va)) return clf, train_score, val_score
class TabNetModel(BaseModel): def __init__(self, dataset, dataset_path, sample=False, sample_num=None, split_method='6-2-2', use_category=True): BaseModel.__init__(self, dataset, dataset_path, sample, sample_num, split_method, use_category) self.model_name = 'tabnet' def preprocessing(self, train_X, val_X, test_X, cat_cols): train_X, val_X, test_X = remove_unseen_category( train_X, val_X, test_X, cat_cols) train_X, val_X, test_X = label_encoding(train_X, val_X, test_X, cat_cols) # tabnet predict_proba func does not accept objet ndarray train_X, val_X, test_X = train_X.astype(float), val_X.astype( float), test_X.astype(float) # tabnet does not accept np.nan train_X = np.where(np.isnan(train_X), 0, train_X) val_X = np.where(np.isnan(val_X), 0, val_X) test_X = np.where(np.isnan(test_X), 0, test_X) return train_X, val_X, test_X def fit(self, train_X, train_Y, val_X, val_Y, cat_cols): cat_dims = [len(set(train_X[:, idx])) + 1 for idx in cat_cols] if self.dataset_type in {'2-class', 'm-class'}: self.model = TabNetClassifier(cat_idxs=cat_cols, cat_dims=cat_dims) self.model.fit(train_X, train_Y, eval_set=[(val_X, val_Y)], eval_metric=['logloss'], max_epochs=200, patience=20) elif self.dataset_type in {'regression'}: self.model = TabNetRegressor(cat_idxs=cat_cols, cat_dims=cat_dims) self.model.fit(train_X, train_Y[:, np.newaxis], eval_set=[(val_X, val_Y[:, np.newaxis])], eval_metric=['rmse'], max_epochs=200, patience=20)
class ModelTabNetRegressor(Model): def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): categorical_dims = {} for col in self.categorical_features: tr_x[col] = tr_x[col].fillna("unk") va_x[col] = va_x[col].fillna("unk") te_x[col] = te_x[col].fillna("unk") categorical_dims[col] = len(set(tr_x[col].values) | set(va_x[col].values) | set(te_x[col].values)) cat_idxs = [i for i, f in enumerate(tr_x.columns) if f in self.categorical_features] cat_dims = [categorical_dims[f] for i, f in enumerate(tr_x.columns) if f in self.categorical_features] cat_emb_dim = [10 for _ in categorical_dims] for col in tr_x.columns: tr_x[col] = tr_x[col].fillna(tr_x[col].mean()) va_x[col] = va_x[col].fillna(tr_x[col].mean()) te_x[col] = te_x[col].fillna(tr_x[col].mean()) self.model = TabNetRegressor(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs) self.model.fit(X_train=tr_x.values, y_train=tr_y.values.reshape(-1, 1), X_valid=va_x.values, y_valid=va_y.values.reshape(-1, 1), max_epochs=1000, patience=50, batch_size=1024, virtual_batch_size=128) def predict(self, te_x): return self.model.predict(te_x.values).reshape(-1, ) def save_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') os.makedirs(os.path.dirname(model_path), exist_ok=True) Data.dump(self.model, model_path) def load_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') self.model = Data.load(model_path)
class TabNetModel: def __init__(self, feat_sel = None): if feat_sel is not None: self.pipeline = Pipeline([ ('preprocessing', MinMaxScaler()), ('feature_selection', SelectFromModel(feat_sel)) ]) else: self.pipeline = Pipeline([ ('preprocessing', MinMaxScaler()) ]) self.feat_sel = feat_sel self.params = {'feat_sel': feat_sel} def set_params(self, **kwargs): tabnet_params, proc_params = {}, {} for key in kwargs.keys(): if key == 'feat_sel': self = TabNetRegression.TabNetModel.__init__(kwargs[key]) elif key.startswith(TabNetRegression.prefix): tabnet_params[key.replace(TabNetRegression.prefix + "__", "")] = kwargs[key] else: proc_params[key] = kwargs[key] self.tabnet = TabNetRegressor(**tabnet_params) self.pipeline.set_params(**proc_params) self.params = {**tabnet_params, **proc_params} return self def get_params(self, deep = False): return self.params def fit(self, Xtrain, Ytrain, Xvalid, Yvalid): if self.feat_sel is not None: self.pipeline.fit(Xtrain, Ytrain) else: self.pipeline.fit(Xtrain) Xtrain_scaled, Xvalid_scaled = self.pipeline.transform(Xtrain), self.pipeline.transform(Xvalid) self.tabnet.fit(Xtrain_scaled, Ytrain.flatten(), Xvalid_scaled, Yvalid.flatten()) def predict(self, X): return self.tabnet.predict(self.pipeline.transform(X))
def __call__(self, trial): df_train, df_valid = train_test_split(self.df, test_size=0.1, random_state=self.random_state) X_train, y_train = df_train[self.feature_col].values, df_train[ self.target_col].values.reshape(-1, 1) X_valid, y_valid = df_valid[self.feature_col].values, df_valid[ self.target_col].values.reshape(-1, 1) logging.info( f'Train/valid split: {X_train.shape[0]} for training, {X_valid.shape[0]} for validation' ) n_d = trial.suggest_int('n_d', 8, 64) params = self.default_params params['n_d'] = n_d params['n_a'] = n_d params['seed'] = self.random_state params['n_steps'] = trial.suggest_int('n_steps', 3, 10) params['n_shared'] = trial.suggest_int('n_shared', 2, 5) params['n_independent'] = trial.suggest_int('n_independent', 2, 5) params['momentum'] = trial.suggest_float('momentum', 0.01, 0.4) params['gamma'] = trial.suggest_float('gamma', 1.0, 2.0) model = TabNetRegressor(**params) model.fit(X_train=X_train, y_train=y_train, eval_set=[(X_valid, y_valid)], eval_metric=['rmse'], max_epochs=20, patience=10, batch_size=1024) score = rmse(y_valid, model.predict(X_valid).squeeze()) return score
def make_model(X_train,y_train,X_valid,y_valid,pretrainer): """Builds a decision tree model based on stored trainingd data""" tabnet_params = dict(n_d=8, n_a=8, n_steps=3, gamma=1.3, n_independent=2, n_shared=2, seed=SEED, lambda_sparse=1e-3, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2, weight_decay=1e-5 ), mask_type="entmax", scheduler_params=dict(max_lr=0.05, steps_per_epoch=int(X_train.shape[0] / 256), epochs=200, is_batch_level=True ), scheduler_fn=torch.optim.lr_scheduler.OneCycleLR, verbose=10, # cat_idxs=cat_idxs, # comment out when Unsupervised # cat_dims=cat_dims, # comment out when Unsupervised # cat_emb_dim=1 # comment out when Unsupervised ) model = TabNetRegressor(**tabnet_params) model.fit(X_train=X_train.values, y_train=y_train.values, eval_set=[(X_valid.values, y_valid.values)], eval_name = ["valid"], eval_metric = ["rmse"], max_epochs=200, patience=20, batch_size=4096, virtual_batch_size=256, num_workers=0,drop_last=False, from_unsupervised=pretrainer # comment out when Unsupervised ) return model
def run_tabnet(df, max_epochs=20, device='cpu'): X, y = get_X_y_tab(df) X_train, X_val, X_test, y_train, y_val, y_test = split_data_tab(X, y) reg = TabNetRegressor(device_name=device) print('Running the TabNet DNN, this could take a while') model = reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_name=['train', 'val'], eval_metric=['logloss'], max_epochs=max_epochs) print('Fitting the test data to the model') y_pred = reg.predict(X_test) ll = round(log_loss(y_test, y_pred), 5) print(f'The Log loss is {ll}') return model
def run_training_tabnet(train, test, trn_idx, val_idx, feature_cols, target_cols, fold, seed, filename="tabnet"): seed_everything(seed) train_ = process_data(train) test_ = process_data(test) train_df = train_.loc[trn_idx, :].reset_index(drop=True) valid_df = train_.loc[val_idx, :].reset_index(drop=True) x_train, y_train = train_df[feature_cols].values, train_df[ target_cols].values x_valid, y_valid = valid_df[feature_cols].values, valid_df[ target_cols].values model = TabNetRegressor( n_d=32, n_a=32, n_steps=1, lambda_sparse=0, cat_dims=[3, 2], cat_emb_dim=[1, 1], cat_idxs=[0, 1], optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2, weight_decay=1e-5), mask_type='entmax', # device_name=DEVICE, scheduler_params=dict(milestones=[100, 150], gamma=0.9), #) scheduler_fn=torch.optim.lr_scheduler.MultiStepLR, verbose=10, seed=seed) loss_fn = LabelSmoothing(0.001) # eval_metric = SmoothedLogLossMetric(0.001) # eval_metric_nosmoothing = SmoothedLogLossMetric(0.) oof = np.zeros((len(train), target.iloc[:, 1:].shape[1])) if IS_TRAIN: # print("isnan", np.any(np.isnan(x_train))) model.fit(X_train=x_train, y_train=y_train, eval_set=[(x_valid, y_valid)], eval_metric=[LogLossMetric, SmoothedLogLossMetric], max_epochs=200, patience=50, batch_size=1024, virtual_batch_size=128, num_workers=0, drop_last=False, loss_fn=loss_fn) model.save_model(f"{MODEL_DIR}/{NB}_{filename}_SEED{seed}_FOLD{fold}") #--------------------- PREDICTION--------------------- x_test = test_[feature_cols].values model = TabNetRegressor( n_d=32, n_a=32, n_steps=1, lambda_sparse=0, cat_dims=[3, 2], cat_emb_dim=[1, 1], cat_idxs=[0, 1], optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2, weight_decay=1e-5), mask_type='entmax', # device_name=DEVICE, scheduler_params=dict(milestones=[100, 150], gamma=0.9), #) scheduler_fn=torch.optim.lr_scheduler.MultiStepLR, verbose=10, seed=seed) model.load_model( f"{MODEL_DIR}/{NB}_{filename}_SEED{seed}_FOLD{fold}.model") valid_preds = model.predict(x_valid) valid_preds = torch.sigmoid( torch.as_tensor(valid_preds)).detach().cpu().numpy() oof[val_idx] = valid_preds predictions = model.predict(x_test) predictions = torch.sigmoid( torch.as_tensor(predictions)).detach().cpu().numpy() return oof, predictions
class TabNetBase(AI_Base): # file_path = os.getcwd() + "\\src\\AIs\\models\\TabNetv1\\" file_path = os.getcwd() + "\\" save_name = file_path + "test_model" def __init__(self, *args, **kwargs): _TPI(self, locals()) super(TabNetBase, self).__init__(*args, **kwargs) ACT = self.env.act MATCH = self.env.match_loader self.X_train, self.X_valid, self.X_test = None, None, None self.y_train, self.y_valid, self.y_test = None, None, None self.cat_idxs, self.cat_dims, self.cat_emb_dim = MATCH.get_categorical( ) self.ai = None self._scenario_tactics = None self._scenario_matches = None self._scenario_learn_from_file = list([[ 1, # [self.epochs, [ 1, # [len(MATCH), [ 1, (self.act_register_data, dict(data=MATCH.act_get(is_flat=True))), self.act_modify_data, self.act_init_ai, # self.act_load_game, self.act_run_ai_with_learn, # self.act_test ] ], ]]) self.set_mode(self.mode) def act_register_data(self, data, is_test=False): if is_test is True: _TPI(self, locals()) else: self.X_train = np.array(self.env.match_loader.train_players) self.y_train = np.array(self.env.match_loader.train_plus) self.X_valid = np.array(self.env.match_loader.valid_players) self.y_valid = np.array(self.env.match_loader.valid_plus) self.X_test = np.array(self.env.match_loader.test_players) self.y_test = np.array(self.env.match_loader.test_plus) def act_init_ai(self, is_test=False): if is_test is True: _TPI(self, locals()) else: MATCH = self.env.match_loader self.ai = TabNetRegressor(n_steps=10, input_dim=MATCH.count_cols * MATCH.count_players, cat_dims=self.cat_dims, cat_emb_dim=self.cat_emb_dim, cat_idxs=self.cat_idxs) def act_modify_data(self, is_test=False): if is_test is True: _TPI(self, locals()) else: pass def act_load_game(self, is_test=False): if is_test is True: _TPI(self, locals()) else: save = self.save_name + ".zip" if os.path.isfile(save): print("Load Network") self.ai.load_model(save) def act_test(self, is_test=False): if is_test is True: _TPI(self, locals()) else: predictions = self.ai.predict(self.X_test) y_true = self.y_test test_score = mean_squared_error(y_pred=predictions, y_true=y_true) #np.savetxt("predict.txt", predictions, delimiter=',', fmt='%d') #np.savetxt("true.txt", y_true, delimiter=',', fmt='%d') print(test_score) def act_run_ai_with_learn(self, is_test=False): if is_test is True: _TPI(self, locals()) else: self.ai.fit(X_train=self.X_train, y_train=self.y_train, X_valid=self.X_valid, y_valid=self.y_valid, max_epochs=self.epochs, patience=500, batch_size=512, drop_last=False) # self.ai.save_model(self.save_name) def act_save_model(self, is_test=False): if is_test is True: _TPI(self, locals()) else: print(self.save_name) self.ai.save_model(self.save_name)
print("FOLDS: ", fold_nb + 1, 'seed:', tabnet_params['seed']) print('*' * 60) X_train, y_train = train.values[train_idx, :], target.values[ train_idx, :] X_val, y_val = train.values[val_idx, :], target.values[val_idx, :] ### Model ### model = TabNetRegressor(**tabnet_params) ### Fit ### model.fit(X_train=X_train, y_train=y_train, eval_set=[(X_val, y_val)], eval_name=["val"], eval_metric=["logits_ll"], max_epochs=MAX_EPOCH, patience=20, batch_size=1024, virtual_batch_size=32, num_workers=1, drop_last=False, loss_fn=SmoothBCEwLogits(smoothing=5e-5)) print('-' * 60) ### Predict on validation ### preds_val = model.predict(X_val) # Apply sigmoid to the predictions preds = 1 / (1 + np.exp(-preds_val)) score = np.min(model.history["val_logits_ll"]) saving_path_name = 'TabNet_seed_' + str( tabnet_params['seed']) + '_fold_' + str(fold_nb + 1) saved_filepath = model.save_model(saving_path_name)
def fit( self, x_train: AoD, y_train: AoS, x_valid: AoD, y_valid: AoS, config: dict, **kwargs ) -> Tuple[TabNetModel, dict]: model_params = config["model"]["model_params"] train_params = config["model"]["train_params"] categorical_cols = config["categorical_cols"] self.config["categorical_cols"] = categorical_cols categorical_dims = {} for col in categorical_cols: x_train[col] = x_train[col].cat.add_categories("Unknown") x_train[col] = x_train[col].fillna("Unknown") x_train[col] = x_train[col].cat.codes x_valid[col] = x_valid[col].cat.add_categories("Unknown") x_valid[col] = x_valid[col].fillna("Unknown") x_valid[col] = x_valid[col].cat.codes categorical_dims[col] = len( set(x_train[col].values) | set(x_valid[col].values) ) cat_idxs = [i for i, f in enumerate(x_train.columns) if f in categorical_cols] cat_dims = [ categorical_dims[f] for i, f in enumerate(x_train.columns) if f in categorical_cols ] cat_emb_dim = [10 for _ in categorical_dims] numerical_cols = [col for col in x_train.columns if col not in categorical_cols] for col in numerical_cols: x_train[col] = x_train[col].fillna(x_train[col].mean()) x_valid[col] = x_valid[col].fillna(x_train[col].mean()) mode = config["model"]["mode"] self.mode = mode if mode == "regression": model = TabNetRegressor( cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs, **model_params, ) else: model = TabNetClassifier( cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs, **model_params, ) model.fit( X_train=x_train.values, y_train=y_train.reshape(-1, 1), X_valid=x_valid.values, y_valid=y_valid.reshape(-1, 1), **train_params, ) best_score = {"valid_score": model.losses_valid} return model, best_score
def run(try_num, config): args = get_args() print('config:', config.to_dict(), flush=True) print('args:', args, flush=True) os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' model_dir = f'blending-02-tabnet-{try_num}' if not os.path.exists(model_dir): os.mkdir(model_dir) train_features = pd.read_csv('../input/lish-moa/train_features.csv') train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv') dae_features = pd.read_csv(config.dae_path) test_features = pd.read_csv('../input/lish-moa/test_features.csv') if args.debug: train_features = train_features[:500] train_targets = train_targets[:500] dae_features = pd.concat([dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True) config.update(dict( n_folds=3, seeds=[222], n_epochs=3, batch_size=128, )) target_columns = [col for col in train_targets.columns if col != 'sig_id'] n_targets = len(target_columns) train_features, train_targets, test_features = preprocess(config, model_dir, train_features, train_targets, test_features, dae_features) features_columns = [col for col in train_features.columns if col not in ['sig_id', 'cp_type', 'cp_time', 'cp_dose', 'cp_type_ctl_vehicle', 'cp_type_trt_cp']] train_features = train_features[features_columns] test_features = test_features[features_columns] smooth_loss_function = SmoothBCEwLogits(smoothing=config.smoothing) kfold = MultilabelStratifiedKFold(n_splits=config.n_folds, random_state=42, shuffle=True) oof_preds = np.zeros((len(train_features), len(config.seeds), n_targets)) test_preds = [] for seed_index, seed in enumerate(config.seeds): print(f'Train seed {seed}', flush=True) set_seed(seed) for fold_index, (train_indices, val_indices) in enumerate(kfold.split( train_targets[target_columns].values, train_targets[target_columns].values )): print(f'Train fold {fold_index + 1}', flush=True) x_train = train_features.loc[train_indices, features_columns].values y_train = train_targets.loc[train_indices, target_columns].values x_val = train_features.loc[val_indices, features_columns].values y_val = train_targets.loc[val_indices, target_columns].values weights_path = f'{model_dir}/weights-{seed}-{fold_index}.pt' tabnet_conf = dict( seed=seed, optimizer_fn=optim.Adam, scheduler_fn=optim.lr_scheduler.ReduceLROnPlateau, n_d=32, n_a=32, n_steps=1, gamma=1.3, lambda_sparse=0, momentum=0.02, optimizer_params=dict(lr=2e-2, weight_decay=1e-5), scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9), mask_type="entmax", verbose=10, n_independent=1, n_shared=1, ) if args.only_pred: print('Skip training', flush=True) else: model = TabNetRegressor(**tabnet_conf) model.fit( X_train=x_train, y_train=y_train, eval_set=[(x_val, y_val)], eval_name=['val'], eval_metric=['logits_ll'], max_epochs=config.n_epochs, patience=20, batch_size=1024, virtual_batch_size=32, num_workers=1, drop_last=True, loss_fn=smooth_loss_function ) model.save_model(weights_path) print('Save weights to: ', weights_path, flush=True) model = TabNetRegressor(**tabnet_conf) model.load_model(f'{weights_path}.zip') val_preds = sigmoid(model.predict(x_val)) score = mean_log_loss(y_val, val_preds, n_targets) print(f'fold_index {fold_index} - val_loss: {score:5.5f}', flush=True) oof_preds[val_indices, seed_index, :] = val_preds preds = sigmoid(model.predict(test_features.values)) test_preds.append(preds) score = mean_log_loss(train_targets[target_columns].values, oof_preds[:, seed_index, :], n_targets) print(f'Seed {seed} - val_loss: {score:5.5f}', flush=True) oof_preds = np.mean(oof_preds, axis=1) score = mean_log_loss(train_targets[target_columns].values, oof_preds, n_targets) print(f'Overall score is {score:5.5f}', flush=True) oof_pred_df = train_targets.copy() oof_pred_df.loc[:, target_columns] = oof_preds oof_pred_df.to_csv(f'{model_dir}/oof_pred.csv', index=False) test_features = pd.read_csv('../input/lish-moa/test_features.csv') submission = create_submission(test_features, ['sig_id'] + target_columns) submission[target_columns] = np.mean(test_preds, axis=0) submission.loc[test_features['cp_type'] == 'ctl_vehicle', target_columns] = 0 submission.to_csv(f'{model_dir}/submission.csv', index=False)
def train_tabnet(x_train, y_train, x_test, submission, feature_cols, target_cols, seeds, nfolds, save_path): cfg_fe = Config_FeatureEngineer() seed_everything(seed_value=cfg_fe.seed) cfg_tabnet = Config_TabNet() test_cv_preds = [] oof_preds = [] scores = [] for seed in seeds: kfold_col = f'kfold_{seed}' print("seed: {}".format(seed)) print('*' * 60) for fold in range(nfolds): oof_preds_fold = y_train.copy() oof_preds_fold.iloc[:, :] = 0 print('*' * 60) print("FOLD: {}".format(fold + 1)) print('*' * 60) trn_idx = x_train[x_train[kfold_col] != fold].index val_idx = x_train[x_train[kfold_col] == fold].index train_df = x_train[x_train[kfold_col] != fold].reset_index( drop=True) valid_df = x_train[x_train[kfold_col] == fold].reset_index( drop=True) x_tr, y_tr = train_df[feature_cols].values, train_df[ target_cols].values x_val, y_val = valid_df[feature_cols].values, valid_df[ target_cols].values # tabnet model model = TabNetRegressor( n_d=cfg_tabnet.n_d, n_a=cfg_tabnet.n_a, n_steps=cfg_tabnet.n_steps, n_independent=cfg_tabnet.n_independent, n_shared=cfg_tabnet.n_shared, gamma=cfg_tabnet.gamma, lambda_sparse=cfg_tabnet.lambda_sparse, optimizer_fn=cfg_tabnet.optimizer_fn, optimizer_params=cfg_tabnet.optimizer_params, mask_type=cfg_tabnet.mask_type, scheduler_params=cfg_tabnet.scheduler_params, scheduler_fn=cfg_tabnet.scheduler_fn, seed=seed, verbose=cfg_tabnet.verbose) # fit model model.fit( X_train=x_tr, y_train=y_tr, eval_set=[(x_val, y_val)], eval_name=["val"], eval_metric=["logits_ll"], max_epochs=cfg_tabnet.max_epochs, patience=cfg_tabnet.fit_patience, batch_size=cfg_tabnet.batch_size, virtual_batch_size=cfg_tabnet.virtual_batch_size, num_workers=1, drop_last=False, # To use binary cross entropy because this is not a regression problem loss_fn=BCEwLogitsSmooth(smooth=cfg_tabnet.labelsmooth_rate)) print('-' * 60) # save model model.save_model( os.path.join(save_path, f"TabNet_seed{seed}_FOLD{fold}")) print('*' * 60) # Predict on validation preds_val = model.predict(x_val) # Apply sigmoid to the predictions preds = 1 / (1 + np.exp(-preds_val)) score = np.min(model.history["val_logits_ll"]) oof_preds.append(preds) scores.append(score) # Save OOF for CV preds_tr = model.predict(x_train[feature_cols].values) preds = 1 / (1 + np.exp(-preds_tr)) oof_preds_fold.loc[:, target_cols] = preds oof_preds_fold.to_csv( path_or_buf=f"./TabNet_oof_preds_seed{seed}_FOLD{fold}.csv", sep=',', index=False) # Predict on test preds_test = model.predict(x_test[feature_cols].values) preds_test = 1 / (1 + np.exp(-preds_test)) test_cv_preds.append(preds_test) test_cv_preds_fold = pd.DataFrame(preds_test, columns=target_cols) test_cv_preds_fold["sig_id"] = x_test["sig_id"] test_cv_preds_fold.to_csv( path_or_buf=f"./TabNet_test_preds_seed{seed}_FOLD{fold}.csv", sep=',', index=False) oof_preds_all = np.concatenate(oof_preds) test_preds_all = np.stack(test_cv_preds) print("Averaged Best Score for CVs is: {}".format(np.mean(scores))) return test_preds_all
optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-1), scheduler_params={ "step_size": 10, "gamma": 0.9 }, # how to use learning rate scheduler scheduler_fn=torch.optim.lr_scheduler.StepLR, mask_type="sparsemax", ) clf.fit( X_train=X_trn, y_train=y_trn, eval_set=[(X_trn, y_trn), (X_vld, y_vld)], eval_name=["train", "valid"], eval_metric=["rmse"], max_epochs=2000, patience=50, batch_size=128, virtual_batch_size=128, num_workers=0, ) fold_preds = clf.predict(X_vld).astype(np.float64)[:, 0] _test_preds.append(clf.predict(X_tst)[:, 0]) oof[vld_index] = fold_preds scores.append(np.sqrt(mean_squared_error(y_vld, fold_preds))) importances = pd.concat( [ importances, pd.DataFrame({ "feature": feat_cols,
def objective(trial): SEED = 25 # [20,21,22] # all hyperparameters here # mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"]) n_d = trial.suggest_int("n_d", 8, 32, step=8) n_a = trial.suggest_int("n_a", 32, 64, step=8) #n_steps = trial.suggest_int("n_steps", 1, 3, step=1) #n_shared = trial.suggest_int("n_shared", 1, 2) #n_independent = trial.suggest_int("n_independent", 1, 2, step=1) clip_value = trial.suggest_int("clip_value", 1, 2, step=1) gamma = trial.suggest_float("gamma", 1., 1.6, step=0.2) #lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True) #batch_size = trial.suggest_int("batch_size", 512, 1024, step=256) #momentum = trial.suggest_float("momentum", 0.02, 0.1, step=0.02) #factor = trial.suggest_float("factor", 0.5, 0.9,step=0.1) for t in trial.study.trials: if t.state != optuna.structs.TrialState.COMPLETE: continue if t.params == trial.params: return t.value # Return the previous value without re-evaluating it. tabnet_params = dict(n_d=n_d, n_a=n_a, n_steps=1, gamma=gamma, n_shared=1, n_independent=1, lambda_sparse=0, #momentum = momentum, clip_value=clip_value, optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2, weight_decay=1e-5), mask_type="entmax", scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.5,), scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau, verbose=10, seed=SEED, ) print(m_,'params:',tabnet_params) scores_auc_all = [] test_cv_preds = [] NB_SPLITS = 7 # 7 mskf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, random_state = 0, shuffle = True) oof_preds = [] oof_targets = [] scores = [] scores_auc = [] for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, target)): print(b_,"FOLDS: ", r_, fold_nb + 1) print(b_, "FOLDS: ", r_, fold_nb + 1, y_, 'seed:', tabnet_params['seed']) print(g_, '*' * 60, c_) X_train, y_train = train.values[train_idx, :], target.values[train_idx, :] X_val, y_val = train.values[val_idx, :], target.values[val_idx, :] ### Model ### model = TabNetRegressor(**tabnet_params) ### Fit ### model.fit( X_train = X_train, y_train = y_train, eval_set = [(X_val, y_val)], eval_name = ["val"], eval_metric = ["logits_ll"], max_epochs = MAX_EPOCH, patience = 20, batch_size = 512, #1024 virtual_batch_size = 64,#32 num_workers = 1, drop_last = False, # To use binary cross entropy because this is not a regression problem loss_fn = SmoothBCEwLogits(smoothing =0.0005) ) print(y_, '-' * 60) ### Predict on validation ### preds_val = model.predict(X_val) # Apply sigmoid to the predictions preds = 1 / (1 + np.exp(-preds_val)) score = np.min(model.history["val_logits_ll"]) ### Save OOF for CV ### oof_preds.append(preds_val) oof_targets.append(y_val) scores.append(score) ### Predict on test ### #preds_test = model.predict(X_test) #test_cv_preds.append(1 / (1 + np.exp(-preds_test))) #oof_preds_all = np.concatenate(oof_preds) #oof_targets_all = np.concatenate(oof_targets) #test_preds_all = np.stack(test_cv_preds) return np.mean(scores)
for seed in [0,1]: print('Seed',seed) for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, train_targets_scored)): print("FOLDS : ", fold_nb) ## model X_train, y_train = train.values[train_idx, :], train_targets_scored.values[train_idx, :] X_val, y_val = train.values[val_idx, :], train_targets_scored.values[val_idx, :] model = TabNetRegressor(**tabnet_params) model.fit(X_train=X_train, y_train=y_train, eval_set=[(X_val, y_val)], eval_name = ["val"], eval_metric = ["logits_ll"], max_epochs=MAX_EPOCH, patience=20, batch_size=1024, virtual_batch_size=128, num_workers=1, drop_last=False, # use binary cross entropy as this is not a regression problem loss_fn=torch.nn.functional.binary_cross_entropy_with_logits) preds_val = model.predict(X_val) # Apply sigmoid to the predictions preds = 1 / (1 + np.exp(-preds_val)) score = np.min(model.history["val_logits_ll"]) # name = cfg.save_name + f"_fold{fold_nb}" # model.save_model(name) ## save oof to compute the CV later oof_preds.append(preds_val) oof_targets.append(y_val) scores.append(score)
class MyTabNetRegressorModel(BaseModel): """ Paramters --------- ref: https://dreamquark-ai.github.io/tabnet/generated_docs/README.html#model-parameters model_params: n_d:default=8(range 8 to 64) n_a:default=8 n_steps:default=3(range 3 to 10) gamma:default=1.3(range 1.0 to 2.0) n_independent:default=2(range 1 to 5) n_shared:default=2(range 1 to 5) lambda_sparse:default=1e3 optimizer_fn:default=Adam optimizer_params:default=(lr=2e2, weight_decay=None), mask_type:default=sparsemax or entmax scheduler_params:dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False), seed: default=0 verbose=5, cat_dims=cat_dims, cat_idxs=cat_idx, cat_emb_dim=1 fit_params: max_epochs:default=200 patience:default=15 loss_fn(torch.loss or list of torch.loss):default to mse for regression and cross entropy for classification eval_metric(list or str) batch_size:default=1024 virtual_batch_size:default=128 pretrain_ratio ### Example use: >>>nunique = train_feat_df.nunique() >>>types = train_feat_df.dtypes >>>categorical_columns = [] >>>categorical_dims = {} >>>train_feat_df["is_train"] = 1 >>>test_feat_df["is_train"] = 0 >>>all_df = pd.concat([train_feat_df, test_feat_df]) >>for col in train_feat_df.drop(["is_train"], axis=1).columns: if str(types[col]) == 'category' or nunique[col] < 200: l_enc = LabelEncoder() all_df[col] = l_enc.fit_transform(all_df[col].values) all_df[col] = all_df[col].astype("category") categorical_columns.append(col) categorical_dims[col] = len(l_enc.classes_ ) >>>cat_idx = [i for i, f in enumerate(train_feat_df.columns.tolist()) if f in categorical_columns] >>>cat_dims = [categorical_dims[f] for i, f in enumerate(train_feat_df.columns.tolist()) if f in categorical_columns] """ def __init__(self, model_params, fit_params: Optional[Dict]): self.model_params = model_params self.fit_params = fit_params if self.fit_params is None: self.fit_params = {} def build_model(self): self.model = TabNetRegressor(**self.model_params) return self.model def fit(self, train_x, train_y, valid_x=None, valid_y=None): train_x, valid_x = train_x.values, valid_x.values self.model = self.build_model() self.model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_name=["train", "valid"], **self.fit_params) return self.model def predict(self, est, valid_x): valid_x = valid_x.values preds = est.predict(valid_x) return preds
cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs) X_train_1 = X_train_1.values X_valid_1 = X_valid_1.values Y_train_1 = Y_train_1.values.reshape(-1, 1) Y_valid_1 = Y_valid_1.values.reshape(-1, 1) max_epochs = 1000 clf.fit( X_train=X_train_1, y_train=Y_train_1, eval_set=[(X_train_1, Y_train_1), (X_valid_1, Y_valid_1)], eval_name=['train', 'valid'], eval_metric=['mae', 'mse'], max_epochs=max_epochs, patience=50, # batch_size=1024, # virtual_batch_size=128, num_workers=0, drop_last=False) X_test = X_test.values preds = clf.predict(X_test) preds[:48] sub = pd.DataFrame(preds) sub['1'] = sub[0] * 0.6 sub['2'] = sub[0] * 0.7 sub['3'] = sub[0] * 0.8
def run_model(self, train_df, targets, X_test): """ Run model. Args: train_df (dataframe): training inputs with dimensions [n_observations,n_features] targets (dataframe): updated input data of known responses (binary) from MoA targets for train data X_test (arr): test inputs with dimensions [n_observations,n_features] Returns: arr: predicted outputs with dimensions [n_splits_kfold,n_observations,n_moa_targets] """ test_cv_preds = [] oof_preds = [] oof_targets = [] scores = [] mskf = MultilabelStratifiedKFold(n_splits=self.config.NB_SPLITS, random_state=0, shuffle=True) for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train_df, targets)): print("FOLDS: ", fold_nb + 1) print('*' * 60) X_train, y_train = train_df.values[train_idx, :], targets.values[ train_idx, :] X_val, y_val = train_df.values[val_idx, :], targets.values[ val_idx, :] model = TabNetRegressor(**self.tabnet_params) model.fit(X_train=X_train, y_train=y_train, eval_set=[(X_val, y_val)], eval_name=["val"], eval_metric=["logits_ll"], max_epochs=self.config.MAX_EPOCH, patience=20, batch_size=1024, virtual_batch_size=32, num_workers=1, drop_last=False, loss_fn=F.binary_cross_entropy_with_logits) print('-' * 60) preds_val = model.predict(X_val) preds = 1 / (1 + np.exp(-preds_val)) score = np.min(model.history["val_logits_ll"]) oof_preds.append(preds_val) oof_targets.append(y_val) scores.append(score) preds_test = model.predict(X_test) test_cv_preds.append(1 / (1 + np.exp(-preds_test))) oof_preds_all = np.concatenate(oof_preds) oof_targets_all = np.concatenate(oof_targets) test_preds_all = np.stack(test_cv_preds) aucs = [] for task_id in range(oof_preds_all.shape[1]): aucs.append( roc_auc_score(y_true=oof_targets_all[:, task_id], y_score=oof_preds_all[:, task_id])) print(f"Overall AUC: {np.mean(aucs)}") print(f"Average CV: {np.mean(scores)}") return test_preds_all