Esempio n. 1
0
def build_tabnet():
    model_file_name = 'tabnet_model_{}'.format(current_dataset_name)

    df = current_dataset.copy()
    cleaning_text(df)

    X = df['clean_content']
    y = df['emotion']
    # tokenize la data
    tok = Tokenizer(num_words=1000, oov_token='<UNK>')
    # fit le model avec les données de train
    # tok.fit_on_texts(X)
    # X = tok.texts_to_matrix(X, mode='tfidf')
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        stratify=y,
                                                        random_state=1)
    X_test_save = X_test
    tok.fit_on_texts(X_test)
    X_test = tok.texts_to_matrix(X_test, mode='tfidf')
    tok.fit_on_texts(X_train)
    X_train = tok.texts_to_matrix(X_train, mode='tfidf')
    # X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, stratify=y)
    # build model, fit and predict
    if LOAD_MODEL and pathlib.Path(model_file_name).exists():
        model = pickle.load(open(model_file_name, 'rb'))
    else:
        model = TabNetClassifier()
        model.fit(X_train=X_train,
                  y_train=y_train,
                  eval_set=[(X_train, y_train), (X_test, y_test)],
                  eval_name=['train', 'valid'],
                  eval_metric=['accuracy', 'balanced_accuracy', 'logloss'])

    preds_mapper = {
        idx: class_name
        for idx, class_name in enumerate(model.classes_)
    }
    preds = model.predict_proba(X_test)
    y_pred_proba = np.vectorize(preds_mapper.get)(np.argmax(preds, axis=1))
    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    pickle.dump(model, open(model_file_name, 'wb'))
    # model.save_model(model_file_name)
    return model, y_test, y_pred, test_acc
Esempio n. 2
0
class ModelTabNetClassifier(Model):

    def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None):

        categorical_dims = {}
        for col in self.categorical_features:
            tr_x[col] = tr_x[col].fillna("unk")
            va_x[col] = va_x[col].fillna("unk")
            te_x[col] = te_x[col].fillna("unk")
            categorical_dims[col] = len(set(tr_x[col].values) | set(va_x[col].values) | set(te_x[col].values))

        cat_idxs = [i for i, f in enumerate(tr_x.columns) if f in self.categorical_features]
        cat_dims = [categorical_dims[f] for i, f in enumerate(tr_x.columns) if f in self.categorical_features]
        cat_emb_dim = [10 for _ in categorical_dims]

        for col in tr_x.columns:
            tr_x[col] = tr_x[col].fillna(tr_x[col].mean())
            va_x[col] = va_x[col].fillna(tr_x[col].mean())
            te_x[col] = te_x[col].fillna(tr_x[col].mean())

        self.model = TabNetClassifier(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs)
        self.model.fit(X_train=tr_x.values, y_train=tr_y.values,
                       X_valid=va_x.values, y_valid=va_y.values,
                       max_epochs=1000,
                       patience=50,
                       batch_size=1024,
                       virtual_batch_size=128)

    def predict(self, te_x):
        return self.model.predict_proba(te_x.values)[:, 1].reshape(-1, )

    def save_model(self):
        model_path = os.path.join('../output/model', f'{self.run_fold_name}.model')
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        Data.dump(self.model, model_path)

    def load_model(self):
        model_path = os.path.join('../output/model', f'{self.run_fold_name}.model')
        self.model = Data.load(model_path)
def main():
    # Generate Synthetic Data
    data, test_data, cat_col_names, num_col_names = data_load()
    cat_dims = data[cat_col_names].nunique().to_list()
    cat_idxs = [(cat_col_names + num_col_names).index(cat_col)
                for cat_col in cat_col_names]
    cat_emb_dims = np.ceil(np.log(cat_dims)).astype(np.int).tolist()
    cat_emb_dims = np.ceil(np.clip((np.array(cat_dims)) / 2, a_min=1,
                                   a_max=50)).astype(np.int).tolist()
    FEATURES = cat_col_names + num_col_names
    df_sub = pd.read_csv('Data/sample_submission.csv')

    bsize = 2500 * 2

    # ##########Define the Configs############
    N_D = 16
    N_A = 16
    N_INDEP = 2
    N_SHARED = 2
    N_STEPS = 1  # 2
    MASK_TYPE = "sparsemax"
    GAMMA = 1.5
    BS = 512
    MAX_EPOCH = 21  # 20
    PRETRAIN = True

    X = data[FEATURES].values
    y = data["target"].values

    X_test = test_data[FEATURES].values

    if PRETRAIN:
        pretrain_params = dict(
            n_d=N_D,
            n_a=N_A,
            n_steps=N_STEPS,  # 0.2,
            n_independent=N_INDEP,
            n_shared=N_SHARED,
            cat_idxs=cat_idxs,
            cat_dims=cat_dims,
            cat_emb_dim=cat_emb_dims,
            gamma=GAMMA,
            lambda_sparse=0.,
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=2e-2),
            mask_type=MASK_TYPE,
            scheduler_params=dict(
                mode="min",
                patience=3,
                min_lr=1e-5,
                factor=0.5,
            ),
            scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
            verbose=1,
        )

        pretrainer = TabNetPretrainer(**pretrain_params)

        pretrainer.fit(
            X_train=X_test,
            eval_set=[X],
            max_epochs=MAX_EPOCH,
            patience=25,
            batch_size=BS,
            virtual_batch_size=BS,  # 128,
            num_workers=0,
            drop_last=True,
            pretraining_ratio=
            0.5  # The bigger your pretraining_ratio the harder it is to reconstruct
        )
    # Training the Model
    # tabular_mode.fit(train=train, validation=val)
    # # Evaluating the Model
    # # #Loss and Metrics on New Data¶
    # result = tabular_mode.evaluate(test)

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=777)

    BS = 2048
    MAX_EPOCH = 20
    # skf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)

    data['oof_preds'] = np.nan

    for fold_nb, (train_index, valid_index) in enumerate(cv.split(X, y)):
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]

        tabnet_params = dict(
            n_d=N_D,
            n_a=N_A,
            n_steps=N_STEPS,
            gamma=GAMMA,
            n_independent=N_INDEP,
            n_shared=N_SHARED,
            lambda_sparse=1e-5,
            seed=0,
            clip_value=2,
            cat_idxs=cat_idxs,
            cat_dims=cat_dims,
            cat_emb_dim=cat_emb_dims,
            mask_type=MASK_TYPE,
            device_name='auto',
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=5e-2, weight_decay=1e-5),
            scheduler_params=dict(
                max_lr=5e-2,
                steps_per_epoch=int(X_train.shape[0] / BS),
                epochs=MAX_EPOCH,
                # final_div_factor=100,
                is_batch_level=True),
            scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
            #                               scheduler_params=dict(mode='max',
            #                                                     factor=0.5,
            #                                                     patience=5,
            #                                                     is_batch_level=False,),
            #                               scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
            verbose=1)
        # Defining TabNet model
        model = TabNetClassifier(**tabnet_params)

        model.fit(
            X_train=X_train,
            y_train=y_train,
            from_unsupervised=pretrainer if PRETRAIN else None,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            eval_name=["train", "valid"],
            eval_metric=["auc"],
            batch_size=BS,
            virtual_batch_size=256,
            max_epochs=MAX_EPOCH,
            drop_last=True,
            pin_memory=True,
            patience=10,
        )

        val_preds = model.predict_proba(X_valid)[:, -1]
        print('auc:', roc_auc_score(y_true=y_valid, y_score=val_preds))

        data['oof_preds'].iloc[valid_index] = val_preds

        test_preds = model.predict_proba(X_test)[:, -1]
        df_sub[f"fold_{fold_nb+1}"] = test_preds

    df_sub["target"] = df_sub.filter(like="fold_").mean(axis=1).values

    df_sub.to_csv("Analysis/submission_5_tabnet.csv", index=False)

    df_sub = pd.read_csv("Analysis/submission_5_tabnet.csv")

    # df_sub.target = df_sub.target.map(lambda x: 0 if x<=0.5 else 1)
    df_sub.loc[:,
               ["id", "target"]].to_csv("Analysis/submission_5_2_tabnet.csv",
                                        index=False)
Esempio n. 4
0
y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])]


cat_idxs = [ i for i, col in enumerate(X_all.columns) if col in vars_cat]
cat_dims = [ len(np.unique(X_all.iloc[:,i].values)) for i in cat_idxs]
cat_emb_dim = np.floor(np.log(cat_dims)).astype(int)


md = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=cat_emb_dim,
                       ## optimizer_fn=torch.optim.Adam,
                       ## optimizer_params=dict(lr=2e-2),
                       ## mask_type='sparsemax',
                       n_steps=1,
)

%%time
md.fit( X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_name=['train', 'test_EVIL'],
    eval_metric=['auc'],
    max_epochs=100, patience=100,
    ## batch_size=1024, virtual_batch_size=128,
    ## weights=0,
)

y_pred = md.predict_proba(X_test)[:,1]
print(metrics.roc_auc_score(y_test, y_pred))

Esempio n. 5
0
            X_train=X_trn,
            y_train=y_trn,
            eval_set=[(X_trn, y_trn), (X_vld, y_vld)],
            eval_name=["train", "valid"],
            eval_metric=["logloss"],
            max_epochs=max_epochs,
            patience=50,
            batch_size=128,
            virtual_batch_size=128,
            num_workers=0,
            weights=1,
            drop_last=False,
            from_unsupervised=loaded_pretrain,
        )

        fold_preds = clf.predict_proba(X_vld).astype(np.float64)[:, 1]
        _test_preds.append(clf.predict_proba(X_tst)[:, 1])
        oof[vld_index] = fold_preds
        scores.append(log_loss(y_vld, fold_preds))
        importances = pd.concat(
            [
                importances,
                pd.DataFrame({"feature": feat_cols, "importance": clf.feature_importances_}),
            ],
            axis=0,
        )
    oof_preds.append(oof)
    test_preds.append(np.mean(_test_preds, axis=0))


fig, ax = plt.subplots(figsize=(6, 18))
Esempio n. 6
0
        X_train=X_trn,
        y_train=y_trn,
        eval_set=[(X_trn, y_trn), (X_vld, y_vld)],
        eval_name=["train", "valid"],
        eval_metric=["logloss"],
        max_epochs=max_epochs,
        patience=25,
        batch_size=128,
        virtual_batch_size=128,
        num_workers=0,
        weights=1,
        drop_last=False,
        from_unsupervised=loaded_pretrain,
    )

    fold_preds = clf.predict_proba(X_vld).astype(np.float64)
    preds[vld_index] = fold_preds[:, 1]
    scores.append(log_loss(y_vld, fold_preds[:, 1]))
    importances = pd.concat(
        [
            importances,
            pd.DataFrame({
                "feature": feat_cols,
                "importance": clf.feature_importances_
            })
        ],
        axis=0,
    )

fig, ax = plt.subplots(figsize=(6, 18))
sns.barplot(
Esempio n. 7
0
#we separe between categorical variables and date variables to preprocess it separately
categorical_val.remove('answer_creation_date')
categorical_val.remove('group_creation_date')
categorical_val.remove('request_creation_date')
categorical_val.remove('victim_of_violence_type')
date_columns = [
    'answer_creation_date', 'group_creation_date', 'request_creation_date'
]
X_test, y_test = preprocess(requests_test, categorical_val, date_columns)

# Drop Nan value because otherwise there are memory errors
X_test['granted_number_of_nights'] = y_test
X_test = X_test.dropna()
y_test = X_test['granted_number_of_nights']
X_test = X_test.drop(columns=['granted_number_of_nights'])
#preprocess the datasets for TabNet
X_test_tab, y_test_tab = preprocess_for_tabnet(X_test, y_test)
# retrieve model
# Not working with version <3.7

PATH = '../model_zoo/TabNet_model.zip'
clf = TabNetClassifier()
clf.load_model(PATH)

#run inference
start = time.time()
preds = clf.predict_proba(X_test_tab)
end = time.time()
score = competition_scorer(y_test_tab, preds)
print('time per prediction:', (end - start) / len(X_test))
print('The competition score on test data', score)
Esempio n. 8
0
class TabNetCv1(TabNetBase):
    def act_init_ai(self, is_test=False):
        if is_test is True:
            _TPI(self, locals())
        else:
            MATCH = self.env.match_loader
            self.ai = TabNetClassifier(n_steps=10,
                                       input_dim=MATCH.count_cols *
                                       MATCH.count_players,
                                       cat_dims=self.cat_dims,
                                       cat_emb_dim=self.cat_emb_dim,
                                       cat_idxs=self.cat_idxs,
                                       device_name='cuda')

    def act_modify_data(self, is_test=False):
        if is_test is True:
            _TPI(self, locals())
        else:
            MATCH = self.env.match_loader
            for i in range(len(self.y_train)):
                result = MATCH.train_plus[i] - MATCH.train_minus[i]

                if result < 0:
                    result = 0
                elif result == 0:
                    result = 1
                else:
                    result = 2
                self.y_train[i] = result

            for i in range(len(self.y_valid)):
                result = MATCH.valid_plus[i] - MATCH.valid_minus[i]
                if result < 0:
                    result = 0
                elif result == 0:
                    result = 1
                else:
                    result = 2
                self.y_valid[i] = result

            for i in range(len(self.y_valid)):
                result = MATCH.test_plus[i] - MATCH.test_minus[i]
                if result < 0:
                    result = 0
                elif result == 0:
                    result = 1
                else:
                    result = 2
                self.y_test[i] = result

    def load_model(self):
        print("Loading Model")
        print(self.save_name + ".zip")
        self.ai.load_model(self.save_name + ".zip")

    def get_result(self, my_team: list, your_team: list):
        # tmp
        '''
        MATCH = self.env.match_loader
        self.act_register_data(data=MATCH.act_get(is_flat=True))
        self.act_modify_data()
        self.ai = TabNetClassifier(
            n_steps=10,
            input_dim=MATCH.count_cols * MATCH.count_players,
            cat_dims=self.cat_dims,
            cat_emb_dim=self.cat_emb_dim,
            cat_idxs=self.cat_idxs,
            device_name='cuda'
        )

        self.ai.fit(
            X_train=self.X_train, y_train=self.y_train,
            X_valid=self.X_valid, y_valid=self.y_valid,
            max_epochs=self.epochs,
            patience=500,
            batch_size=512,
            drop_last=False
        )
        '''
        try:
            x = list()
            for player in my_team:
                x.append(0)
                x.extend(player)

            for player in your_team:
                x.append(1)
                x.extend(player)

            x = np.array(x)
            x = np.array([x])
            print(x.shape)

            # predictions = self.ai.predict(x)
            probability = self.ai.predict_proba(x)
            print(probability)

            return probability[0][2]

        except Exception as e:
            print(e)
Esempio n. 9
0
                    "gamma": 0.9
                },
                scheduler_fn=torch.optim.lr_scheduler.StepLR,
                mask_type='entmax'  # "sparsemax"
            )

            clf.fit(X_train=X_tr,
                    y_train=y_tr,
                    X_valid=X_val,
                    y_valid=y_val,
                    max_epochs=epoch,
                    patience=epoch,
                    batch_size=1024,
                    virtual_batch_size=128,
                    num_workers=0,
                    weights=1,
                    drop_last=False)

            test_pred += clf.predict_proba(X_test)[:, 1]

        test_pred /= 5

        test_auc = roc_auc_score(y_test, test_pred)

        print('test auc:', test_auc)
        res[data_name] = test_auc

    print(res)
    with open("tabnet_result.pickle", "wb") as f:
        pickle.dump(res, f)