Ejemplo n.º 1
0
def build_tabnet():
    model_file_name = 'tabnet_model_{}'.format(current_dataset_name)

    df = current_dataset.copy()
    cleaning_text(df)

    X = df['clean_content']
    y = df['emotion']
    # tokenize la data
    tok = Tokenizer(num_words=1000, oov_token='<UNK>')
    # fit le model avec les données de train
    # tok.fit_on_texts(X)
    # X = tok.texts_to_matrix(X, mode='tfidf')
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        stratify=y,
                                                        random_state=1)
    X_test_save = X_test
    tok.fit_on_texts(X_test)
    X_test = tok.texts_to_matrix(X_test, mode='tfidf')
    tok.fit_on_texts(X_train)
    X_train = tok.texts_to_matrix(X_train, mode='tfidf')
    # X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, stratify=y)
    # build model, fit and predict
    if LOAD_MODEL and pathlib.Path(model_file_name).exists():
        model = pickle.load(open(model_file_name, 'rb'))
    else:
        model = TabNetClassifier()
        model.fit(X_train=X_train,
                  y_train=y_train,
                  eval_set=[(X_train, y_train), (X_test, y_test)],
                  eval_name=['train', 'valid'],
                  eval_metric=['accuracy', 'balanced_accuracy', 'logloss'])

    preds_mapper = {
        idx: class_name
        for idx, class_name in enumerate(model.classes_)
    }
    preds = model.predict_proba(X_test)
    y_pred_proba = np.vectorize(preds_mapper.get)(np.argmax(preds, axis=1))
    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    pickle.dump(model, open(model_file_name, 'wb'))
    # model.save_model(model_file_name)
    return model, y_test, y_pred, test_acc
Ejemplo n.º 2
0
    def fit(self, x_train, y_train, kf_splits=5, tabnet_type=None):
        def _get_tabnet_params(tabnet_type):
            if (tabnet_type is None):
                tabnet_params = dict(
                    verbose=40,
                    optimizer_fn=torch.optim.Adam,
                    optimizer_params=dict(lr=1e-2, weight_decay=1e-5),
                    scheduler_params=dict(max_lr=0.05,
                                          steps_per_epoch=x_train.shape[0] //
                                          128,
                                          epochs=300),
                    scheduler_fn=torch.optim.lr_scheduler.OneCycleLR)
                fit_params = dict(batch_size=1024,
                                  virtual_batch_size=128,
                                  eval_metric='accuracy')
            elif (tabnet_type == 'TabNet-S'):
                tabnet_params = dict(
                    n_d=8,
                    n_a=8,
                    lambda_sparse=0.0001,
                    momentum=0.1,
                    n_steps=3,
                    gamma=1.2,
                    verbose=40,
                    optimizer_fn=torch.optim.Adam,
                    optimizer_params=dict(lr=0.01),
                    scheduler_params=dict(step_size=8000, gamma=0.05),
                    scheduler_fn=torch.optim.lr_scheduler.StepLR)
                fit_params = dict(batch_size=4096,
                                  virtual_batch_size=256,
                                  eval_metric='mse')
            else:
                print('[ERROR] Unknown tabnet_type: {}'.format(tabnet_type))
                quit()

            # --- check problem ---
            if fit_params['eval_metric'] in [
                    'auc', 'accuracy', 'balanced_accuracy', 'logloss'
            ]:
                problem = 'classification'
            elif fit_params['eval_metric'] in ['mse', 'mae', 'rmse', 'rmsle']:
                problem = 'regression'

            return tabnet_params, fit_params, problem

        kf = KFold(n_splits=kf_splits, shuffle=False)
        scores = []
        self.tabnet_models = []

        tabnet_params, fit_params, problem = _get_tabnet_params(tabnet_type)

        for i, (train_index,
                val_index) in enumerate(kf.split(x_train, y_train)):
            if (problem == 'classification'):
                unsupervised_model = TabNetPretrainer(**tabnet_params)
                tabnet_model = TabNetClassifier(**tabnet_params)
            elif (problem == 'regression'):
                unsupervised_model = TabNetPretrainer(**tabnet_params)
                tabnet_model = TabNetRegressor(**tabnet_params)
            else:
                pring('[ERROR] Unknown problem: {}'.format(problem))
                quit()

            x_tr = x_train[train_index]
            x_val = x_train[val_index]
            y_tr = y_train[train_index]
            y_val = y_train[val_index]

            unsupervised_model.fit(x_tr,
                                   eval_set=[x_val],
                                   patience=300,
                                   max_epochs=5000,
                                   pretraining_ratio=0.8)

            tabnet_model.fit(
                x_tr,
                y_tr,
                eval_set=[(x_val, y_val)],
                eval_metric=[fit_params['eval_metric']],
                batch_size=fit_params['batch_size'],
                virtual_batch_size=fit_params['virtual_batch_size'],
                patience=300,
                max_epochs=5000,
                from_unsupervised=unsupervised_model)

            self.tabnet_models.append(tabnet_model)
            prediction = tabnet_model.predict(x_val)
            if (problem == 'classification'):
                scores.append(accuracy_score(y_val, prediction))
            elif (problem == 'regression'):
                scores.append(mean_squared_error(y_val, prediction))
            else:
                pring('[ERROR] Unknown problem: {}'.format(problem))
                quit()

            if (i == 0):
                feature_importances = tabnet_model.feature_importances_.copy()
            else:
                feature_importances = np.vstack(
                    (feature_importances, tabnet_model.feature_importances_))

        print(scores)
        print(np.mean(scores))

        return scores, feature_importances