Beispiel #1
0
def test_clone():
    estimator = CatBoostClassifier(custom_metric="Accuracy",
                                   loss_function="MultiClass",
                                   iterations=400)

    # This is important for sklearn.base.clone since
    # it uses get_params for cloning estimator.
    params = estimator.get_params()
    new_estimator = CatBoostClassifier(**params)
    new_params = new_estimator.get_params()

    for param in params:
        assert param in new_params
        assert new_params[param] == params[param]
Beispiel #2
0
def test_clone():
    estimator = CatBoostClassifier(
        custom_metric="Accuracy",
        loss_function="MultiClass",
        iterations=400)

    # This is important for sklearn.base.clone since
    # it uses get_params for cloning estimator.
    params = estimator.get_params()
    new_estimator = CatBoostClassifier(**params)
    new_params = new_estimator.get_params()

    for param in params:
        assert param in new_params
        assert new_params[param] is params[param]
Beispiel #3
0
    def FineTune_hyperopt(self, X, y, mute=False):

        self.dataset(X, y)

        params_space = {
            'l2_leaf_reg':
            hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
            'learning_rate':
            hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
            'bagging_temperature':
            hyperopt.hp.uniform("bagging_temperature", 0, 0.3)
        }
        trials = hyperopt.Trials()
        best = hyperopt.fmin(self.hyperopt_objective,
                             space=params_space,
                             algo=hyperopt.tpe.suggest,
                             max_evals=2,
                             trials=trials,
                             rstate=RandomState(self.random_state))
        if not mute:
            print("\nBest parameters:")
            print(best)
            print("\n")

        _parameters = self.params
        _parameters.update(best)

        _model = CatBoostClassifier(**_parameters)
        _cv_data = catboost.cv(self.all_train_data, _model.get_params())

        if not mute:
            print('\nPrecise validation accuracy score: {}'.format(
                np.max(_cv_data['test-Accuracy-mean'])))
        return best
 def hyperopt_objective(params):
     model = CatBoostClassifier(
         n_estimators=params["n_estimators"],
         # use_best_model=True,od_type='Iter',od_wait=20,
         verbose=2,
         eval_metric='AUC',
         od_pval=0.000001,
         # leaf_estimation_method=params['leaf_estimation_method'],
         depth=params['depth'],
         border_count=params['border_count'],
         learning_rate=params["learning_rate"],
         l2_leaf_reg=params['l2_leaf_reg'],
         bagging_temperature=params['bagging_temperature'],
         rsm=params['rsm'])
     cv_data = cv(Pool(train_set, train_label),
                  model.get_params(),
                  nfold=4,
                  verbose_eval=True)
     # model.fit(train_pool_tp, eval_set=validate_pool_tp)
     # model.fit(X=train_x, y=train_y,
     #         eval_set=(val_x, val_y))
     # y_val_hat = model.predict(train_set.values)
     # mean_auc = roc_auc_score(train_label.values, y_val_hat)
     # metrics = model.eval_metrics(validate_pool_tf, ['AUC'])
     # mean_auc = sum(metrics['AUC'])/float(len(metrics['AUC']))
     # cv_data = cv(
     #     Pool(train_set_tf, train_label, cat_features=categorical_features_indices_tf),
     #     model.get_params()
     # )
     logloss = np.max(cv_data['test-Logloss-mean'])
     print(logloss)
     return logloss  # as hyperopt minimises
Beispiel #5
0
class CatboostPredictor(PredictionModel):
    def __init__(self, params):
        self.model = CatBoostClassifier(**params)

    def fitModel(self, X_train, y_train):
        self.model.fit(X_train,
                       y_train,
                       verbose=True,
                       cat_features=np.arange(381, 384))
        pool = Pool(X_train, y_train, cat_features=np.arange(381, 384))
        scores = cv(pool, self.model.get_params(), verbose=True)
        return scores
def hyperopt_objective(params):
    model = CatBoostClassifier(l2_leaf_reg=int(params['l2_leaf_reg']),
                               learning_rate=params['learning_rate'],
                               iterations=500,
                               eval_metric='Accuracy',
                               random_seed=42,
                               logging_level='Silent')

    cv_data = cv(Pool(X, y, cat_features=categorical_features_indices),
                 model.get_params())
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])

    return 1 - best_accuracy  # as hyperopt minimises
def test_serialization_final_fallback(ray_start_regular):
    pytest.importorskip("catboost")
    # This test will only run when "catboost" is installed.
    from catboost import CatBoostClassifier

    model = CatBoostClassifier(iterations=2,
                               depth=2,
                               learning_rate=1,
                               loss_function="Logloss",
                               logging_level="Verbose")

    reconstructed_model = ray.get(ray.put(model))
    assert set(model.get_params().items()) == set(
        reconstructed_model.get_params().items())
Beispiel #8
0
    def hyperopt_objective(self, params):

        _model = CatBoostClassifier(
            l2_leaf_reg=int(params['l2_leaf_reg']),
            learning_rate=params['learning_rate'],
            bagging_temperature=params["bagging_temperature"],
            iterations=500,
            eval_metric='AUC',
            random_seed=99,
            verbose=False,
            loss_function='Logloss')
        _cv_data = catboost.cv(self.all_train_data, _model.get_params())
        best_accuracy = np.max(_cv_data['test-AUC-mean'])

        return 1 - best_accuracy
Beispiel #9
0
def hyperopt_objective(params):
    model = CatBoostClassifier(l2_leaf_reg=int(params['l2_leaf_reg']),
                               max_depth=int(params['max_depth']),
                               iterations=150,
                               eval_metric='Accuracy',
                               random_seed=164530,
                               logging_level='Silent',
                               od_type='IncToDec',
                               od_wait=20)

    cv_data = cv(Pool(X, y, cat_features=categorical_features_indices),
                 model.get_params())
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])

    return 1 - best_accuracy  # as hyperopt minimises
    def objective(space):
        global best_score, trials_count
        #       if os.path.isdir('./catboost_info'):
        #           shutil.rmtree('./catboost_info', ignore_errors=True)
        trials_count += 1
        if (trials_count % 5) == 0 and is_quit_pressed():
            raise co.TennisAbortError
        args_dct = dict(**space)
        params = {
            "eval_metric": metric_name,
            # 'eval_metric': 'Logloss',
            "random_seed": random_state,
            "logging_level": "Silent",
        }
        params.update(args_dct)
        if how == "cv":
            cv_data = cv(pools.train, params, stratified=True)
            scr_val = np.max(cv_data[f"test-{metric_name}-mean"])
        elif how == "sklearn":
            mdl = CatBoostClassifier(**params)
            mdl.fit(pools.train)
            pred = mdl.predict_proba(pools.eval)[:, 1]
            scr_val = roc_auc_score(pools.eval.y, pred)
        elif how == "native":
            mdl = CatBoost(params)
            mdl.fit(
                pools.train,
                eval_set=None,  # pools.eval if pools.eval else None,
                silent=True,
            )  # eval_set=pools.eval
            pred = mdl.predict(pools.eval, prediction_type="Probability")[:, 1]
            scr_val = roc_auc_score(pools.eval.get_label(), pred)
        else:
            raise Exception("bad how arg {}".format(how))

        #       pred = mdl.predict(data.X_test)
        #       scr_val = precision_score(data.y_test, pred)

        if scr_val > best_score:
            if how == "cv":
                cco.out("achieved best {} at {}".format(scr_val, params))
            else:
                cco.out("achieved best {} at {} lrate: {} ntrees: {}".format(
                    scr_val, mdl.get_params(), mdl.learning_rate_,
                    mdl.tree_count_))
            best_score = scr_val
        return {"loss": 1.0 - scr_val, "status": STATUS_OK}
Beispiel #11
0
def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=1000,
        eval_metric='F1',
        random_seed=42,
        verbose=False,
        loss_function='Logloss',
    )

    cv_data = cv(
        Pool(data, data_label, cat_features=categorical_features_indices),
        model.get_params())
    best_f1 = np.max(cv_data['test-F1-mean'])

    return 1 - best_f1
Beispiel #12
0
def modelBuilding(X, y, cat_features):

    X_train, X_validation, y_train, y_validation = train_test_split(X,\
         y, train_size=0.8, random_state=1234)
    model = CatBoostClassifier(iterations=2000,
                               learning_rate=0.01,
                               task_type="GPU"
                               #loss_function='CrossEntropy'
                               )
    model.fit(X_train,
              y_train,
              cat_features=cat_features,
              eval_set=(X_validation, y_validation),
              verbose=True)
    print('Model is fitted: ' + str(model.is_fitted()))
    print('Model params:')
    print(model.get_params())
    return model
Beispiel #13
0
    def train_all_save_catboost(self, X, y, categorical_features_indices):
        """train whole data and save the training to be use later in new predictions"""
        model = CatBoostClassifier(loss_function='MultiClass',
                                   eval_metric='TotalF1',
                                   random_seed=42,
                                   leaf_estimation_method='Newton')
        cv_data = cv(Pool(X, y, cat_features=categorical_features_indices),
                     model.get_params())
        print("precise validation accuracy score:{}".format(np.max(cv_data)))
        model.fit(X, y, cat_features=categorical_features_indices)

        #feature importance
        print(model.get_feature_importance(prettified=True))
        # train = Pool(X, y, cat_features=categorical_features_indices)
        # feature_importances = model.get_feature_importance(train)
        # feature_names = X.columns
        # for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        #     print('{}: {}'.format(name, score))

        model.save_model('catboost_model.dump')
        print("Catboost model has been saved!")
def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        #learning_rate=params['learning_rate'],
        depth=params['depth'],
        iterations=500,
        eval_metric='Accuracy',
        od_type='Iter',
        od_wait=40,
        random_seed=42,
        logging_level='Silent',
        allow_writing_files=False
    )
    
    cv_data = cv(
        train_pool,
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])    
    
    print(params, best_accuracy)
    return 1 - best_accuracy # as hyperopt minimises
def train_cat(model=False):
    global log

    params = grid_search_cat(True)

    clf = CatBoostClassifier().set_params(**params)

    if model:
        return clf

    params = clf.get_params()
    log += 'cat'
    log += ', learning_rate: %.3f' % params['learning_rate']
    log += ', iterations: %d' % params['iterations']
    log += ', depth: %d' % params['depth']
    log += ', l2_leaf_reg: %d' % params['l2_leaf_reg']
    log += ', border_count: %d' % params['border_count']
    log += ', subsample: %d' % params['subsample']
    log += ', one_hot_max_size: %d' % params['one_hot_max_size']
    log += '\n\n'

    return train(clf)
Beispiel #16
0
validation_pool = Pool(data=X_validation,
                       label=y_validation,
                       cat_features=cat_features)

#######################
#   BETTER/BEST MODEL
#######################

# Note: You can tinker with learning rates
model = CatBoostClassifier(
    iterations=5,
    learning_rate=0.1,
)
model.fit(train_pool, eval_set=validation_pool, verbose=False)

# Print model info
print('Model is fitted: {}'.format(model.is_fitted()))
print('Model params:\n{}'.format(model.get_params()))

# Choose the best iteration
# Note: There is a parameter: use_best_model ( = True or False)
model = CatBoostClassifier(iterations=100, )

model.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
)

print('Tree count: ' + str(model.tree_count_))
Beispiel #17
0
class modelCatBoost(object):
    def __init__(self, name="CBT", random_state=99, *args, **kwargs):

        self.name = name
        self.train_dir = "model_" + str(self.name) + "/"
        self.random_state = random_state

        self.manager_models = ParamsManager(param_file, key_read="Models")
        self.params = self.manager_models.get_params()["CatBoost"]
        self.params.update({
            'train_dir': self.train_dir,
            "random_state": self.random_state
        })

        self.model = CatBoostClassifier(**self.params)

    def dataset(self,
                X,
                y,
                categorical_columns_indices=None,
                test_size=0.2,
                *args,
                **kwargs):

        self.categorical_columns_indices = categorical_columns_indices
        self.X = X
        self.columns = list(X)

        self.y, self.cat_replace = self.replace_multiclass(y)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=test_size,
            random_state=self.random_state)

        self.train_data = catboost.Pool(
            data=self.X_train.values,
            label=self.y_train.values,
            cat_features=self.categorical_columns_indices)
        self.eval_data = catboost.Pool(
            data=self.X_test.values,
            label=self.y_test.values,
            cat_features=self.categorical_columns_indices)
        self.all_train_data = catboost.Pool(
            data=self.X.values,
            label=self.y.values,
            cat_features=self.categorical_columns_indices)

    def replace_multiclass(self, targets):

        _unic = targets.unique().tolist()
        _remp = np.arange(0, len(_unic)).tolist()
        return targets.replace(_unic, _remp), _unic

    def fit(self,
            X,
            y,
            use_best_model=True,
            plot=True,
            save_snapshot=False,
            verbose=0,
            *args,
            **kwargs):

        self.dataset(X, y)
        _params = self.model.get_params()

        if verbose:
            _verbose = 0
        else:
            _verbose = _params["verbose"]

        return self.model.fit(self.train_data,
                              verbose=_verbose,
                              eval_set=self.eval_data,
                              use_best_model=use_best_model,
                              plot=plot,
                              save_snapshot=save_snapshot,
                              **kwargs)

        _preds = self.model.predict(self.dvalid)
        preds_test = np.where(_preds > 0.5, 1, 0)
        score_test = accuracy_score(self.y_test, preds_test)

        _preds = self.model.predict(self.dtrain)
        preds_train = np.where(_preds > 0.5, 1, 0)
        score_train = accuracy_score(self.y_train, preds_train)

        if not verbose == 0:
            print("Accurancy para el conjunto de entrenamiento ---> {:.2f}%".
                  format(score_train * 100))
            print("Accurancy para el conjunto de validacion ------> {:.2f}%".
                  format(score_test * 100))

    def fit_cv(self,
               X,
               y,
               fold_count=4,
               shuffle=True,
               stratified=True,
               plot=True,
               verbose=100):

        self.dataset(X, y)

        _params = self.model.get_params()
        _params.update({'verbose': verbose})

        _scores = catboost.cv(pool=self.all_train_data,
                              params=_params,
                              fold_count=fold_count,
                              seed=self.random_state,
                              shuffle=shuffle,
                              verbose=verbose,
                              plot=plot)
        if not verbose == 0:
            print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.
                  format(
                      np.max(_scores['test-Accuracy-mean']),
                      _scores['test-Accuracy-std'][np.argmax(
                          _scores['test-Accuracy-mean'])],
                      np.argmax(_scores['test-Accuracy-mean'])))

        return _scores

    def copy(self, *args, **kwargs):
        returned_classifier = CatBoostClassifier()
        returned_classifier.catboost_classifier = self.model.copy()
        returned_classifier.columns = self.columns
        return returned_classifier

    def update_model(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self.model, k, v)

    def save_model(self, direct="./checkpoints", name="catboost_model"):

        if not os.path.isdir(direct):
            try:
                os.mkdir(direct)
                print("Directorio creado: " + direct)
            except OSError as e:
                raise NameError("Error al crear el directorio")
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        filename = direct + "/" + name + "_" + current_time + ".dump"
        self.model.save_model(filename)
        print("Modelo guardado en la ruta: " + filename)

    def load_model(self, direct="./checkpoints", name="catboost_model"):

        if not os.path.isdir(direct):
            print("no existe el drectorio especificado")
        filename = direct + "/" + name + ".dump"
        self.model.load_model(filename)
        print("Modelo cargado de la ruta: " + filename)

    def predict(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict(_X_copy.values, *args, **kwargs)

    def predict_proba(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict_proba(_X_copy.values, *args, **kwargs)

    def add_cat_features(self, index_features):

        self.categorical_columns_indices = index_features
        print(self.categorical_columns_indices)

        self.train_data = catboost.Pool(
            data=self.X_train,
            label=self.y_train,
            cat_features=self.categorical_columns_indices)
        self.eval_data = catboost.Pool(
            data=self.X_test,
            label=self.y_test,
            cat_features=self.categorical_columns_indices)
        self.all_train_data = catboost.Pool(
            data=self.X,
            label=self.y,
            cat_features=self.categorical_columns_indices)

    def index_features(self, features):

        _index = []
        for i in features:
            _index.append(self.X.columns.get_loc(i))
        if _index == []:
            raise NameError("No coincide ninguna de las features introducidas")
        return _index

    def get_important_features(self, display=True):

        self.model.get_feature_importance(prettified=True)
        _feature_importance_df = self.model.get_feature_importance(
            prettified=True)

        if display:
            plt.figure(figsize=(12, 6))
            sns.barplot(x="Importances",
                        y="Feature Id",
                        data=_feature_importance_df)
            plt.title('CatBoost features importance:')

        return _feature_importance_df

    def Visualizer_Models(self, directs=None, visu_model=True):

        directorios = []
        if len(directs) < 0:
            if visu_model:
                directorios.append(self.train_dir)
            else:
                raise NameError("No se ha seleccionado ningun directorio")
        else:
            if visu_model:
                directorios.append(self.train_dir)
            for i in directs:
                directorios.append(i)
        print(directorios)
        widget = MetricVisualizer(directorios)
        widget.start()

    def hyperopt_objective(self, params):

        _model = CatBoostClassifier(
            l2_leaf_reg=int(params['l2_leaf_reg']),
            learning_rate=params['learning_rate'],
            bagging_temperature=params["bagging_temperature"],
            iterations=500,
            eval_metric='AUC',
            random_seed=99,
            verbose=False,
            loss_function='Logloss')
        _cv_data = catboost.cv(self.all_train_data, _model.get_params())
        best_accuracy = np.max(_cv_data['test-AUC-mean'])

        return 1 - best_accuracy

    def FineTune_hyperopt(self, X, y, mute=False):

        self.dataset(X, y)

        params_space = {
            'l2_leaf_reg':
            hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
            'learning_rate':
            hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
            'bagging_temperature':
            hyperopt.hp.uniform("bagging_temperature", 0, 0.3)
        }
        trials = hyperopt.Trials()
        best = hyperopt.fmin(self.hyperopt_objective,
                             space=params_space,
                             algo=hyperopt.tpe.suggest,
                             max_evals=2,
                             trials=trials,
                             rstate=RandomState(self.random_state))
        if not mute:
            print("\nBest parameters:")
            print(best)
            print("\n")

        _parameters = self.params
        _parameters.update(best)

        _model = CatBoostClassifier(**_parameters)
        _cv_data = catboost.cv(self.all_train_data, _model.get_params())

        if not mute:
            print('\nPrecise validation accuracy score: {}'.format(
                np.max(_cv_data['test-Accuracy-mean'])))
        return best

    def FineTune_sklearn(self, X, y, mute=False, n_splits=10, n_iter=2):
        """
        https://www.kaggle.com/ksaaskil/pets-definitive-catboost-tuning
        """
        self.dataset(X, y)

        def build_search(modelo,
                         param_distributions,
                         cv=5,
                         n_iter=10,
                         verbose=1,
                         random_state=99):
            """
            Builder function for RandomizedSearch.
            """
            QWS = make_scorer(cohen_kappa_score, weights='quadratic')
            return RandomizedSearchCV(modelo,
                                      param_distributions=param_distributions,
                                      cv=cv,
                                      return_train_score=True,
                                      refit='cohen_kappa_quadratic',
                                      n_iter=n_iter,
                                      n_jobs=None,
                                      scoring={
                                          'accuracy':
                                          make_scorer(accuracy_score),
                                          'cohen_kappa_quadratic': QWS
                                      },
                                      verbose=verbose,
                                      random_state=random_state)

        def pretty_cv_results(cv_results,
                              sort_by='rank_test_cohen_kappa_quadratic',
                              sort_ascending=True,
                              n_rows=30):
            """
            Return pretty Pandas dataframe from the `cv_results_` attribute of finished parameter search,
            ranking by test performance and only keeping the columns of interest.
            """
            df = pd.DataFrame(cv_results)
            cols_of_interest = [
                key for key in df.keys() if key.startswith('param_')
                or key.startswith("mean_train") or key.startswith("std_train")
                or key.startswith("mean_test") or key.startswith("std_test")
                or key.startswith('mean_fit_time') or key.startswith('rank')
            ]
            return df.loc[:, cols_of_interest].sort_values(
                by=sort_by, ascending=sort_ascending).head(n_rows)

        def run_search(X_train, y_train, search, mute=False):
            search.fit(X_train, y_train)
            print('Best score is:', search.best_score_)
            return pretty_cv_results(search.cv_results_)

        param_distributions = {
            'iterations': [100, 200],
            'learning_rate': scipy.stats.uniform(0.01, 0.3),
            'max_depth': scipy.stats.randint(3, 10),
            'one_hot_max_size': [30],
            'l2_leaf_reg': scipy.stats.reciprocal(a=1e-2, b=1e1),
        }

        if mute:
            _verbose = 0
        else:
            _verbose = 1

        self.params.update({'use_best_model': False})
        _model = CatBoostClassifier(**self.params)

        catboost_search = build_search(_model,
                                       param_distributions=param_distributions,
                                       n_iter=n_iter,
                                       verbose=_verbose,
                                       cv=RepeatedStratifiedKFold(
                                           n_splits=n_splits,
                                           n_repeats=1,
                                           random_state=self.random_state))
        catboost_cv_results = run_search(self.X,
                                         self.y,
                                         search=catboost_search,
                                         mute=mute)
        best_estimator = catboost_search.best_estimator_
        if not mute:
            print(best_estimator.get_params())

        return catboost_cv_results, best_estimator

    def __getattr__(self, attr):
        """
        Pass all other method calls to self.model.
        """
        return getattr(self.model, attr)
# CatBoost model definition
catboost_model = CatBoostClassifier(iterations=200,
                                    custom_loss=['Accuracy'],
                                    loss_function='Logloss')

# Fit CatBoost model
catboost_model.fit(train_pool)  #,plot=True)

# CatBoost accuracy
acc_catboost = round(catboost_model.score(x_train, y_train) * 100, 2)

# How long will this take?
start_time = time.time()

# Set params for cross-validation as same as initial model
cv_params = catboost_model.get_params()

# Run the cross-validation for 10-folds (same as the other models)
cv_data = cv(train_pool, cv_params, fold_count=10)  #,plot=True)

# How long did it take?
catboost_time = (time.time() - start_time)

# CatBoost CV results save into a dataframe (cv_data), let's withdraw the maximum accuracy score
acc_cv_catboost = round(np.max(cv_data['test-Accuracy-mean']) * 100, 2)
"""
MLP classification
"""
x_train = torch.tensor(x_train.values).float()
x_test = torch.tensor(x_test.values).float()
x_valid = torch.tensor(x_valid.values).float()
    eval_set=(X_validation, y_validation),
    #     logging_level='Verbose',  # you can uncomment this for text output
    plot=True)

# As you can see, it is possible to watch our model learn through verbose output or with nice plots (personally I would definately go with the second option - just check out those plots: you can, for example, zoom in areas of interest!)
#
# With this we can see that the best accuracy value of **0.8341** (on validation set) was acheived on **503th** boosting step.

# ### 2.2 Model Cross-Validation
#
# It is good to validate your model, but to cross-validate it - even better. And also with plots! So with no more words:

# In[13]:

cv_data = cv(Pool(X, y, cat_features=categorical_features_indices),
             model.get_params(),
             plot=True)

# Now we have values of our loss functions at each boosting step averaged by 10 folds, which should provide us with a more accurate estimation of our model performance:

# In[14]:

print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])))

# In[15]:

print('Precise validation accuracy score: {}'.format(
    np.max(cv_data['test-Accuracy-mean'])))
Beispiel #20
0
from src.data.processed_data import aX_i_train, ay_i_train, bX_i_train, by_i_train, cX_i_train, cy_i_train
from src.data.make_dataset import get_categorical_indices

import numpy as np
from catboost import CatBoostClassifier, cv, Pool
from sklearn.cross_validation import StratifiedKFold

a_indices, b_indices, c_indices = get_categorical_indices(
    aX_h_train), get_categorical_indices(bX_h_train), get_categorical_indices(
        cX_h_train)

model_a = CatBoostClassifier(nan_mode='Min')
model_a.fit(aX_h_train, ay_h_train, cat_features=a_indices)

model_b = CatBoostClassifier(nan_mode='Min')
model_b.fit(bX_h_train, by_h_train, cat_features=b_indices)

model_c = CatBoostClassifier(nan_mode='Min')
model_c.fit(cX_h_train, cy_h_train, cat_features=c_indices)

cv_data_a = cv(params=model_a.get_params(),
               pool=Pool(aX_h_train, ay_h_train, cat_features=a_indices))
a_score = cv_data_a['Logloss_test_avg'][-1]

cv_data_b = cv(params=model_b.get_params(),
               pool=Pool(bX_h_train, by_h_train, cat_features=b_indices))
b_score = cv_data_b['Logloss_test_avg'][-1]

cv_data_c = cv(params=model_c.get_params(),
               pool=Pool(cX_h_train, cy_h_train, cat_features=c_indices))
c_score = cv_data_c['Logloss_test_avg'][-1]

# load model
model = CatBoostClassifier()
model.load_model('models/catboost_model_4.dump')


# Feature Importance: Know which feature contributed the most
feature_importances = model.get_feature_importance(train_pool)
feature_names = pd.DataFrame(X_train).columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

print('\n\n\n')
print(model.get_best_score())
print(model.get_params())


# Validation Prediction
probabilities = model.predict(eval_pool)
# print(probabilities)
pd.DataFrame(probabilities).to_csv('validation-scores/val-scores-3.csv')


# TEST VALUES

# preped_test_values = np.array(pd.read_csv('preped/preped_test_&_featured.csv'))

# eval_dataset = Pool(test_values)
# test_prediction = model.predict(preped_test_values)
Beispiel #22
0
    iterations=2000,
    learning_rate=0.01,
    
)

#fitting model on test-train split data
clf_cat.fit(xtrain, ytrain, 
        cat_features=cat_feature, 
        eval_set=(xvalid, yvalid),
        early_stopping_rounds=100,
        verbose=False
)

print('CatBoost model is fitted: ' + str(clf_cat.is_fitted()))
print('CatBoost model parameters:')
print(clf_cat.get_params())

predictions = clf_cat.predict(xvalid)
print("accuracy_score", accuracy_score(yvalid predictions))
predictions_probas = clf_cat.predict_proba(xvalid)
score=gini_normalized(yvalid,predictions_probas)
print(score)

print('Confusion matrix\n',confusion_matrix(yvalid,predictions))

#completely training the whole train dataset after analysing the gini index value
clf_cat.fit(X_train1, Y_train1,cat_features=cat_feature,verbose=False)

result = clf_cat.predict_proba(X_out)[:,1]
result
Beispiel #23
0
class CatBoostModel(object):
    def __init__(self,
                 df,
                 doCreateFeatures=False,
                 Ycol=None,
                 Xcolumns=None,
                 otherColumns=None,
                 requiredColumns=[]):
        #df = pd.DataFrame.from_records(events)
        self.df = df
        self.model = None
        self.Xcolumns = Xcolumns
        self.Ycol = Ycol
        self.otherColumns = otherColumns
        #self.df=df
        self.requiredColumns = requiredColumns
        self.df = self.fixFeatureTypes(df)
        self.df = self.dropColumnsWithNull(df, self.requiredColumns)

        if doCreateFeatures == True:
            self.df = self.createFeaturesForNewData(df, createFeatures)
            self.categorical_features_indices = None
            target = Ycol
            self.y = self.df[target]
            self.X = self.df[Xcolumns.keys()]
            #print (self.y.dtypes)
            self.categorical_features_indices = np.where(
                self.X.dtypes != np.float)[0]
            self.X_train, self.X_validation, self.y_train, self.y_validation = train_test_split(
                self.X, self.y, train_size=0.75, random_state=42)

    def save(self, name):
        self.model.save_model(f'./data/{name}')

    def load(self, name):
        self.model = CatBoostClassifier().load_model(f'./data/{name}')

    def fixFeatureTypes(self, df):
        df = self.convertModelFeatures(df, self.Xcolumns)
        df = self.convertModelFeatures(df, self.Ycol)
        df = self.convertModelFeatures(df, self.otherColumns)

        return df

    def predict(self, items):
        results = []
        for item in items:
            #logging.warning(item)
            #print (item)
            df = pd.DataFrame.from_records([item])
            df = self.fixFeatureTypes(df)
            df = self.createFeaturesForNewData(df, createFeatures)
            self.df = self.df.append(df, ignore_index=True)
            df = df.drop(self.Ycol, axis=1)
            preddf = df[self.Xcolumns.keys()]
            predictions = self.model.predict(preddf)
            predictions_probs = self.model.predict_proba(preddf)
            preddf.loc[preddf.index[0], 'prediction'] = predictions[0]
            preddf.loc[preddf.index[0], 'proba'] = predictions_probs[0][1]
            result = preddf.to_dict('records')[0]
            for k, v in result.items():
                item[k] = v
            results.append(item)
        return results

    def train(self):
        self.buildModel()
        self.fitModel()
        self.logPrecision()

    def dropColumnsWithNull(self, df, columns):
        for col in columns:
            df = df[pd.notnull(df[col])]
        return df

    def buildModel(self):
        logging.warning('Building Model')
        self.model = CatBoostClassifier(custom_loss=['Accuracy'],
                                        random_seed=42,
                                        logging_level='Silent')

    def fitModel(self):
        logging.warning('Fitting Model')
        self.model.fit(
            self.X_train,
            self.y_train,
            cat_features=self.categorical_features_indices,
            eval_set=(self.X_validation, self.y_validation),
            logging_level='Verbose'  # you can uncomment this for text output
        )

    def logPrecision(self):
        cv_params = self.model.get_params()
        cv_params.update({'loss_function': 'Logloss'})
        cv_data = cv(
            Pool(self.X,
                 self.y,
                 cat_features=self.categorical_features_indices), cv_params)
        logging.warning(
            'Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
                np.max(cv_data['test-Accuracy-mean']),
                cv_data['test-Accuracy-std'][np.argmax(
                    cv_data['test-Accuracy-mean'])],
                np.argmax(cv_data['test-Accuracy-mean'])))
        logging.warning('Precise validation accuracy score: {}'.format(
            np.max(cv_data['test-Accuracy-mean'])))

    def convertModelFeatures(self, df, features):

        for k, v in features.items():
            try:
                if v == "object":

                    df[k].fillna('Nothing', inplace=True)
                    df[k] = df[k].replace('', 'Nothing')
                    df[k] = df[k].astype(object)
                elif v == "float":
                    df[k].fillna(0, inplace=True)
                    df[k] = df[k].apply(pd.to_numeric,
                                        errors='ignore',
                                        downcast='float')
                elif v == "datetime":
                    df[k] = df[k].apply(pd.to_datetime, errors='ignore')
                elif v == "bool":
                    df[k] = df[k].astype(float)
            except KeyError:
                pass
        return df

    def createFeaturesForNewData(self, df, f):
        #number of predictions
        nb = len(df)
        fulldf = self.df
        logging.warning(f"creating features for {nb} predictions")

        fulldf = fulldf.append(df, ignore_index=True)
        lastPreds = fulldf.tail(nb)  # we get the correct indexes this way
        for index, row in lastPreds.iterrows():
            fulldf = f(fulldf, index, row)
        result = fulldf.tail(nb)

        return result
Beispiel #24
0
class CatBoost(Model):
    
    def fit(self, X_train, y_train, X_val=None, y_val=None):
        # Fit the model
        self.model = CatBoostClassifier(verbose=False)
        self.model.fit(X_train, y_train)
        return
    
    def tune_best(self, X_train, y_train, X_val, y_val):
        params = {
            'iterations': 500,
            'learning_rate': 0.001,
            'eval_metric': 'Logloss',
            'random_seed': 42,
            'logging_level': 'Silent',
            'use_best_model': False
        }
        train_pool = Pool(X_train, y_train)
        validate_pool = Pool(X_val, y_val)
        self.model = CatBoostClassifier(**params)
        self.model.fit(train_pool, eval_set=validate_pool)
        best_model_params = params.copy()
        best_model_params.update({
            'use_best_model': True
        })
        self.model = CatBoostClassifier(**best_model_params)
        self.model.fit(train_pool, eval_set=validate_pool, logging_level='Verbose')
        return
    
    def tune(self, X_train, y_train, X_val, y_val):
        self.model = CatBoostClassifier(
            custom_loss=['Logloss'],
            random_seed=42,
            logging_level='Silent'
        )
        self.model.fit(
            X_train, y_train,
            eval_set=(X_val, y_val),
            logging_level='Verbose',
            plot=True
        );
        cv_params = self.model.get_params()
        cv_params.update({
            'loss_function': 'Logloss'
        })
        cv_data = cv(
            Pool(X_train, y_train),
            cv_params,
            plot=True
        )
        print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
            np.max(cv_data['test-Logloss-mean']),
            cv_data['test-Logloss-std'][np.argmax(cv_data['test-Logloss-mean'])],
            np.argmax(cv_data['test-Logloss-mean'])
        ))
        print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Logloss-mean'])))

    def transform(self, X_test):
        predictions = self.model.predict_proba(X_test)
        return predictions
    
    def predict(self, X_test):
        predictions = self.model.predict_proba(X_test)[:,1]
        return predictions

    def evaluate(self, X_test, y_test):
        y_pred = self.transform(X_test)
        score = log_loss(y_test, y_pred)
        print("LOG LOSS : ", score)
        return
[
    print('Variable: {:20} Importance: {}'.format(*pair))
    for pair in feature_importances_cbc
]

#OTIMIZAÇÃO DE HIPERPARAMETROS - CATBOOST

#Grid Search With Cross Validation

from catboost import CatBoostClassifier

from sklearn.model_selection import GridSearchCV

classifier_cbc_gscv = CatBoostClassifier()

classifier_cbc_gscv.get_params().keys()

grid_param_cbc = {
    'depth': [4, 7, 10],
    'iterations': [100, 200, 300, 400, 500],
    'l2_leaf_reg': [1, 4, 9],
    'learning_rate': [0.03, 0.1, 0.15]
}

classifier_cbc_gscv_gd_sr = GridSearchCV(estimator=classifier_cbc_gscv,
                                         param_grid=grid_param_cbc,
                                         scoring='accuracy',
                                         cv=10)

classifier_cbc_gscv_gd_sr.fit(X_train, Y_train)
clf = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.1,
    #loss_function='CrossEntropy'
)

clf.fit(X_train,
        y_train,
        cat_features=categorical_columns,
        eval_set=(X_val, y_val),
        verbose=False)

print('CatBoost model is fitted: ' + str(clf.is_fitted()))
print('CatBoost model parameters:')
print(clf.get_params())

predictions = clf.predict(X_val)

print("accuracy_score", accuracy_score(y_val, predictions))

predictions_probas = clf.predict_proba(X_val)
print("roc-auc score for the class 1, from target 'HasDetections' ",
      roc_auc_score(y_val, predictions_probas[:, 1]))

val_cnf_matrix = confusion_matrix(y_val, predictions)
sns.heatmap(val_cnf_matrix, annot=True, fmt='.2f',
            cmap="BrBG").set_title("Validation")
plt.show()

#completely training the whole train dataset
Beispiel #27
0
def cat_boost():
    print("Start of Cat boost")
    X, y = data_pre_processing()
    X_train, X_test, y_train, y_test = data_processor()
    eval_set = [(X_test, y_test)]
    cb_model = CatBoostClassifier(iterations=1375,
                                  learning_rate=0.109499,
                                  depth=6,
                                  thread_count=10,
                                  eval_metric='AUC',
                                  bagging_temperature=0.9,
                                  od_type='Iter',
                                  metric_period=75,
                                  loss_function='Logloss',
                                  od_wait=100)
    cb_model.fit(X_train,
                 y_train,
                 eval_set=eval_set,
                 cat_features=categorical_features_pos,
                 verbose=True)
    print("Model Evaluation Stage")
    print(cb_model.get_params())
    print("\nevaluate predictions")
    catpred = cb_model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, catpred)
    f_score = f1_score(y_test, catpred)
    print("Accuracy : %.2f%%" % (accuracy * 100.0))
    print("F1 Score : %.2f%%" % (f_score * 100.0))
    print('Confusion Matrix :')
    print(confusion_matrix(y_test, catpred))
    print('Report : ')
    print(classification_report(y_test, catpred))
    print(mean_squared_error(y_test, catpred))
    # catpred = cb_model.predict(X_test)
    # print("Cat BoostRegressor Score: ", catpred.score(X_test, y_test))
    #Make Prediction and Output for Scoring
    print('Final Result: Make Prediction and Output Score')
    #test_values =pd.read_csv('Data/df_test_enc_2.csv')
    df_test_values_trf = pd.read_csv('../Data/df_test_enc_2.csv')
    # df_test_values_trf = preprocessing.normalize(df_test_values_trf, axis =0)
    # #df_trf = df_trf.astype(int)
    # #df_trf = df_trf.round()
    # #df_enc.dtypes
    # df_test_values_trf = pd.DataFrame(df_test_values_trf,columns = df_trf.columns)

    #df_test_values_trf = clean_dataset(df_test_values_trf)
    # col_names = df_test_values_trf.columns
    # features = df_test_values_trf[col_names]
    # imp = Imputer(strategy="most_frequent").fit(df_test_values_trf)
    # features = imp.transform(df_test_values_trf)

    # scaler = preprocessing.StandardScaler().fit(features)
    # features = scaler.transform(features)
    # df_test_values_trf[col_names] = features
    # cate = df_test_values_trf.columns
    #print(cate)
    #data_norm = preprocessing.normalize(df_test_values_trf, axis = 1)
    #df_test_values_trf = np.concatenate([data_norm])
    #df = pd.DataFrame(df_test_values_trf, columns=cate)
    test_values = df_test_values_trf.drop(['Unnamed: 0'], axis=1)
    #test_values = test_values.astype(int)
    test_values = np.array(test_values)
    # Make predictions using the testing set
    cb_pred = cb_model.predict(test_values)
    L_prediccion = pd.DataFrame(data=cb_pred, columns=['accepted'])
    print(L_prediccion.shape)

    L_prediccion.index.names = ['row_id']
    L_prediccion['accepted'] = L_prediccion['accepted'].astype(np.int64)
    print(L_prediccion.shape)
    print(L_prediccion.head())
    L_prediccion.to_csv('../Data/submission_1.csv')
    print("End of Cat boost")
def cat_boost():
    print("Start of Cat boost")
    X, y = data_pre_processing()
    X_train, X_test, y_train, y_test = data_processor()
    eval_set = [(X_test, y_test)]
    cb_model = CatBoostClassifier(iterations=1375,
                                 learning_rate= 0.1094999,
                                 depth=6,
                                 thread_count = 10,
                                 eval_metric='AUC',
                                 #eval_metric='Accuracy',
                                 bagging_temperature = 0.9,
                                 od_type='IncToDec',
                                 # l2_leaf_reg= 6,
                                 metric_period = 75,
                                 random_seed = 42,
                                 #logging_level= 'Silent',
                                 random_strength = 1.0,
                                 nan_mode = "Min",
                                 scale_pos_weight = 1.0,
                                 od_wait=100)
    cb_model.fit(X_train, y_train,
                 eval_set=eval_set,
                 cat_features = categorical_features_pos,
                 verbose=True)
    print("Model Evaluation Stage")
    print(cb_model.get_params())
    print("\nevaluate predictions")
    catpred = cb_model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, catpred)
    f_score = f1_score(y_test, catpred)
    print("Accuracy : %.2f%%" % (accuracy * 100.0))
    print("F1 Score : %.2f%%" % (f_score * 100.0))
    print('Confusion Matrix :')
    print(confusion_matrix(y_test, catpred))
    print('Report : ')
    print(classification_report(y_test, catpred))
    print(mean_squared_error(y_test, catpred))

    # keep probabilities for the positive outcome only
    probs = cb_model.predict_proba(X_test)[:, 1]
    # predict class values
    yhat = cb_model.predict(X_test)
    # calculate precision-recall curve
    precision, recall, thresholds = precision_recall_curve(y_test, probs)
    # calculate F1 score
    f1 = f1_score(y_test, yhat)
    # calculate precision-recall AUC
    auc_c = auc(recall, precision)
    # calculate average precision score
    ap = average_precision_score(y_test, probs)
    print('f1=%.3f auc=%.3f ap=%.3f' % (f1, auc_c, ap))
    plt.figure(figsize=(12, 6))
    # plot no skill
    plt.plot([0, 1], [0.5, 0.5], linestyle='--', label="No Skill")
    # plot the precision-recall curve for the model
    plt.plot(recall, precision, marker='.', label="precision-recall curve")
    # show the plot
    # Line Plot of Precision-Recall Curve
    plt.title("Line Plot of Precision-Recall Curve", {"fontsize": 16});
    plt.ylabel('Precision (y-axis)')
    plt.xlabel('Recall (x-axis)')
    plt.show()

    #Make Prediction and Output for Scoring
    print('Final Result: Make Prediction and Output Score')
    #test_values =pd.read_csv('Data/df_test_enc_2.csv')
    df_test_values_trf = pd.read_csv('../Data/df_test_enc_3.csv')
    # df_test_values_trf = preprocessing.normalize(df_test_values_trf, axis =0)
    # #df_trf = df_trf.astype(int)
    # #df_trf = df_trf.round()
    # #df_enc.dtypes
    # df_test_values_trf = pd.DataFrame(df_test_values_trf,columns = df_trf.columns)


    #df_test_values_trf = clean_dataset(df_test_values_trf)
    # col_names = df_test_values_trf.columns
    # features = df_test_values_trf[col_names]
    # imp = Imputer(strategy="most_frequent").fit(df_test_values_trf)
    # features = imp.transform(df_test_values_trf)

    # scaler = preprocessing.StandardScaler().fit(features)
    # features = scaler.transform(features)
    # df_test_values_trf[col_names] = features
    # cate = df_test_values_trf.columns
    #print(cate)
    #data_norm = preprocessing.normalize(df_test_values_trf, axis = 1)
    #df_test_values_trf = np.concatenate([data_norm])
    #df = pd.DataFrame(df_test_values_trf, columns=cate)

    # this function loops through columns in a data set and defines a predefined scaler to each
    # numeric_columns = ['loan_amount','msa_md', 'state_code', 'lender', 'county_code', 'applicant_income',
    # 'population', 'minority_population_pct','applicant_ethnicity',
    #  'ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
    #  'number_of_owner-occupied_units', 'number_of_1_to_4_family_units']
    # scaler = MinMaxScaler()
    # df_test_values_trf = scale_numeric(df_test_values_trf, numeric_columns, scaler)
    # #df = round(df)
    # # convert all DataFrame columns to the int64 dtype
    # df_test_values_trf = round(df_test_values_trf).astype(int)
    test_values = df_test_values_trf.drop(['Unnamed: 0'],axis=1)
    #test_values = test_values.astype(int)
    test_values=np.array(test_values)
    # Make predictions using the testing set
    cb_pred = cb_model.predict(test_values)
    L_prediccion=pd.DataFrame(data=cb_pred,columns=['accepted'])
    print(L_prediccion.shape)

    L_prediccion.index.names=['row_id']
    L_prediccion['accepted']= L_prediccion['accepted'].astype(np.int64)
    print(L_prediccion.shape)
    print(L_prediccion.head())
    L_prediccion.to_csv('../Data/submission_1.csv')
    print("End of Cat boost")
Beispiel #29
0
from sklearn.metrics import recall_score,precision_score

print(recall_score(y_test,y_pred,average='macro'))

print(precision_score(y_test, y_pred, average='micro'))


print(accuracy_score(y_test,y_pred))





#cr0ss validati0n

cv_params = clf.get_params()
cv_params.update({
    'loss_function': 'Logloss'
})
cv_data = cv(
    Pool(X, y, cat_features=cat_featuresind),
    cv_params,
    plot=True
)


print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))
Beispiel #30
0
    def model_catboost(self, X, y, X_train, y_train, X_test, y_test,
                       categorical_features_indices, target, file):
        print("Processing CATBOOST....")

        # Adicione esto: inicio
        train_pool = Pool(X_train,
                          y_train,
                          cat_features=categorical_features_indices)
        validate_pool = Pool(X_test,
                             y_test,
                             cat_features=categorical_features_indices)
        # fin

        #         model=CatBoostClassifier(loss_function='MultiClass',use_best_model=True, random_seed=42)#, class_weights=[1,2,3,4,5,6,7,8,9,10,11])
        model = CatBoostClassifier(loss_function='MultiClass',
                                   eval_metric='TotalF1',
                                   use_best_model=True,
                                   random_seed=42,
                                   leaf_estimation_method='Newton')

        model.fit(train_pool,
                  eval_set=validate_pool,
                  use_best_model=True,
                  verbose=50,
                  plot=False,
                  early_stopping_rounds=100)

        # cross-validation
        cv_params = model.get_params()
        cv_data = cv(Pool(X, y, cat_features=categorical_features_indices),
                     cv_params,
                     fold_count=10,
                     plot=False)
        print('Precise validation accuracy score: {}'.format(
            np.max(cv_data)))  # ['TotalF1']
        # fin

        print("PRIMER prediccion")
        print()
        print(model)
        # make predictions
        expected_y = y_test
        predicted_y = model.predict(X_test)
        # summarize the fit of the model
        print()
        print(metrics.classification_report(expected_y, predicted_y))
        print()
        print(metrics.confusion_matrix(expected_y, predicted_y))

        print("SEGUNDO prediccion")
        print(model.best_iteration_, model.best_score_)
        print(model.evals_result_['validation']['MultiClass'][-10:])

        # prediction
        pred = model.predict(X_test)
        print("PREDICT")
        print(pred)

        print("print dataframe predictions:")
        cm = pd.DataFrame()
        #         cm['DAMAGE'] = y_test
        cm[target] = y_test
        cm['Predict'] = model.predict(X_test)
        print(cm)

        print("SCORES")
        print(model.score(X_test, y_test))
        cm.to_csv(file)  # , index=False)
        #         cm.to_csv("catboost_prediction.csv")#, index=False)

        # confusion matrix
        print("confusion matrix:")
        #         conf_mat = get_confusion_matrix(model, Pool(X_train, y_train, cat_features=categorical_features_indices))
        conf_mat = get_confusion_matrix(
            model,
            Pool(X_test, y_test, cat_features=categorical_features_indices))
        print(conf_mat)

        # feature selection
        print(model.get_feature_importance(prettified=True))
        # feature_importances = model.get_feature_importance(train_pool)
        # feature_names = X_train.columns
        # for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        #     print('{}: {}'.format(name, score))
        ##

        return model, cv_data
model = CatBoostClassifier(eval_metric='Accuracy',use_best_model=True,random_seed=42)


# In[11]:


#now just to make the model to fit the data
model.fit(xtrain,ytrain,cat_features=cate_features_index,eval_set=(xtest,ytest))


# In[12]:


#for the data is not so big, we need use the cross-validation(cv) for the model, to find how
#good the model is ,I just use the 10-fold cv
cv_data = cv(model.get_params(),Pool(x,y,cat_features=cate_features_index),fold_count=10)


# In[13]:


#show the acc for the model
print('the best cv accuracy is :{}'.format(np.max(cv_data["b'Accuracy'_test_avg"])))


# In[14]:


#show the model test acc, but you have to note that the acc is not the cv acc,
#so recommend to use the cv acc to evaluate your model!
print('the test accuracy is :{:.6f}'.format(accuracy_score(ytest,model.predict(xtest))))