def CatBoost_gridsearch(x_train, y_train, categorical_indexes):
    cat = CatBoostClassifier(cat_features=categorical_indexes)
    tune = cat.grid_search(cat_params,
                           cv=5,
                           stratified=True,
                           shuffle=True,
                           serch_by_train_test_split=True,
                           X=x_train,
                           y=y_train,
                           plot=True)
    return tune
Beispiel #2
0
def grid_search_catboost(pool):
    """Grid search helper function for Scikit Catboost classifier."""
    params = {
        "verbose": False,
        "eval_metric": "AUC",
        "loss_function": "Logloss",
    }
    grid = {
        "grow_policy": ["SymmetricTree", "Depthwise", "Lossguide"],
        "iterations": [1000, 2000],
        "depth": [3, 4, 5, 6],
        "min_data_in_leaf": [3, 5, 7, 10],
    }
    model = CatBoostClassifier(**params)
    search_results = model.grid_search(grid, X=pool, verbose=False)
    return {**search_results["params"], **params}
#           'l2_leaf_reg': np.logspace(-20, -19, 3),
#           'leaf_estimation_iterations': [20],
#           'eval_metric': ['Accuracy'],
#           'use_best_model': ['True'],
#           'logging_level':['Silent'],
#           'random_seed': [0]
#          }

categorical_indexes = [0,3,5,6]
cat = CatBoostClassifier(cat_features=categorical_indexes).fit(x_train, y_train)
cross_val_score(cat,x_val, y_val).mean() 

cat_grid = {'iterations':[150,300,500], 'depth':[3,5,7], 
'random_seed':[0], 'learning_rate':[0.005,0.01,0.1,0.2], 'l2_leaf_reg':[3,5,7,9],'leaf_estimation_iterations':[10,30,50]}
cat = CatBoostClassifier(cat_features=categorical_indexes)
cat_grid = cat.grid_search(cat_grid, cv=5, stratified=True, shuffle=True, serch_by_train_test_split=True, 
X=x_train, y=y_train, plot=True) # test의 logloss, std 확인하여 iteration=?? 정함

cat_grid['cv_results'].keys()


cat_fit = CatBoostClassifier(cat_features=categorical_indexes, leaf_estimation_iterations=50, depth=5, 
random_seed=0, l2_leaf_reg=7, iterations=300, learning_rate=0.2)
cat_fit.fit(x_train, y_train)

cat_dict = {}
cat_dict['Catboost']= {'time':str(datetime.datetime.now()),'name': 'Catboosting', 
'best_param':cat_grid['params'],
'cross_val_score_mean':cross_val_score(cat_fit, x_val, y_val).mean()}


# kfold testing
Beispiel #4
0
def gradient_boosting_classifier(data,
                                 exluded_num_cols,
                                 excluded_cat_cols,
                                 target,
                                 sub_category=None,
                                 tune_parameters=False,
                                 display_results=True):
    """
    Create a catboost classifier with the given input and display the results
    :param data: base dataframe
    :type data:
    :param exluded_num_cols: numerical columns to include in analysis
    :type exluded_num_cols:
    :param excluded_cat_cols: categorical columns to use in analysis
    :type excluded_cat_cols:
    :param target: categorical column which to use for classifying
    :type target:
    :return:
    :rtype:
    """

    if not tune_parameters:

        # Binary classification case when a binary column is picked or a subcategory is set
        if len(data[target].dropna().unique()) == 2 or sub_category:
            # Train the model in standard configuration
            train_pool, test_pool, x_train, y_train, y_test = create_boost_training_data(
                data,
                exluded_num_cols,
                excluded_cat_cols,
                target,
                target_cat=sub_category)
            model = CatBoostClassifier(iterations=40,
                                       learning_rate=1,
                                       depth=8,
                                       loss_function="Logloss",
                                       custom_metric=["Logloss", "AUC"])
        else:
            # Train the model in standard configuration
            train_pool, test_pool, x_train, y_train, y_test = create_boost_training_data(
                data, exluded_num_cols, excluded_cat_cols, target)
            model = CatBoostClassifier(
                iterations=40,
                learning_rate=1,
                depth=8,
                loss_function="MultiClassOneVsAll",
                custom_metric=["MultiClassOneVsAll", "AUC"])
        model.fit(train_pool)
    else:
        # Perform cross validated hyper parameter tuning on the training set
        train_pool, test_pool, x_train, y_train, y_test = create_boost_training_data(
            data,
            exluded_num_cols,
            excluded_cat_cols,
            target,
            target_cat=sub_category)

        # Binary classification case when a binary column is picked or a subcategory is set
        if len(data[target].dropna().unique()) == 2 or sub_category:
            model = CatBoostClassifier(loss_function="Logloss",
                                       custom_metric=["Logloss", "AUC"])
        else:

            model = CatBoostClassifier(
                loss_function="MultiClassOneVsAll",
                custom_metric=["MultiClassOneVsAll", "AUC"])
        grid = {
            "iterations": [40, 60, 100],
            "learning_rate": [0.01, 0.1, 1],
            "depth": [4, 6, 10],
            "l2_leaf_reg": [3, 5, 7]
        }
        grid_result = model.grid_search(grid,
                                        X=train_pool,
                                        plot=False,
                                        verbose=True)
    pred = model.predict(test_pool)
    if len(data[target].dropna().unique()) == 2:
        score = accuracy_score(y_test, pred == "True")
    else:
        # TODO implement multiclass scoring
        score = accuracy_score(y_test, np.squeeze(pred))
    if display_results:
        print(score)
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(x_train)
        # print(model.get_best_score())
        shap.summary_plot(shap_values, x_train)
        # print(model.eval_metrics(test_pool, metrics=["AUC"]))
        # # If binary plot roc curve
        if len(data[target].dropna().unique()) == 2 or sub_category:
            get_roc_curve(model, test_pool, plot=True)
        # otherwise plot confusion matrix
        else:
            confusion_matrix = get_confusion_matrix(model, test_pool)
            plot_confusion_matrix(confusion_matrix)

    return model, score
# catboost_lon_level1.grid_search(param_grid=catboost_lon_level1_params, X=X, y=y_lon)

# %% Latitude optimization

grid_search_results = catboost_lat_level1.grid_search(
    param_grid, X=X, y=y_lat, shuffle=False, verbose=3
)
catboost_lat_level1_params = grid_search_results["params"]
print(catboost_lat_level1_params)
# catboost_lat_level1_params = {'depth': 4, 'l2_leaf_reg': 1}
# catboost_lat_level1.grid_search(param_grid=catboost_lat_level1_params, X=X, y=y_lat)

# %% Buildings optimization

grid_search_results = catboost_building_level1.grid_search(
    param_grid, X=X, y=y_building, shuffle=False, verbose=3
)
catboost_building_level1_params = grid_search_results["params"]
print(catboost_building_level1_params)
# catboost_building_level1_params = {'depth': 10, 'l2_leaf_reg': 1}
# catboost_building_level1.grid_search(
#     param_grid=catboost_building_level1_params, X=X, y=y_building
# )

# %% Floors optimization

grid_search_results = catboost_floor_level1.grid_search(
    param_grid, X=X, y=y_floor, shuffle=False, verbose=3
)
catboost_floor_level1_params = grid_search_results["params"]
print(catboost_floor_level1_params)
Beispiel #6
0
def cvgrid_search(X, y, X_test, model_type):
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      stratify=y,
                                                      shuffle=True,
                                                      test_size=0.2,
                                                      random_state=42)

    if model_type == 'cat':
        model = CatBoostClassifier(**PARAMS['cat_def'])
        grid_search_result = model.grid_search(PARAMS['cat_cv'],
                                               X=X_train,
                                               y=y_train,
                                               plot=False,
                                               cv=5,
                                               stratified=True,
                                               verbose=0)
        best_model = CatBoostClassifier(**PARAMS['cat_def'],
                                        **grid_search_result['params'])

        best_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=0)
        get_preds = lambda model, x: model.predict(
            x, prediction_type='Probability')[:, 1]

    if model_type == 'rf':
        rf = RandomForestClassifier()
        rf_random = RandomizedSearchCV(estimator=rf,
                                       param_distributions=PARAMS['rf_cv'],
                                       n_iter=50,
                                       cv=5,
                                       verbose=0,
                                       random_state=42,
                                       n_jobs=-1)
        # Tune hyperparams
        rf_random.fit(X_train, y_train)
        best_model = rf_random.best_estimator_
        # Train set only
        best_model.fit(X_train, y_train)
        get_preds = lambda model, x: model.predict_proba(x)[:, 1]

    if model_type == 'lr':
        lr = linear_model.LogisticRegression()
        lr_random = RandomizedSearchCV(estimator=lr,
                                       param_distributions=PARAMS['lr_cv'],
                                       n_iter=50,
                                       cv=5,
                                       verbose=0,
                                       random_state=42,
                                       n_jobs=-1)
        # Tune hyperparams
        lr_random.fit(X_train, y_train)
        best_model = lr_random.best_estimator_
        # Train set only
        best_model.fit(X_train, y_train)
        get_preds = lambda model, x: model.predict_proba(x)[:, 1]

    if model_type == 'svm':
        svc = SVC()
        svc_random = RandomizedSearchCV(estimator=svc,
                                        param_distributions=PARAMS['svm_cv'],
                                        n_iter=50,
                                        cv=5,
                                        verbose=0,
                                        random_state=42,
                                        n_jobs=-1)
        # Tune hyperparams
        svc_random.fit(X_train, y_train)
        best_model = svc_random.best_estimator_
        # Train set only
        best_model.fit(X_train, y_train)
        get_preds = lambda model, x: model.decision_function(x)

    y_pred_train = get_preds(best_model, X_train)
    y_pred_val = get_preds(best_model, X_val)
    train_auc = roc_auc_score(y_train, y_pred_train)
    val_auc = roc_auc_score(y_val, y_pred_val)
    train_ap = average_precision_score(y_train, y_pred_train)
    val_ap = average_precision_score(y_val, y_pred_val)

    # Final fit
    best_model.fit(X, y)

    return (get_preds(best_model, X),
            get_preds(best_model,
                      X_test)), (train_auc, val_auc), (train_ap, val_ap)