predictions)  # Calculating the Recall score.
    recall_list.append(recall)
    kappa = cohen_kappa_score(activity_shuffled,
                              predictions)  # Calculating the Kappa score.
    kappa_list.append(kappa)

    #@ GINI Features Importance:
    gini_importances[
        "Run" +
        str(seed)] = model.feature_importances_  # Features Importance: GINI.

    #@ PERMUTATION Features Importance:
    permutation_important = permutation_importance(
        model,
        data,
        activity_shuffled,
        n_repeats=5,  # Permutation Importance.
        n_jobs=-1,
        random_state=seed)
    permutation_importances["Run" + str(
        seed
    )] = permutation_important.importances_mean  # Features Importance: PERMUTATION.

# 1. Evaluation Metrics:
# I will calculate the Evaluation metrics such as OOB score, Recall score and Kappa score using the whole dataset.
# I will be changing the seed of Random Forest Classifier along with the loop. The values of seed starts from 123
# and goes up to 148 which will also be the range of Iteration of the loop which is 25 Iterations in total.

#@ Dictionary of Evaluation Metrics:
scores = {
    "OOB Score": oob_list,
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
print("Accuracy on test data: {:.2f}".format(clf.score(X_test, y_test)))

# %%
# Next, we plot the tree based feature importance and the permutation
# importance. The permutation importance plot shows that permuting a feature
# drops the accuracy by at most `0.012`, which would suggest that none of the
# features are important. This is in contradiction with the high test accuracy
# computed above: some feature must be important. The permutation importance
# is calculated on the training set to show how much the model relies on each
# feature during training.
result = permutation_importance(clf, X_train, y_train, n_repeats=10, random_state=42)
perm_sorted_idx = result.importances_mean.argsort()

tree_importance_sorted_idx = np.argsort(clf.feature_importances_)
tree_indices = np.arange(0, len(clf.feature_importances_)) + 0.5

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
ax1.barh(tree_indices, clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticks(tree_indices)
ax1.set_yticklabels(data.feature_names[tree_importance_sorted_idx])
ax1.set_ylim((0, len(clf.feature_importances_)))
ax2.boxplot(
    result.importances[perm_sorted_idx].T,
    vert=False,
    labels=data.feature_names[perm_sorted_idx],
)
print('Target on test data', predict_test)

# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_Test, predict_test)
print('accuracy_score on test dataset : ', accuracy_test)

print("R2")
print(r2_score(y_Test, predict_test))

print(classification_report(y_Test, predict_test))


def plot(classifier, x, y, title):
    class_names = ["Not Defaulted", "Defaulted"]
    disp = plot_confusion_matrix(classifier,
                                 x,
                                 y,
                                 display_labels=class_names,
                                 cmap=plt.cm.PuRd,
                                 values_format='')
    disp.ax_.set_title(title)
    plt.show()


plot(model, x_Test, y_Test, "Naive Bayes Model-scaled")

imps = permutation_importance(model, x_Test, y_Test)
print(imps.importances_mean)

# In[ ]:
Esempio n. 4
0
def main():

    # Lazy import libraries
    from rlearnlib.utils import (
        predefined_estimators,
        load_training_data,
        save_training_data,
        option_to_list,
        scoring_metrics,
        check_class_weights,
    )
    from rlearnlib.raster import RasterStack

    try:
        import sklearn

        if sklearn.__version__ < "0.20":
            gs.fatal(
                "Package python3-scikit-learn 0.20 or newer is not installed")

    except ImportError:
        gs.fatal("Package python3-scikit-learn 0.20 or newer is not installed")

    try:
        import pandas as pd

    except ImportError:
        gs.fatal("Package python3-pandas 0.25 or newer is not installed")

    # parser options ----------------------------------------------------------
    group = options["group"]
    training_map = options["training_map"]
    training_points = options["training_points"]
    field = options["field"]
    model_save = options["save_model"]
    model_name = options["model_name"]
    hyperparams = {
        "penalty": options["penalty"],
        "alpha": options["alpha"],
        "l1_ratio": options["l1_ratio"],
        "C": options["c"],
        "epsilon": options["epsilon"],
        "min_samples_leaf": options["min_samples_leaf"],
        "n_estimators": options["n_estimators"],
        "learning_rate": options["learning_rate"],
        "subsample": options["subsample"],
        "max_depth": options["max_depth"],
        "max_features": options["max_features"],
        "n_neighbors": options["n_neighbors"],
        "weights": options["weights"],
        "hidden_layer_sizes": options["hidden_units"],
    }
    cv = int(options["cv"])
    group_raster = options["group_raster"]
    importances = flags["f"]
    preds_file = options["preds_file"]
    classif_file = options["classif_file"]
    fimp_file = options["fimp_file"]
    param_file = options["param_file"]
    norm_data = flags["s"]
    random_state = int(options["random_state"])
    load_training = options["load_training"]
    save_training = options["save_training"]
    n_jobs = int(options["n_jobs"])
    balance = flags["b"]
    category_maps = option_to_list(options["category_maps"])

    # define estimator --------------------------------------------------------
    hyperparams, param_grid = process_param_grid(hyperparams)
    estimator, mode = predefined_estimators(model_name, random_state, n_jobs,
                                            hyperparams)

    # remove dict keys that are incompatible for the selected estimator
    estimator_params = estimator.get_params()
    param_grid = {
        key: value
        for key, value in param_grid.items() if key in estimator_params
    }
    scoring, search_scorer = scoring_metrics(mode)

    # checks of input options -------------------------------------------------
    if (mode == "classification" and balance is True
            and model_name not in check_class_weights()):
        gs.warning(model_name + " does not support class weights")
        balance = False

    if mode == "regression" and balance is True:
        gs.warning(
            "Balancing of class weights is only possible for classification")
        balance = False

    if classif_file:
        if cv <= 1:
            gs.fatal("Output of cross-validation global accuracy requires "
                     "cross-validation cv > 1")

        if not os.path.exists(os.path.dirname(classif_file)):
            gs.fatal("Directory for output file {} does not exist".format(
                classif_file))

    # feature importance file selected but no cross-validation scheme used
    if importances:
        if sklearn.__version__ < "0.22":
            gs.fatal("Feature importances calculation requires scikit-learn "
                     "version >= 0.22")

    if fimp_file:
        if importances is False:
            gs.fatal(
                'Output of feature importance requires the "f" flag to be set')

        if not os.path.exists(os.path.dirname(fimp_file)):
            gs.fatal("Directory for output file {} does not exist".format(
                fimp_file))

    # predictions file selected but no cross-validation scheme used
    if preds_file:
        if cv <= 1:
            gs.fatal("Output of cross-validation predictions requires "
                     "cross-validation cv > 1")

        if not os.path.exists(os.path.dirname(preds_file)):
            gs.fatal("Directory for output file {} does not exist".format(
                preds_file))

    # define RasterStack ------------------------------------------------------
    stack = RasterStack(group=group)

    if category_maps is not None:
        stack.categorical = category_maps

    # extract training data ---------------------------------------------------
    if load_training != "":
        X, y, cat, class_labels, group_id = load_training_data(load_training)

        if class_labels is not None:
            a = pd.DataFrame({"response": y, "labels": class_labels})
            a = a.drop_duplicates().values
            class_labels = {k: v for (k, v) in a}

    else:
        gs.message("Extracting training data")

        if group_raster != "":
            stack.append(group_raster)

        if training_map != "":
            X, y, cat = stack.extract_pixels(training_map)
            y = y.flatten()

            with RasterRow(training_map) as src:

                if mode == "classification":
                    src_cats = {v: k for (k, v, m) in src.cats}
                    class_labels = {k: k for k in np.unique(y)}
                    class_labels.update(src_cats)
                else:
                    class_labels = None

        elif training_points != "":
            X, y, cat = stack.extract_points(training_points, field)
            y = y.flatten()

            if y.dtype in (np.object_, np.object):
                from sklearn.preprocessing import LabelEncoder

                le = LabelEncoder()
                y = le.fit_transform(y)
                class_labels = {k: v for (k, v) in enumerate(le.classes_)}
            else:
                class_labels = None

        # take group id from last column and remove from predictors
        if group_raster != "":
            group_id = X[:, -1]
            X = np.delete(X, -1, axis=1)
            stack.drop(group_raster)
        else:
            group_id = None

        # check for labelled pixels and training data
        if y.shape[0] == 0 or X.shape[0] == 0:
            gs.fatal("No training pixels or pixels in imagery group ...check "
                     "computational region")

        from sklearn.utils import shuffle

        if group_id is None:
            X, y, cat = shuffle(X, y, cat, random_state=random_state)
        else:
            X, y, cat, group_id = shuffle(X,
                                          y,
                                          cat,
                                          group_id,
                                          random_state=random_state)

        if save_training != "":
            save_training_data(save_training, X, y, cat, class_labels,
                               group_id, stack.names)

    # cross validation settings -----------------------------------------------
    # inner resampling method (cv=2)
    from sklearn.model_selection import GridSearchCV, StratifiedKFold, GroupKFold, KFold

    if any(param_grid) is True:
        if group_id is None and mode == "classification":
            inner = StratifiedKFold(n_splits=3)
        elif group_id is None and mode == "regression":
            inner = KFold(n_splits=3)
        else:
            inner = GroupKFold(n_splits=3)
    else:
        inner = None

    # outer resampling method (cv=cv)
    if cv > 1:
        if group_id is None and mode == "classification":
            outer = StratifiedKFold(n_splits=cv)
        elif group_id is None and mode == "regression":
            outer = KFold(n_splits=cv)
        else:
            outer = GroupKFold(n_splits=cv)

    # modify estimators that take sample_weights ------------------------------
    if balance is True:
        from sklearn.utils import compute_class_weight

        class_weights = compute_class_weight(class_weight="balanced",
                                             classes=(y),
                                             y=y)
        fit_params = {"sample_weight": class_weights}

    else:
        class_weights = None
        fit_params = {}

    # preprocessing -----------------------------------------------------------
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder

    # standardization
    if norm_data is True and category_maps is None:
        scaler = StandardScaler()
        trans = ColumnTransformer(
            remainder="passthrough",
            transformers=[("scaling", scaler, np.arange(0, stack.count))],
        )

    # one-hot encoding
    elif norm_data is False and category_maps is not None:
        enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
        trans = ColumnTransformer(remainder="passthrough",
                                  transformers=[("onehot", enc,
                                                 stack.categorical)])

    # standardization and one-hot encoding
    elif norm_data is True and category_maps is not None:
        scaler = StandardScaler()
        enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
        trans = ColumnTransformer(
            remainder="passthrough",
            transformers=[
                ("onehot", enc, stack.categorical),
                (
                    "scaling",
                    scaler,
                    np.setxor1d(range(stack.count),
                                stack.categorical).astype("int"),
                ),
            ],
        )

    # combine transformers
    if norm_data is True or category_maps is not None:
        estimator = Pipeline([("preprocessing", trans),
                              ("estimator", estimator)])
        param_grid = wrap_named_step(param_grid)
        fit_params = wrap_named_step(fit_params)

    if any(param_grid) is True:
        estimator = GridSearchCV(
            estimator=estimator,
            param_grid=param_grid,
            scoring=search_scorer,
            n_jobs=n_jobs,
            cv=inner,
        )

    # estimator training ------------------------------------------------------
    gs.message(os.linesep)
    gs.message(("Fitting model using " + model_name))
    if balance is True and group_id is not None:
        estimator.fit(X, y, groups=group_id, **fit_params)
    elif balance is True and group_id is None:
        estimator.fit(X, y, **fit_params)
    else:
        estimator.fit(X, y)

    # message best hyperparameter setup and optionally save using pandas
    if any(param_grid) is True:
        gs.message(os.linesep)
        gs.message("Best parameters:")

        optimal_pars = [
            (k.replace("estimator__", "").replace("selection__", "") + " = " +
             str(v)) for (k, v) in estimator.best_params_.items()
        ]

        for i in optimal_pars:
            gs.message(i)

        if param_file != "":
            param_df = pd.DataFrame(estimator.cv_results_)
            param_df.to_csv(param_file)

    # cross-validation --------------------------------------------------------
    if cv > 1:
        from sklearn.metrics import classification_report
        from sklearn import metrics

        if (mode == "classification"
                and cv > np.histogram(y, bins=np.unique(y))[0].min()):
            gs.message(os.linesep)
            gs.fatal("Number of cv folds is greater than number of samples in "
                     "some classes ")

        gs.message(os.linesep)
        gs.message("Cross validation global performance measures......:")

        if (mode == "classification" and len(np.unique(y)) == 2
                and all([0, 1] == np.unique(y))):
            scoring["roc_auc"] = metrics.roc_auc_score

        from sklearn.model_selection import cross_val_predict

        preds = cross_val_predict(
            estimator=estimator,
            X=X,
            y=y,
            groups=group_id,
            cv=outer,
            n_jobs=n_jobs,
            fit_params=fit_params,
        )

        test_idx = [test for train, test in outer.split(X, y)]
        n_fold = np.zeros((0, ))

        for fold in range(outer.get_n_splits()):
            n_fold = np.hstack((n_fold, np.repeat(fold,
                                                  test_idx[fold].shape[0])))

        preds = {"y_pred": preds, "y_true": y, "cat": cat, "fold": n_fold}

        preds = pd.DataFrame(data=preds,
                             columns=["y_pred", "y_true", "cat", "fold"])
        gs.message(os.linesep)
        gs.message("Global cross validation scores...")
        gs.message(os.linesep)
        gs.message("Metric \t Mean \t Error")

        for name, func in scoring.items():
            score_mean = (preds.groupby("fold").apply(
                lambda x: func(x["y_true"], x["y_pred"])).mean())

            score_std = (preds.groupby("fold").apply(
                lambda x: func(x["y_true"], x["y_pred"])).std())

            gs.message(name + "\t" + str(score_mean.round(3)) + "\t" +
                       str(score_std.round(3)))

        if mode == "classification":
            gs.message(os.linesep)
            gs.message("Cross validation class performance measures......:")

            report_str = classification_report(
                y_true=preds["y_true"],
                y_pred=preds["y_pred"],
                sample_weight=class_weights,
                output_dict=False,
            )

            report = classification_report(
                y_true=preds["y_true"],
                y_pred=preds["y_pred"],
                sample_weight=class_weights,
                output_dict=True,
            )
            report = pd.DataFrame(report)

            gs.message(report_str)

            if classif_file != "":
                report.to_csv(classif_file, mode="w", index=True)

        # write cross-validation predictions to csv file
        if preds_file != "":
            preds.to_csv(preds_file, mode="w", index=False)
            text_file = open(preds_file + "t", "w")
            text_file.write('"Real", "Real", "integer", "integer"')
            text_file.close()

    # feature importances -----------------------------------------------------
    if importances is True:
        from sklearn.inspection import permutation_importance

        fimp = permutation_importance(
            estimator,
            X,
            y,
            scoring=search_scorer,
            n_repeats=5,
            n_jobs=n_jobs,
            random_state=random_state,
        )

        feature_names = deepcopy(stack.names)
        feature_names = [i.split("@")[0] for i in feature_names]

        fimp = pd.DataFrame({
            "feature": feature_names,
            "importance": fimp["importances_mean"],
            "std": fimp["importances_std"],
        })

        gs.message(os.linesep)
        gs.message("Feature importances")
        gs.message("Feature" + "\t" + "Score")

        for index, row in fimp.iterrows():
            gs.message(row["feature"] + "\t" + str(row["importance"]) + "\t" +
                       str(row["std"]))

        if fimp_file != "":
            fimp.to_csv(fimp_file, index=False)

    # save the fitted model
    import joblib

    joblib.dump((estimator, y, class_labels), model_save)
Esempio n. 5
0

features = np.array(('pe', 'closestPMT', 'n100', 'reco_wall_r', 'reco_wall_z'))
print(features)
feature_importance = gbr.feature_importances_
print(feature_importance)
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, features[sorted_idx])
plt.title('Feature Importance (MDI)')
print(features[sorted_idx])

result = permutation_importance(gbr, reco_test, mc_test, n_repeats=10,
                                random_state=42, n_jobs=2)
print(result)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(result.importances[sorted_idx].T,
            vert=False, labels=features[sorted_idx])
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()

"""
	
infile = 'wbls_220000_final_ml_output.csv'
file = open(str(infile))
file_read = pd.read_csv(file)
df = file_read[['TrueEnergy', 'RecoE', 'pe']]
Esempio n. 6
0
df1=df.iloc[:, ga.best_fen]
cros=model(X_train=df1, y_train=out,  method=method)


print("кросс-валидация=", cros) 
print("Количество признаков ", np.sum(ga.best_fen))       
X_train, X_test, y_train, y_test = train_test_split(
                df1, out, test_size=0.33, random_state=42, stratify=out)

out_model=model(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
                method=method)
y_pred_test=out_model["y_mod_test"]
y_pred_train=out_model["y_mod_train"]

#оценка значимости признаков
results = permutation_importance(out_model["model"], X_train, y_train,
                                 scoring='f1_macro', n_repeats=10)


df_fi1=pd.DataFrame( results.importances_mean, columns=["important"])
df_fi1["features"]=list(X_train.columns)

df_fi1["method"]=method
df_fi1=df_fi1[["method", "features", "important"]]
df_fi.append(df_fi1)

#print("точность по тестовой выборке=", out_model["f1_test"]) 
#print("точность по обучающей выборке=", out_model["f1_train"]) 
#plt.figure()
#plt.plot(y_pred_test, "r*", label="Модель")
#plt.plot(y_test.values, "b*", label="Выборка")
#plt.title("Test")
Esempio n. 7
0
def calculate_permutation_importance(train="b234", test="b261", method="svmk"):
    """
    Calculate the permutation importance of each variable to predict RR-Lyrae stars.
    Persist the results in the local filesystem as pkl files.

    Parameters
    ----------
    train: id of the tile to be used as training dataset
    test: id of the tile to be used as test dataset
    method: either "svmk", "rf" or "linear"
    
    References
    ----------
    [1] https://scikit-learn.org/stable/modules/permutation_importance.html

    """
    X, y = CARPYNCHO.retrieve_tile(train, "full")
    Xt, yt = CARPYNCHO.retrieve_tile(test, "full")

    if method == "rf":
        clf = RandomForestClassifier(n_estimators=400,
                                     criterion="entropy",
                                     min_samples_leaf=2,
                                     max_features="sqrt",
                                     n_jobs=7)
        clf.fit(X, y)

    if method == "linear":
        clf = Pipeline([('disc',
                         KBinsDiscretizer(
                             n_bins=get_optimal_parameters_p("svml")["n_bins"],
                             encode='ordinal',
                             strategy='quantile')),
                        ('scaler', StandardScaler()),
                        ('clf',
                         LinearSVC(verbose=3,
                                   max_iter=100000,
                                   C=get_optimal_parameters_p("svml")["C"],
                                   dual=False))])
        clf.fit(X, y)

    #SVM-K
    if method == "svmk":
        clf = Pipeline([('disc',
                         KBinsDiscretizer(
                             n_bins=get_optimal_parameters_p("svmk")["n_bins"],
                             encode='ordinal',
                             strategy='quantile')),
                        ("scaler", StandardScaler()),
                        ("feature_map",
                         Nystroem(
                             n_components=300,
                             gamma=get_optimal_parameters_p("svmk")["gamma"],
                         )),
                        ("svm",
                         LinearSVC(
                             dual=False,
                             max_iter=100000,
                             C=get_optimal_parameters_p("svmk")["C"],
                         ))])

        clf.fit(X, y)

    result = permutation_importance(clf,
                                    Xt,
                                    yt,
                                    scoring="average_precision",
                                    n_repeats=1)

    with open(
            EXPERIMENTS_OUTPUT_FOLDER_INSPECTION + "Permutation_importance/" +
            method + "_RESULT_train=" + train + "test=" + test + ".pkl",
            'wb') as output:
        pickle.dump(result, output, pickle.HIGHEST_PROTOCOL)
Esempio n. 8
0
    [X_train, X_test, y_train,
     y_test] = train_test_split(X,
                                y,
                                test_size=.2,
                                random_state=randint(0, 1000))

    model: KNeighborsRegressor = KNeighborsRegressor(n_neighbors=20,
                                                     n_jobs=1).fit(
                                                         X_train, y_train)

    score_test[j] = model.score(X_test, y_test)
    score_train[j] = model.score(X_train, y_train)

    importances = permutation_importance(model,
                                         X_train,
                                         y_train,
                                         random_state=randint(0, 1000))

    if cofs is None:
        cofs = importances.importances_mean
    else:
        cofs += importances.importances_mean

    # analise_auxiliar.find_prediction_time(model, X.shape[1])
    # exit(0)

end: float = time.time()

analise_auxiliar.print_time_of_each_prediction(start, end, numpy.size(x_axis),
                                               numpy.size(y))
analise_auxiliar.print_score(numpy.mean(score_test), numpy.mean(score_train))
Esempio n. 9
0
    # print("Using Top {} features: {}".format(args.topk, feature_name[topk_list]))
    # bst = xgb.Booster(model_file=args.model_path)

    if args.fsel == 1:
        skl_model_path = args.model_path.replace("_best.model", "_best.skl")
        reg0 = pickle.load(open(skl_model_path, 'rb'))
        print("topk important_fea")
        topk_list = get_topk_important_fea1(reg0, args.topk)
    elif args.fsel == 2:
        skl_model_path = args.model_path.replace("_best.model", "_best.skl")
        reg0 = pickle.load(open(skl_model_path, 'rb'))
        print("topk permutation_importance")

        result = permutation_importance(reg0,
                                        e_x,
                                        e_y,
                                        n_repeats=10,
                                        random_state=42,
                                        n_jobs=2)
        topk_list = result.importances_mean.argsort()[::-1][:
                                                            args.topk]  #[::-1]
    else:
        print("topk gain")
        bst = xgb.Booster(model_file=args.model_path)
        topk_list = get_topk_important_fea2(bst, args.topk)

    feature_list = feature_name[topk_list]
    t_x = t_x[:, topk_list]
    e_x = e_x[:, topk_list]
    if preddata != "":
        p_x = p_x[:, topk_list]
    print("Using Top {} features: {}".format(args.topk, topk_list))
Esempio n. 10
0
def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples):
    # This test checks that the column shuffling logic has the same behavior
    # both a dataframe and a simple numpy array.
    pd = pytest.importorskip("pandas")

    # regression test to make sure that sequential and parallel calls will
    # output the same results.
    X, y = make_regression(n_samples=100, n_features=5, random_state=0)
    X_df = pd.DataFrame(X)

    # Add a categorical feature that is statistically linked to y:
    binner = KBinsDiscretizer(n_bins=3, encode="ordinal")
    cat_column = binner.fit_transform(y.reshape(-1, 1))

    # Concatenate the extra column to the numpy array: integers will be
    # cast to float values
    X = np.hstack([X, cat_column])
    assert X.dtype.kind == "f"

    # Insert extra column as a non-numpy-native dtype (while keeping backward
    # compat for old pandas versions):
    if hasattr(pd, "Categorical"):
        cat_column = pd.Categorical(cat_column.ravel())
    else:
        cat_column = cat_column.ravel()
    new_col_idx = len(X_df.columns)
    X_df[new_col_idx] = cat_column
    assert X_df[new_col_idx].dtype == cat_column.dtype

    # Stich an aribtrary index to the dataframe:
    X_df.index = np.arange(len(X_df)).astype(str)

    rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
    rf.fit(X, y)

    n_repeats = 3
    importance_array = permutation_importance(
        rf,
        X,
        y,
        n_repeats=n_repeats,
        random_state=0,
        n_jobs=n_jobs,
        max_samples=max_samples,
    )

    # First check that the problem is structured enough and that the model is
    # complex enough to not yield trivial, constant importances:
    imp_min = importance_array["importances"].min()
    imp_max = importance_array["importances"].max()
    assert imp_max - imp_min > 0.3

    # Now check that importances computed on dataframe matche the values
    # of those computed on the array with the same data.
    importance_dataframe = permutation_importance(
        rf,
        X_df,
        y,
        n_repeats=n_repeats,
        random_state=0,
        n_jobs=n_jobs,
        max_samples=max_samples,
    )
    assert_allclose(
        importance_array["importances"], importance_dataframe["importances"]
    )
Esempio n. 11
0
def test_permutation_importance_sample_weight():
    # Creating data with 2 features and 1000 samples, where the target
    # variable is a linear combination of the two features, such that
    # in half of the samples the impact of feature 1 is twice the impact of
    # feature 2, and vice versa on the other half of the samples.
    rng = np.random.RandomState(1)
    n_samples = 1000
    n_features = 2
    n_half_samples = n_samples // 2
    x = rng.normal(0.0, 0.001, (n_samples, n_features))
    y = np.zeros(n_samples)
    y[:n_half_samples] = 2 * x[:n_half_samples, 0] + x[:n_half_samples, 1]
    y[n_half_samples:] = x[n_half_samples:, 0] + 2 * x[n_half_samples:, 1]

    # Fitting linear regression with perfect prediction
    lr = LinearRegression(fit_intercept=False)
    lr.fit(x, y)

    # When all samples are weighted with the same weights, the ratio of
    # the two features importance should equal to 1 on expectation (when using
    # mean absolutes error as the loss function).
    pi = permutation_importance(
        lr, x, y, random_state=1, scoring="neg_mean_absolute_error", n_repeats=200
    )
    x1_x2_imp_ratio_w_none = pi.importances_mean[0] / pi.importances_mean[1]
    assert x1_x2_imp_ratio_w_none == pytest.approx(1, 0.01)

    # When passing a vector of ones as the sample_weight, results should be
    # the same as in the case that sample_weight=None.
    w = np.ones(n_samples)
    pi = permutation_importance(
        lr,
        x,
        y,
        random_state=1,
        scoring="neg_mean_absolute_error",
        n_repeats=200,
        sample_weight=w,
    )
    x1_x2_imp_ratio_w_ones = pi.importances_mean[0] / pi.importances_mean[1]
    assert x1_x2_imp_ratio_w_ones == pytest.approx(x1_x2_imp_ratio_w_none, 0.01)

    # When the ratio between the weights of the first half of the samples and
    # the second half of the samples approaches to infinity, the ratio of
    # the two features importance should equal to 2 on expectation (when using
    # mean absolutes error as the loss function).
    w = np.hstack(
        [np.repeat(10.0 ** 10, n_half_samples), np.repeat(1.0, n_half_samples)]
    )
    lr.fit(x, y, w)
    pi = permutation_importance(
        lr,
        x,
        y,
        random_state=1,
        scoring="neg_mean_absolute_error",
        n_repeats=200,
        sample_weight=w,
    )
    x1_x2_imp_ratio_w = pi.importances_mean[0] / pi.importances_mean[1]
    assert x1_x2_imp_ratio_w / x1_x2_imp_ratio_w_none == pytest.approx(2, 0.01)
Esempio n. 12
0
def test_robustness_to_high_cardinality_noisy_feature(n_jobs, max_samples, seed=42):
    # Permutation variable importance should not be affected by the high
    # cardinality bias of traditional feature importances, especially when
    # computed on a held-out test set:
    rng = np.random.RandomState(seed)
    n_repeats = 5
    n_samples = 1000
    n_classes = 5
    n_informative_features = 2
    n_noise_features = 1
    n_features = n_informative_features + n_noise_features

    # Generate a multiclass classification dataset and a set of informative
    # binary features that can be used to predict some classes of y exactly
    # while leaving some classes unexplained to make the problem harder.
    classes = np.arange(n_classes)
    y = rng.choice(classes, size=n_samples)
    X = np.hstack([(y == c).reshape(-1, 1) for c in classes[:n_informative_features]])
    X = X.astype(np.float32)

    # Not all target classes are explained by the binary class indicator
    # features:
    assert n_informative_features < n_classes

    # Add 10 other noisy features with high cardinality (numerical) values
    # that can be used to overfit the training data.
    X = np.concatenate([X, rng.randn(n_samples, n_noise_features)], axis=1)
    assert X.shape == (n_samples, n_features)

    # Split the dataset to be able to evaluate on a held-out test set. The
    # Test size should be large enough for importance measurements to be
    # stable:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=rng
    )
    clf = RandomForestClassifier(n_estimators=5, random_state=rng)
    clf.fit(X_train, y_train)

    # Variable importances computed by impurity decrease on the tree node
    # splits often use the noisy features in splits. This can give misleading
    # impression that high cardinality noisy variables are the most important:
    tree_importances = clf.feature_importances_
    informative_tree_importances = tree_importances[:n_informative_features]
    noisy_tree_importances = tree_importances[n_informative_features:]
    assert informative_tree_importances.max() < noisy_tree_importances.min()

    # Let's check that permutation-based feature importances do not have this
    # problem.
    r = permutation_importance(
        clf,
        X_test,
        y_test,
        n_repeats=n_repeats,
        random_state=rng,
        n_jobs=n_jobs,
        max_samples=max_samples,
    )

    assert r.importances.shape == (X.shape[1], n_repeats)

    # Split the importances between informative and noisy features
    informative_importances = r.importances_mean[:n_informative_features]
    noisy_importances = r.importances_mean[n_informative_features:]

    # Because we do not have a binary variable explaining each target classes,
    # the RF model will have to use the random variable to make some
    # (overfitting) splits (as max_depth is not set). Therefore the noisy
    # variables will be non-zero but with small values oscillating around
    # zero:
    assert max(np.abs(noisy_importances)) > 1e-7
    assert noisy_importances.max() < 0.05

    # The binary features correlated with y should have a higher importance
    # than the high cardinality noisy features.
    # The maximum test accuracy is 2 / 5 == 0.4, each informative feature
    # contributing approximately a bit more than 0.2 of accuracy.
    assert informative_importances.min() > 0.15
xTreino = xTreino.values
yTreino = yTreino.values

#Define o classificador
classifier = RandomForestClassifier(class_weight="balanced", random_state=1986)

#Treina com todos registros
classifier.fit(xTreino, yTreino) 

#Define o scoring
scoring = ['accuracy', 'balanced_accuracy', 'average_precision', 'recall', 'jaccard']
score = 'average_precision'

#Permutation Importance
print('\nPermutation Importance')
pi = permutation_importance(classifier, x, y, scoring=score, n_jobs=3, random_state=1986)

#Restringe as features
indFeatures = np.where((pi.importances_mean * 1000) >= 0.001)[0]
for i in pi.importances_mean[indFeatures].argsort()[::-1]:
    print('%s: %.2f' % (features[indFeatures[i]], pi.importances_mean[indFeatures[i]] * 1000))

xTreino = xTreino[:, indFeatures]
xTeste = xTeste[xTeste.columns[indFeatures]]
print('Qtde features selecionadas: ', len(xTeste.columns))

#K-fold
print('\n========== TUNING PARAMETERS ==========')
arrayYReal = []
arrayYPrediction = []
arrayAcuracia = []
Esempio n. 14
0
    average_precisions_train[i] = average_precision_score(
        clss_test, pred_train)
    average_precisions_test[i] = average_precision_score(clss_test, pred_test)
    precision_train, recall_train, _ = precision_recall_curve(
        clss_train, pred_train)
    precisions_train[i] = np.interp(interval, recall_train[::-1],
                                    precision_train[::-1])
    precision_test, recall_test, _ = precision_recall_curve(
        clss_test, pred_test)
    precisions_test[i] = np.interp(interval, recall_test[::-1],
                                   precision_test[::-1])
    selected_features_mask = (classifier.best_estimator_.
                              named_steps['feature_selection'].get_support())
    feat_imp = permutation_importance(
        classifier.best_estimator_.named_steps['classify'],
        features.values[:, selected_features_mask],
        clss,
        n_jobs=-1,
        random_state=42)
    feature_importances[i, selected_features_mask] += \
        feat_imp.importances_mean

    best_params.append(
        (classifier.best_estimator_.named_steps['feature_selection'].k,
         classifier.best_estimator_.named_steps['classify'].max_depth,
         classifier.best_estimator_.named_steps['classify'].min_samples_leaf))
print(best_params)

stats = {
    'accuracy_train': np.mean(accuracies_train),
    'accuracy_std_train': np.std(accuracies_train),
    'average_precision_train': np.mean(average_precisions_train),
Esempio n. 15
0
###########################
#   IMPORTANCES SORTED   #
###########################

importance=modelos["Tree"].feature_importances_
Tree_Sorted={}
for i,v in enumerate(importance):
    Tree_Sorted[Features_usados[i]]=np.abs(v)

importance=modelos["Random"].feature_importances_
Random_Sorted={}
for i,v in enumerate(importance):
    Random_Sorted[Features_usados[i]]=np.abs(v)

results = permutation_importance(modelos["Knbrs"], X_train, y_train, scoring='accuracy')
importance = results.importances_mean
Knbrs_Sorted={}
for i,v in enumerate(importance):
    Knbrs_Sorted[Features_usados[i]]=np.abs(v)

Vector_Sorted={}
importance = modelos["Vector"].coef_[0]
for i,v in enumerate(importance):
    Vector_Sorted[Features_usados[i]]=np.abs(v)

Tree_Sorted={k: v for k, v in sorted(Tree_Sorted.items(), key=lambda item: item[1])}
Random_Sorted={k: v for k, v in sorted(Random_Sorted.items(), key=lambda item: item[1])}
Knbrs_Sorted={k: v for k, v in sorted(Knbrs_Sorted.items(), key=lambda item: item[1])}
Vector_Sorted={k: v for k, v in sorted(Vector_Sorted.items(), key=lambda item: item[1])}
Sorteados={
Esempio n. 16
0
##############################################################################
# NEURAL NETWORK MODEL
##############################################################################
 
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(features[feature_names],
                                                    labels, train_size = 0.9,
                                                    random_state = 42,
                                                    stratify = labels)
t0 = time.time()

# ???????????????????
nnet = clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                  hidden_layer_sizes=(15,), random_state=1)

nnet.fit(X_train, y_train)
t1 = time.time()
total_time = t1-t0                                                random_state = 42,
                                                    stratify = labels)
result = permutation_importance(nnet, X_train, y_train, random_state = 8)

predictions = nnet.predict(X_test)

accuracy = accuracy_score(y_test, predictions)

confmat = confusion_matrix(y_test, predictions)
df_confmat = pd.DataFrame(confmat)
plot_confusion_matrix(nnet, X_test, y_test)
Esempio n. 17
0
    102: 'random_1',
    103: 'random_10',
    104: 'random_100',
    105: 'random_1000',
    106: 'random_10000',
    107: 'random_100000'
}

normal_games()
clf_tree = RandomForestClassifier(n_estimators=10)
best_features = dict()
clf_tree.fit(games, results)
start = time.time()
result = permutation_importance(clf_tree,
                                games,
                                results,
                                n_repeats=2000,
                                random_state=0)
print(time.time() - start)

best = result.importances_mean
#best = clf.feature_importances_
for i in range(len(features_dict)):
    best_features[features_dict[i]] = best[i]

sort_orders = sorted(best_features.items(), key=lambda x: x[1], reverse=True)
for i in sort_orders:
    print(i[0], i[1])

clf_rand_100 = RandomForestClassifier()
# print("**************************************************")
plt.xlabel('LR experiment [mm]')
plt.ylabel('LR predict [mm]')
plt.title("mlp")
#plt.xticks(np.arange(6, 11, step=2))
#plt.yticks(np.arange(6, 11, step=2))
plt.tight_layout()
plt.savefig("gurafu5(研究報告).png")

#3.4特徴量の寄与度

#順列重要度vs特徴量

result = permutation_importance(grid1,
                                sX_trainH2,
                                y_trainH2,
                                n_repeats=5,
                                random_state=42)

cols = list(sX_trainH2.columns)  # 特徴量名のリスト(目的変数CRIM以外)
f_importance = np.array(result["importances"].mean(axis=1))  # 特徴量重要度の算出
f_importance = f_importance / np.sum(f_importance)  # 正規化(必要ない場合はコメントアウト)
df_importance = pd.DataFrame({'feature': cols, 'importance': f_importance})
df1 = df_importance
df_importance = df_importance.sort_values("importance", ascending=False)

plt.figure(figsize=(8, 8))

plt.rcParams['font.size'] = 18
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['lines.linewidth'] = 2
Esempio n. 19
0
plt.savefig("gurafu4.png")








#3.4特徴量の寄与度

#順列重要度vs特徴量



result = permutation_importance(grid,X_train,y_train, n_repeats=5, random_state=42)


df_importance = pd.DataFrame(zip(X_train.columns, result["importances"].mean(axis=1)),columns=["feature","importance"])
df_importance = df_importance.sort_values("importance",ascending=False)
print(df_importance)
df_importance.to_excel('/mnt/c/CEA/df_importance1.xlsx')


plt.figure(figsize=(8,8))

plt.rcParams['font.size'] = 18
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['lines.markersize'] = 4.0
Esempio n. 20
0
    def kfold_cv_LGBMClassifier(
        self,
        lgb_params: dict,
        df: pd.DataFrame,
        num_folds: int,
        target_col: str,
        del_cols=None,
        select_cols=None,
        eval_metric="error",
        stratified=True,  # StratifiedKFoldにするか
        is_submission=False,  # Home_Credit_Default_Risk の submission.csv作成するか
        is_plot_perm_importance=False,  # permutation importanceも出すか. feature_importance はデフォルトでだす
        random_state=1001,
    ):
        """
        LGBMClassifierでcross validation + feature_importance/permutation importance plot
        """
        # データフレームからID列など不要な列削除
        if del_cols is not None:
            df = df.drop(del_cols, axis=1)

        # 特徴量の列のみ保持
        feats = df.columns.to_list()
        feats.remove(target_col)

        # Divide in training/validation and test data
        train_df = df[df[target_col].notnull()].reset_index(drop=True)
        test_df = df[df[target_col].isnull()].reset_index(drop=True)
        print(
            f"INFO: Starting LightGBM. Train shape: {train_df.shape}, test shape: {test_df.shape}"
        )
        del df
        gc.collect()

        ###################################### cross validation ######################################
        # Cross validation model
        if stratified:
            folds = StratifiedKFold(n_splits=num_folds,
                                    shuffle=True,
                                    random_state=random_state)
        else:
            folds = KFold(n_splits=num_folds,
                          shuffle=True,
                          random_state=random_state)

        # Create arrays and dataframes to store results
        oof_preds = np.zeros(train_df.shape[0])
        sub_preds = np.zeros(test_df.shape[0])
        feature_importance_df = pd.DataFrame()
        permutation_importance_df = pd.DataFrame()
        result_scores = {}
        train_probas = {}
        test_probas = {}
        best_threshold = 0.0

        for n_fold, (train_idx, valid_idx) in enumerate(
                folds.split(train_df[feats], train_df[target_col])):
            print(
                f"\n------------------------------------ n_fold={n_fold + 1} ------------------------------------"
            )
            ############################ create fold ############################
            fold_df_base = train_df.iloc[train_idx]  # エンコディングのベースデータ
            v_fold_df = train_df.iloc[valid_idx]

            # ターゲットエンコディングとか一括実行
            t_fold_df, v_fold_df = self._encoding(fold_df_base, v_fold_df,
                                                  target_col)
            print(
                f"INFO: Encoded Train shape: {t_fold_df.shape}, valid shape: {v_fold_df.shape}"
            )

            # 指定の列あればそれだけにする
            feats = t_fold_df.columns.to_list(
            ) if select_cols is None else select_cols
            if target_col in feats:
                feats.remove(target_col)
            print(f"INFO: select features: {len(feats)}\n")

            train_x, train_y = (
                t_fold_df[feats],
                t_fold_df[target_col],
            )
            valid_x, valid_y = (
                v_fold_df[feats],
                v_fold_df[target_col],
            )

            ############################ train fit ############################
            # LightGBM parameters found by Bayesian optimization
            clf = LGBMClassifier(**lgb_params)
            clf.fit(
                train_x,
                train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric=eval_metric,
                verbose=200,
                early_stopping_rounds=200,
            )
            # モデル保存
            joblib.dump(clf,
                        f"{self.output_dir}/lgb-{n_fold + 1}.model",
                        compress=True)
            # モデルのパラメ
            pd.DataFrame.from_dict(clf.get_params(), orient="index").to_csv(
                f"{self.output_dir}/param.tsv",
                sep="\t",
            )

            ############################ valid pred ############################
            oof_preds[valid_idx] = clf.predict_proba(
                valid_x, num_iteration=clf.best_iteration_)[:, 1]
            if eval_metric == "auc":
                fold_auc = roc_auc_score(valid_y, oof_preds[valid_idx])
                print("\nINFO: Fold %2d AUC : %.6f" % (n_fold + 1, fold_auc))
                result_scores[f"fold_auc_{str(n_fold + 1)}"] = fold_auc
            elif eval_metric == "error":
                # intにしないとaccuracy_score()エラーになる
                _pred = oof_preds[valid_idx]
                _pred[_pred >= 0.5] = 1
                _pred[_pred < 0.5] = 0
                fold_err = 1.0 - accuracy_score(valid_y, _pred)
                print("\nINFO: Fold %2d error(threshold=0.5) : %.6f" %
                      (n_fold + 1, fold_err))
                result_scores[f"fold_err_{str(n_fold + 1)}"] = fold_err

                # best_thresholdで2値化
                _pred = oof_preds[valid_idx]
                _best_threshold = Model().nelder_mead_th(valid_y, _pred)
                _pred[_pred >= _best_threshold] = 1
                _pred[_pred < _best_threshold] = 0
                fold_err_best_threshold = 1.0 - accuracy_score(valid_y, _pred)
                print(
                    f"\nINFO: Fold %2d error(threshold={_best_threshold}) : %.6f"
                    % (n_fold + 1, fold_err_best_threshold))
                best_threshold += _best_threshold / num_folds

            ############################ test pred ############################
            if test_df.shape[0] > 0:
                # ターゲットエンコディングとか一括実行
                tr_df, te_df = self._encoding(fold_df_base, test_df,
                                              target_col)

                # testの確信度
                test_probas[f"fold_{str(n_fold + 1)}"] = clf.predict_proba(
                    te_df[feats], num_iteration=clf.best_iteration_)[:, 1]
                sub_preds += test_probas[
                    f"fold_{str(n_fold + 1)}"] / folds.n_splits

                # 一応trainの確信度も出しておく
                train_probas[f"fold_{str(n_fold + 1)}"] = clf.predict_proba(
                    tr_df[feats], num_iteration=clf.best_iteration_)[:, 1]

            ############################ importance 計算 ############################
            # feature_importance
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = clf.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)

            if is_plot_perm_importance:
                # permutation_importance
                # 時間かかるからifで制御する
                # scoringはsklearnのスコアリングパラメータ
                # accuracy や neg_mean_squared_log_error とか
                # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
                fold_importance_df = pd.DataFrame()
                fold_permutation = permutation_importance(clf,
                                                          valid_x,
                                                          valid_y,
                                                          scoring="roc_auc")
                fold_permutation_df = pd.DataFrame(
                    {
                        "feature": valid_x.columns,
                        "importance":
                        np.abs(fold_permutation["importances_mean"]
                               ),  # マイナスとるのもあるので絶対値にする
                        "fold": n_fold + 1,
                    }, )
                permutation_importance_df = pd.concat(
                    [permutation_importance_df, fold_permutation_df], axis=0)

            del clf, train_x, train_y, valid_x, valid_y
            gc.collect()

        print(
            "\n------------------------------------ mean fold ------------------------------------"
        )
        mean_fold_score = None
        if eval_metric == "auc":
            mean_fold_score = roc_auc_score(train_df[target_col], oof_preds)
            print("INFO: Mean valid AUC score %.6f" % mean_fold_score)
            result_scores["mean_fold_auc"] = mean_fold_score
        elif eval_metric == "error":
            # intにしないとaccuracy_score()エラーになる
            _pred = oof_preds
            _pred[_pred >= 0.5] = 1
            _pred[_pred < 0.5] = 0
            mean_fold_score = 1.0 - accuracy_score(train_df[target_col], _pred)
            print("INFO: Mean valid error score %.6f" % mean_fold_score)
            result_scores["mean_fold_err"] = mean_fold_score

        # モデルの評価指標出力
        result_scores_df = pd.DataFrame(result_scores.values(),
                                        index=result_scores.keys())
        result_scores_df.to_csv(f"{self.output_dir}/result_scores.tsv",
                                sep="\t")

        # test setについて
        if test_df.shape[0] > 0:
            test_probas_df = pd.DataFrame(test_probas)
            test_probas_df.to_csv(f"{self.output_dir}/test_probas.csv",
                                  index=False)
            ############################ Write submission file ############################
            if is_submission:
                # threshold=0.5で2値化
                te_mean = test_probas_df.apply(lambda x: np.mean(x),
                                               axis=1).values
                te_mean[te_mean >= 0.5] = 1
                te_mean[te_mean < 0.5] = 0
                te_mean = te_mean.astype(int)
                output_csv = f"{self.output_dir}/submission_kernel.csv"
                pd.DataFrame({
                    "id": range(len(te_mean)),
                    "y": te_mean
                }).to_csv(output_csv, index=False)
                print(f"INFO: save csv {output_csv}")

                # best_thresholdで2値化
                print(
                    f"INFO: submission best_threshold(cv mean): {best_threshold}"
                )
                te_mean = test_probas_df.apply(lambda x: np.mean(x),
                                               axis=1).values
                te_mean[te_mean >= best_threshold] = 1
                te_mean[te_mean < best_threshold] = 0
                te_mean = te_mean.astype(int)
                output_csv = f"{self.output_dir}/submission_nelder_mead.csv"
                pd.DataFrame({
                    "id": range(len(te_mean)),
                    "y": te_mean
                }).to_csv(output_csv, index=False)
                print(f"INFO: save csv {output_csv}")

        # Plot feature importance
        png_path = f"{self.output_dir}/lgbm_feature_importances.png"
        Model().display_importances(
            feature_importance_df,
            png_path=png_path,
            title="feature_importance",
        )
        # print(f"INFO: save png {png_path}")
        if is_plot_perm_importance:
            png_path = f"{self.output_dir}/lgbm_permutation_importances.png"
            Model().display_importances(
                permutation_importance_df,
                png_path=png_path,
                title="permutation_importance",
            )
            # print(f"INFO: save png {png_path}")

        return mean_fold_score, feature_importance_df, permutation_importance_df
Esempio n. 21
0
pyplot.show()

# In[27]:

from sklearn.datasets import make_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.inspection import permutation_importance
from matplotlib import pyplot
# define dataset
X, y = make_regression(n_samples=1000,
                       n_features=10,
                       n_informative=5,
                       random_state=1)
#define the model
model = KNeighborsRegressor()
#fit thr model
model.fit(X, y)
# perform permutation importance
results = permutation_importance(model, X, y, scoring='neg_mean_squared_error')
#get importance
importance = results.importances_mean

# summarize feature importance
for i, v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i, v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

# In[ ]:
Esempio n. 22
0
 def _get_permutation_importance(self, model, fit_params):
     assert fit_params is not None
     res = permutation_importance(model, **fit_params)
     return res['importances_mean'], res['importances_std']
Esempio n. 23
0
    'critical': y_pool,
    'confidence': con,
    'confidence_of_1': con_1
}
df_r = pd.DataFrame(data,
                    columns=[
                        'Attack-Stage', 'Port-Service', 'critical',
                        'confidence', 'confidence_of_1'
                    ])
print()
print(df_r.iloc[:, 0:4])

from sklearn.inspection import permutation_importance
r = permutation_importance(learner,
                           X_pool,
                           y_pool,
                           n_repeats=10,
                           random_state=0)
for i in r.importances_mean.argsort()[::-1]:
    print(f"{df.iloc[:,2:7].columns.tolist()[i]:<8}"
          f'    '
          f"{r.importances_mean[i]:.3f}"
          f" +/- {r.importances_std[i]:.3f}")

df_A = df_r.groupby(by='Attack-Stage')['confidence_of_1'].mean()
df_P = df_r.groupby(by='Port-Service')['confidence_of_1'].mean()
print()
print(df_A.sort_values())
print()
print(df_P.sort_values())
print()
Esempio n. 24
0
#     for i, v in enumerate(importance):
#         print('Feature: %0d, Score: %.5f' % (i, v))
#     print('\n')
# except:
#     print('Ridge classifier error' + '\n')
# """*******************************************************************************"""

try:

    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X)

    print('Result of Multinomial NB')

    imps = permutation_importance(model, X_train, y_train)
    print(imps.importances_mean)

    print('Accurancy score: ' + str(accuracy_score(y, y_pred)))
    print('Precision score: ' +
          str(precision_score(y, y_pred, average='micro', zero_division=1)))
    print('Recall score: ' +
          str(recall_score(y, y_pred, average='micro', zero_division=1)))
    print('F1 score: ' +
          str(f1_score(y, y_pred, average='micro', zero_division=1)) + '\n')
    print('Precision score (macro): ' +
          str(precision_score(y, y_pred, average='macro', zero_division=1)))
    print('Recall score (macro): ' +
          str(recall_score(y, y_pred, average='macro', zero_division=1)))
    print('F1 score (macro): ' +
          str(f1_score(y, y_pred, average='macro', zero_division=1)) + '\n')
Esempio n. 25
0
# ------------------------------------
#
# The :func:`inspection.permutation_importance` can be used to get an
# estimate of the importance of each feature, for any fitted estimator:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

X, y = make_classification(random_state=0, n_features=5, n_informative=3)
feature_names = np.array([f'x_{i}' for i in range(X.shape[1])])

rf = RandomForestClassifier(random_state=0).fit(X, y)
result = permutation_importance(rf, X, y, n_repeats=10, random_state=0,
                                n_jobs=-1)

fig, ax = plt.subplots()
sorted_idx = result.importances_mean.argsort()
ax.boxplot(result.importances[sorted_idx].T,
           vert=False, labels=feature_names[sorted_idx])
ax.set_title("Permutation Importance of each feature")
ax.set_ylabel("Features")
fig.tight_layout()
plt.show()

# %%
# Native support for missing values for gradient boosting
# -------------------------------------------------------
#
# The :class:`ensemble.HistGradientBoostingClassifier`
Esempio n. 26
0
  model.fit(data, target)                                                 # Training the Classifier.

  #@ Evaluation Metrics:
  predictions = model.predict(data)                                       # Making the Predictions.
  oob = model.oob_score_                                                  # Calculating the OOB score.
  oob_list.append(oob)
  recall = recall_score(target, predictions)                              # Calculating the Recall score.
  recall_list.append(recall)
  kappa = cohen_kappa_score(target, predictions)                          # Calculating the Kappa score.
  kappa_list.append(kappa)

  #@ GINI Features Importance:
  gini_importances["Run" + str(seed)] = model.feature_importances_             # Features Importance: GINI.

  #@ PERMUTATION Features Importance:
  permutation_important = permutation_importance(model, data, target, n_repeats=5,    # Permutation Importance.
                                   n_jobs=-1, random_state=seed)
  permutation_importances["Run" + str(seed)] = permutation_important.importances_mean             # Features Importance: PERMUTATION.

# 1. Evaluation Metrics:
# I will calculate the Evaluation metrics such as OOB score, Recall score and Kappa score using the whole dataset.
# I will be changing the seed of Random Forest Classifier along with the loop. The values of seed starts from 123
# and goes up to 148 which will also be the range of Iteration of the loop which is 25 Iterations in total.

#@ Dictionary of Evaluation Metrics:
scores = {"OOB Score": oob_list,
          "Recall Score": recall_list,
          "Kappa Score": kappa_list}
scores = pd.DataFrame.from_dict(scores, orient="columns")                 # Creating the DataFrame.
scores.to_csv("./Ecoli_evaluation_metrics_nestimator800.csv")                                  # Saving the DataFrame into csv.

# 2. GINI: Features Importance
ax.set_title("Feature importances")

bp = ax.boxplot(importances,
                vert=False,
                labels=y_titles[sorted_idx],
                showbox=False,
                showcaps=False,
                showfliers=False,
                showmeans=True,
                medianprops=dict(color="white"),
                whiskerprops=dict(color="white"),
                meanprops=dict(color="black"))

sresult = permutation_importance(model,
                                 small_test.drop('phase', axis=1),
                                 small_test.phase,
                                 n_repeats=10,
                                 random_state=42)

mresult = permutation_importance(model,
                                 medium_test.drop('phase', axis=1),
                                 medium_test.phase,
                                 n_repeats=10,
                                 random_state=42)

bresult = permutation_importance(model,
                                 big_test.drop('phase', axis=1),
                                 big_test.phase,
                                 n_repeats=10,
                                 random_state=42)
Esempio n. 28
0
    def test_features_relevance(self,
                                df,
                                model_name=None,
                                model=None,
                                features=None,
                                normalize=True,
                                method=DEFAULT_IMP_METHOD,
                                target=None,
                                exclude=None,
                                positive_only=True):
        x, y = [], []
        if model is None:
            if model_name is not None:
                self.activate(model_name)
            model = self.model
            if model is None:
                logging.error(
                    ' Please specify an existing model or train a new one.')
                return
        if target is None:
            target = self.target
        if features is None:
            features = self.features
            features, target = self.get_features(df,
                                                 features=features,
                                                 exclude=exclude,
                                                 target=target)
        features_df = pd.DataFrame(columns=features)
        if method == 'all':
            return pd.DataFrame.from_dict({
                m: self.test_features_relevance(df, method=m)
                for m in ('coef', 'permutation', 'score', 'residual')
            })
        if method == 'coef':
            if hasattr(model, '_final_estimator'):
                estimator = model._final_estimator
            else:
                estimator = model
            if hasattr(estimator, 'coef_'):
                importance = estimator.coef_
            elif hasattr(estimator, 'feature_importances_'):
                importance = estimator.feature_importances_
            else:
                logging.error(
                    ' Final estimator for specified model has neither "coef_" nor "feature_importances_" attributes, '
                    'choose another test method.')
                return
            features_df = pd.Series(dict(zip(
                features, abs(importance)))).sort_values(ascending=False)
        else:
            x = df[features].applymap(partial(pd.to_numeric,
                                              errors='coerce')).values
            if method in ('permutation', 'score'):
                y = df[target].apply(partial(pd.to_numeric,
                                             errors='coerce')).values
                x, y = self.trim_xy(x, y)
            else:
                x = self.trim_xy(x)
            if method == 'permutation':
                if hasattr(model, '_final_estimator'):
                    estimator = model._final_estimator
                else:
                    estimator = model
                if hasattr(estimator, '_estimator_type'
                           ) and estimator._estimator_type == 'regressor':
                    scoring = 'neg_mean_squared_error'
                else:
                    scoring = 'accuracy'
                try:
                    results = permutation_importance(model,
                                                     x,
                                                     y,
                                                     scoring=scoring)
                    importance = results.importances_mean
                except ValueError:
                    importance = pd.Series([np.nan] * len(features))
                features_df = pd.Series(dict(zip(features, abs(importance))))
            elif method == 'residual':
                for row_index, row_feats in enumerate(tqdm(x)):
                    features_relevance = {}
                    baseline_prediction = model.predict([row_feats])
                    for index, feature_value in enumerate(row_feats):
                        new_row = row_feats.copy()
                        new_row[
                            index] = -1 / feature_value if feature_value > 1e-6 else 1e6
                        new_prediction = model.predict([new_row])
                        features_relevance.update({
                            index:
                            abs(new_prediction - baseline_prediction)[0]
                        })
                    features_df = features_df.append(dict(
                        zip(features_df.columns, features_relevance.values())),
                                                     ignore_index=True)
                # noinspection PyArgumentList
                features_df = features_df.sum()
            elif method == 'score':
                baseline_score = model.score(x, y)
                for index, feature in enumerate(features_df.columns):
                    new_x = x.copy()
                    new_x[:, index] = x[:, index].mean()
                    features_df.loc[0, feature] = (
                        baseline_score / model.score(new_x, y) - 1) * 100
                features_df = features_df.T[0]
            else:
                logging.error(
                    ' Valid methods are "coef", "permutation", "residual" and "score".'
                )
                return

        if positive_only:
            features_df = features_df[features_df > 0]
        if normalize:
            min_max_scaler = MinMaxScaler()
            features_df = pd.Series((min_max_scaler.fit_transform(
                (lambda values: values.reshape([len(values), 1]))(
                    features_df.values)) * 100).flatten(), features_df.keys())
        features_df = features_df.sort_values(ascending=False)
        return features_df
ax.set_yticks(y_ticks)
ax.set_yticklabels(feature_names[sorted_idx])
ax.set_title("Random Forest Feature Importances (MDI)")
fig.tight_layout()
plt.show()

# %%
# As an alternative, the permutation importances of ``rf`` are computed on a
# held out test set. This shows that the low cardinality categorical feature,
# ``sex`` is the most important feature.
#
# Also note that both random features have very low importances (close to 0) as
# expected.
result = permutation_importance(rf,
                                X_test,
                                y_test,
                                n_repeats=10,
                                random_state=42,
                                n_jobs=2)
sorted_idx = result.importances_mean.argsort()

fig, ax = plt.subplots()
ax.boxplot(result.importances[sorted_idx].T,
           vert=False,
           labels=X_test.columns[sorted_idx])
ax.set_title("Permutation Importances (test set)")
fig.tight_layout()
plt.show()

# %%
# It is also possible to compute the permutation importances on the training
# set. This reveals that ``random_num`` gets a significantly higher importance
Esempio n. 30
0
 def generate_permutation_importance(self):
     """ 11. Generate permutation importance. """
     self.permutation_importance = permutation_importance(
         self.mdl_(), self.X_train, self.X_ttest)
     self.next(self.end)