コード例 #1
0
def test_regression():

    X, y = make_regression(n_samples=1000,
                           n_features=5,
                           n_informative=2,
                           n_targets=1,
                           random_state=123,
                           shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=123)

    svm = SVR(kernel='rbf', gamma='auto')
    svm.fit(X_train, y_train)

    imp_vals, imp_all = feature_importance_permutation(
        predict_method=svm.predict,
        X=X_test,
        y=y_test,
        metric='r2',
        num_rounds=1,
        seed=123)

    assert imp_vals.shape == (X_train.shape[1], )
    assert imp_all.shape == (X_train.shape[1], 1)
    assert imp_vals[0] > 0.2
    assert imp_vals[1] > 0.2
    assert sum(imp_vals[3:]) <= 0.01
コード例 #2
0
def test_regression_custom_mse():

    X, y = make_regression(n_samples=1000,
                           n_features=5,
                           n_informative=2,
                           n_targets=1,
                           random_state=123,
                           shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=123)

    svm = SVR(kernel='rbf', gamma='auto')
    svm.fit(X_train, y_train)

    imp_vals, imp_all = feature_importance_permutation(
        predict_method=svm.predict,
        X=X_test,
        y=y_test,
        metric=mean_squared_error,
        num_rounds=1,
        seed=123)

    norm_imp_vals = imp_vals / np.abs(imp_vals).max()

    assert imp_vals.shape == (X_train.shape[1], )
    assert imp_all.shape == (X_train.shape[1], 1)
    assert norm_imp_vals[0] == -1.
コード例 #3
0
def test_classification():

    X, y = make_classification(n_samples=1000,
                               n_features=6,
                               n_informative=3,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=2,
                               random_state=0,
                               shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0,
                                                        stratify=y)

    svm = SVC(C=1.0, kernel='rbf', random_state=0, gamma='auto')
    svm.fit(X_train, y_train)

    imp_vals, imp_all = feature_importance_permutation(
        predict_method=svm.predict,
        X=X_test,
        y=y_test,
        metric='accuracy',
        num_rounds=1,
        seed=1)

    assert imp_vals.shape == (X_train.shape[1], )
    assert imp_all.shape == (X_train.shape[1], 1)
    assert imp_vals[0] > 0.2
    assert imp_vals[1] > 0.2
    assert imp_vals[2] > 0.2
    assert sum(imp_vals[3:]) <= 0.02
コード例 #4
0
    def setUp(self):
        import Exercise9_03
        self.exercises = Exercise9_03

        self.file_url = '../Dataset/phpYYZ4Qc.csv'
        self.df = pd.read_csv(self.file_url)
        self.df.head()
        self.y = self.df.pop('rej')

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.df, self.y, test_size=0.3, random_state=1)

        self.rf_model = RandomForestRegressor(random_state=1,
                                              n_estimators=50,
                                              max_depth=6,
                                              min_samples_leaf=60)
        self.rf_model.fit(self.X_train, self.y_train)

        self.imp_vals, _ = feature_importance_permutation(
            predict_method=self.rf_model.predict,
            X=self.X_test.values,
            y=self.y_test.values,
            metric='r2',
            num_rounds=1,
            seed=2)

        self.varimp_df = pd.DataFrame({
            'feature': self.df.columns,
            'importance': self.imp_vals
        })
コード例 #5
0
def test_classification():

    X, y = make_classification(n_samples=1000,
                               n_features=6,
                               n_informative=3,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=2,
                               random_state=0,
                               shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0, stratify=y)

    svm = SVC(C=1.0, kernel='rbf', random_state=0)
    svm.fit(X_train, y_train)

    imp_vals, imp_all = feature_importance_permutation(
        predict_method=svm.predict,
        X=X_test,
        y=y_test,
        metric='accuracy',
        num_rounds=1,
        seed=1)

    assert imp_vals.shape == (X_train.shape[1], )
    assert imp_all.shape == (X_train.shape[1], 1)
    assert imp_vals[0] > 0.2
    assert imp_vals[1] > 0.2
    assert imp_vals[2] > 0.2
    assert sum(imp_vals[3:]) <= 0.02
コード例 #6
0
def test_regression():

    X, y = make_regression(n_samples=1000,
                           n_features=5,
                           n_informative=2,
                           n_targets=1,
                           random_state=123,
                           shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=123)

    svm = SVR(kernel='rbf')
    svm.fit(X_train, y_train)

    imp_vals, imp_all = feature_importance_permutation(
        predict_method=svm.predict,
        X=X_test,
        y=y_test,
        metric='r2',
        num_rounds=1,
        seed=123)

    assert imp_vals.shape == (X_train.shape[1], )
    assert imp_all.shape == (X_train.shape[1], 1)
    assert imp_vals[0] > 0.2
    assert imp_vals[1] > 0.2
    assert sum(imp_vals[3:]) <= 0.01
コード例 #7
0
def test_regression_custom_mse():

    X, y = make_regression(n_samples=1000,
                           n_features=5,
                           n_informative=2,
                           n_targets=1,
                           random_state=123,
                           shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=123)

    svm = SVR(kernel='rbf', gamma='auto')
    svm.fit(X_train, y_train)

    imp_vals, imp_all = feature_importance_permutation(
        predict_method=svm.predict,
        X=X_test,
        y=y_test,
        metric=mean_squared_error,
        num_rounds=1,
        seed=123)

    norm_imp_vals = imp_vals / np.abs(imp_vals).max()

    assert imp_vals.shape == (X_train.shape[1], )
    assert imp_all.shape == (X_train.shape[1], 1)
    assert norm_imp_vals[0] == -1.
コード例 #8
0
ファイル: skl_utils.py プロジェクト: weklica/pennai
def compute_imp_score(model, metric, training_features, training_classes,
                      random_state):
    """compute importance scores for features.
    If coef_ or feature_importances_ attribute is available for the model,
    the the importance scores will be based on the attribute. If not,
    then permuation importance scores will be estimated
    Parameters
    ----------
    tmpdir: string
        Temporary directory for saving experiment results
    model:  scikit-learn Estimator
        A fitted scikit-learn model
    metric: str, callable
        The metric for evaluating the feature importance through
        permutation. By default, the strings 'accuracy' is
        recommended for classifiers and the string 'r2' is
        recommended for regressors. Optionally, a custom
        scoring function (e.g., `metric=scoring_func`) that
        accepts two arguments, y_true and y_pred, which have
        similar shape to the `y` array.
    training_features: np.darray/pd.DataFrame
        Features in training dataset
    training_classes: np.darray/pd.DataFrame
        Target in training dataset
    random_state: int
        Random seed for permuation importances

    Returns
    -------
    coefs: np.darray
        Feature importance scores
    imp_score_type: string
        Importance score type

    """
    # exporting/computing importance score
    if hasattr(model, 'coef_'):
        coefs = model.coef_
        if coefs.ndim > 1:
            coefs = safe_sqr(coefs).sum(axis=0)
            imp_score_type = "Sum of Squares of Coefficients"
        else:
            coefs = safe_sqr(coefs)
            imp_score_type = "Squares of Coefficients"
    else:
        coefs = getattr(model, 'feature_importances_', None)
        imp_score_type = "Gini Importance"
    if coefs is None or np.isnan(coefs).any():
        coefs, _ = feature_importance_permutation(
            predict_method=model.predict,
            X=training_features,
            y=training_classes,
            num_rounds=5,
            metric=metric,
            seed=random_state,
        )
        imp_score_type = "Permutation Feature Importance"

    return coefs, imp_score_type
コード例 #9
0
def getPermutationImportanceMLxtend(num_rounds, model, X_test, y_test, feature_names, width_perm_imp_plot, \
                                    figure_path, figure_filename, top_k='All'):

    # calculate permutation importance values (hardcoded seed for reproducibility)
    imp_vals, imp_all = feature_importance_permutation(
        predict_method=model.predict,
        X=X_test,
        y=y_test,
        metric='accuracy',
        num_rounds=num_rounds,
        seed=1597)
    # calculate std dev
    std = np.std(imp_all, axis=1)
    # get indices from ranking
    indices = np.argsort(imp_vals)[::-1]
    # get labels in ranking order
    labels = []
    for i in indices:
        labels += [feature_names[i]]
    # print information
    print(
        "********* Most Important Features (Mean Permutation Importance with Std. Dev.): *********"
    )
    n_features = len(feature_names) if top_k == 'All' else int(top_k)
    if top_k != 'All':
        print("Note: Showing only top-" + str(top_k) + " features")
    for i in range(n_features):
        print("%d. feature %s (%f +/- %f)" %
              (i + 1, labels[i], imp_vals[indices[i]], std[indices[i]]))
    # create figure
    plt.figure(figsize=(width_perm_imp_plot, 7))
    # title
    plt.title("RF Classifier Mean Permutation Importance (with Std. Dev.)")
    # horizontal line for y = 0
    plt.hlines(0, -1, n_features, colors='k', linestyles='dotted')
    # create bars
    if top_k != 'All':
        plt.bar(range(n_features),
                imp_vals[indices[:top_k]],
                yerr=std[indices[:top_k]])
    else:
        plt.bar(range(n_features), imp_vals[indices], yerr=std[indices])
    # set labels on features
    if top_k != 'All':
        plt.xticks(range(n_features), labels[:top_k], rotation=90)
    else:
        plt.xticks(range(n_features), labels, rotation=90)
    # set x axis limits
    plt.xlim([-1, n_features])
    # set axis labels
    plt.xlabel("Feature Name")
    plt.ylabel("Mean Feature Permutation Importance (MLxtend)")
    # save figure
    plt.savefig(figure_path / figure_filename, bbox_inches='tight')
    # return ranked feature names used in the figure
    if top_k != 'All':
        return labels[:top_k]
    else:
        return labels
コード例 #10
0
ファイル: skl_utils.py プロジェクト: viabard/pennai
def compute_imp_score(model, metric, features, target, random_state):
    """Compute permuation importance scores for features.

    Parameters
    ----------
    tmpdir: string
        Temporary directory for saving experiment results
    model:  scikit-learn Estimator
        A fitted scikit-learn model
    metric: str, callable
        The metric for evaluating the feature importance through
        permutation. By default, the strings 'accuracy' is
        recommended for classifiers and the string 'r2' is
        recommended for regressors. Optionally, a custom
        scoring function (e.g., `metric=scoring_func`) that
        accepts two arguments, y_true and y_pred, which have
        similar shape to the `y` array.
    features: np.darray/pd.DataFrame
        Features in training dataset
    target: np.darray/pd.DataFrame
        Target in training dataset
    random_state: int
        Random seed for permuation importances

    Returns
    -------
    coefs: np.darray
        Feature importance scores
    imp_score_type: string
        Importance score type

    """

    coefs, _ = feature_importance_permutation(
        predict_method=model.predict,
        X=features,
        y=target,
        num_rounds=5,
        metric=metric,
        seed=random_state,
    )
    imp_score_type = "Permutation Feature Importance"

    return coefs, imp_score_type
コード例 #11
0
def permuation_importance_wrapper(datatuple,
                                  model,
                                  rounds,
                                  metric=balanced_accuracy_score):
    X = datatuple[0]
    y = datatuple[1]
    name = datatuple[2]

    imp_vals, imp_all = feature_importance_permutation(
        predict_method=model.predict,
        X=X,
        y=y,
        metric=metric,
        num_rounds=rounds,
        seed=821996,
    )

    result_dict = {"set": name, "imp_vals": imp_vals, "imp_all": imp_all}
    return result_dict
コード例 #12
0
# Supervised Feature Importance
# Using mlxtend.evaluate.feature_importance_permutation
# Using sklearn.neighbors.KNeighborsClassifier

# conda install -c conda-forge mlxtend
from mlxtend.evaluate import feature_importance_permutation
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

feature_importances_, _ = feature_importance_permutation(
    predict_method=knn.predict,
    X=X_test,
    y=y_test,
    metric='accuracy',
    num_rounds=100,
    seed=1)

feature_importances = pd.DataFrame({
    'feature':
    X.columns,
    'importance':
    np.round(feature_importances_, 3)
})
feature_importances = feature_importances.sort_values('importance',
                                                      ascending=False)

print("Supervised brute force")
print(feature_importances)
コード例 #13
0
ax = fig.add_subplot()

viz = FeatureImportances(rf, ax=ax, absolute=True)
viz.fit(X, y)
viz.poof()

############## PERMUTATION IMPORTANCE ##############

# mlxtend

from mlxtend.evaluate import feature_importance_permutation

imp_vals, imp_all = feature_importance_permutation(
    predict_method=rf.predict,
    X=X_test,
    y=y_test,
    metric='accuracy',  # use 'r2' or other method for regression
    num_rounds=10,
    seed=1)

std = np.std(imp_all, axis=1)
indices = np.argsort(imp_vals)[::-1]

plt.figure()
plt.title(
    "Random Forest feature importance via permutation importance w. std. dev.")
plt.bar(range(X.shape[1]), imp_vals[indices], yerr=std[indices])
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
コード例 #14
0
def test_model(  # pylint:disable=too-many-arguments
    modelpath: str,
    scalerpath: str,
    Xpath: str,
    ypath: str,
    namepath: str,
    outpath: str,
    featurelabelpath: str = None,
):  # pylint:disable=too-many-locals
    """Takes a trained model and performes some tests on it and calculates statistics.

    Arguments:
        modelpath {str} -- path to sklearn model in .joblib file
        modelpath {str} -- path to the scaler object
        Xpath {str} -- path to features in npz file
        ypath {str} -- path to labels in npz file
        namepath {str} -- path to names in pickle 3 file
        outpath {str} -- path to which the evaluation metrics are written

    Keyword Arguments:
        featurelabelpath {str} -- path to a picklefile with a list of the feature names, if not None, feature importances are also estimates (default {None})
    """
    lower_quantile = 2.5 / 100
    upper_quantile = 97.5 / 100

    experiment = Experiment(
        api_key=os.getenv("COMET_API_KEY", None), project_name="mof-oxidation-states"
    )
    experiment.add_tag("model evaluation")

    print("*** Loading data ***")
    model = load(modelpath)
    scaler = load(scalerpath)
    X = np.load(Xpath)
    X = scaler.transform(X)
    y = np.load(ypath)
    experiment.log_dataset_hash(X)
    names = read_pickle(namepath)

    print("*** Getting bootstrapped metrics, using 200 folds which takes some time ***")
    scores = bootstrapped_metrics(model, X, y, scoring_funcs=return_scoring_funcs())

    df_metrics = pd.DataFrame(scores)

    means = df_metrics.mean().values
    medians = df_metrics.median().values
    lower = df_metrics.quantile(lower_quantile).values
    upper = df_metrics.quantile(upper_quantile).values
    stds = df_metrics.std().values

    # print(
    #    " *** Running permuation test running 200 folds with 10 fold CV which takes forever ***"
    # )
    # cv = StratifiedKFold(10)
    # balanced_accuracy, balanced_acc_permutation_scores, balanced_accuracy_pvalue = permutation_test(
    #    model, X, y
    # )

    metrics_dict = {}

    # metrics_dict["balanced_accuracy_cv"] = balanced_accuracy
    # metrics_dict[
    #    "balanced_accuracy_permutation_scores"
    # ] = balanced_acc_permutation_scores
    # metrics_dict["balanced_accuracy_p_value"] = balanced_accuracy_pvalue

    prediction = model.predict(X)

    print(" *** Getting misclassified cases ***")
    misclassified = np.where(y != prediction)
    misclassified_w_prediction_true = [
        (names[i], prediction[i], y[i]) for i in list(misclassified[0])
    ]

    metrics_dict["misclassified"] = misclassified_w_prediction_true
    experiment.log_metric("misclassified", misclassified)
    if featurelabelpath is not None:
        feature_labels = read_pickle(featurelabelpath)

        print("*** Getting feature importance ***")
        imp_vals, imp_all = feature_importance_permutation(
            predict_method=model.predict,
            X=X,
            y=y,
            metric="accuracy",
            num_rounds=20,  # to get some errorbars
            seed=1,
        )
        importance_error = np.std(imp_all, axis=-1)
        importance_metrics = [
            (name, value, error)
            for name, value, error in zip(feature_labels, imp_vals, importance_error)
        ]
        experiment.log_metric("feature_importances", importance_metrics)
        metrics_dict["feature_importances"] = importance_metrics

    for i, column in enumerate(df_metrics.columns.values):
        metrics_dict[column] = (means[i], medians[i], stds[i], lower[i], upper[i])
        print((column, means[i], "_".join([column, "mean"])))
        experiment.log_metric("_".join([column, "mean"]), means[i])
        experiment.log_metric("_".join([column, "median"]), medians[i])
        experiment.log_metric("_".join([column, "lower"]), lower[i])
        experiment.log_metric("_".join([column, "upper"]), upper[i])
        experiment.log_metric("_".join([column, "std"]), stds[i])

    # experiment.log_metrics("balanced_accuracy_cv", balanced_accuracy)
    # experiment.log_metrics("balanced_accuracy_p_value", balanced_accuracy_pvalue)
    # experiment.log_metrics("missclassified", misclassified_w_prediction_true)

    print(" *** Getting the calibration curve ***")
    cc = calibration_curve(y, model.predict(X), n_bins=10)

    metrics_dict["calibration_curve_true_probab"] = cc[0]
    metrics_dict["calibration_curve_predicted_probab"] = cc[1]

    # now write a .json with metrics for DVC
    with open(os.path.join(outpath, "test_metrics.json"), "w") as fp:
        json.dump(metrics_dict, fp, cls=NpEncoder)
コード例 #15
0
ファイル: classificator_1.py プロジェクト: lg2417/diploma
        align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, 25])
plt.ylim([0, 0.15])
#plt.show()
plt.savefig('./feat_imp_48.png')
#Ytest =numpy.array ([1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#         3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
#         4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] )
#
X = numpy.array(Xtest)
imp_vals, imp_all = feature_importance_permutation(
    predict_method=model.predict,
    X=numpy.array(Xtest),
    y=numpy.array(Ytest),
    metric='accuracy',
    num_rounds=10,
    seed=1)

std = numpy.std(imp_all, axis=1)
indices = numpy.argsort(imp_vals)[::-1]

plt.figure()
plt.title("Random Forest feature importance via permutation importance")
plt.bar(range(X.shape[1]), imp_vals[indices], yerr=std[indices])
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, 30])
#plt.show()
plt.savefig('./feat_imp_dog_perm.png')
#aa.to_pickle('./conf_matr.pkl')