コード例 #1
0
def test_partial_dependence_unknown_feature(estimator, features):
    X, y = make_classification(random_state=0)
    estimator.fit(X, y)

    err_msg = 'all features must be in'
    with pytest.raises(ValueError, match=err_msg):
        partial_dependence(estimator, X, [features])
コード例 #2
0
def test_multiclass_multioutput(Estimator):
    # Make sure error is raised for multiclass-multioutput classifiers

    # make multiclass-multioutput dataset
    X, y = make_classification(n_classes=3, n_clusters_per_class=1,
                               random_state=0)
    y = np.array([y, y]).T

    est = Estimator()
    est.fit(X, y)

    with pytest.raises(
            ValueError,
            match="Multiclass-multioutput estimators are not supported"):
        partial_dependence(est, X, [0])
コード例 #3
0
def test_warning_recursion_non_constant_init():
    # make sure that passing a non-constant init parameter to a GBDT and using
    # recursion method yields a warning.

    gbc = GradientBoostingClassifier(init=DummyClassifier(), random_state=0)
    gbc.fit(X, y)

    with pytest.warns(
            UserWarning,
            match='Using recursion method with a non-constant init predictor'):
        partial_dependence(gbc, X, [0], method='recursion')

    with pytest.warns(
            UserWarning,
            match='Using recursion method with a non-constant init predictor'):
        partial_dependence(gbc, X, [0], method='recursion')
コード例 #4
0
def test_output_shape(Estimator, method, data, grid_resolution,
                      features):
    # Check that partial_dependence has consistent output shape for different
    # kinds of estimators:
    # - classifiers with binary and multiclass settings
    # - regressors
    # - multi-task regressors

    est = Estimator()

    # n_target corresponds to the number of classes (1 for binary classif) or
    # the number of tasks / outputs in multi task settings. It's equal to 1 for
    # classical regression_data.
    (X, y), n_targets = data

    est.fit(X, y)
    pdp, axes = partial_dependence(est, X=X, features=features,
                                   method=method,
                                   grid_resolution=grid_resolution)

    expected_pdp_shape = (n_targets, *[grid_resolution
                                       for _ in range(len(features))])
    expected_axes_shape = (len(features), grid_resolution)

    assert pdp.shape == expected_pdp_shape
    assert axes is not None
    assert np.asarray(axes).shape == expected_axes_shape
コード例 #5
0
def test_partial_dependence_easy_target(est, power):
    # If the target y only depends on one feature in an obvious way (linear or
    # quadratic) then the partial dependence for that feature should reflect
    # it.
    # We here fit a linear regression_data model (with polynomial features if
    # needed) and compute r_squared to check that the partial dependence
    # correctly reflects the target.

    rng = np.random.RandomState(0)
    n_samples = 100
    target_variable = 2
    X = rng.normal(size=(n_samples, 5))
    y = X[:, target_variable]**power

    est.fit(X, y)

    averaged_predictions, values = partial_dependence(
        est, features=[target_variable], X=X, grid_resolution=1000)

    new_X = values[0].reshape(-1, 1)
    new_y = averaged_predictions[0]
    # add polynomial features if needed
    new_X = PolynomialFeatures(degree=power).fit_transform(new_X)

    lr = LinearRegression().fit(new_X, new_y)
    r2 = r2_score(new_y, lr.predict(new_X))

    assert r2 > .99
コード例 #6
0
def test_recursion_decision_function(target_feature):
    # Make sure the recursion method (implicitly uses decision_function) has
    # the same result as using brute method with
    # response_method=decision_function

    X, y = make_classification(n_classes=2, n_clusters_per_class=1,
                               random_state=1)
    assert np.mean(y) == .5  # make sure the init estimator predicts 0 anyway

    est = GradientBoostingClassifier(random_state=0, loss='deviance')
    est.fit(X, y)

    preds_1, _ = partial_dependence(est, X, [target_feature],
                                    response_method='decision_function',
                                    method='recursion')
    preds_2, _ = partial_dependence(est, X, [target_feature],
                                    response_method='decision_function',
                                    method='brute')

    assert_allclose(preds_1, preds_2, atol=1e-7)
コード例 #7
0
def test_partial_dependence_sample_weight():
    # Test near perfect correlation between partial dependence and diagonal
    # when sample weights emphasize y = x predictions
    # non-regression test for #13193
    N = 1000
    rng = np.random.RandomState(123456)
    mask = rng.randint(2, size=N, dtype=bool)

    x = rng.rand(N)
    # set y = x on mask and y = -x outside
    y = x.copy()
    y[~mask] = -y[~mask]
    X = np.c_[mask, x]
    # sample weights to emphasize data points where y = x
    sample_weight = np.ones(N)
    sample_weight[mask] = 1000.

    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
    clf.fit(X, y, sample_weight=sample_weight)

    pdp, values = partial_dependence(clf, X, features=[1])

    assert np.corrcoef(pdp, values)[0, 1] > 0.99
コード例 #8
0
def test_partial_dependence_error(estimator, params, err_msg):
    X, y = make_classification(random_state=0)
    estimator.fit(X, y)

    with pytest.raises(ValueError, match=err_msg):
        partial_dependence(estimator, X, **params)
コード例 #9
0
def test_partial_dependence_unfitted_estimator(estimator):
    err_msg = "'estimator' parameter must be a fitted estimator"
    with pytest.raises(ValueError, match=err_msg):
        partial_dependence(estimator, X, [0])
コード例 #10
0
def plot_partial_dependence_bootstrap(model,
                                      X_train,
                                      y_train,
                                      features,
                                      feature_name,
                                      n_boot=5,
                                      random_state=None):
    rng = check_random_state(random_state)

    # fit a model for each bootstrap sample
    all_estimators = [clone(model) for _ in range(n_boot)]
    for est in all_estimators:
        bootstrap_idx = rng.choice(np.arange(X_train.shape[0]),
                                   size=X_train.shape[0],
                                   replace=True)
        X_train_bootstrap = X_train.iloc[bootstrap_idx]
        y_train_bootstrap = y_train[bootstrap_idx]
        est.fit(X_train_bootstrap, y_train_bootstrap)

    # prepare the plotting
    n_fig = 3
    n_rows = (int(len(features) / n_fig) +
              1 if len(features) % n_fig != 0 else int(len(features) / n_fig))
    n_cols = len(features) if n_rows == 1 else n_fig
    fig, axs = plt.subplots(nrows=n_rows,
                            ncols=n_cols,
                            figsize=(n_cols * 5, n_rows * 5))
    for feat, ax in zip(features, np.ravel(axs)):
        # compute the partial dependence for each models
        X_train_preprocessed = model[0].fit_transform(X_train)
        avg_preds_bootstrap = []
        for est in all_estimators:
            avg_preds, values = partial_dependence(est[-1],
                                                   X_train_preprocessed,
                                                   feat,
                                                   grid_resolution=20)
            avg_preds_bootstrap.append(avg_preds)

        if len(values) == 2:
            # compute the mean of the average prediction when plotting contour
            # plots
            mean_avg_preds = np.mean(avg_preds_bootstrap, axis=0)
            Z_level = np.linspace(mean_avg_preds.min(), mean_avg_preds.max(),
                                  8)
            XX, YY = np.meshgrid(values[0], values[1])
            Z = mean_avg_preds[0].T
            CS = ax.contour(XX,
                            YY,
                            Z,
                            levels=Z_level,
                            linewidths=0.5,
                            colors='k')
            ax.contourf(XX,
                        YY,
                        Z,
                        levels=Z_level,
                        vmax=Z_level[-1],
                        vmin=Z_level[0],
                        alpha=0.75)
            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)
            ax.set_xlabel(feature_name[feat[0]])
            ax.set_ylabel(feature_name[feat[1]])
        else:
            # plot all average predictions and their mean
            mean_avg_preds = np.zeros_like(avg_preds_bootstrap[0])
            for preds in avg_preds_bootstrap:
                mean_avg_preds += preds
                ax.plot(values[0], preds[0], '--k', linewidth=1, alpha=0.5)
            mean_avg_preds /= len(avg_preds_bootstrap)
            ax.plot(values[0],
                    mean_avg_preds[0],
                    'r',
                    alpha=0.8,
                    label='Average')
            ax.set_xlabel(feature_name[feat])
            ax.set_ylabel('WAGE')
            ax.legend()
    plt.tight_layout()
コード例 #11
0
ファイル: stat_models.py プロジェクト: semvijverberg/RGCPD
def plot_oneway_partial_dependence(GBR_models_split_lags,
                                   keys=None,
                                   lags=None,
                                   grid_resolution=20):
    #%%
    sns.set_style("whitegrid")
    sns.set_style(rc={'axes.edgecolor': 'black'})

    if lags is None:
        lag_keys = GBR_models_split_lags.keys()
        lags = [int(l.split('_')[1]) for l in lag_keys][:3]

    if keys is None:
        keys = set()
        for l, lag in enumerate(lags):
            # get models at lag
            GBR_models_split = GBR_models_split_lags[f'lag_{lag}']
        [
            keys.update(list(r.X_pred.columns))
            for k, r in GBR_models_split.items()
        ]
        masks = ['TrainIsTrue', 'x_fit', 'x_pred', 'y_fit', 'y_pred']
        keys = [k for k in keys if k not in masks]
    keys = keys

    df_lags = []
    for l, lag in enumerate(lags):
        # get models at lag
        GBR_models_split = GBR_models_split_lags[f'lag_{lag}']

        df_keys = []
        keys_in_lag = []
        for i, key in enumerate(keys):
            y = []
            x = []
            for splitkey, regressor in GBR_models_split.items():
                if key in list(regressor.X_pred.columns):
                    X_pred = regressor.X_pred
                    index = list(X_pred.columns).index(key)
                    TrainIsTrue = regressor.df_norm['TrainIsTrue']
                    TestIsTrue = TrainIsTrue.loc[X_pred.index] == False
                    X_test = X_pred[TestIsTrue]
                    # X_test = regressor.X_pred.loc[:,all_keys][regressor.X_pred['x_pred']]
                    _y, _x = partial_dependence(
                        regressor,
                        X=X_test,
                        features=[index],
                        grid_resolution=grid_resolution)
                    y.append(_y[0])
                    x.append(_x[0])
                    keys_in_lag.append(key)
            if len(y) != 0:
                # y has shape (grid_res, splits_key_present)
                y_mean = np.array(y).mean(0)
                y_std = np.std(y, 0).ravel()
                x_vals = np.mean(x, 0)
                count_splits = np.repeat(np.array(y).shape[0], y_mean.shape)
                data = [
                    y_mean[:, None], y_std[:, None], x_vals[:, None],
                    count_splits[:, None]
                ]
                data = np.concatenate(data, axis=1)
                df_key = pd.DataFrame(
                    data,
                    columns=['y_mean', 'y_std', 'x_vals', 'count splits'])
                df_keys.append(df_key)
        df_keys = pd.concat(df_keys, keys=np.unique(keys_in_lag))
        df_lags.append(df_keys)
    df_lags = pd.concat(df_lags, keys=lags)
    # =============================================================================
    # Plotting
    # =============================================================================
    #%%
    col_wrap = 4
    g = sns.FacetGrid(pd.DataFrame(data=keys),
                      col=0,
                      col_wrap=col_wrap,
                      aspect=1.5,
                      sharex=False)
    custom_lines = []
    _legend = []
    for l, lag in enumerate(lags):

        style = line_styles[l]
        color = colors_datasets[l]
        custom_lines.append(
            Line2D([0], [0], linestyle=style, color=color, lw=4,
                   markersize=10))
        _legend.append(f'lag {lag}')
        #        text_lag = []
        for i, key in enumerate(keys):
            ax = g.axes[i]
            df_plot = df_lags.loc[lag, key]
            y_mean = df_plot['y_mean']
            y_std = 2 * df_plot['y_std']
            x_vals = df_plot['x_vals']
            ax.fill_between(x_vals,
                            y_mean - y_std,
                            y_mean + y_std,
                            color=color,
                            linestyle=style,
                            alpha=0.2)
            ax.plot(x_vals, y_mean, color=color, linestyle=style)
            ax.set_title(key)
            if i == 0:
                ax.legend(custom_lines, _legend, handlelength=3)

    return df_lags, g.fig
コード例 #12
0
def test_partial_dependence_X_list(estimator):
    # check that array-like objects are accepted
    X, y = make_classification(random_state=0)
    estimator.fit(X, y)
    partial_dependence(estimator, list(X), [0])
コード例 #13
0
    def test_interpret(self):
        iris = load_iris()
        X, y = iris.data, iris.target
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=42)

        classifier = RandomForestClassifier(random_state=42)
        classifier.fit(X_train, y_train)

        pdp, axes = partial_dependence(classifier,
                                       X, [0],
                                       response_method='predict_proba',
                                       percentiles=(0.05, 0.95),
                                       grid_resolution=100,
                                       method='brute')

        # ensure original pdp implemtend by scikit learn works as expected
        assert pytest.approx(pdp[0][0]) == 0.34126667
        assert pytest.approx(pdp[0][-1]) == 0.29406667

        # load predictions that have been saved from the exact model trained above and run through serving engine v1.0.2
        predictions = pickle.load(open("./misc/predictions.pickle", "rb"))

        # simulate the responses from serving engine
        model = Mock()
        model.output_name = 'output_probability'
        flatten = lambda l: [item for sublist in l for item in sublist]
        model.estimate = Mock(
            side_effect=[{
                'output_probability': flatten(predictions)
            }])

        sim_pdp, sim_axes = self.pdp.interpret(model=model,
                                               X=X,
                                               features=[0],
                                               percentiles=(0.05, 0.95),
                                               grid_resolution=100)

        # check that pdp values computed from predictions of serving engine matches scikit's pdp implementation
        assert pdp[0][0] == sim_pdp[0][0]
        assert (pdp == sim_pdp).all()

        assert (axes[0] == sim_axes[0]).all()

        # Try with multiple api calls
        model.estimate = Mock(side_effect=[{
            'output_probability': predictions[i]
        } for i in range(len(predictions))])

        sim_pdp, sim_axes = self.pdp.interpret(model=model,
                                               X=X,
                                               features=[0],
                                               percentiles=(0.05, 0.95),
                                               grid_resolution=100,
                                               one_api_call=False)

        # check that pdp values computed from predictions of serving engine matches scikit's pdp implementation
        assert pdp[0][0] == sim_pdp[0][0]
        assert (pdp == sim_pdp).all()

        assert (axes[0] == sim_axes[0]).all()
コード例 #14
0
ファイル: ANN_regress.py プロジェクト: FengHaiF/py_ws
y_predict = scaler_Y.inverse_transform(net.predict(\
                             scaler_X.transform(x_test)))
line = Line()

line.add_xaxis(range(1, len(test_Y) + 1))
line.add_yaxis('samples',test_Y.reshape(-1,),\
               label_opts=opts.LabelOpts(is_show=False))
line.add_yaxis('predict',y_predict,\
               label_opts=opts.LabelOpts(is_show=False))
line.set_global_opts(title_opts=opts.TitleOpts(title="line demo"))
line.render('./html/line.html')

# 3D surface
target_feature = (1, 2)
pdp, axes = partial_dependence(net,\
                               scaler_X.transform(x_train),\
                               target_feature,\
                               grid_resolution=30)
XX, YY = np.meshgrid(axes[0], axes[1])
Z = pdp[0].T

names = ['Num', 'x2', 'x4', 'temp']
fig = plt.figure()
ax = Axes3D(fig)
surf = ax.plot_surface(XX,
                       YY,
                       Z,
                       rstride=1,
                       cstride=1,
                       cmap=plt.cm.BuPu,
                       edgecolor='k')
ax.set_xlabel(names[target_feature[0]])
# MAGIC %md #### Read from db

# COMMAND ----------

predDF_final_done = spark.read.format("jdbc").option("url", "jdbc:mysql://sx2200-gr5069.ccqalx6jsr2n.us-east-1.rds.amazonaws.com/sx2200") \
    .option("driver", "com.mysql.jdbc.Driver").option("dbtable", "test_lr_preds") \
    .option("user", "admin").option("password", "Xs19980312!").load()

# COMMAND ----------

# MAGIC %md #### Marginal Effects 

# COMMAND ----------

from sklearn.inspection import plot_partial_dependence, partial_dependence
from sklearn.datasets import make_friedman1
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
%matplotlib inline

# COMMAND ----------

factors = X_train[['race_count','lag1_avg']]
# plot the partial dependence (marginal effect)
plot_partial_dependence(logreg, X_train, factors)  
# get the partial dependence (marginal effect)
partial_dependence(logreg, X_train_s, [0])  

# COMMAND ----------

# From the plots, the marginal effect of race_count and lag1_avg are shown. The relationship between races completed and whether or not the constructor would win a season is a linear relation with positive slope, and the line is quite cliffy. The relation between average points earned in last season and the constructor championship is also positive, while the slope of this curve is smaller than the slope of race_count.
コード例 #16
0
def plot_pdp(clf,
             X,
             feature,
             scaler=None,
             column_names=None,
             query=None,
             xlabel=None,
             show_deciles=True,
             show_distplot=False,
             y=None,
             pardep_kws={},
             plt_kws={},
             distplot_kws={},
             ax=None):
    """ plots partial dependence plot against `feature` for samples satifying `query`

    Parameters
    ----------
    clf : compatible with sklearn.inspect.parital_dependence
        model
    X : pd.DataFrame or 2D array of numbers
        data to calculated partial dependece
        both training and hold-on set (or combined) is reasonable for this purpose
    feature : str
        name of the feature to compute pdp w.r.t
    scaler : sklearn-compatible scaler e.g. StandardScaler
        scaler used to scale training data
    column_names : iterable of strings
        names of the columns in the dataset,
        if None (default) then X has to be dataframe with correct columns
        other args as `feature` or `query` relies on it
    query : str
        selection criteria, only samples passing it will be used to compute pdp
        has to be valid input for pd.DataFrame.query()
    xlabel : str or None
        xlabel, if None (default) `feature` will be used
    show_deciles : bool
        if small vertical lines (seaborn's rugs) corresponding to deciles of selected `feature` values should be shown,
        selected i.e. passing `query`
    show_distplot : bool
        if distribution of `feature` should be plotted below pdp
        if `y` passed, than it's grouped by y=0/1
    y : array of numbers
        samples labels, used to split distplot, 
        used only if show_distplot is True
    pardep_kws : dict
        passed to sklearn.inspect.parital_dependence
    plt_kws : dict
        passed to plt.plot
    distplot_kws : dict
        passed to sns.distplot, 
        has some defaults - see code
    ax : matplotlib.axes._subplots.AxesSubplot object or None
        axes to plot on
        default=None, meaning creating axes inside function

    Returns
    -------
    ax
    """
    if not ax:
        _, ax = plt.subplots(figsize=(7, 5))
    if column_names is None:
        column_names = X.columns

    X_orig = scaler.inverse_transform(X) if scaler else X
    df = pd.DataFrame(X_orig)
    df.columns = column_names
    if query: df = df.query(query)

    df_xgb = pd.DataFrame(scaler.transform(df)) if scaler else df
    if feature not in clf.get_booster().feature_names:
        df_xgb.columns = [f'f{i}' for i in range(df_xgb.shape[1])]
        feat_idx = list(column_names).index(feature)
        feat_name_xgb = f'f{feat_idx}'
    else:
        df_xgb.columns = df.columns
        feat_idx = list(column_names).index(feature)
        feat_name_xgb = feature

    part_dep, feat_vals = partial_dependence(
        clf,
        df_xgb[df_xgb[feat_name_xgb].notna()],
        features=[feat_name_xgb],
        **pardep_kws)
    part_dep, feat_vals = np.array(part_dep[0]), np.array(feat_vals[0])
    if scaler:
        feat_vals_orig = feat_vals * np.sqrt(
            scaler.var_[feat_idx]) + scaler.mean_[feat_idx]
    else:
        feat_vals_orig = feat_vals

    ax.plot(feat_vals_orig, part_dep, lw=3, **plt_kws)
    ax.set_xlim(left=min(ax.get_xlim()[0], min(feat_vals_orig)),
                right=max(ax.get_xlim()[1], max(feat_vals_orig)))

    vals = df[feature]
    if show_deciles:
        xlim = ax.get_xlim()
        deciles = np.nanpercentile(vals, np.arange(0, 101, 10))
        sns.rugplot(deciles, ax=ax)
        ax.set_xlim(xlim)
    if show_distplot:
        distplot_default_kws = dict(bins=np.linspace(*ax.get_xlim(), 100),
                                    distplot_y_frac=0.8)
        distplot_kws = {
            **distplot_default_kws,
            **distplot_kws
        }  # passed `distplot_kws` overwrites defaults
        _add_distplot(ax, vals, y=y, **distplot_kws)

    ax.set_xlabel(xlabel if xlabel else feature)
    ax.set_ylabel('partial dependence')
    return ax
コード例 #17
0
def get(request):
    filename = "titanic_train.csv"
    treeNum = 0
    treeDeep = 0
    if request.method == 'GET':
        filename = request.GET.get('name')
        treeNum = request.GET.get('treeNum')
        treeDeep = request.GET.get('treeDeep')
        print(filename + " " + treeNum + " " + treeDeep)
    # D:\PyCharm 2020.2.3\djangoProject\djangoProject\data
    data = pd.read_csv('D:/PyCharm 2020.2.3/djangoProject/djangoProject/data/' + filename + '1.csv')
    # dftrain = pd.read_csv('data/' + filename)
    y_train = None
    dp = None
    if filename == 'german':
        y_train = data.pop('Creditability')
        dp = pd.read_csv('D:/PyCharm 2020.2.3/djangoProject/djangoProject/data/germandis.csv')
    else:
        y_train = data.pop('survived')
    # 定义随机森林模型参数
    rfc = RandomForestClassifier(max_depth=int(treeDeep), n_estimators=int(treeNum), random_state=60)
    # # 处理数据,将离散化的值转换为数字等
    # data = prepareData(dftrain)

    global totaldata
    totaldata = data.copy()
    # 特征名称列表
    featureList = totaldata.columns.values.tolist()
    # # 训练
    rfc.fit(data, y_train)

    global estimator
    estimator = rfc

    importancenData = permutation_importance(rfc, data, y_train, n_repeats=100, random_state=16)

    global mdiFeature
    mdiFeature = rfc.feature_importances_.tolist()

    global feature
    feature = importancenData.importances_mean.tolist()
    print("排列重要性")
    print(feature)
    print("mdi重要性")
    print(mdiFeature)

    # 计算部份依赖
    global pdpData
    pdpData = []
    for index, value in enumerate(data.columns.values):
        pdp, axes = partial_dependence(rfc, data, index)
        pdpData.append({"name": value, "axes": axes[0].tolist(), "pdp": pdp[0].tolist()})
    # print(plot_partial_dependence(rfc, data, ["Account Balance"], target=0))
    # # tsne降维
    # tsne = TSNE(n_components=2, perplexity=30, n_iter=500, metric='precomputed')

    tsne = TSNE(learning_rate=100.0)
    array = tsne.fit_transform(dp).tolist()  # 进行数据降维

    # embedding = MDS(n_components=2)
    # array = embedding.fit_transform(dp)

    # 预测的概率
    y_pre = rfc.predict(data)
    predict_prob = rfc.predict_proba(data)
    predict0 = []
    predict1 = []
    for i in predict_prob:
        predict0.append(i[0])
        predict1.append(i[1])
    data['predict0'] = predict0
    data['predict1'] = predict1
    # # 将预测值和真实值加入到数据中
    data['predict'] = y_pre
    data['true'] = y_train
    x = []
    y = []
    for i in array:
        x.append(i[0])
        y.append(i[1])
    data['x'] = x
    data['y'] = y
    index = []
    for i in range(len(data)):
        index.append(i)
    data.insert(0, 'id', index)
    # data = pd.read_csv('D:/PyCharm 2020.2.3/djangoProject/djangoProject/data/result_new.csv')
    # return JsonResponse(data, safe=False)

    print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, y_pre))
    print(metrics.confusion_matrix(y_train, y_pre, labels=None, sample_weight=None))
    da = data.to_dict(orient='records')
    featureListMin = []
    featureListMax = []
    for i in featureList:
        featureListMin.append(min(data[i]))
        featureListMax.append(max(data[i]))
    return JsonResponse({'data': da, 'featureList': featureList, 'featureImportance': feature,
                         'mdiFeatureImportance': mdiFeature, 'featureListMin': featureListMin
                         , 'featureListMax': featureListMax, 'auc': metrics.roc_auc_score(y_train, y_pre),
                         'confusionMatrix': metrics.confusion_matrix(y_train, y_pre, labels=None, sample_weight=None).tolist()}, safe=False)
コード例 #18
0
)


# 2 ランダムフォレストによる学習 -------------------------------------------------

# モデル構築
# --- インスタンスの生成
# --- 学習
rf = RandomForestRegressor(n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)


# 3 PDとICEによる解釈 -------------------------------------------------------------

# PDとICEの計算
ice = partial_dependence(estimator=rf, X=X_test, features=["RM"], kind="both")
ice


# プロット定義
def plot_ice():
    fig, ax = plt.subplots(figsize=(8, 4))
    plot_partial_dependence(estimator=rf, X=X_test, features=["RM"],
                            kind="both", ax=ax)
    fig.show()


# プロット作成
plot_ice()

コード例 #19
0
def main():
    cal_housing = fetch_california_housing()

    X, y = cal_housing.data, cal_housing.target
    names = cal_housing.feature_names

    # Center target to avoid gradient boosting init bias: gradient boosting
    # with the 'recursion' method does not account for the initial estimator
    # (here the average target, by default)
    y -= y.mean()

    print("Training MLPRegressor...")
    est = MLPRegressor(activation='logistic')
    est.fit(X, y)
    print('Computing partial dependence plots...')
    # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower
    # with the brute method.
    features = [0, 5, 1, 2]
    plot_partial_dependence(est,
                            X,
                            features,
                            feature_names=names,
                            n_jobs=3,
                            grid_resolution=50)
    fig = plt.gcf()
    fig.suptitle('Partial dependence of house value on non-location features\n'
                 'for the California housing dataset, with MLPRegressor')
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

    print("Training GradientBoostingRegressor...")
    est = GradientBoostingRegressor(n_estimators=100,
                                    max_depth=4,
                                    learning_rate=0.1,
                                    loss='huber',
                                    random_state=1)
    est.fit(X, y)
    print('Computing partial dependence plots...')
    features = [0, 5, 1, 2, (5, 1)]
    plot_partial_dependence(est,
                            X,
                            features,
                            feature_names=names,
                            n_jobs=3,
                            grid_resolution=50)
    fig = plt.gcf()
    fig.suptitle('Partial dependence of house value on non-location features\n'
                 'for the California housing dataset, with Gradient Boosting')
    plt.subplots_adjust(top=0.9)

    print('Custom 3d plot via ``partial_dependence``')
    fig = plt.figure()

    target_feature = (1, 5)
    pdp, axes = partial_dependence(est, X, target_feature, grid_resolution=50)
    XX, YY = np.meshgrid(axes[0], axes[1])
    Z = pdp[0].T
    ax = Axes3D(fig)
    surf = ax.plot_surface(XX,
                           YY,
                           Z,
                           rstride=1,
                           cstride=1,
                           cmap=plt.cm.BuPu,
                           edgecolor='k')
    ax.set_xlabel(names[target_feature[0]])
    ax.set_ylabel(names[target_feature[1]])
    ax.set_zlabel('Partial dependence')
    #  pretty init view
    ax.view_init(elev=22, azim=122)
    plt.colorbar(surf)
    plt.suptitle('Partial dependence of house value on median\n'
                 'age and average occupancy, with Gradient Boosting')
    plt.subplots_adjust(top=0.9)

    plt.show()
コード例 #20
0
#skplt.metrics.plot_roc_curve(ytest, ypred)
#plt.show()

#Getting scores for each column as partial dependence value
from sklearn.inspection import partial_dependence
#b=partial_dependence(lr,features=[0],X=data,percentiles=(0,1))
#print(b)
#print(b[0].max())
#a = np.array(())
listt = [
]  #Creating a list that will store the maximum dependency value for each column
#listt = ((b[0].max(), 0))
#listt.append([b[0].max(), 0])
#print(data.head(1)) #Here is the head of the data that has column variable names
for i in range(len(data.columns)):
    b = partial_dependence(clf, features=[i], X=data, percentiles=(0, 1))
    listt.append([b[0].max(), data.columns.values[i]])
#print(listt)
#Write the listt to a file
#conc = np.vstack(listt)
my_df = pd.DataFrame(listt, columns=['PDValues', 'ColumnName'])
#print(my_df.head())
my_df = my_df.sort_values(by='PDValues')
#print(my_df.head())
my_df.to_csv('PDValuesAdaBoost.csv', index=False)
#a.sort(axis=0)
#np.savetxt('columgSig.csv', listt, delimiter=',')
print("Columns Significance Saved to PDValuesAdaBoost.csv")

#Now we go for Precision Recall Curve metrics
precision, recall, thresholds = metrics.precision_recall_curve(ytest, ypred)
コード例 #21
0
def main():
    cal_housing = fetch_california_housing()

    X, y = cal_housing.data, cal_housing.target
    names = cal_housing.feature_names

    # Center target to avoid gradient boosting init bias: gradient boosting
    # with the 'recursion' method does not account for the initial estimator
    # (here the average target, by default)
    y -= y.mean()

    print("Training MLPRegressor...")
    est = MLPRegressor(activation='logistic')
    est.fit(X, y)
    print('Computing partial dependence plots...')
    # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower
    # with the brute method.
    features = [0, 5, 1, 2]
    plot_partial_dependence(est, X, features, feature_names=names,
                            n_jobs=3, grid_resolution=50)
    fig = plt.gcf()
    fig.suptitle('Partial dependence of house value on non-location features\n'
                 'for the California housing dataset, with MLPRegressor')
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

    print("Training GradientBoostingRegressor...")
    est = GradientBoostingRegressor(n_estimators=100, max_depth=4,
                                    learning_rate=0.1, loss='huber',
                                    random_state=1)
    est.fit(X, y)
    print('Computing partial dependence plots...')
    features = [0, 5, 1, 2, (5, 1)]
    plot_partial_dependence(est, X, features, feature_names=names,
                            n_jobs=3, grid_resolution=50)
    fig = plt.gcf()
    fig.suptitle('Partial dependence of house value on non-location features\n'
                 'for the California housing dataset, with Gradient Boosting')
    plt.subplots_adjust(top=0.9)

    print('Custom 3d plot via ``partial_dependence``')
    fig = plt.figure()

    target_feature = (1, 5)
    pdp, axes = partial_dependence(est, X, target_feature,
                                   grid_resolution=50)
    XX, YY = np.meshgrid(axes[0], axes[1])
    Z = pdp[0].T
    ax = Axes3D(fig)
    surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1,
                           cmap=plt.cm.BuPu, edgecolor='k')
    ax.set_xlabel(names[target_feature[0]])
    ax.set_ylabel(names[target_feature[1]])
    ax.set_zlabel('Partial dependence')
    #  pretty init view
    ax.view_init(elev=22, azim=122)
    plt.colorbar(surf)
    plt.suptitle('Partial dependence of house value on median\n'
                 'age and average occupancy, with Gradient Boosting')
    plt.subplots_adjust(top=0.9)

    plt.show()
コード例 #22
0
# age.
#
# 3D interaction plots
# --------------------
#
# Let's make the same partial dependence plot for the 2 features interaction,
# this time in 3 dimensions.

import numpy as np
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()

features = ('AveOccup', 'HouseAge')
pdp = partial_dependence(est,
                         X_train,
                         features=features,
                         kind='average',
                         grid_resolution=20)
XX, YY = np.meshgrid(pdp["values"][0], pdp["values"][1])
Z = pdp.average[0].T
ax = Axes3D(fig)
surf = ax.plot_surface(XX,
                       YY,
                       Z,
                       rstride=1,
                       cstride=1,
                       cmap=plt.cm.BuPu,
                       edgecolor='k')
ax.set_xlabel(features[0])
ax.set_ylabel(features[1])
ax.set_zlabel('Partial dependence')
コード例 #23
0
]
plot_partial_dependence(est, X_train, features, n_jobs=3, grid_resolution=20)
print("done in {:.3f}s".format(time() - tic))
fig = plt.gcf()
fig.suptitle('Partial dependence of house value on non-location features\n'
             'for the California housing dataset, with Gradient Boosting')
fig.subplots_adjust(wspace=0.4, hspace=0.3)

##############################################################################
# 3D interaction plots (2D PDP)

fig = plt.figure()

features = ('AveOccup', 'HouseAge')
pdp, axes = partial_dependence(est,
                               X_train,
                               features=features,
                               grid_resolution=20)
XX, YY = np.meshgrid(axes[0], axes[1])
Z = pdp[0].T
ax = Axes3D(fig)
surf = ax.plot_surface(XX,
                       YY,
                       Z,
                       rstride=1,
                       cstride=1,
                       cmap=plt.cm.BuPu,
                       edgecolor='k')
ax.set_xlabel(features[0])
ax.set_ylabel(features[1])
ax.set_zlabel('Partial dependence')
#  pretty init view
コード例 #24
0
def main():
    cal_housing = fetch_california_housing()

    X, y = cal_housing.data, cal_housing.target
    names = cal_housing.feature_names

    # Center target to avoid gradient boosting init bias: gradient boosting
    # with the 'recursion' method does not account for the initial estimator
    # (here the average target, by default)
    y -= y.mean()

    print("Training SNN_Regressor...")
    est = SNN_Regressor(8,
                        1,
                        10,
                        10,
                        hiddenAct=Activation.Tanh(),
                        error=Error.Mse(),
                        update=Update.RmsProp(0.001, rateDecay=0.9))

    t = [
        (3, lambda e: e.cool()),  # cool
        (6, lambda e: Trainer.prune(e, X, y)),  # prune
        #  ( 18, lambda e: e.cool() ), # cool
        (9,
         lambda e: Trainer.grow(e, max(1, 1 + int(np.log(e.hiddenSize_ + 1))))
         ),  # grow
        #  ( 11, lambda e: e.cool() ), # cool
    ]
    growLoss = Trainer.train(est, X, y, batch=1, maxIter=100, triggers=t)
    est.maxIter_ = 1000
    plt.semilogy(growLoss, label='Grow')
    plt.legend()
    #  plt.show()
    #  pdb.set_trace()

    print("SNN weights:", est.weight_)
    print("SNN dweight:", est.dWeight_)
    print("SNN nHidden:", est.hiddenSize_)
    print('Computing partial dependence plots...')
    # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower
    # with the brute method.
    features = [0, 5, 1, 2]
    plot_partial_dependence(est,
                            X,
                            features,
                            feature_names=names,
                            n_jobs=3,
                            grid_resolution=50)
    fig = plt.gcf()
    fig.suptitle('Partial dependence of house value on non-location features\n'
                 'for the California housing dataset, with SNN_Regressor...')
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

    print("Training MLPRegressor...")
    est = MLPRegressor(activation='logistic')
    est.fit(X, y)
    print('MLP Loss: ', np.average(Error.Mse().f(y, est.predict(X))))
    print('Computing partial dependence plots...')
    # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower
    # with the brute method.
    features = [0, 5, 1, 2]
    plot_partial_dependence(est,
                            X,
                            features,
                            feature_names=names,
                            n_jobs=3,
                            grid_resolution=50)
    fig = plt.gcf()
    fig.suptitle('Partial dependence of house value on non-location features\n'
                 'for the California housing dataset, with MLPRegressor')
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

    print("Training GradientBoostingRegressor...")
    est = GradientBoostingRegressor(n_estimators=100,
                                    max_depth=4,
                                    learning_rate=0.1,
                                    loss='huber',
                                    random_state=1)
    est.fit(X, y)
    print('Computing partial dependence plots...')
    features = [0, 5, 1, 2, (5, 1)]
    plot_partial_dependence(est,
                            X,
                            features,
                            feature_names=names,
                            n_jobs=3,
                            grid_resolution=50)
    fig = plt.gcf()
    fig.suptitle('Partial dependence of house value on non-location features\n'
                 'for the California housing dataset, with Gradient Boosting')
    plt.subplots_adjust(top=0.9)

    print('Custom 3d plot via ``partial_dependence``')
    fig = plt.figure()

    target_feature = (1, 5)
    pdp, axes = partial_dependence(est, X, target_feature, grid_resolution=50)
    XX, YY = np.meshgrid(axes[0], axes[1])
    Z = pdp[0].T
    ax = Axes3D(fig)
    surf = ax.plot_surface(XX,
                           YY,
                           Z,
                           rstride=1,
                           cstride=1,
                           cmap=plt.cm.BuPu,
                           edgecolor='k')
    ax.set_xlabel(names[target_feature[0]])
    ax.set_ylabel(names[target_feature[1]])
    ax.set_zlabel('Partial dependence')
    #  pretty init view
    ax.view_init(elev=22, azim=122)
    plt.colorbar(surf)
    plt.suptitle('Partial dependence of house value on median\n'
                 'age and average occupancy, with Gradient Boosting')
    plt.subplots_adjust(top=0.9)

    plt.show()
コード例 #25
0
def test_partial_dependence_error(estimator, params, err_msg):
    X, y = make_classification(random_state=0)
    estimator.fit(X, y)

    with pytest.raises(ValueError, match=err_msg):
        partial_dependence(estimator, X, **params)
コード例 #26
0
#
# 3D interaction plots
# --------------------
#
# Let's make the same partial dependence plot for the 2 features interaction,
# this time in 3 dimensions.
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from sklearn.inspection import partial_dependence

fig = plt.figure()

features = ("AveOccup", "HouseAge")
pdp = partial_dependence(est,
                         X_train,
                         features=features,
                         kind="average",
                         grid_resolution=10)
XX, YY = np.meshgrid(pdp["values"][0], pdp["values"][1])
Z = pdp.average[0].T
ax = Axes3D(fig)
fig.add_axes(ax)

surf = ax.plot_surface(XX,
                       YY,
                       Z,
                       rstride=1,
                       cstride=1,
                       cmap=plt.cm.BuPu,
                       edgecolor="k")
ax.set_xlabel(features[0])
コード例 #27
0
def test_partial_dependence_unfitted_estimator(estimator):
    err_msg = "'estimator' parameter must be a fitted estimator"
    with pytest.raises(ValueError, match=err_msg):
        partial_dependence(estimator, X, [0])
コード例 #28
0
def test_partial_dependence_X_list(estimator):
    # check that array-like objects are accepted
    X, y = make_classification(random_state=0)
    estimator.fit(X, y)
    partial_dependence(estimator, list(X), [0])
コード例 #29
0
ens.score(X_val, y_val)

#Once you are confident about your final model, measure its performance on the test set to estimate the generalization error

#Model interpretability
#Feature importance
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(model, random_state=101).fit(X_val, y_val)
eli5.show_weights(perm, feature_names=X_val.columns.tolist())

#Partial dependence plot
#New integration in sklearn, might not work with older versions
from sklearn.inspection import partial_dependence, plot_partial_dependence
partial_dependence(model, X_train, features=['feature', ('feat1', 'feat2')])
plot_partial_dependence(model,
                        X_train,
                        features=['feature', ('feat1', 'feat2')])
#With external module for legacy editions
from pdpbox import pdp, get_dataset, info_plots

#Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=model,
                            dataset=X_val,
                            model_features=X_val.columns,
                            feature='Goals Scored')

#plot it
pdp.pdp_plot(pdp_goals, 'Goals Scored')
plt.show()
コード例 #30
0
ファイル: utils.py プロジェクト: blewy/Insurance_Churn
def plot_pdp(model,
             x,
             feature,
             target=False,
             return_pd=False,
             y_pct=True,
             figsize=(10, 9),
             norm_hist=True,
             dec=.5):
    """
    Plot partial dependence plot suing sklearn and add a bar blot with the distribuition of the observations

            Parameters:
                    model (model): A decimal integer
                    X (dataframe): Another decimal integer
                    feature (str): Another decimal integer

            Returns:
                    plot
    """
    # Get partial dependence
    pardep = partial_dependence(model, x, [feature])

    # Get min & max values
    xmin = pardep[1][0].min()
    xmax = pardep[1][0].max()
    ymin = pardep[0][0].min()
    ymax = pardep[0][0].max()

    # Create figure
    fig, ax1 = plt.subplots(figsize=figsize)
    ax1.grid(alpha=.5, linewidth=1)

    # Plot partial dependence
    color = 'tab:blue'
    ax1.plot(pardep[1][0], pardep[0][0], color=color)
    ax1.tick_params(axis='y', labelcolor=color)
    ax1.set_xlabel(feature, fontsize=14)

    tar_ylabel = ': {}'.format(target) if target else ''
    ax1.set_ylabel('Partial Dependence{}'.format(tar_ylabel),
                   color=color,
                   fontsize=14)

    tar_title = target if target else 'Target Variable'
    ax1.set_title('Relationship Between {} and {}'.format(feature, tar_title),
                  fontsize=16)

    if y_pct and ymin >= 0 and ymax <= 1:
        # Display yticks on ax1 as percentages
        fig.canvas.draw()
        labels = [item.get_text() for item in ax1.get_yticklabels()]
        labels = [
            int(np.float(label.replace('−', '-')) * 100) for label in labels
        ]
        labels = ['{}%'.format(label) for label in labels]
        ax1.set_yticklabels(labels)

    # Plot line for decision boundary
    ax1.hlines(dec,
               xmin=xmin,
               xmax=xmax,
               color='black',
               linewidth=2,
               linestyle='--',
               label='Decision Boundary')
    ax1.legend()

    ax2 = ax1.twinx()
    color = 'tab:red'
    ax2.hist(x[feature],
             bins=80,
             range=(xmin, xmax),
             alpha=.25,
             color=color,
             density=norm_hist)
    ax2.tick_params(axis='y', labelcolor=color)
    ax2.set_ylabel('Distribution', color=color, fontsize=14)

    if y_pct and norm_hist:
        # Display yticks on ax2 as percentages
        fig.canvas.draw()
        labels = [item.get_text() for item in ax2.get_yticklabels()]
        labels = [
            int(np.float(label.replace('−', '-')) * 100) for label in labels
        ]
        labels = ['{}%'.format(label) for label in labels]
        ax2.set_yticklabels(labels)

    plt.show()

    if return_pd:
        return pardep
コード例 #31
0
# two features: for an average occupancy greater than two, the house price is
# nearly independent of the house age, whereas for values less than two there
# is a strong dependence on age.

##############################################################################
# 3D interaction plots
# --------------------
#
# Let's make the same partial dependence plot for the 2 features interaction,
# this time in 3 dimensions.

fig = plt.figure()

target_feature = (1, 5)
pdp, axes = partial_dependence(est,
                               X_train,
                               target_feature,
                               grid_resolution=20)
XX, YY = np.meshgrid(axes[0], axes[1])
Z = pdp[0].T
ax = Axes3D(fig)
surf = ax.plot_surface(XX,
                       YY,
                       Z,
                       rstride=1,
                       cstride=1,
                       cmap=plt.cm.BuPu,
                       edgecolor='k')
ax.set_xlabel(names[target_feature[0]])
ax.set_ylabel(names[target_feature[1]])
ax.set_zlabel('Partial dependence')
#  pretty init view
コード例 #32
0
def PartialDependencePlots(estimator,
                           X,
                           features,
                           feature_labels,
                           nrows=None,
                           ncols=4,
                           figsize=None,
                           sharey=True,
                           conf_int=True,
                           show=True,
                           save=False,
                           plot_dir='Output/Plots',
                           title='PDP',
                           save_params={},
                           pdp_params={},
                           plot_params={},
                           plot_ci_params={}):
    """
    INPUT:
     - estimator -> A sklearn tree-based fitted estimator object. (object) 
             Look at sklearn.inspection.partial_dependence doc. for more info.
     - X -> Feature matrix. (array-like or dataframe)
             Look at sklearn.inspection.partial_dependence doc. for more info.
     - features -> Features for which the partial dependency should be computed.
             (int, string or list of ints, strings)
             Look at sklearn.inspection.partial_dependence doc. for more info.
     - feature_labels -> Labels of the model features that will be used in the 
             plot. (string or list of strings).            
     - nrows -> Number of rows of the figure object. (int)
     - ncols -> Number of columns of the figure object. (int)
     - figsize -> Size of the figure object. (tuple of int: (width, height))
     - sharey -> Choose whether or not axes in the figure should share the y 
             axis values. (bool) 
     - conf_int -> Choose whether or not to plot the confidence interval. (bool) 
     - show -> Choose whether or not to display the plot. (bool) 
     - save -> Choose whether or not to save the plot. (bool)
     - plot_dir -> Plot saving directory. (path string)
     - title -> Name of the plot file without file extension. (string)
     - save_params -> Parameters for the saving operation. (dict)
     - pdp_params -> Parameters for sklearn.inspection.partial_dependence. (dict)
             Look at the doc. for more info.
     - plot_params -> Parameters for matplotlib.pylot.plot. (dict)
             Look at the doc. for more info.
     - plot_ci_params -> Parameters for matplotlib.axes.Axes.fill_between. (dict)
             Look at the doc. for more info.   
             
    OUTPUT:
     - fig -> Figure object. (matplotlib figure object)
     - ax, axs -> Axes of the current figure. (matplotlib axes object)
    """

    # Default values for parameter dictionaries:
    # save_params
    SP = {'format': 'jpg'}
    # pdp_params
    PDPP = {'kind': 'both', 'grid_resolution': 100}
    # plot_params
    PP = {}
    # plot_ci_params
    PCIP = {'alpha': 0.2, 'color': '#66C2D7'}

    # Update parameter dictionaries with user choices
    SP.update(save_params)
    PDPP.update(pdp_params)
    PP.update(plot_params)
    PCIP.update(plot_ci_params)

    # If features and feature_labels contains only a string, make them lists.
    features = MakeList(features)
    feature_labels = MakeList(feature_labels)
    n = len(features)

    # Define number of rows and columns and figsize of the figure object
    # containing the plot(s)
    nrows, ncols, figsize = ArrangePlots(n, nrows, ncols, figsize)

    # Output file directory
    file_dir = os.path.join(plot_dir, f"{title}.{SP['format']}")

    # If the plot file already exists and must not be overwritten, then display it.
    if show and os.path.exists(file_dir) and not save:
        fig, ax = plt.subplots(figsize=figsize)
        ax.imshow(plt.imread(file_dir), aspect='equal')
        plt.axis('off')

        return fig, ax

    # If the file must be created or overwritten ...
    else:
        fig, axs = plt.subplots(nrows, ncols, sharey=sharey, figsize=figsize)

        # Go through each feature
        for f, l, ax in zip(features, feature_labels, np.array(axs).flat):
            # Compute partial dependence values
            PDP = partial_dependence(estimator=estimator,
                                     X=X,
                                     features=f,
                                     **PDPP)

            ax.plot(PDP['values'][0], PDP['average'][0], **PP)

            if 'individual' in PDP and conf_int:
                # Compute the standard error on each mean partial dependence
                PDP['sd'] = PDP['individual'][0].std(axis=0).reshape(1, -1)
                # Define upper and lower bounds for the confidence interval
                upper = PDP['average'][0] + PDP['sd'][0]
                lower = PDP['average'][0] - PDP['sd'][0]

                ax.fill_between(PDP['values'][0], upper, lower, **PCIP)

            ax.set_xlabel(l)
            if ax.is_first_col():
                ax.set_ylabel('Target')

        # Remove eventual excessive axes
        if n < nrows * ncols:
            for i in range(1, nrows * ncols - n + 1):
                fig.delaxes(axs.flat[-i])

        fig.tight_layout()

        # Save the plot if needed
        if save:
            plt.savefig(file_dir, **SP)
        # Prevent display of the plot if needed
        if not show:
            plt.close()

        return fig, axs