Python prediction_errorの例

プログラミング言語: Python

名前空間/パッケージ名: yellowbrick.regressor

メソッド/関数: prediction_error

hotexamples.comのコード掲載数: 6

Python prediction_error - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのyellowbrick.regressor.prediction_errorの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示


for model in models:
    show_score(x_train, y_train, model)

figure, axes = plt.subplots(nrows=len(models),
                            ncols=2,
                            figsize=(9, 9),
                            sharex=True)

for ind, model in enumerate(models):
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    for index, ax in enumerate(axes):
        residuals_plot(model, x_test, preds, hist=False, ax=ax[index])
        prediction_error(model, x_test, preds, ax=ax)

# Do some scoring on XGB estimators
# Validation curve
viz = ValidationCurve(XGBRegressor(objective="reg:squarederror"),
                      param_name="max_depth",
                      param_range=np.arange(1, 11),
                      cv=5,
                      scoring="r2")
viz.fit(x_train, y_train)
viz.show()

# Learning curve
model = XGBRegressor(objective="reg:squarederror")
viz_2 = LearningCurve(model, scoring="r2")
viz_2.fit(x_train, y_train)

コード例 #2

ファイルを表示

# Use the quick method and immediately show the figure
alphas(LassoCV(random_state=0), X, y)

# Load a regression dataset
X, y = load_concrete()  # same as above

#X = pd.read_csv(r'C:\Users\Jujin\Desktop\cs-4641-group-44\training_data.csv')
#y = pd.read_csv(r'C:\Users\Jujin\Desktop\cs-4641-group-44\test_data.csv')

# X_train = []                         # makes a list
# y_train = []                         # makes a list
# d = pd.read_csv(r'C:\Users\Jujin\Desktop\cs-4641-group-44\training_data.csv')
# for row in d:
#     X_train.append(row[1])     # selects data from the ith row
#     y_train.append(row[2])     # selects data from the ith row
#
# X_test = []                         # makes a list
# y_test = []                         # makes a list
# da = pd.read_csv(r'C:\Users\Jujin\Desktop\cs-4641-group-44\test_data.csv')
# for row in da:
#     X_test.append(row[1])     # selects data from the ith row
#     y_test.append(row[2])     # selects data from the ith row

# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the linear model and visualizer
model = Lasso()
visualizer = prediction_error(model, X_train, y_train, X_test, y_test)

コード例 #3

ファイルを表示

ファイル: linear_model.py プロジェクト: jwattier/nycdsa_housing_ml_project

ols_log = sm.OLS(y_train_log, X_add_constant)
ans_log = ols_log.fit()
print(ans_log.summary())

# - Evaluation of OLS for non-log LR

X_add_constant_non_log = sm.add_constant(X_train)
ols = sm.OLS(y_train, X_add_constant_non_log)
ans = ols.fit()
print(ans.summary())

# + pycharm={"is_executing": false}
# Prediction error plot to further evaluate normality of residual distribution
from yellowbrick.regressor import prediction_error

visualizer = prediction_error(lr_log, X_train_log, y_train_log)

# + [markdown] pycharm={"name": "#%% md\n"}
# - qq plot of prediction error appears to follow in a straight line, which is indicative of a normally distributed error term.
#

# + pycharm={"is_executing": false}
from yellowbrick.regressor import cooks_distance

cd_visualizer = cooks_distance(X=X_train, y=y_train_log)

# + [markdown] pycharm={"name": "#%% md\n"}
# ## Cross Validation through YellowBrick
# ### **Evaluation of R2 over 4-fold Cross-Validation**
# - linear log model is evaluated via 4-k fold

コード例 #4

ファイルを表示

metrics.mean_squared_error(auto_y_test, dt.predict(auto_X_test))

# Residuals plot
# Good for looking at homoskedasticity - variance of errors
regressor.residuals_plot(lr, auto_X_train, auto_y_train, auto_X_test,
                         auto_y_test)

# Residuals plot
# Good for looking at homoskedasticity - variance of errors
regressor.residuals_plot(dt, auto_X_train, auto_y_train, auto_X_test,
                         auto_y_test)

# Prediction Error
# Good for looking at variance and performance at different ends
regressor.prediction_error(lr, auto_X_train, auto_y_train, auto_X_test,
                           auto_y_test)

# Prediction Error
# Good for looking at variance and performance at different ends
regressor.prediction_error(dt, auto_X_train, auto_y_train, auto_X_test,
                           auto_y_test)

# ## Evaluation Exercise
# * Create a decision tree model for the housing data
# * Compare the r2 and MSE for the decision tree and linear regression models.
# * Plot a residuals plot for both models.

lr = linear_model.LinearRegression()
lr.fit(ames_X_train, ames_y_train)
lr.score(ames_X_test, ames_y_test)

コード例 #5

ファイルを表示

def visualize_features(classes, problem_type, curdir, default_features,
                       balance_data, test_size):

    # make features into label encoder here
    features, feature_labels, class_labels = get_features(
        classes, problem_type, default_features, balance_data)

    # now preprocess features for all the other plots
    os.chdir(curdir)
    le = preprocessing.LabelEncoder()
    le.fit(class_labels)
    tclass_labels = le.transform(class_labels)

    # process features to help with clustering
    se = preprocessing.StandardScaler()
    t_features = se.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        tclass_labels,
                                                        test_size=test_size,
                                                        random_state=42)

    # print(len(features))
    # print(len(feature_labels))
    # print(len(class_labels))
    # print(class_labels)

    # GET TRAINING DATA DURING MODELING PROCESS
    ##################################
    # get filename
    # csvfile=''
    # print(classes)
    # for i in range(len(classes)):
    # 	csvfile=csvfile+classes[i]+'_'

    # get training and testing data for later
    # try:
    # print('loading training files...')
    # X_train=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'train.csv')
    # y_train=X_train['class_']
    # X_train.drop(['class_'], axis=1)
    # X_test=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'test.csv')
    # y_test=X_test['class_']
    # X_test.drop(['class_'], axis=1)
    # y_train=le.inverse_transform(y_train)
    # y_test=le.inverse_transform(y_test)
    # except:
    # print('error loading in training files, making new test data')

    # Visualize each class (quick plot)
    ##################################
    visualization_dir = 'visualization_session'
    try:
        os.mkdir(visualization_dir)
        os.chdir(visualization_dir)
    except:
        shutil.rmtree(visualization_dir)
        os.mkdir(visualization_dir)
        os.chdir(visualization_dir)

    objects = tuple(set(class_labels))
    y_pos = np.arange(len(objects))
    performance = list()
    for i in range(len(objects)):
        performance.append(class_labels.count(objects[i]))

    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.xticks(rotation=90)
    plt.title('Counts per class')
    plt.ylabel('Count')
    plt.xlabel('Class')
    plt.tight_layout()
    plt.savefig('classes.png')
    plt.close()

    # set current directory
    curdir = os.getcwd()

    # ##################################
    # # CLUSTERING!!!
    # ##################################

    ##################################
    # Manifold type options
    ##################################
    '''
		"lle"
		Locally Linear Embedding (LLE) uses many local linear decompositions to preserve globally non-linear structures.
		"ltsa"
		LTSA LLE: local tangent space alignment is similar to LLE in that it uses locality to preserve neighborhood distances.
		"hessian"
		Hessian LLE an LLE regularization method that applies a hessian-based quadratic form at each neighborhood
		"modified"
		Modified LLE applies a regularization parameter to LLE.
		"isomap"
		Isomap seeks a lower dimensional embedding that maintains geometric distances between each instance.
		"mds"
		MDS: multi-dimensional scaling uses similarity to plot points that are near to each other close in the embedding.
		"spectral"
		Spectral Embedding a discrete approximation of the low dimensional manifold using a graph representation.
		"tsne" (default)
		t-SNE: converts the similarity of points into probabilities then uses those probabilities to create an embedding.
	'''
    os.mkdir('clustering')
    os.chdir('clustering')

    # tSNE
    plt.figure()
    viz = Manifold(manifold="tsne", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="tsne.png")
    plt.close()
    # os.system('open tsne.png')
    # viz.show()

    # PCA
    plt.figure()
    visualizer = PCADecomposition(scale=True, classes=set(classes))
    visualizer.fit_transform(np.array(features), tclass_labels)
    visualizer.poof(outpath="pca.png")
    plt.close()
    # os.system('open pca.png')

    # spectral embedding
    plt.figure()
    viz = Manifold(manifold="spectral", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="spectral.png")
    plt.close()

    # lle embedding
    plt.figure()
    viz = Manifold(manifold="lle", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="lle.png")
    plt.close()

    # ltsa
    # plt.figure()
    # viz = Manifold(manifold="ltsa", classes=set(classes))
    # viz.fit_transform(np.array(features), tclass_labels)
    # viz.poof(outpath="ltsa.png")
    # plt.close()

    # hessian
    # plt.figure()
    # viz = Manifold(manifold="hessian", method='dense', classes=set(classes))
    # viz.fit_transform(np.array(features), tclass_labels)
    # viz.poof(outpath="hessian.png")
    # plt.close()

    # modified
    plt.figure()
    viz = Manifold(manifold="modified", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="modified.png")
    plt.close()

    # isomap
    plt.figure()
    viz = Manifold(manifold="isomap", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="isomap.png")
    plt.close()

    # mds
    plt.figure()
    viz = Manifold(manifold="mds", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="mds.png")
    plt.close()

    # spectral
    plt.figure()
    viz = Manifold(manifold="spectral", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="spectral.png")
    plt.close()

    # UMAP embedding
    plt.figure()
    umap = UMAPVisualizer(metric='cosine',
                          classes=set(classes),
                          title="UMAP embedding")
    umap.fit_transform(np.array(features), class_labels)
    umap.poof(outpath="umap.png")
    plt.close()

    # alternative UMAP
    # import umap.plot
    # plt.figure()
    # mapper = umap.UMAP().fit(np.array(features))
    # fig=umap.plot.points(mapper, labels=np.array(tclass_labels))
    # fig = fig.get_figure()
    # fig.tight_layout()
    # fig.savefig('umap2.png')
    # plt.close(fig)

    #################################
    # 	  FEATURE RANKING!!
    #################################
    os.chdir(curdir)
    os.mkdir('feature_ranking')
    os.chdir('feature_ranking')

    # You can get the feature importance of each feature of your dataset
    # by using the feature importance property of the model.
    plt.figure(figsize=(12, 12))
    model = ExtraTreesClassifier()
    model.fit(np.array(features), tclass_labels)
    # print(model.feature_importances_)
    feat_importances = pd.Series(model.feature_importances_,
                                 index=feature_labels[0])
    feat_importances.nlargest(20).plot(kind='barh')
    plt.title('Feature importances (ExtraTrees)', size=16)
    plt.title('Feature importances with %s features' % (str(len(features[0]))))
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    # os.system('open feature_importance.png')

    # get selected labels for top 20 features
    selectedlabels = list(dict(feat_importances.nlargest(20)))
    new_features, new_labels = restructure_features(selectedlabels, t_features,
                                                    feature_labels[0])
    new_features_, new_labels_ = restructure_features(selectedlabels, features,
                                                      feature_labels[0])

    # Shapiro rank algorithm (1D)
    plt.figure(figsize=(28, 12))
    visualizer = Rank1D(algorithm='shapiro',
                        classes=set(classes),
                        features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    # plt.tight_layout()
    visualizer.poof(outpath="shapiro.png")
    plt.title('Shapiro plot (top 20 features)', size=16)
    plt.close()
    # os.system('open shapiro.png')
    # visualizer.show()

    # pearson ranking algorithm (2D)
    plt.figure(figsize=(12, 12))
    visualizer = Rank2D(algorithm='pearson',
                        classes=set(classes),
                        features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    plt.tight_layout()
    visualizer.poof(outpath="pearson.png")
    plt.title('Pearson ranking plot (top 20 features)', size=16)
    plt.close()
    # os.system('open pearson.png')
    # visualizer.show()

    # feature importances with top 20 features for Lasso
    plt.figure(figsize=(12, 12))
    viz = FeatureImportances(Lasso(), labels=new_labels_)
    viz.fit(np.array(new_features_), tclass_labels)
    plt.tight_layout()
    viz.poof(outpath="lasso.png")
    plt.close()

    # correlation plots with feature removal if corr > 0.90
    # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf

    # now remove correlated features
    # --> p values
    # --> https://towardsdatascience.com/the-next-level-of-data-visualization-in-python-dd6e99039d5e / https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/Plotly%20Whirlwind%20Introduction.ipynb- plotly for correlation heatmap and scatterplot matrix
    # --> https://seaborn.pydata.org/tutorial/distributions.html
    data = new_features
    corr = data.corr()

    plt.figure(figsize=(12, 12))
    fig = sns.heatmap(corr)
    fig = fig.get_figure()
    plt.title('Heatmap with correlated features (top 20 features)', size=16)
    fig.tight_layout()
    fig.savefig('heatmap.png')
    plt.close(fig)

    columns = np.full((corr.shape[0], ), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= 0.9:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    corr = data.corr()

    plt.figure(figsize=(12, 12))
    fig = sns.heatmap(corr)
    fig = fig.get_figure()
    plt.title('Heatmap without correlated features (top 20 features)', size=16)
    fig.tight_layout()
    fig.savefig('heatmap_clean.png')
    plt.close(fig)

    # radviz
    # Instantiate the visualizer
    plt.figure(figsize=(12, 12))
    visualizer = RadViz(classes=classes, features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    visualizer.poof(outpath="radviz.png")
    visualizer.show()
    plt.close()

    # feature correlation plot
    plt.figure(figsize=(28, 12))
    visualizer = feature_correlation(np.array(new_features),
                                     tclass_labels,
                                     labels=new_labels)
    visualizer.poof(outpath="correlation.png")
    visualizer.show()
    plt.tight_layout()
    plt.close()

    os.mkdir('feature_plots')
    os.chdir('feature_plots')

    newdata = new_features_
    newdata['classes'] = class_labels

    for j in range(len(new_labels_)):
        fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels_[j]])
        fig = fig.get_figure()
        fig.tight_layout()
        fig.savefig('%s_%s.png' % (str(j), new_labels_[j]))
        plt.close(fig)

    os.mkdir('feature_plots_transformed')
    os.chdir('feature_plots_transformed')

    newdata = new_features
    newdata['classes'] = class_labels

    for j in range(len(new_labels)):
        fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels[j]])
        fig = fig.get_figure()
        fig.tight_layout()
        fig.savefig('%s_%s.png' % (str(j), new_labels[j]))
        plt.close(fig)

    ##################################################
    # PRECISION-RECALL CURVES
    ##################################################

    os.chdir(curdir)
    os.mkdir('model_selection')
    os.chdir('model_selection')

    plt.figure()
    visualizer = precision_recall_curve(GaussianNB(), np.array(features),
                                        tclass_labels)
    visualizer.poof(outpath="precision-recall.png")
    plt.close()

    plt.figure()
    visualizer = roc_auc(LogisticRegression(), np.array(features),
                         tclass_labels)
    visualizer.poof(outpath="roc_curve_train.png")
    plt.close()

    plt.figure()
    visualizer = discrimination_threshold(
        LogisticRegression(multi_class="auto", solver="liblinear"),
        np.array(features), tclass_labels)
    visualizer.poof(outpath="thresholds.png")
    plt.close()

    plt.figure()
    visualizer = residuals_plot(Ridge(),
                                np.array(features),
                                tclass_labels,
                                train_color="maroon",
                                test_color="gold")
    visualizer.poof(outpath="residuals.png")
    plt.close()

    plt.figure()
    visualizer = prediction_error(Lasso(), np.array(features), tclass_labels)
    visualizer.poof(outpath='prediction_error.png')
    plt.close()

    # outlier detection
    plt.figure()
    visualizer = cooks_distance(np.array(features),
                                tclass_labels,
                                draw_threshold=True,
                                linefmt="C0-",
                                markerfmt=",")
    visualizer.poof(outpath='outliers.png')
    plt.close()

    # cluster numbers
    plt.figure()
    visualizer = silhouette_visualizer(
        KMeans(len(set(tclass_labels)), random_state=42), np.array(features))
    visualizer.poof(outpath='siloutte.png')
    plt.close()

    # cluster distance
    plt.figure()
    visualizer = intercluster_distance(
        KMeans(len(set(tclass_labels)), random_state=777), np.array(features))
    visualizer.poof(outpath='cluster_distance.png')
    plt.close()

    # plot percentile of features plot with SVM to see which percentile for features is optimal
    features = preprocessing.MinMaxScaler().fit_transform(features)
    clf = Pipeline([('anova', SelectPercentile(chi2)),
                    ('scaler', StandardScaler()),
                    ('logr', LogisticRegression())])
    score_means = list()
    score_stds = list()
    percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100)

    for percentile in percentiles:
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, np.array(features), class_labels)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())

    plt.errorbar(percentiles, score_means, np.array(score_stds))
    plt.title(
        'Performance of the LogisticRegression-Anova varying the percent features selected'
    )
    plt.xticks(np.linspace(0, 100, 11, endpoint=True))
    plt.xlabel('Percentile')
    plt.ylabel('Accuracy Score')
    plt.axis('tight')
    plt.savefig('logr_percentile_plot.png')
    plt.close()

    # get PCA
    pca = PCA(random_state=1)
    pca.fit(X_train)
    skplt.decomposition.plot_pca_component_variance(pca)
    plt.savefig('pca_explained_variance.png')
    plt.close()

    # estimators
    rf = RandomForestClassifier()
    skplt.estimators.plot_learning_curve(rf, X_train, y_train)
    plt.title('Learning Curve (Random Forest)')
    plt.savefig('learning_curve.png')
    plt.close()

    # elbow plot
    kmeans = KMeans(random_state=1)
    skplt.cluster.plot_elbow_curve(kmeans,
                                   X_train,
                                   cluster_ranges=range(1, 30),
                                   title='Elbow plot (KMeans clustering)')
    plt.savefig('elbow.png')
    plt.close()

    # KS statistic (only if 2 classes)
    lr = LogisticRegression()
    lr = lr.fit(X_train, y_train)
    y_probas = lr.predict_proba(X_test)
    skplt.metrics.plot_ks_statistic(y_test, y_probas)
    plt.savefig('ks.png')
    plt.close()

    # precision-recall
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    y_probas = nb.predict_proba(X_test)
    skplt.metrics.plot_precision_recall(y_test, y_probas)
    plt.tight_layout()
    plt.savefig('precision-recall.png')
    plt.close()

    ## plot calibration curve
    rf = RandomForestClassifier()
    lr = LogisticRegression()
    nb = GaussianNB()
    svm = LinearSVC()
    dt = DecisionTreeClassifier(random_state=0)
    ab = AdaBoostClassifier(n_estimators=100)
    gb = GradientBoostingClassifier(n_estimators=100,
                                    learning_rate=1.0,
                                    max_depth=1,
                                    random_state=0)
    knn = KNeighborsClassifier(n_neighbors=7)

    rf_probas = rf.fit(X_train, y_train).predict_proba(X_test)
    lr_probas = lr.fit(X_train, y_train).predict_proba(X_test)
    nb_probas = nb.fit(X_train, y_train).predict_proba(X_test)
    # svm_scores = svm.fit(X_train, y_train).predict_proba(X_test)
    dt_scores = dt.fit(X_train, y_train).predict_proba(X_test)
    ab_scores = ab.fit(X_train, y_train).predict_proba(X_test)
    gb_scores = gb.fit(X_train, y_train).predict_proba(X_test)
    knn_scores = knn.fit(X_train, y_train).predict_proba(X_test)

    probas_list = [
        rf_probas,
        lr_probas,
        nb_probas,  # svm_scores,
        dt_scores,
        ab_scores,
        gb_scores,
        knn_scores
    ]

    clf_names = [
        'Random Forest',
        'Logistic Regression',
        'Gaussian NB',  # 'SVM',
        'Decision Tree',
        'Adaboost',
        'Gradient Boost',
        'KNN'
    ]

    skplt.metrics.plot_calibration_curve(y_test, probas_list, clf_names)
    plt.savefig('calibration.png')
    plt.tight_layout()
    plt.close()

    # pick classifier type by ROC (without optimization)
    probs = [
        rf_probas[:, 1],
        lr_probas[:, 1],
        nb_probas[:, 1],  # svm_scores[:, 1],
        dt_scores[:, 1],
        ab_scores[:, 1],
        gb_scores[:, 1],
        knn_scores[:, 1]
    ]

    plot_roc_curve(y_test, probs, clf_names)
    # more elaborate ROC example with CV = 5 fold
    # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py

    os.chdir(curdir)

    return ''

コード例 #6

ファイルを表示

print('p_pred:', p_pred, sep='\n', end='\n\n')
print('y_pred:', y_pred, end='\n\n')
print('score_:', score_, end='\n\n')
print('conf_m:', conf_m, sep='\n', end='\n\n')
print('report:', report, sep='\n')
print(model.coef_)
print(model.intercept_)

#plt.plot(x[:,0], y)
#plt.show()

print(model.score(xtest, ytest))

alphas(LassoCV(random_state=0), x, y)
model2 = Lasso()
visualizer = prediction_error(model2, x, y, xtest, ytest)

#ROC,AUC
y_score2 = model.predict_proba(xtest)[:, 1]
false_positive_rate2, true_positive_rate2, threshold2 = roc_curve(
    ytest, y_score2)
print('roc_auc_score for Logistic Regression: ',
      roc_auc_score(ytest, y_score2))

plt.subplots(1, figsize=(10, 10))
plt.title('Receiver Operating Characteristic - Logistic regression')
plt.plot(false_positive_rate2, true_positive_rate2)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0], c=".7"), plt.plot([1, 1], c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')