def test_validation_curve_cv_splits_consistency(): n_samples = 100 n_splits = 5 X, y = make_classification(n_samples=100, random_state=0) scores1 = validation_curve(SVC(kernel='linear', random_state=0), X, y, 'C', [0.1, 0.1, 0.2, 0.2], cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples)) # The OneTimeSplitter is a non-re-entrant cv splitter. Unless, the # `split` is called for each parameter, the following should produce # identical results for param setting 1 and param setting 2 as both have # the same C value. assert_array_almost_equal(*np.vsplit(np.hstack(scores1)[(0, 2, 1, 3), :], 2)) scores2 = validation_curve(SVC(kernel='linear', random_state=0), X, y, 'C', [0.1, 0.1, 0.2, 0.2], cv=KFold(n_splits=n_splits, shuffle=True)) # For scores2, compare the 1st and 2nd parameter's scores # (Since the C value for 1st two param setting is 0.1, they must be # consistent unless the train test folds differ between the param settings) assert_array_almost_equal(*np.vsplit(np.hstack(scores2)[(0, 2, 1, 3), :], 2)) scores3 = validation_curve(SVC(kernel='linear', random_state=0), X, y, 'C', [0.1, 0.1, 0.2, 0.2], cv=KFold(n_splits=n_splits)) # OneTimeSplitter is basically unshuffled KFold(n_splits=5). Sanity check. assert_array_almost_equal(np.array(scores3), np.array(scores1))
def validation_crv(estimator, X, y, title, n_jobs=1): param_range = np.logspace(-6, -1, 5) train_scores, test_scores = validation_curve( estimator, X, y, param_name="max_features", param_range=param_range, cv=10, scoring="accuracy", n_jobs=n_jobs) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title(title) plt.xlabel("$\gamma$") plt.ylabel("Score") plt.ylim(0.0, 1.1) lw = 2 plt.semilogx(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw) plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw) plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw) plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy", lw=lw) plt.legend(loc="best") return plt
def plot_validation_curve(self, X_train, X_test, y_train, y_test, pipeline, param_name, param_range, title, filename, cv=4, param_range_plot=None): train_scores, test_scores = validation_curve(estimator=pipeline, X=X_train, y=y_train, param_name=param_name, param_range=param_range, cv=cv) #train_scores = 1. - train_scores #test_scores = 1. - test_scores train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) if param_range_plot != None: param_range = param_range_plot plot_series(param_range, [train_mean, test_mean], [train_std, test_std], ['training accuracy', 'validation accuracy'], ['blue', 'green'], ['o', 's'], title, param_name, 'Accuracy', filename)
def test_validation_curve_clone_estimator(): X, y = make_classification(n_samples=2, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) param_range = np.linspace(1, 0, 10) _, _ = validation_curve( MockEstimatorWithSingleFitCallAllowed(), X, y, param_name="param", param_range=param_range, cv=2 )
def test_validation_curve(): X, y = make_classification(n_samples=2, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) param_range = np.linspace(0, 1, 10) with warnings.catch_warnings(record=True) as w: train_scores, test_scores = validation_curve( MockEstimatorWithParameter(), X, y, param_name="param", param_range=param_range, cv=2 ) if len(w) > 0: raise RuntimeError("Unexpected warning: %r" % w[0].message) assert_array_almost_equal(train_scores.mean(axis=1), param_range) assert_array_almost_equal(test_scores.mean(axis=1), 1 - param_range)
def test_validation_curve(self): digits = datasets.load_digits() df = pdml.ModelFrame(digits) param_range = np.logspace(-2, -1, 2) svc = df.svm.SVC(random_state=self.random_state) result = df.model_selection.validation_curve(svc, 'gamma', param_range) expected = ms.validation_curve(svm.SVC(random_state=self.random_state), digits.data, digits.target, 'gamma', param_range) self.assertEqual(len(result), 2) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assert_numpy_array_almost_equal(result[1], expected[1])
def validation_curve(estimator, epochs, y, param_name, param_range, cv=None): """Validation curve on epochs. Parameters ---------- estimator : object that implements "fit" and "predict" method. the estimator whose Validation curve must be found epochs : instance of mne.Epochs. The epochs. y : array The labels. param_name : str Name of the parameter that will be varied. param_range : array The values of the parameter that will be evaluated. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation strategy. Returns ------- train_scores : array The scores in the training set test_scores : array The scores in the test set """ from sklearn.model_selection import validation_curve if not isinstance(estimator, GlobalAutoReject): msg = 'No guarantee that it will work on this estimator.' raise NotImplementedError(msg) BaseEpochs = _get_epochs_type() if not isinstance(epochs, BaseEpochs): raise ValueError('Only accepts MNE epochs objects.') data_picks = _handle_picks(epochs.info, picks=None) X = epochs.get_data()[:, data_picks, :] n_epochs, n_channels, n_times = X.shape estimator.n_channels = n_channels estimator.n_times = n_times train_scores, test_scores = \ validation_curve(estimator, X.reshape(n_epochs, -1), y=y, param_name="thresh", param_range=param_range, cv=cv, n_jobs=1, verbose=0) return train_scores, test_scores
def plot_validation_curve(self, estimator, x_train, y_train, cv, data_label, param_range, param_name, n_jobs=-1): # plot the validation curves plt.clf() train_scores, test_scores = validation_curve(estimator=estimator, X=x_train, y=y_train, param_name=param_name, param_range=param_range, cv=cv, n_jobs=n_jobs) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', marker='s', markersize=5, linestyle='--', label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.title("Validation curve: %s" % (data_label)) plt.xlabel(param_name) plt.ylabel('Accurancy') plt.legend(loc='lower right') fn = self.save_path + data_label + '_' + param_name + '_validationcurve.png' plt.savefig(fn)
def plot_cv_parameters(classifier,X_train,y_train,param, param_range,cv=10): train_scores,test_scores= validation_curve(estimator=classifier, X=X_train, y=y_train, param_name=param, param_range=param_range, cv=10) train_mean = np.mean(train_scores,axis=1) train_std = np.std(train_scores,axis=1) test_mean = np.mean(test_scores,axis=1) test_std = np.std(test_scores,axis=1) fig = plt.figure(figsize=(10,5)) plt.plot(param_range, train_mean, color='blue', marker='o' , markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean+train_std, train_mean-train_std, alpha=0.15,color='blue') plt.plot(param_range, test_mean, color='green', linestyle="--", marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean+test_std, test_mean-test_std, alpha=0.15,color='green') plt.grid() plt.xscale('log') plt.xlabel(param) plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.6 ,1.1]) plt.show()
def ModelComplexity(X, y): """ Calculates the performance of the model as model complexity increases. The learning and testing errors rates are then plotted. """ # Create 10 cross-validation sets for training and testing cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0) # Vary the max_depth parameter from 1 to 10 max_depth = np.arange(1,11) # Calculate the training and testing scores train_scores, test_scores = validation_curve(DecisionTreeRegressor(), X, y, \ param_name = "max_depth", param_range = max_depth, cv = cv, scoring = 'r2') # Find the mean and standard deviation for smoothing train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # Plot the validation curve pl.figure(figsize=(7, 5)) pl.title('Decision Tree Regressor Complexity Performance') pl.plot(max_depth, train_mean, 'o-', color = 'r', label = 'Training Score') pl.plot(max_depth, test_mean, 'o-', color = 'g', label = 'Validation Score') pl.fill_between(max_depth, train_mean - train_std, \ train_mean + train_std, alpha = 0.15, color = 'r') pl.fill_between(max_depth, test_mean - test_std, \ test_mean + test_std, alpha = 0.15, color = 'g') # Visual aesthetics pl.legend(loc = 'lower right') pl.xlabel('Maximum Depth') pl.ylabel('Score') pl.ylim([-0.05,1.05]) pl.show()
w = min(W,dx) image(temppath,imgx,imgy,width=w) imgy = imgy + dy + 20 os.remove(temppath) size(W, HEIGHT+dy+40) else: def pltshow(mplpyplot): mplpyplot.show() # nodebox section end digits = load_digits() X, y = digits.data, digits.target param_range = np.logspace(-6, -1, 5) train_scores, test_scores = validation_curve( SVC(), X, y, param_name="gamma", param_range=param_range, cv=10, scoring="accuracy", n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve with SVM") plt.xlabel("$\gamma$") plt.ylabel("Score") plt.ylim(0.0, 1.1) lw = 2 plt.semilogx(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw) plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2,
# In[ ]: from sklearn import svm from sklearn.model_selection import learning_curve from sklearn.model_selection import validation_curve from sklearn.model_selection import train_test_split from matplotlib import pyplot as plt get_ipython().magic(u'matplotlib inline') # Find better 'gamma' by default C value param_range = np.logspace(-2, 0, 20) print(param_range) train_scores, test_scores = validation_curve(svm.SVC(C=0.6), whole_data_x_scaled, whole_data_y, param_name="gamma", param_range=param_range, cv=10, scoring="accuracy", n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve with SVM") plt.xlabel("$\gamma$") plt.ylabel("Score") plt.ylim(0.6, 1.1) lw = 2 plt.semilogx(param_range,
def plot_validation_curve(model, partition, pname, prange): r"""Generate scikit-learn validation curves. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. pname : str Name of the hyperparameter to test. prange : numpy array The values of the hyperparameter that will be evaluated. Returns ------- None : None References ---------- http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py """ logger.info("Generating Validation Curves") plot_dir = get_plot_directory(model) pstring = datasets[partition] # Extract model parameters. cv_folds = model.specs['cv_folds'] n_jobs = model.specs['n_jobs'] scorer = model.specs['scorer'] verbosity = model.specs['verbosity'] # Get X, Y for correct partition. X, y = get_partition_data(model, partition) # Define plotting constants. spacing = 0.5 alpha = 0.2 # Calculate a validation curve for each algorithm. for algo in model.algolist: logger.info("Algorithm: %s", algo) # get estimator estimator = model.estimators[algo] # set up plot train_scores, test_scores = validation_curve(estimator, X, y, param_name=pname, param_range=prange, cv=cv_folds, scoring=scorer, n_jobs=n_jobs) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) # set up figure plt.style.use('classic') plt.figure() # plot learning curves title = BSEP.join([algo, "Validation Curve [", pstring, "]"]) plt.title(title) # x-axis x_min, x_max = min(prange) - spacing, max(prange) + spacing plt.xlabel(pname) plt.xlim(x_min, x_max) # y-axis plt.ylabel("Score") plt.ylim(0.0, 1.1) # plot scores plt.plot(prange, train_scores_mean, label="Training Score", color="r") plt.fill_between(prange, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=alpha, color="r") plt.plot(prange, test_scores_mean, label="Cross-Validation Score", color="g") plt.fill_between(prange, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=alpha, color="g") plt.legend(loc="best") # save the plot tag = USEP.join([pstring, algo]) write_plot('matplotlib', plt, 'validation_curve', tag, plot_dir)
def validation_curve_model(X, Y, model, param_name, parameters, cv, ylim, log=True): train_scores, test_scores = validation_curve(model, X, Y, param_name=param_name, param_range=parameters, cv=cv, scoring="accuracy") train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.figure() plt.title("Validation curve") plt.fill_between(parameters, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(parameters, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") if log == True: plt.semilogx(parameters, train_scores_mean, 'o-', color="r", label="Training score") plt.semilogx(parameters, test_scores_mean, 'o-', color="g", label="Cross-validation score") else: plt.plot(parameters, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(parameters, test_scores_mean, 'o-', color="g", label="Cross-validation score") #plt.ylim([0.55, 0.9]) if ylim is not None: plt.ylim(*ylim) plt.ylabel('Score') plt.xlabel('Parameter C') plt.legend(loc="best") return plt
import numpy as np from sklearn.model_selection import validation_curve from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score from hw1_util import create_two_spirals, get_hill_valley_data X, y = create_two_spirals() X, y = get_hill_valley_data() param_range = range(1, 60) train_scores, test_scores = validation_curve( DecisionTreeClassifier(max_features='auto'), X, y, param_name="max_depth", param_range=param_range, cv=10, scoring="accuracy", n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve with DecisionTreeClassifier") plt.xlabel("Max depth") plt.ylabel("Score") plt.ylim(0.0, 1.1) lw = 2 plt.plot(param_range,
def tune_hyper_parameter(self, param_name, param_range): train_scores, valid_scores = validation_curve(self.model, self.X_train, self.y_train, param_name, param_range, cv=5, scoring='f1', n_jobs=-1, verbose=51) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) valid_scores_mean = np.mean(valid_scores, axis=1) valid_scores_std = np.std(valid_scores, axis=1) title = "MC Curve for " + param_name + "\n" + self.model_type title_dic = {'fontsize': 7, 'fontweight': 'bold'} fig, (ax1), = plt.subplots(1, 1, figsize=(3, 2)) ax1.set_title(title, title_dic) ax1.set_ylabel("Mean F1 Score", title_dic) ax1.tick_params(axis="x", labelsize=7) ax1.tick_params(axis="y", labelsize=7) ax1.yaxis.set_major_formatter(FormatStrFormatter('%.3f')) ax1.set_xlabel(param_name, title_dic) plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkred", lw=2) plt.fill_between(param_range, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.2, color="navy", lw=2) ax1.plot(param_range, train_scores_mean, "r", linewidth=2, label="train") ax1.plot(param_range, valid_scores_mean, "b", linewidth=2, label="cross val") if param_name == 'max_iter': ax1.set_xlabel("iterations", title_dic) ax1.legend(loc='best', fontsize=6) ax1.grid() plt.tight_layout() path = OUTPUT_PATH + '/' + self.model_type + "/" filename = "MC_Curve_" + param_name + ".png" filename = os.path.join(path, filename) plt.savefig(filename)
def drawvalidationCurve(): train_scores, valid_scores = validation_curve(Ridge(), X, y, "alpha", np.logspace(-7, 3, 3))
# %% [markdown] # Then, create an `AbaBoostRegressor`. Use the function # `sklearn.model_selection.validation_curve` to get training and test scores # by varying the number of estimators. # *Hint: vary the number of estimators between 1 and 60.* # %% import numpy as np from sklearn.ensemble import AdaBoostRegressor from sklearn.model_selection import validation_curve adaboost = AdaBoostRegressor() param_range = np.unique(np.logspace(0, 1.8, num=30).astype(int)) train_scores, test_scores = validation_curve(adaboost, data_train, target_train, param_name="n_estimators", param_range=param_range, n_jobs=-1) # %% [markdown] # Plot both the mean training and test scores. You can also plot the # standard deviation of the scores. # %% import matplotlib.pyplot as plt plt.errorbar(param_range, train_scores.mean(axis=1), yerr=train_scores.std(axis=1), label="Training score", alpha=0.7)
def boost_plot(train_set, label_train, validation_set, label_validation, depth=8): print("Boosting test") boost_clf = ensemble.AdaBoostRegressor(tree.DecisionTreeRegressor( criterion='mse', max_depth=25), n_estimators=600, learning_rate=1.5) grad_boost_clf = ensemble.GradientBoostingRegressor(max_depth=7, n_estimators=600, learning_rate=0.05) boost_clf.fit(train_set, label_train) grad_boost_clf.fit(train_set, label_train) ada_computed_values = boost_clf.predict(validation_set) mean_absolute_error = sklearn.metrics.mean_absolute_error( label_validation, ada_computed_values, multioutput='uniform_average') print("ada mean error is : ", mean_absolute_error) mean_max_absolute_error_arrays = np.mean( np.amax(np.absolute(np.subtract(ada_computed_values, label_validation)), axis=1)) print("ada mean max error per row is : ", mean_max_absolute_error_arrays) grad_computed_values = grad_boost_clf.predict(validation_set) mean_absolute_error = sklearn.metrics.mean_absolute_error( label_validation, grad_computed_values, multioutput='uniform_average') print("grad mean error is : ", mean_absolute_error) mean_max_absolute_error_arrays = np.mean( np.amax(np.absolute(np.subtract(grad_computed_values, label_validation)), axis=1)) print("grad mean max error per row is : ", mean_max_absolute_error_arrays) X = np.concatenate((train_set, validation_set), axis=0) y = np.concatenate((label_train, label_validation), axis=0) # z = np.transpose(norm_values[:, -1:])[0] param_range = np.arange(0.1, 2.5, 0.5) # train_scores, test_scores = validation_curve( # ensemble.AdaBoostRegressor(tree.DecisionTreeRegressor(criterion='mse', max_depth=9), n_estimators=200), X, # y, param_name="learning_rate", param_range=param_range, cv=6, scoring="neg_mean_absolute_error", n_jobs=1) train_scores, test_scores = validation_curve( ensemble.GradientBoostingRegressor(max_depth=9, n_estimators=200), X, y, param_name="learning_rate", param_range=param_range, cv=3, scoring="neg_mean_absolute_error", n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) print(test_scores) plt.title( "Eye Validation Curve with Boosting (GradBoost) : criterion : 'entropy'" ) plt.xlabel("learning_rate") plt.ylabel("Score") plt.ylim(-10, 10) lw = 2 plt.plot(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw) plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw) plt.plot(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw) plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy", lw=lw) plt.legend(loc="best") plt.show()
print(train_sizes) print("Max cross-validation score:") # print((train_sizes[np.argmax(test_mean)])) print(test_mean[np.argmax(test_mean)]) print("Max training score at the same point as max cross-validation score:") print(train_mean[np.argmax(test_mean)]) # In[89]: # Model complexity analysis print("Max depth model complexity analysis") train_scores, test_scores = validation_curve( DecisionTreeClassifier(criterion='entropy'), x_train, y_train, param_name="max_depth", param_range=range(2, 51), scoring="accuracy", n_jobs=-1, cv=5) train_mean = np.mean(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) train_sizes = list(range(2, 51)) plt.close() plot_data(train_sizes, test_mean, title="Figure 5 (Decision Tree Validation Curve - Wine Quality)", x_label="Max depth", y_label="Accuracy Score", color="blue", label='Cross-validation score')
# load & scale dataset X, y = load_boston(return_X_y=True) X = StandardScaler().fit_transform(X) y = StandardScaler().fit_transform(y.reshape(-1, 1)) # regression by decision trees subsets = ShuffleSplit(n_splits=5, test_size=0.33, random_state=23) dtree_model = DecisionTreeRegressor() dtree_max_depth = range(1, 11) trn_scores, tst_scores = validation_curve(dtree_model, X, y, param_name='max_depth', param_range=dtree_max_depth, cv=subsets, scoring='r2') mean_trn_scores = np.mean(trn_scores, axis=1) mean_tst_scores = np.mean(tst_scores, axis=1) dtree_scores = pd.concat([ pd.DataFrame({ 'max_depth': dtree_max_depth, 'score': mean_trn_scores, 'subset': 'train' }), pd.DataFrame({ 'max_depth': dtree_max_depth, 'score': mean_tst_scores,
import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import load_digits from sklearn.svm import SVC from sklearn.model_selection import validation_curve digits = load_digits() X, y = digits.data, digits.target param_range = np.logspace(-6, -1, 5) train_scores, test_scores = validation_curve(SVC(), X, y, param_name="gamma", param_range=param_range, cv=10, scoring="accuracy", n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve with SVM") plt.xlabel("$\gamma$") plt.ylabel("Score") plt.ylim(0.0, 1.1) lw = 2 plt.semilogx(param_range, train_scores_mean,
from sklearn.svm import SVC from sklearn.model_selection import validation_curve param_range = np.logspace(-3, 3, 4) train_scores, test_scores = validation_curve(SVC(), X, y, param_name='gamma', param_range=param_range, cv=3) print(train_scores) print(test_scores) # This code based on scikit-learn validation_plot example # See: http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html plt.figure() train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title('Validation Curve with SVM') plt.xlabel('$\gamma$ (gamma)') plt.ylabel('Score') plt.ylim(0.0, 1.1) lw = 2 plt.semilogx(param_range, train_scores_mean, label='Training score', color='darkorange', lw=lw) plt.fill_between(param_range, train_scores_mean - train_scores_std,
# ## Addressing over- and underfitting with validation curves # In[16]: from sklearn.model_selection import validation_curve param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] train_scores, test_scores = validation_curve( estimator=pipe_lr, X=X_train, y=y_train, param_name='logisticregression__C', param_range=param_range, cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15,
plt.scatter(x_test, y_test, color='blue', label='Test set') plt.title('The data') plt.legend(loc='best') ############################################################ # Plot a validation curve from sklearn.model_selection import validation_curve degrees = np.arange(1, 21) model = make_pipeline(PolynomialFeatures(), LinearRegression()) # The parameter to vary is the "degrees" on the pipeline step # "polynomialfeatures" train_scores, validation_scores = validation_curve( model, x[:, np.newaxis], y, param_name='polynomialfeatures__degree', param_range=degrees) # Plot the mean train error and validation error across folds plt.figure(figsize=(6, 4)) plt.plot(degrees, validation_scores.mean(axis=1), lw=2, label='cross-validation') plt.plot(degrees, train_scores.mean(axis=1), lw=2, label='training') plt.legend(loc='best') plt.xlabel('degree of fit') plt.ylabel('explained variance') plt.title('Validation curve') plt.tight_layout()
alpha=0.15, color='green') plt.grid() plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.8, 1.03]) plt.tight_layout() plt.show() param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] train_scores, test_scores = validation_curve( estimator=pipe_lr, X=X_train, y=y_train, param_name='logisticregression__C', param_range=param_range, cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
def plot_validation_curve(estimator, title, X, y, param_name, param_range, ylim=None, cv=None, scoring='accuracy', n_jobs=-1, verbose=1, islog=False): plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xticks(param_range) plt.xlabel(param_name) plt.ylabel("Score") train_scores, test_scores = validation_curve(estimator, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring=scoring, n_jobs=n_jobs, verbose=verbose) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) if islog: lw = 2 plt.semilogx(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw) plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw) plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw) plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy", lw=lw) else: plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(param_range, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(param_range, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") plt.show()
def plot_validation_curve(estimator, X, y, title=None, ylim=None, param_name=None, param_range=[1, 100, 1000, 10000], cv=10, scoring=None, n_jobs=None): plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_scores, test_scores = validation_curve(estimator, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring=scoring, n_jobs=n_jobs) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve") plt.xlabel(param_name) plt.ylabel("Score") plt.ylim(-0.1, 1.1) lw = 2 # plt.grid() # plt.fill_between(param_range, train_scores_mean - train_scores_std, # train_scores_mean + train_scores_std, alpha=0.01, # color="r") # plt.fill_between(param_range, test_scores_mean - test_scores_std, # test_scores_mean + test_scores_std, alpha=0.01, color="g") # plt.plot(param_range, train_scores_mean, 'o-', color="r", # label="Training score") # plt.plot(param_range, test_scores_mean, 'o-', color="g", # label="Cross-validation score") # plt.legend(loc="best") plt.semilogx(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw) plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="darkorange", lw=lw) plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw) plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="navy", lw=lw) plt.legend(loc="best") return plt
kfold = KFold(n_splits=10) cv = kfold scores = cross_val_score(rbf_kernel_svm_clf, X,y,cv=cv) print('gamma : {}'.format(gamma)) print('Cross-Validation scores: {}'.format(scores)) title = "Learning Curves ("+name+")" plt = plot_learning_curve(rbf_kernel_svm_clf, title, X, y, cv=10) plt.savefig(name+'_lc.png') print(name+'_lc.png') plt.show() param_range = np.logspace(-6, -1, 5) train_scores, test_scores = validation_curve( rbf_kernel_svm_clf, X, y, param_name='svc__gamma', param_range=param_range, scoring="accuracy", n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve (" + name +")") plt.xlabel(r"$\gamma$") plt.ylabel("Score") plt.ylim(0.0, 1.1) lw = 2 plt.semilogx(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw) plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw)
def validation_curves(X, Y, alg1, alg2, alg3, CV, score, csv_name): """ Given training data, iteratively runs some ML algorithms (currently, this is nearest neighbor, decision tree, and support vector methods), varying the training set size for each prediction category: reactor type, cooling time, enrichment, and burnup Parameters ---------- X : dataframe that includes all training data Y : series with labels for training data alg1 : optimized learner 1 alg2 : optimized learner 2 alg3 : optimized learner 3 CV : cross-validation generator score : csv_name : string containing the train set, nuc subset, and parameter being predicted for naming purposes Returns ------- *validation_curve.csv : csv file with val curve results for each prediction category """ # Note: I'm trying to avoid loops here so the code is inelegant # Varied alg params for validation curves k_list = np.linspace(1, 25, 10).astype(int) depth_list = np.linspace(3, 25, 10).astype(int) feat_list = np.linspace(5, 47, 10).astype(int) gamma_list = np.logspace(-4, -1, 10) c_list = np.logspace(0, 5, 10) # knn train, cv = validation_curve(alg1, X, Y, 'n_neighbors', k_list, cv=CV, scoring=score, n_jobs=4) train_mean = np.mean(train, axis=1) train_std = np.std(train, axis=1) cv_mean = np.mean(cv, axis=1) cv_std = np.std(cv, axis=1) df1 = pd.DataFrame({ 'ParamList': k_list, 'TrainScore': train_mean, 'TrainStd': train_std, 'CV-Score': cv_mean, 'CV-Std': cv_std }) df1['Algorithm'] = 'knn' # dtree train, cv = validation_curve(alg2, X, Y, 'max_depth', depth_list, cv=CV, scoring=score, n_jobs=4) train_mean = np.mean(train, axis=1) train_std = np.std(train, axis=1) cv_mean = np.mean(cv, axis=1) cv_std = np.std(cv, axis=1) df2 = pd.DataFrame({ 'ParamList': depth_list, 'TrainScore': train_mean, 'TrainStd': train_std, 'CV-Score': cv_mean, 'CV-Std': cv_std }) df2['Algorithm'] = 'dtree' train, cv = validation_curve(alg2, X, Y, 'max_features', feat_list, cv=CV, scoring=score, n_jobs=4) train_mean = np.mean(train, axis=1) train_std = np.std(train, axis=1) cv_mean = np.mean(cv, axis=1) cv_std = np.std(cv, axis=1) df3 = pd.DataFrame({ 'ParamList': feat_list, 'TrainScore': train_mean, 'TrainStd': train_std, 'CV-Score': cv_mean, 'CV-Std': cv_std }) df3['Algorithm'] = 'dtree' # svr train, cv = validation_curve(alg3, X, Y, 'gamma', gamma_list, cv=CV, scoring=score, n_jobs=4) train_mean = np.mean(train, axis=1) train_std = np.std(train, axis=1) cv_mean = np.mean(cv, axis=1) cv_std = np.std(cv, axis=1) df4 = pd.DataFrame({ 'ParamList': gamma_list, 'TrainScore': train_mean, 'TrainStd': train_std, 'CV-Score': cv_mean, 'CV-Std': cv_std }) df4['Algorithm'] = 'svr' train, cv = validation_curve(alg3, X, Y, 'C', c_list, cv=CV, scoring=score, n_jobs=4) train_mean = np.mean(train, axis=1) train_std = np.std(train, axis=1) cv_mean = np.mean(cv, axis=1) cv_std = np.std(cv, axis=1) df5 = pd.DataFrame({ 'ParamList': c_list, 'TrainScore': train_mean, 'TrainStd': train_std, 'CV-Score': cv_mean, 'CV-Std': cv_std }) df5['Algorithm'] = 'svr' vc_data = pd.concat([df1, df2, df3, df4, df5]) vc_data.to_csv(csv_name + '_validation_curve.csv') return
def draw_validation_curve(model, param_range, param, name, ax, features, labels, metric='roc_auc', cv=3, n_jobs=1, title=None): ''' A function to draw a validation curve using cross-validation. model = the algorithm or pipeline used to model the data param_range = the inputs to validate param = the hyperparameter to evaluate name = name of the hyperparameter to use for the graph axis metric = evaluation metric title = name of the algorithm to use for the graph title ''' train_scores, test_scores = validation_curve(model, features, labels, param_name=param, param_range=param_range, cv=cv, scoring=metric, n_jobs=n_jobs) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve with %s" % title) plt.xlabel(name) plt.ylabel("Score") plt.ylim(0.0, 1.1) lw = 2 plt.semilogx(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw) plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw) plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw) plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy", lw=lw) plt.legend(loc="best")
# We can use the :class:`~sklearn.model_selection.validation_curve` to inspect # the impact of varying the parameter `k_neighbors`. In this case, we need # to use a score to evaluate the generalization score during the # cross-validation. # %% from sklearn.metrics import cohen_kappa_score, make_scorer from sklearn.model_selection import validation_curve scorer = make_scorer(cohen_kappa_score) param_range = range(1, 11) train_scores, test_scores = validation_curve( model, X, y, param_name="smote__k_neighbors", param_range=param_range, cv=3, scoring=scorer, ) # %% train_scores_mean = train_scores.mean(axis=1) train_scores_std = train_scores.std(axis=1) test_scores_mean = test_scores.mean(axis=1) test_scores_std = test_scores.std(axis=1) # %% [markdown] # We can now plot the results of the cross-validation for the different # parameter values that we tried.
""" from __future__ import print_function from sklearn.model_selection import validation_curve from sklearn.datasets import load_digits from sklearn.svm import SVC import matplotlib.pyplot as plt import numpy as np digits = load_digits() X = digits.data y = digits.target param_range = np.logspace(-6, -2.3, 5) train_loss, test_loss = validation_curve(SVC(), X, y, param_name='gamma', param_range=param_range, cv=10, scoring='neg_mean_squared_error') train_loss_mean = -np.mean(train_loss, axis=1) test_loss_mean = -np.mean(test_loss, axis=1) plt.plot(param_range, train_loss_mean, 'o-', color="r", label="Training") plt.plot(param_range, test_loss_mean, 'o-', color="g", label="Cross-validation") plt.xlabel("gamma") plt.ylabel("Loss")
import numpy as np from sklearn.model_selection import validation_curve from sklearn.model_selection import train_test_split boston = datasets.load_boston() print(boston.data.shape) #print(boston.data) X = boston.data y = boston.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 4) # 針對不同gamma參數進行fit model,並選擇gamma參數 param_range = np.logspace(-6, -3, 10) train_loss, test_loss = validation_curve(SVR(), X_train, y_train, param_name = "gamma", param_range = param_range, cv = 10, scoring = "neg_mean_squared_error") train_loss_mean = -np.mean(train_loss, axis = 1) test_loss_mean = -np.mean(test_loss, axis = 1) plt.plot(param_range, train_loss_mean, "o-", color = "r", label = "Training") plt.plot(param_range, test_loss_mean, "o-", color = "g", label = "Cross_validation") plt.xlabel("gamma") plt.ylabel("Loss") plt.legend(loc = "best") plt.show() # 針對不同 C 參數進行fit model,並選擇 C 參數 param_range = np.logspace(-1, 2, 10)
# plt.grid(True) # plt.xlabel("C values") # plt.ylabel("accuracy Score") # plt.axis([0, max(x) + 0.001, min(min(train_scores), min(valid_scores)), max(max(train_scores), max(valid_scores))]) # plt.title("Validation Curve SVM-C") # plt.legend() # plt.show() # plt.clf() x = [1, 2] train_scores, valid_scores = validation_curve(SVC(gamma=10, random_state=42), X_train, y_train, "kernel", ['linear', 'rbf'], cv=5, verbose=1000, n_jobs=-1, scoring='accuracy') train_scores = np.mean(train_scores, axis=1) valid_scores = np.mean(valid_scores, axis=1) plt.plot(x, train_scores, label="Train score") plt.plot(x, valid_scores, label="Validation score") plt.grid(True) plt.xlabel("Kernel") plt.ylabel("accuracy Score") plt.axis([ 0, max(x) + 0.001,
scorer = metrics.make_scorer(metrics.cohen_kappa_score) # Generate the dataset X, y = datasets.make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=10, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=4, n_samples=5000, random_state=RANDOM_STATE) smote = os.SMOTE(random_state=RANDOM_STATE) cart = tree.DecisionTreeClassifier(random_state=RANDOM_STATE) pipeline = pl.make_pipeline(smote, cart) param_range = range(1, 11) train_scores, test_scores = ms.validation_curve( pipeline, X, y, param_name="smote__k_neighbors", param_range=param_range, cv=3, scoring=scorer, n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) plt.plot(param_range, test_scores_mean, label='SMOTE') ax.fill_between(param_range, test_scores_mean + test_scores_std, test_scores_mean - test_scores_std, alpha=0.2) idx_max = np.argmax(test_scores_mean) plt.scatter(param_range[idx_max], test_scores_mean[idx_max], label=r'Cohen Kappa: ${0:.2f}\pm{1:.2f}$'.format(
Y_class1_train_LR = Y_probas_train_LR[np.newaxis, :, 1].T Y_class1_train_DT = Y_probas_train_DT[np.newaxis, :, 1].T Y_class1_train_RF = Y_probas_train_RF[np.newaxis, :, 1].T X_meta_train = np.concatenate( (Y_class1_train_svm, Y_class1_train_knn, Y_class1_train_LR, Y_class1_train_DT, Y_class1_train_RF), axis=1) #concatenate horizontally, final shape (m, 5) Y_meta_train = Y_train x = [0.1, 1, 10, 100, 1000, 1100, 1300, 1500] train_scores, valid_scores = validation_curve( LogisticRegression(random_state=42), X_train, Y_meta_train, "C", x, cv=3, verbose=1000, n_jobs=-1, scoring='accuracy') train_scores = np.mean(train_scores, axis=1) valid_scores = np.mean(valid_scores, axis=1) plt.plot(x, train_scores, label="Train score") plt.plot(x, valid_scores, label="Validation score") plt.grid(True) plt.xlabel("C values") plt.ylabel("accuracy Score") plt.axis([ 0,
] #16273个样本假设检验不通过的 dataMat = dataMat.drop(delnames, axis=1) dataMat = StandardScaler().fit_transform(dataMat) # param_range = np.logspace(-6, -1, 5) param_range = [0.0001, 0.001, 0.01, 0.1] train_scores, test_scores = validation_curve(estimator=SVC( kernel='rbf', gamma='auto', shrinking=True, probability=True, tol=0.0001, cache_size=1000, max_iter=-1, class_weight='balanced', decision_function_shape='ovr', random_state=None), X=dataMat, y=labelMat, param_name='C', param_range=param_range, cv=10, scoring="accuracy", n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve with SVM")
from sklearn.model_selection import validation_curve #validation_curve模块 from sklearn.datasets import load_digits from sklearn.svm import SVC import matplotlib.pyplot as plt import numpy as np #digits数据集 digits = load_digits() X = digits.data y = digits.target #建立参数测试集 param_range = np.logspace(-6, -2.3, 5) #使用validation_curve快速找出参数对模型的影响 train_loss, test_loss = validation_curve( SVC(), X, y, param_name='gamma', param_range=param_range, cv=10, scoring='neg_mean_squared_error') #平均每一轮的平均方差 train_loss_mean = -np.mean(train_loss, axis=1) test_loss_mean = -np.mean(test_loss, axis=1) #可视化图形 plt.plot(param_range, train_loss_mean, 'o-', color="r", label="Training") plt.plot(param_range, test_loss_mean, 'o-', color="g", label="Cross-validation") plt.xlabel("gamma") plt.ylabel("Loss") plt.legend(loc="best") plt.show()
def main(): """ 機械学習パイプラインによる、機械学習処理フロー(scikit-learn ライブラリの Pipeline クラスを使用) 学習曲線, 検証曲線よるモデルの汎化性能の評価 """ print("Enter main()") # データの読み込み prePro = DataPreProcess.DataPreProcess() prePro.setDataFrameFromCsvFile( "https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/wdbc/wdbc.data" ) #prePro.print( "Breast Cancer Wisconsin dataset" ) dat_X = prePro.df_.loc[:, 2:].values dat_y = prePro.df_.loc[:, 1].values #=========================================== # 前処理 [PreProcessing] #=========================================== # 欠損データへの対応 #prePro.meanImputationNaN() # ラベルデータをエンコード prePro.encodeClassLabelByLabelEncoder(colum=1) prePro.print("Breast Cancer Wisconsin dataset") # データをトレードオフデータとテストデータに分割 X_train, X_test, y_train, y_test \ = DataPreProcess.DataPreProcess.dataTrainTestSplit( X_input = dat_X, y_input = dat_y, ratio_test = 0.2 ) #------------------------------------------- # Pipeline の設定 #------------------------------------------- # パイプラインに各変換器、推定器を設定 pipe_logReg = Pipeline(steps=[ # タプル (任意の識別文字, 変換器 or 推定器のクラス) で指定 ("scl", StandardScaler()), # スケーリング: 変換器のクラス(fit() 関数を持つ) ("clf", LogisticRegression(penalty='l2', random_state=0) ) # ロジスティクス回帰(L2正則化):推定器のクラス(predict()関数を持つ) ]) # パイプラインに設定した変換器の fit() 関数を実行 #pipe_logReg.fit( X_train, y_train ) # #print( "Test Accuracy: %.3f" % pipe_logReg.score( X_test, y_test ) ) #============================================ # Learning Process #=========================================== # パイプラインに設定した推定器の predict() 実行 #y_predict = pipe_logReg.predict(X_test) #print("predict : ", y_predict ) #=========================================== # 学習曲線による汎化性能の確認 #=========================================== # learning_curve() 関数で"交差検証"による正解率を算出 train_sizes, train_scores, test_scores \ = learning_curve( estimator = pipe_logReg, # 推定器 : Pipeline に設定しているロジスティクス回帰 X = X_train, # y = y_train, # train_sizes = numpy.linspace(0.1, 1.0, 10), # トレードオフサンプルの絶対数 or 相対数 # トレーニングデータサイズに応じた, 等間隔の10 個の相対的な値を設定 cv = 10, # 交差検証の回数(分割数) n_jobs = -1 # 全てのCPUで並列処理 ) # 平均値、分散値を算出 train_means = numpy.mean(train_scores, axis=1) # axis = 1 : 行方向 train_stds = numpy.std(train_scores, axis=1) test_means = numpy.mean(test_scores, axis=1) test_stds = numpy.std(test_scores, axis=1) # print("train_sizes : \n", train_sizes) # トレーニングデータサイズに応じた, 等間隔の10 個の相対的な値のリスト print("train_scores : \n", train_scores) print("test_scores : \n", test_scores) print("train_means : \n", train_means) print("train_stds : \n", train_stds) print("test_means : \n", test_means) print("test_stds : \n", test_stds) #------------------------------------------- # 学習曲線を描写 #------------------------------------------- Plot2D.Plot2D.drawLearningCurve(train_sizes=train_sizes, train_means=train_means, train_stds=train_stds, test_means=test_means, test_stds=test_stds) plt.title("Learning Curve \n LogisticRegression (L2 regularization)") plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='best') plt.ylim([0.8, 1.01]) plt.tight_layout() plt.savefig("./MachineLearningPipeline_scikit-learn_1.png", dpi=300, bbox_inches='tight') plt.show() #=========================================== # 検証曲線による汎化性能の確認 #=========================================== param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] # validication_curve() 関数で"交差検証"による正解率を算出 train_scores, test_scores \ = validation_curve( estimator = pipe_logReg, # X = X_train, y = y_train, param_name = 'clf__C', # param_range = param_range, cv = 10 ) # 上書き train_means = numpy.mean(train_scores, axis=1) train_stds = numpy.std(train_scores, axis=1) test_means = numpy.mean(test_scores, axis=1) test_stds = numpy.std(test_scores, axis=1) # print("param_range : \n", param_range) print("train_scores : \n", train_scores) print("test_scores : \n", test_scores) print("train_means : \n", train_means) print("train_stds : \n", train_stds) print("test_means : \n", test_means) print("test_stds : \n", test_stds) #------------------------------------------- # 検証曲線を描写 #------------------------------------------- Plot2D.Plot2D.drawValidationCurve(param_range=param_range, train_means=train_means, train_stds=train_stds, test_means=test_means, test_stds=test_stds) plt.xscale('log') # log スケール plt.title("Validation Curve \n LogisticRegression (L2 regularization)") plt.xlabel('Parameter C [Reverse regularization parameter]') plt.ylabel('Accuracy') plt.legend(loc='best') plt.ylim([0.8, 1.01]) plt.tight_layout() plt.savefig("./MachineLearningPipeline_scikit-learn_2.png", dpi=300, bbox_inches='tight') plt.show() print("Finish main()") return
out_performance, color="navy", marker='o', label="Mean test Acc. from 5-fold CV") plt.grid(True) plt.legend() plt.title('5-fold CV Curve for Decision Tree on max_depth by cross_val_score') plt.show() ########## base_clf = DecisionTreeClassifier(random_state=0) train_scores, test_scores = validation_curve(base_clf, X, y, param_name="max_depth", param_range=candidate_max_depth, scoring="accuracy", n_jobs=-1, cv=5) train_scores_mean = np.mean(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) plt.figure(1, figsize=(8, 8)) plt.ylabel("Accuracy") plt.xlabel("max_depth") plt.title( "K-fold(k={}) CV Curve for Decision Tree on max_depth by validation_curve". format(train_scores.shape[1])) plt.plot(candidate_max_depth, train_scores_mean, marker='o',
train_x = np.array(train_x).T # 转置回来,变为编码后的矩阵 # 定义模型 model = se.RandomForestClassifier( max_depth=8, # 最大深度 random_state=7 # 随机种子 ) # 产生数组,用于验证 n_estimators = np.arange(50, 550, 50) # 产生一个数组 print('n_estimators:', n_estimators) # 通过不同的参数,构建多个随机森林,验证其准确率 train_scores, test_scores = ms.validation_curve( model, train_x, train_y, 'n_estimators', # 待验证的参数名称 n_estimators, # 待验证的参数值 cv=5 # 折叠数量 ) # print(test_scores) train_mean = train_scores.mean(axis=1) # 求各个折叠下性能均值 test_mean = test_scores.mean(axis=1) # 求各个折叠下测试性能均值 # 可视化 mp.figure('n_estimators', facecolor='lightgray') mp.title('n_estimators', fontsize=20) mp.xlabel('n_estimators', fontsize=14) mp.ylabel('F1 Score', fontsize=14) mp.tick_params(labelsize=10) mp.grid(linestyle=':')
# plt.plot(param_range, test_scores_mean, label="Cross-validation score", # color="navy", lw=lw) # plt.fill_between(param_range, test_scores_mean - test_scores_std, # test_scores_mean + test_scores_std, alpha=0.2, # color="navy", lw=lw) # plt.legend(loc="best") # print('plot time bro') # plt.show() # LEARNING RATE INIT param_range = np.arange(0.001, 0.05, 0.001) train_scores, test_scores = validation_curve(MLPClassifier( hidden_layer_sizes=(100, 5), solver='sgd', learning_rate='constant'), X, y, param_name="learning_rate_init", param_range=param_range, cv=10, scoring="accuracy", n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve with MLPClassifier") plt.xlabel("Learning Rate") plt.ylabel("Score") plt.ylim(0.0, 1.1) lw = 2
X_test = np.linspace(-0.1, 1.1, 500)[:, None] plt.scatter(X.ravel(), y, color='black') axis = plt.axis() for degree in [1, 3, 5]: y_test = PolynomialRegression(degree).fit(X, y).predict(X_test) plt.plot(X_test.ravel(), y_test, label='degree={0}'.format(degree)) plt.xlim(-0.1, 1.0) plt.ylim(-2, 12) plt.legend(loc='best'); #%% from sklearn.model_selection import validation_curve degree = np.arange(0, 21) train_score, val_score = validation_curve(PolynomialRegression(), X, y, 'polynomialfeatures__degree', degree, cv=7) plt.plot(degree, np.median(train_score, 1), color='blue', label='training score') plt.plot(degree, np.median(val_score, 1), color='red', label='validation score') plt.legend(loc='best') plt.ylim(0, 1) plt.xlabel('degree') plt.ylabel('score'); #%% plt.scatter(X.ravel(), y) lim = plt.axis() y_test = PolynomialRegression(3).fit(X, y).predict(X_test) plt.plot(X_test.ravel(), y_test); plt.axis(lim);
def draw_validation_curve(X, y, clf_type): param_range = np.logspace(0, 2, 5) if str(clf_type) == str(CLF_TYPE.K_SVM): train_scores, test_scores = validation_curve(SVC(), X, y, param_name="C", verbose=2, param_range=param_range, cv=3, scoring="accuracy", n_jobs=2) elif str(clf_type) == str(CLF_TYPE.Logistic): train_scores, test_scores = validation_curve(LogisticRegression(), X, y, param_name="C", verbose=2, param_range=param_range, cv=5, scoring="accuracy", n_jobs=1) elif str(clf_type) == str(CLF_TYPE.SVM): train_scores, test_scores = validation_curve(LinearSVC(), X, y, param_name="C", verbose=2, param_range=param_range, cv=5, scoring="accuracy", n_jobs=1) else: print('clf type not implemented') return train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve with " + str(clf_type)) plt.xlabel(r"$\gamma$") plt.ylabel("Score") plt.ylim(0.0, 1.1) lw = 2 plt.semilogx(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw) plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw) plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw) plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy", lw=lw) plt.legend(loc="best") plt.show()
alpha=0.15, color='blue') plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.9, 1.0]) plt.show() # evidence of slight overfit due to gap between curves # create validation curves for C (the inverse regularization param of logreg) param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] train_scores, test_scores = ms.validation_curve(estimator=PIPE_LR, X=FEAT_TRAIN, y=LABEL_TRAIN, param_name='clf__C', param_range=param_range, cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # plot validation curves plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, test_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')