def plot(cat,clf,clf2,X_train,X_test,y_train,y_test): # Plot for main category from scipy import sparse # plot learning curve from sklearn.learning_curve import learning_curve import matplotlib.pyplot as plt import sklearn.svm import sklearn.metrics #plot title = cat train_sizes=[0.01,0.1,0.2,0.5,1] plt.figure() plt.title(cat) plt.xlabel("Training examples") plt.ylabel(" F1 Score") train_sizes, train_scores, test_scores = learning_curve(clf, sparse.csr_matrix(np.concatenate([X_train, X_test])),np.concatenate([y_train,y_test]), scoring='f1', cv=5, train_sizes=[0.08,0.1,0.2,0.4,0.6,1]) train_scores_mean = np.mean(train_scores, axis=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1,color="y") plt.fill_between(train_sizes, test_scores_mean - test_scores_std,test_scores_mean + test_scores_std, alpha=0.1, color="b") plt.plot(train_sizes, train_scores_mean, 'o', color="y",label="Training score: KNN") plt.plot(train_sizes, test_scores_mean, 'o-', color="b",label="Cross-validation score: KNN") print 'done knn' train_sizes, train_scores, test_scores = learning_curve(clf2, sparse.csr_matrix(np.concatenate([X_train, X_test])),np.concatenate([y_train,y_test]), scoring='f1', cv=5, train_sizes=[0.08,0.1,0.2,0.4,0.6,1]) train_scores_mean = np.mean(train_scores, axis=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1,color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std,test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o', color="r",label="Training score: SVM") plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score: SVM") print 'done svm' mini=min(test_scores_mean)-0.1 ylim=(mini, 1.01) plt.ylim(*ylim) plt.grid() plt.legend(loc='lower right',prop={'size':10}) plt.savefig(cat+'_learning_curve.png')
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): """ Generate a simple plot of the test and traning learning curve. from http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html """ plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
def get_learning_curve( model, x, y, cv=3, train_sizes=None, scoring="log_loss"): """Get a dataframe representing the learning curve for a model :param model: a sklearn model :type model: object :param x: the full dataframe of features to pass to the model pipeline :type x: pandas.DataFrame :param y: the full vector of results :type y: pandas.DataFrame :param cv: the number of cross validation folds to make on each iteration :param train_sizes: a list of training set sizes to go through :returns: a dataframe """ if train_sizes is None: train_sizes = range(50, 400, 25) sizes, train_score, cv_score = learning_curve( model, x, y, train_sizes=train_sizes, cv=cv, scoring=scoring ) train_score = np.apply_along_axis(np.mean, 1, train_score) cv_score = np.apply_along_axis(np.mean, 1, cv_score) df = DataFrame( [sizes, train_score, cv_score], index=["sizes", "train_score", "cv_score"] ).transpose() return df
def plot_learning_curve(self): print " + Plotting learning curve (this will take some time)...", (X_train, y_train) = self._train_data plt.figure() plt.title("Learning curve (%s)" % self._learner) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve(self._clf[self._learner], X_train, y_train, cv=5) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between( train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r", ) plt.fill_between( train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g" ) plt.plot(train_sizes, train_scores_mean, "o-", color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score") plt.legend(loc="best") plt.show() print "done."
def plot_learning_curve(estimator, title, X, y, ylim=(0, 1.1), cv=None, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5), filename=None): plt.clf() plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") save_plot("learning_curve_"+str(filename)+".png")
def train(clf, train_sizes, cv, params, features, labels): train_sizes, train_scores, test_scores = learning_curve(clf, features, labels, cv = cv, train_sizes = train_sizes,n_jobs = 1) #plot_learning_curve(clf, title = "Test", X = features,y = labels, cv = cv, train_sizes = train_sizes,n_jobs = 1) #clf_string = pickle.dumps(clf) return clf, train_sizes, train_scores, test_scores
def plot_curve(): # Defining our regression algorithm reg = DecisionTreeRegressor() # Fit our model using X and y reg.fit(X, y) print "Regressor score: {:.4f}".format(reg.score(X,y)) # TODO: Use learning_curve imported above to create learning curves for both the # training data and testing data. You'll need reg, X, y, cv and score from above. # Note: Because i didnt use all the parameters in order of function definition for learning_curve fn, # I have to explicitly assign values to the parameters. e.g, from learning_curve fn, after 'y' # comes 'train_sizes'. But since it is optional and I am not using that parameter, for all other parameters # that come after, i have to explicitly assign values to the parameter (e.g cv=cv, scoring=score) # else error train_sizes, train_scores, test_scores = learning_curve(reg, X, y, cv=cv, scoring=score) # Taking the mean of the test and training scores train_scores_mean = np.mean(train_scores,axis=1) test_scores_mean = np.mean(test_scores,axis=1) # Plotting the training curves and the testing curves using train_scores_mean and test_scores_mean plt.plot(train_sizes ,train_scores_mean,'-o',color='b',label="train_scores_mean") plt.plot(train_sizes,test_scores_mean ,'-o',color='r',label="test_scores_mean") # Plot aesthetics plt.ylim(-0.1, 1.1) plt.ylabel("Curve Score") plt.xlabel("Training Points") plt.legend(bbox_to_anchor=(1.1, 1.1)) plt.show()
def experience_curve(self, train_sizes=None, cv=5, ylim=None, scoring="r2"): """ Return matplotlib plt object with learning/experience curve using self.estimator. """ print "params: ", self.regressor.get_params() if not train_sizes: train_sizes = np.linspace(.1, 1.0, 10) plt.figure() plt.title("UCI Energy Output") if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( self.regressor, self.X, self.y, cv=cv, n_jobs=-1, train_sizes=train_sizes, scoring=scoring) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
def make_graph(self): plt.figure() plt.title(title) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( self.model, self.inputs, self.outputs, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
def __init__(self, master, x_train, y_train, x_test, y_test, evaluator): Tk.Frame.__init__(self, master) train_sizes, train_scores, test_scores = learning_curve(estimator=evaluator.pipeline, X=x_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) frame_lcurve = Tk.Frame(self) frame_lcurve.pack(fill="x", expand=1, padx=15, pady=15) figure_lcurve = Figure(figsize=(6, 6), dpi=100) subplot_lcurve = figure_lcurve.add_subplot(111) subplot_lcurve.plot(train_sizes, train_mean, color="blue", marker='o', markersize=5, label="training accuracy") subplot_lcurve.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color="blue") subplot_lcurve.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label="cross-validation accuracy") subplot_lcurve.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color="green") subplot_lcurve.grid() subplot_lcurve.set_xlabel("Number of training samples") subplot_lcurve.set_ylabel("Accuracy") subplot_lcurve.legend(loc="lower right") subplot_lcurve.set_ylim([0.8, 1.0]) self.attach_figure(figure_lcurve, frame_lcurve)
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, train_sizes=np.linspace(.1, 1.0, 5)): """ 画出data在某模型上的learning curve. 参数解释 ---------- estimator : 你用的分类器。 title : 表格的标题。 X : 输入的feature,numpy类型 y : 输入的target vector ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点 cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份) """ plt.figure() train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=5, n_jobs=1, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) # fill_between 填充两个函数color黄色填充 plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.xlabel("Training examples") plt.ylabel("Score") plt.legend(loc="best") plt.grid("on") if ylim: plt.ylim(ylim) plt.title(title) plt.show()
def plot_learning_curve(model, X, y, scorer, sizes=np.linspace(0.1, 1, 5), cv=None, n_jobs=5, ylim=None, title="Xval. learning curve"): ''' Plot learning curve for model on data ''' df = pd.DataFrame() train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=cv, n_jobs=n_jobs, train_sizes=sizes) df['sizes_p'] = sizes df['sizes_n'] = train_sizes df['train_mean'] = 1 - np.mean(train_scores, axis=1) df['train_std'] = np.std(train_scores, axis=1) df['test_mean'] = 1 - np.mean(test_scores, axis=1) df['test_std'] = np.std(test_scores, axis=1) fig = plt.figure() ax = fig.add_subplot(111) ax.set_title(title) if ylim is not None: ax.set_ylim(*ylim) ax.set_xlabel("Size of training set") ax.set_ylabel("Error (1-score)") ax.grid() ax.fill_between(sizes, df.train_mean - df.train_std, df.train_mean + df.train_std, alpha=0.1, color="r") ax.fill_between(sizes, df.test_mean - df.test_std, df.test_mean + df.test_std, alpha=0.1, color="g") ax.plot(sizes, df.train_mean, 'o-', color="r", label="Training") ax.plot(sizes, df.test_mean, 'o-', color="g", label="Test") ax.legend(loc="best") fig.show() return df, fig
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)): #to have a figure object, this can be done figure = plt.figure() then the figure object can be referenced subsequently plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring='f1_weighted') train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") plt.get_current_fig_manager().window.raise_() plt.show() return plt
def plot_learning_curve(estimator, title, dataset, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): """ Generate a simple plot of the test and traning learning curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : integer, cross-validation generator, optional If an integer is passed, it is the number of folds (defaults to 3). Specific cross-validation objects can be passed, see sklearn.cross_validation module for the list of possible objects n_jobs : integer, optional Number of jobs to run in parallel (default 1). """ ds = dataset X = ds.X_train y = ds.y_train plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
def plot_learning_curves(df): # get learning curves X = df.values[:, :-1] y = df.values[:, -1] clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, n_jobs=-1, random_state=42) train_sizes, train_scores, test_scores = learning_curve(clf, X, y, cv=10, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10)) # get mean and std deviation train_scores_mean = np.mean(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) # plot learning curves plt.figure() plt.plot(train_sizes, train_scores_mean, 'o-', color='b', label='Train') plt.plot(train_sizes, test_scores_mean, 'o-', color='r', label='Test') plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, color='b', alpha=0.1) plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, color='r', alpha=0.1) plt.title("Random Forest Classifier") plt.legend(loc='best') plt.xlabel("Training Samples") plt.ylabel("Score") plt.ylim(0.6, 1.01) plt.gca().invert_yaxis() plt.grid() plt.draw() plt.savefig('./figures/learning_curves.png') plt.clf()
def plot_learning_curve(clf, cv, X, y): train_sizes, train_scores, valid_scores = learning_curve( clf, X, y, train_sizes = np.array([ 0.04, 0.1, 0.33, 0.55, 0.78, 1. ]), cv = cv, scoring = my_pipeline_deviance_function, verbose = 2, n_jobs=14) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) valid_scores_mean = np.mean(valid_scores, axis=1) valid_scores_std = np.std(valid_scores, axis=1) fig, ax = plt.subplots() ax.set_title('Learning curve') ax.set_ylim(0, 1.1) ax.set_xlabel("Training examples", fontsize = 14) ax.set_ylabel("Loss (deviance)", fontsize = 14) ax.grid() ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") ax.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g") ax.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training") ax.plot(train_sizes, valid_scores_mean, 'o-', color="g", label="Cross-validation") ax.legend(loc="best") plt.savefig('learning_curve.png')
def plot_learning_curve(estimator, title, X, y, ylimit, days_tr, train_sizes): plt.figure(facecolor='w', figsize = (6,5), frameon = "True") a, b, axes, label_size = plot_params() plt.title(title, size = label_size) if ylimit is not None: axes.set_ylim(ylimit) plt.xlabel("Training Samples", size = label_size) plt.ylabel("Mean Squared Error", size = label_size) train_sizes, train_scores, valid_scores = learning_curve(estimator, X, y, cv = 5, train_sizes = train_sizes, scoring = 'mean_squared_error') train_scores_mean = -np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) valid_scores_mean = -np.mean(valid_scores, axis=1) valid_scores_std = np.std(valid_scores, axis=1) #plt.grid(b=True, which='major', color='#696969', linestyle=':') plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training") plt.plot(train_sizes, valid_scores_mean, 'o-', color="g", label="Cross-validation") leg = plt.legend(loc= 4, fontsize = label_size, frameon = 'True') leg.get_frame().set_facecolor('w') #fig.savefig('learning_curve.png', bbox_inches= 'tight') return plt
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(0.1, 1.0, 5)): plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes ) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between( train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r" ) plt.fill_between( train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g" ) plt.plot(train_sizes, train_scores_mean, "o-", color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score") plt.legend(loc="best") return plt
def plot_learning_curve(estimator, title, X, y, ylim = None, cv = None, train_sizes = np.linspace(0.1, 1.0, 5)): ''' 画出data在某模型上的learning curve 参数解释 ------------- estimator:你用的分类器 title:表格的标题 X:输入的feature(numpy的array类型) y:输入的target vector ylim:tuple格式的(ymin, ymax),设定图像中纵坐标的最低点和最高点 cv:做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份) ------------- ''' plt.figure() train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv = 5, n_jobs = 1, train_sizes = train_sizes) train_scores_mean = np.mean(train_scores, axis = 1) train_scores_std = np.std(train_scores, axis = 1) test_scores_mean = np.mean(test_scores, axis = 1) test_scores_std = np.std(test_scores, axis = 1) plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha = 0.1, color = 'r') plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha = 0.1, color = 'g') plt.plot(train_sizes, train_scores_mean, 'o-', color = 'r', label = 'Training score') plt.plot(train_sizes, test_scores_mean, 'o-', color = 'g', label = 'Cross-validation score') plt.xlabel('Training examples') plt.ylabel('Score') plt.legend(loc = 'best') plt.grid('on') if ylim: plt.ylim(ylim) plt.title(title) plt.show()
def plotLearningCurve(X,y,fileStorePath,fileName): print("Inside Plot learning curve") #train_sizes, train_scores, test_scores = learning_curve(ExtraTreesClassifier(n_estimators=10, # max_depth=None, min_samples_split=1,random_state=0), X, y,train_sizes = np.linspace(.001, 1.0, 5)) train_sizes, train_scores, test_scores = learning_curve(AdaBoostClassifier(), X, y,train_sizes = np.linspace(.001, 1.0, 5)) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Learning Curve with ExtraTreesClassifier") plt.grid() plt.fill_between(train_sizes,1-(train_scores_mean - train_scores_std),1-(train_scores_mean + train_scores_std), alpha=0.1,color="r") plt.fill_between(train_sizes,1-(test_scores_mean - test_scores_std),1-(test_scores_mean + test_scores_std), alpha=0.1, color="g") plt.plot(train_sizes, 1-train_scores_mean, 'o-', color="r",label="Training set error") plt.plot(train_sizes, 1-test_scores_mean, 'o-', color="g",label="Cross-validation error") plt.legend(loc="best") plt.ylabel('Error') plt.xlabel('Training sample size') if not os.path.exists(fileStorePath): os.makedirs(fileStorePath) plt.savefig(fileStorePath+'/'+fileName+'.png') plt.clf()#clear the figure for next loop
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, train_sizes=np.linspace(.1, 1.0, 5)): if os.name == 'nt': n_jobs = 1 else: n_jobs = -1 plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("训练数据量") plt.ylabel("准确率") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, scoring='accuracy', train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="训练集") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="测试集(交叉校验)") plt.legend(loc="best") return plt
def plot_learning_curve(estimator, X, y,train_sizes): n_jobs = -1 # cv=3 cv = cross_validation.ShuffleSplit(len(X), n_iter=1, test_size=0.3) plt.figure() plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y,scoring="log_loss", cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,verbose=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") plt.show()
def showLearningCurve(clf, X, y): print ('calculate to print learning curve' + str(datetime.now())) train_sizes, train_scores, valid_scores = learning_curve(clf, X, y, scoring = 'f1', cv = StratifiedKFold(y, 3), n_jobs = 2) plt.figure() plt.title('learning curve') plt.xlabel("Training examples") plt.ylabel("Score") train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) valid_scores_mean = np.mean(valid_scores, axis=1) valid_scores_std = np.std(valid_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, valid_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") plt.show()
def learn_curve_plot(estimator,title,X,y,cv=None,train_sizes=np.linspace(0.1,1.0,5)): ''' :param estimator: the model/algorithem you choose :param title: plot title :param x: train data numpy array style :param y: target data vector :param xlim: axes x lim :param ylim: axes y lim :param cv: :return: the figure ''' plt.figure() train_sizes,train_scores,test_scores=\ learning_curve(estimator,X,y,cv=cv,train_sizes=train_sizes) '''this is the key score function''' train_scores_mean=np.mean(train_scores,axis=1) train_scores_std=np.std(train_scores,axis=1) test_scores_mean=np.mean(test_scores,axis=1) test_scores_std=np.std(test_scores,axis=1) plt.fill_between(train_sizes,train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,alpha=0.1,color='b') plt.fill_between(train_sizes,test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,alpha=0.1,color='g') plt.plot(train_sizes,train_scores_mean,'o-',color='b',label='training score') plt.plot(train_sizes,test_scores_mean,'o-',color='g',label='cross valid score') plt.xlabel('training examples') plt.ylabel('score') plt.legend(loc='best') plt.grid('on') plt.title(title) plt.show()
def plot_learning_curve(outdir, bdt, x, y): logging.info("creating learning curve") train_sizes, train_scores, test_scores = learning_curve(bdt, x, y, cv=ShuffleSplit(len(x), n_iter=100, test_size=1.0 / CV), n_jobs=NJOBS, train_sizes=np.linspace(.1, 1., 7), scoring='roc_auc') train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=.2, color='r') plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=.2, color='g') plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score') plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Cross-validation score') plt.xlabel("Sample size") plt.ylabel("Score (ROC area)") plt.legend() plt.savefig(os.path.join(outdir, 'learning-curve.png')) plt.savefig(os.path.join(outdir, 'learning-curve.pdf')) plt.close()
def plot_learning_curve(clf, X, y, name=None): try: name = clf.__class__.__name__ if name is None else name print name, X.shape, y.shape ssp = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.2, random_state=5557) train_sizes, train_scores, test_scores = learning_curve(clf, X, y, cv=ssp, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 10)) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title('Learning Curve for {}'.format(name)) plt.grid() plt.ylim(-0.05, 1.05) plt.xlabel('Training examples') plt.ylabel('Score') plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='r') plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='g') plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score') plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Cross-validation score') plt.legend(loc='best') plt.savefig(name+'_learningcurve.png') plt.clf() except Exception as e: print(e)
def plot_learning_curve(X, Y): train_sizes = np.array([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]) train_sizes, train_scores, test_scores = learning_curve( KNeighborsClassifier(), X_features, Y, train_sizes = train_sizes, cv = 7, n_jobs = -1, verbose = True) train_scores_mean = np.mean(train_scores, axis = 1) train_scores_std = np.std(train_scores, axis = 1) test_scores_mean = np.mean(test_scores, axis = 1) test_scores_std = np.mean(test_scores, axis = 1) plt.figure() plt.grid() plt.title("Learning Curve with KNN", size = 15) plt.xlabel("Training Examples", size = 15) plt.ylabel("Score", size = 15) plt.xticks(size = 12) plt.yticks(size = 12) plt.ylim(0.9,1.0) plt.plot(train_sizes, train_scores_mean, label = "Training Score", marker = "o", color = "r") plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha = 0.2, color = "r") plt.plot(train_sizes, test_scores_mean, label = "Cross-Validation Score", marker = "o", color = "g") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha = 0.2, color = "g") plt.legend(loc = "best") plt.savefig('plot_learning_curve_rf_asis') plt.show()
def plot_curve(): reg = LinearRegression() reg.fit(X,y) print "Regressor score: {:.4f}".format(reg.score(X,y)) # TODO: Use learning_curve imported above to create learning curves from X and y. # You will need to use 'cv_sets' and 'scorer' as parameters in the function. # train_sizes, train_scores, test_scores = (None, None, None) train_sizes, train_scores, test_scores = learning_curve(reg, X, y, cv=cv_sets, scoring=scorer) # TODO: Plot the learning curves for both the training scores and testing scores. # Use plt.plot() twice -- one for each score. Be sure to give them labels! # NOTE: Using plt.plot(train_scores) will get you 6 lines when we are looking to # plot just 2(mean scores for training and testing). # You can use np.mean(train_scores, axis =1) to get mean train_scores values. # Similarly you can get the mean for the test_scores. train_scores_mean = np.mean(train_scores, axis=1) plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") test_scores_mean = np.mean(test_scores, axis=1) plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") # Plot aesthetics plt.ylim(-0.1, 1.1) plt.ylabel("Curve Score") plt.xlabel("Training Points") plt.legend(bbox_to_anchor=(1.1, 1.1)) plt.show()
def CurvaAprendizaje(self): if self.CV: if not self.fitted: self.fit() a = self.newClassifiers else: a = self.clasificadores figure(self.datos + ', curvas de aprendizaje') i=1 for c in a: if c.__class__.__name__ == 'GaussianNB': modo = True else: modo = False numero, training, CV = learning_curve(c, self.X, self.y, train_sizes = linspace(0.1,1.,10), cv = 5, scoring = 'f1_weighted', exploit_incremental_learning = modo ) subplot(2,3,i) ylim((0,1)) title(c.__class__.__name__) xlabel('# de datos') ylabel('F1') training_mean, training_std = mean(training, axis=1), std(training, axis=1) CV_mean, CV_std = mean(CV, axis=1), std(CV, axis=1) grid() fill_between(numero, training_mean - training_std, training_mean + training_std, color = 'r', alpha = 0.1) fill_between(numero, CV_mean - CV_std, CV_mean + CV_std, color = 'g', alpha = 0.1) plot(numero, training_mean, 'o-', color='r', label = 'Training') plot(numero, CV_mean, 'o-', color='g', label = 'Cross Validation') legend(loc = 4) i += 1 show()
def plot_learning_curve(X_train, y_train): pipe_lr = Pipeline([ ('scl', StandardScaler()), ('clf', LogisticRegression(penalty='l2', random_state=0)), ]) train_sizes, train_scores, test_scores = learning_curve( estimator=pipe_lr, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, ) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot( train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy', ) plt.fill_between( train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue', ) plt.plot( train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy', ) plt.fill_between( train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green', ) plt.grid() plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.8, 1.0]) plt.show()
ax.xaxis.tick_top() for i in range(4): for j in range(4): ax.text(j, i, '{:.2f}'.format(cm[i, j]), size='medium', ha='center', va='center') cv = ShuffleSplit(n_all, n_iter=100, test_size=0.2, random_state=0) train_sizes = np.linspace(.1, 1.0, 5) for (name, mdl), ax in zip([(x['title'], x['cl']) for x in models.values()], axes.flat[models_num:]): train_sizes, train_scores, test_scores = learning_curve( mdl, x_all, y_all, cv=cv, train_sizes=train_sizes ) ax.set_xlabel('Score') train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) ax.grid() ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") ax.plot(train_sizes, train_scores_mean, 'o-', color="r",
X_train_orig # In[ ]: # Perform pre-processing to determine optimal data set size and tune model parameters from sklearn.svm import SVC svm = SVC(kernel='rbf', C=100.0, gamma=0.1, random_state=0) # Determine optimal training data set size using learning curve methods import matplotlib.pyplot as plt from sklearn.learning_curve import learning_curve train_sizes, train_scores, test_scores = learning_curve( estimator=svm, X=X_train_orig, y=y_train_orig, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean + train_std,
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): """ Generate a simple plot of the test and training learning curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. n_jobs : integer, optional Number of jobs to run in parallel (default 1). """ plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
# _*_ coding:utf-8 _*_ import matplotlib.pyplot as plt import numpy as np print("#---------------------------------------#") print(" learning curve ") print("#---------------------------------------#") print("\n") from sklearn.learning_curve import learning_curve fig,ax=plt.subplots(1,2,figsize=(16,6)) fig.subplots_adjust(left=0.0625,right=0.95,wspace=0.1) for i,degree in enumerate([2,9]): N,train_lc,val_lc=learning_curve() pass
markersize=5,label="test_score") plt.fill_between(train_sizes,test_mean+test_std,test_mean-test_std,alpha=0.15,color="green") plt.grid() plt.title('Learning_curve of Random Forest') plt.xlabel("train_size") plt.ylabel("Score") plt.legend(loc="lower right") #plt.ylim([0.8,1.0]) plt.show() ''' #evaluation of SVM #C=100,gamma=0.001,kernel='linear' pipe_line = Pipeline([("std",StandardScaler()), ("clf",svm.SVC())]) train_sizes,train_score,test_score = learning_curve(estimator=pipe_line,X=train_x,y=train_y,train_sizes=np.linspace(0.1,1.0,10),cv=10,n_jobs=1) train_mean = np.mean(train_score,axis=1) train_std = np.std(train_score,axis=1) test_mean = np.mean(test_score,axis=1) test_std = np.std(test_score,axis=1) plt.plot(train_sizes,train_mean,color="blue",marker="o",markersize=5,label="train_score") plt.fill_between(train_sizes,train_mean+train_std,train_mean-train_std,alpha=0.15,color="blue") plt.plot(train_sizes,test_mean,color="green",linestyle="--",marker="s", markersize=5,label="test_score") plt.fill_between(train_sizes,test_mean+test_std,test_mean-test_std,alpha=0.15,color="green") plt.grid() plt.title('Learning_curve of SVM') plt.xlabel("train_size") plt.ylabel("Score")
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k + 1, np.bincount(y_train[train]), score)) np.mean(scores) np.std(scores) #=============== visualing accuracy curve=============== import matplotlib.pyplot as plt from sklearn.learning_curve import learning_curve pip_lr = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression(penalty='12', random_state=0))]) train_sizes, train_scores,test_scores=\ learning_curve(estimator=pipe_lr, X=X_train, y= y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='TA') plt.fill_between(train_sizes,
##clf = neighbors.KNeighborsClassifier(n_neighbors=10,weights='distance') #clf = neighbors.KNeighborsClassifier(n_neighbors=10) ##-------------------------------------------------Traning------------------ clf = clf.fit(XtrainPos, YtrainPos) print(metrics.classification_report(YtestPos, clf.predict(XtestPos))) ##--------------------------Crossvalidation 5 times using different split------------------------------ #from sklearn import cross_validation #scores = cross_validation.cross_val_score(clf, XtrainAll, label, cv=3, scoring='f1') #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ####---------------------------------Check for overfeat------------------------------------- train_sample_size, train_scores, test_scores = learning_curve(clf, XtrainAll, label, train_sizes=np.arange(0.1,1,0.1), cv=10) #----------------------------------------Visualization--------------------------------------------- plt.xlabel("# Training sample") plt.ylabel("Accuracy") plt.grid(); mean_train_scores = np.mean(train_scores, axis=1) mean_test_scores = np.mean(test_scores, axis=1) std_train_scores = np.std(train_scores, axis=1) std_test_scores = np.std(test_scores, axis=1) gap = np.abs(mean_test_scores - mean_train_scores) g = plt.figure(1) plt.title("Learning curves for %r\n" "Best test score: %0.2f - Gap: %0.2f" %
# 画决策树图 # dot_data = StringIO() # tree.export_graphviz(clf,out_file=dot_data) # graph=pydot.graph_from_dot_data(dot_data.getvalue()) # graph.write_pdf("D:/luheng/mypython/mytree.pdf") # df["try"]=df["job_exp"]-df["workexp_months"].astype(int) # print df # for x in df["try"].unique(): # print x # print df.describe() # df2=df[["latest_workexp_job_spec","latest_workexp_job_position","workexp","projectexp"]] # df3=df[["industry","position"]] # df3.to_csv("D:/luheng/mypython/HR.txt",index=False,header=False) # 画学习曲线图 train_sizes = np.linspace(0.1, 1.0, 20) train_sizes, train_scores, test_scores = learning_curve( clf, x, y, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Learning Curve with Tree") plt.xlabel("Training examples") plt.ylabel("Score") plt.ylim(0.0, 1.1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes,
def train_ml_model(self): """ :return: """ logger.info('#########################################################################') logger.info('train_ml_model') logger.info('#########################################################################') ###################################################### # Load dataset ###################################################### cols, splits = self.get_data() data_train, data_test, target_train, target_test = splits # clf = ExtraTreesRegressor(500, n_jobs=constants.ncpu) # #clf = SVR(kernel='rbf', C=1e3, gamma=0.1) # #clf = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3) # data = df_train.as_matrix(columns=cols[1:]) # convert dataframe column to matrix # #data = preprocessing.scale(data) # target = df_train.as_matrix(columns=[self.var_target]).ravel() # convert dataframe column to matrix # clf.fit(data, target) # # predict_val = clf.predict(after.as_matrix(columns=cols[1:])) # results = compute_stats.ols(predict_val.tolist(), after_target.tolist()) # print results.rsquared # import matplotlib.pyplot as plt # plt.scatter(after_target, predict_val) # plt.show() # pdb.set_trace() if not os.path.isfile(self.path_pickle_model): # For details in scikit workflow: See http://stackoverflow.com/questions/ # 35256876/ensuring-right-order-of-operations-in-random-forest-classification-in-scikit-lea # TODO Separate out a dataset so that even the grid search cv can be tested ############################ # Select features from model ############################ logger.info('Selecting important features from model') if self.classify: rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu) else: rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu) feat_selection = SelectFromModel(rf_feature_imp) pipeline = Pipeline([ ('fs', feat_selection), ('clf', self.model), ]) ################################# # Grid search for best parameters ################################# C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) logger.info('Tuning hyperparameters') param_grid = { 'fs__threshold': ['mean', 'median'], 'fs__estimator__max_features': ['auto', 'log2'], 'clf__max_features': ['auto', 'log2'], 'clf__n_estimators': [1000, 2000] #'clf__gamma': np.logspace(-9, 3, 13), #'clf__C': np.logspace(-2, 10, 13) } gs = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, n_jobs=constants.ncpu, error_score=np.nan) # Fir the data before getting the best parameter combination. Different data sets will have # different optimized parameter combinations, i.e. without data, there is no optimal parameter combination. gs.fit(data_train, target_train) logger.info(gs.best_params_) data_test = pd.DataFrame(data_test, columns=cols[1:]) # Update features that should be used in model selected_features = gs.best_estimator_.named_steps['fs'].transform([cols[1:]]) cols = selected_features[0] data_test = data_test[cols] # Update model with the best parameters learnt in the previous step self.model = gs.best_estimator_.named_steps['clf'] predict_val = self.model.predict(data_test) results = compute_stats.ols(predict_val.tolist(), target_test.tolist()) print results.rsquared print cols plt.scatter(target_test, predict_val) plt.show() pdb.set_trace() ################################################################### # Output and plot importance of model features, and learning curves ################################################################### self.output_model_importance(gs, 'clf', num_cols=len(cols[1:])) if constants.plot_model_importance: train_sizes, train_scores, test_scores = learning_curve(self.model, data, target, cv=k_fold, n_jobs=constants.ncpu) plot.plot_learning_curve(train_scores, test_scores, train_sizes=train_sizes, fname='learning_curve', ylim=(0.0, 1.01), title='Learning curves', out_path=self.path_out_dir) # Save the model to disk logger.info('Saving model and features as pickle on disk') with open(self.path_pickle_model, 'wb') as f: cPickle.dump(self.model, f) with open(self.path_pickle_features, 'wb') as f: cPickle.dump(self.vars_features, f) else: # Read model from pickle on disk with open(self.path_pickle_model, 'rb') as f: logger.info('Reading model from pickle on disk') self.model = cPickle.load(f) logger.info('Reading features from pickle on disk') self.vars_features = pd.read_pickle(self.path_pickle_features) return df_cc
def plot_learn_curve(model, train, label, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.05, 1., 20), title='learning_curve', verbose=0, plot=True): train_sizes, train_scores, test_scores = learning_curve( model, train, label, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) if plot: plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel('data size') plt.ylabel('score') plt.gca().invert_yaxis() plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="b") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="r") plt.plot(train_sizes, train_scores_mean, 'o-', color='b', label='train set score') plt.plot(train_sizes, test_scores_mean, 'o-', color='r', label='cv set score') plt.legend(loc='best') plt.draw() plt.show() midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 diff = (train_scores_mean[-1] + test_scores_std[-1]) - ( test_scores_mean[-1] - test_scores_std[-1]) return midpoint, diff
def plot_learning_curve(train_dataset, estimator, train_sizes=np.linspace(.1, 1.0, 5), score_attr=None, n_jobs=-1, save=False, display=True, cv=5, filename="learning_curve"): if not save and not display: return logging.info("Calculating learning curve") train_sizes, train_scores, valid_scores = learning_curve( estimator=estimator, X=train_dataset.data, y=train_dataset.target, train_sizes=train_sizes, cv=cv, n_jobs=n_jobs, scoring=score_attr, ) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) valid_scores_mean = np.mean(valid_scores, axis=1) valid_scores_std = np.std(valid_scores, axis=1) logging.debug("Plotting learning curve") plt.title("Learning curve") plt.xlabel("Training examples") plt.ylabel(score_attr.upper() if score_attr else "" + "Score") plt.grid() plt.plot(train_sizes, train_scores_mean, 'o-', label="Training score", color="r") plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") plt.plot(train_sizes, valid_scores_mean, 'o-', label="Cross-Validation score", color="g") plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.2, color="g") plt.legend(loc="best") if display: plt.show() if save: plt.savefig(filename + "_" + datetime.now().strftime("%Y_%m_%d_%H_%M") + ".png", dpi=400) plt.clf()
def learning_curves(filepath, scoring, eu_adr_only=False, total_instances=0): """ Plot learning curves of f-score for training and test data """ features, labels = load_data(eu_adr_only, total_instances) # convert from dict into np array vec = DictVectorizer() data = vec.fit_transform(features).toarray() # set up pipeline to normalise the data then build the model clf = Pipeline([('normaliser', preprocessing.Normalizer()), ('svm', SVC(kernel='poly', coef0=3, degree=2, gamma=1, cache_size=1000))]) #('svm', SVC(kernel='linear'))]) cv = cross_validation.StratifiedKFold(labels, n_folds=10, shuffle=True, random_state=0) # why does this always return results in the same pattern??? something fishy is going on # think that including 0.9 ends up in downward slope at the end sizes, t_scores, v_scores = learning_curve(clf, data, labels, train_sizes=np.linspace( 0.1, 0.9, 8), cv=cv, scoring=scoring, n_jobs=-1) train_results = np.array( [np.mean(t_scores[i]) for i in range(len(t_scores))]) valid_results = np.array( [np.mean(v_scores[i]) for i in range(len(v_scores))]) ''' # define new set of points to be used to smooth the plots x_new = np.linspace(sizes.min(), sizes.max()) training_smooth = spline(sizes, training_results, x_new) validation_smooth = spline(sizes, validation_results, x_new) #plt.plot(sizes, validation_results) plt.plot(x_new, validation_smooth) #plt.plot(sizes, training_results) plt.plot(x_new, training_smooth) ''' # instead lets fit a polynomial of degree ? as this should give a better impression! valid_coefs = np.polyfit(sizes, valid_results, deg=2) train_coefs = np.polyfit(sizes, train_results, deg=2) x_new = np.linspace(sizes.min(), sizes.max()) valid_new = np.polyval(valid_coefs, x_new) train_new = np.polyval(train_coefs, x_new) # plot the raw points and the fitted curves #plt.plot(x_new, train_new) #plt.plot(sizes, train_results) plt.plot(x_new, valid_new, label='fitted poly degree 2') plt.plot(sizes, valid_results, label='raw points') kernel = str(clf.named_steps['svm'].get_params()['kernel']) coef = str(clf.named_steps['svm'].get_params()['coef0']) degree = str(clf.named_steps['svm'].get_params()['degree']) c_error = str(clf.named_steps['svm'].get_params()['C']) plt.title('kernel: ' + kernel + ', degree = ' + degree + ', coef = ' + coef + ', C = ' + c_error) plt.xlabel('training_instances') plt.ylabel('f_score') #plt.show() plt.savefig(filepath, format='tif') plt.clf()
# shuffle data random_idx = permutation(np.arange(len(y))) X = X[random_idx] y = y[random_idx] # create model model = LogisticRegression(C=1) #model = RandomForestClassifier(n_estimators=10) # plot high-dimensional decision boundary db = DBPlot(model) db.fit(X, y, training_indices=0.5) db.plot(plt, generate_testpoints=True) # set generate_testpoints=False to speed up plotting plt.show() # plot learning curves for comparison N = 10 train_sizes, train_scores, test_scores = learning_curve( model, X, y, cv=5, train_sizes=np.linspace(.2, 1.0, N)) plt.errorbar(train_sizes, np.mean(train_scores, axis=1), np.std(train_scores, axis=1) / np.sqrt(N)) plt.errorbar(train_sizes, np.mean(test_scores, axis=1), np.std(test_scores, axis=1) / np.sqrt(N), c='r') plt.legend(["Accuracies on training set", "Accuracies on test set"]) plt.xlabel("Number of data points") plt.title(str(model)) plt.show()
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True): train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) if plot: plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel(u"Training Sample Size") plt.ylabel(u"Score") plt.gca().invert_yaxis() plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="b") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="r") plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"Training Score") plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"Cross-Validation Score") plt.legend(loc="best") plt.draw() plt.gca().invert_yaxis() plt.show() midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 diff = (train_scores_mean[-1] + train_scores_std[-1]) - ( test_scores_mean[-1] - test_scores_std[-1]) return midpoint, diff
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.05, 1., 20)): """ Generate a simple plot of the test and traning learning curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : integer, cross-validation generator, optional If an integer is passed, it is the number of folds (defaults to 3). Specific cross-validation objects can be passed, see sklearn.cross_validation module for the list of possible objects n_jobs : integer, optional Number of jobs to run in parallel (default 1). """ plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
def draw_learning_curves(estimator, X, y, ylim=None, cv=None, n_jobs=1, scoring=None, train_sizes=np.linspace(.1, 1.0, 5)): """Code taken from scikit-learn examples for version 0.15. Generate a simple plot of the test and traning learning curve. Args: estimator (class): object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title (str): Used for the title for the chart. X (2D array): array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y (1D array): array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim (tuple, optional): Defines minimum and maximum yvalues plotted. cv (int, optional): If an integer is passed, it is the number of folds (defaults to 3). Specific cross-validation objects can be passed, see sklearn.cross_validation module for the list of possible objects n_jobs(int, optional) : Number of jobs to run in parallel (default 1). train_sizes (float): Relative or absolute numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 5)) Returns: A plot of the learning curves for both the training curve and the cross-validation curve. """ plt.close('all') flat_shape = (X.shape[0], ) + (np.prod(X.shape[1:]), ) X_flat = X.reshape(flat_shape) plt.figure() plt.title('Learning Curves', fontsize=20) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples", fontsize=15) plt.ylabel("Score", fontsize=15) train_sizes, train_scores, test_scores = learning_curve( estimator, X_flat, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="#f46d43") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="#1a9641") plt.plot(train_sizes, train_scores_mean, 'o-', color="#f46d43", linewidth=2, label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="#1a9641", linewidth=2, label="Cross-validation score") plt.legend(loc="best") plt.show()
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True): """ 画出data在某模型上的learning curve. 参数解释 ---------- estimator : 你用的分类器。 title : 表格的标题。 X : 输入的feature,numpy类型 y : 输入的target vector ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点 cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份) n_jobs : 并行的的任务数(默认1) """ train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) if plot: plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel(u"训练样本数") plt.ylabel(u"得分") plt.gca().invert_yaxis() plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="b") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="r") plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分") plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分") plt.legend(loc="best") plt.draw() plt.gca().invert_yaxis() plt.show() midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 diff = (train_scores_mean[-1] + train_scores_std[-1]) - ( test_scores_mean[-1] - test_scores_std[-1]) return midpoint, diff
from sklearn import datasets import numpy as np from sklearn.learning_curve import learning_curve from sklearn.svm import SVC import matplotlib.pyplot as plt digits = datasets.load_digits() X = digits.data y = digits.target train_sizes, train_loss, test_loss = learning_curve( SVC(gamma=0.001), X, y, cv=10, scoring='mean_squared_error', train_sizes=[0.1, 0.25, 0.5, 0.75, 1]) train_loss_mean = -np.mean(train_loss, axis=1) test_loss_mean = -np.mean(test_loss, axis=1) plt.plot(train_sizes, train_loss_mean, 'o-', color='r', label='training') plt.plot(train_sizes, test_loss_mean, 'o-', color='g', label='Cross-validation') plt.xlabel('Training examples') plt.ylabel('Loss') plt.legend(loc='best') plt.show()
plt.plot(X, y, 'c*', label='data') plt.hold('on') plt.plot( X, y_kr, c='r', label='Kernel Regression' ) ##line -plt.plot(X, y_svr, c='r', label='SVR'),plt.scatter(X, y_kr, c='b', label='Kernel Regression') #plt.plot(X, y_svr, c='r', label='SVR') plt.xlabel('height') plt.ylabel('weight') plt.title('Kernel regression -boy') plt.legend() # Visualize learning curves plt.figure() train_sizes, train_scores_svr, test_scores_svr = \ learning_curve(svr, X, y, train_sizes=np.linspace(0.1, 1, 10), scoring="mean_squared_error", cv=10) train_sizes_abs, train_scores_kr, test_scores_kr = \ learning_curve(kr, X, y, train_sizes=np.linspace(0.1, 1, 10), scoring="mean_squared_error", cv=10) plt.plot(train_sizes, test_scores_svr.mean(1), 'o-', color="r", label="SVR") plt.plot(train_sizes, test_scores_kr.mean(1), 'o-', color="g", label="Kernel Regression") plt.yscale("symlog", linthreshy=1e-7) plt.ylim(-10, -0.01) plt.xlabel("Training size") plt.ylabel("Mean Squared Error") plt.title('Learning curves') plt.legend(loc="best")
# Linear SVM clf_lin = svm.SVC(kernel='linear') linear_train_error = [] linear_test_error = [] training_samples = [] # Radial SVM clf_rad = svm.SVC(kernel='rbf') radial_train_error = [] radial_test_error = [] #polinomial SVM clf_poly = svm.SVC(kernel='poly', degree=3) poly_train_error = [] poly_test_error = [] rad_train_sizes, rad_train_scores, rad_valid_scores = learning_curve(clf_rad, X_train, Y_train, cv=5) print('rad') lin_train_sizes, lin_train_scores, lin_valid_scores = learning_curve(clf_lin, X_train, Y_train, cv=5) print('lin') poly_train_sizes, poly_train_scores, poly_valid_scores = learning_curve( clf_poly, X_train, Y_train, cv=2) #print('poly') stop = timeit.default_timer() print("time to run:", stop - start) fig = plt.figure()
from sklearn.learning_curve import learning_curve from sklearn.datasets import load_digits from sklearn.svm import SVC import matplotlib.pyplot as plt import numpy as np digits = load_digits() X = digits.data y = digits.target train_sizes, train_loss, test_loss = learning_curve(SVC(gamma=0.01), X, y, cv=10, scoring='accuracy', train_sizes=[0.1, 0.25, 0.5, 0.75, 1]) train_loss_mean = np.mean(train_loss, axis=1) test_loss_mean = np.mean(test_loss, axis=1) plt.figure() plt.plot(train_sizes, train_loss_mean, 'o-', color='r', label='Training') plt.plot(train_sizes, test_loss_mean, 'o-', color='g', label='Cross validation') plt.xlabel('Training examples') plt.ylabel('Accuracy') plt.legend(loc='best') plt.show()
plt.figure() plt.plot(parameter_grid, 100*np.average(train_scores, axis=1), color='black') plt.title('Validation curve') plt.xlabel('Maximum depth of the tree') plt.ylabel('Accuracy') plt.show() ''' 学习曲线帮助我们了解训练集的大小如何影响机器学习模型 这对解决计算约束的问题很重要 ''' #学习曲线 from sklearn.learning_curve import learning_curve classifier = RandomForestClassifier(random_state = 7) parameter_grid = np.array([200,500,800,1000]) train_sizes,train_scores,validation_scores = learning_curve(classifier,X,y,train_sizes = parameter_grid,cv = 5) print ("\n##### LEARNING CURVES #####") print ("\nTraining scores:\n", train_scores) print ("\nValidation scores:\n", validation_scores) plt.figure() plt.plot(parameter_grid, 100*np.average(train_scores, axis=1), color='black') plt.title('Learning curve') plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.show() ''' 小的样本集看似分类精度更好 但容易出现过度拟合的问题
# Scikit-Learn offers a convenient utility for computing such learning curves from your models; # # here we will compute a learning curve for our original dataset with a second-order polynomial model and a ninth-order polynomial: # In[41]: from sklearn.learning_curve import learning_curve import warnings warnings.filterwarnings("ignore") fig, ax = plt.subplots(1, 2, figsize=(16, 6)) fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1) for i, degree in enumerate([2, 9]): N, train_lc, val_lc = learning_curve(PolynomialRegression(degree), X, y, cv=7, train_sizes=np.linspace(0.3, 1, 25)) ax[i].plot(N, np.mean(train_lc, 1), color='blue', label='training score') ax[i].plot(N, np.mean(val_lc, 1), color='red', label='validation score') ax[i].hlines(np.mean([train_lc[-1], val_lc[-1]]), N[0], N[-1], color='gray', linestyle='dashed') ax[i].set_ylim(0, 1) ax[i].set_xlim(N[0], N[-1]) ax[i].set_xlabel('training size', fontsize=30) ax[i].set_ylabel('score', fontsize=30) ax[i].set_title('degree = {0}'.format(degree), size=24)
def plot_learning_curve(estimator, title, X, y, ylin=None, cv=None, n_jobs=1, train_sizes=np.linspace(.05, 1., 30), verbose=0, plot=True): print(len(X)) print(len(y)) train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, train_sizes=train_sizes, cv=10, n_jobs=n_jobs, verbose=verbose) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) if (plot): plt.figure() plt.title(title) if ylin is not None: plt.ylim(ylin) plt.xlabel(u'trainset_count') plt.ylabel(u'score') plt.gca().invert_yaxis() plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='b') plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='r') plt.plot(train_sizes, train_scores_mean, 'o-', color='b', label=u'scores of trainSet') plt.plot(train_sizes, test_scores_mean, 'o-', color='r', label=u'scores of testSet') plt.legend(loc='best') plt.draw() plt.gca().invert_yaxis() plt.show() midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] + test_scores_std[-1])) / 2 diff = ((train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] + test_scores_std[-1])) / 2 return midpoint, diff
frames = [train_data, dev] train_data = pd.concat(frames) label, feature = featureEng(train_data, indices[98:]) # label, feature = off_feature_extraction(train_data) clf2 = off_model_building() labelList = {'Clap': [1, 2,3,4,5,6,7,8,9], 'Hands': [1, 2,3,4,5,6,7,8], 'Upside': [1, 2,3,4,5,6,7], 'Think': [0, 1, 2,3,4,5,6], 'Neutral': [1, 2, 3,4,5], 'Shrug': [1, 2, 3,4], 'FacePalm': [3, 2, 1], 'Cry': [2, 1], 'Explode': [ 1], 'Disappoint': []} for i in range(0, len(label)): label[i] = labelList[label[i]] print(label) label = MultiLabelBinarizer().fit_transform(label) #print(label) train_size, train_loss, test_loss = learning_curve( clf2, feature, label, train_sizes=[np.linspace(0.1, 1, 5)], cv=5, n_jobs=3) print('train done') train_loss_mean = np.mean(train_loss, axis=1) test_loss_mean = np.mean(test_loss, axis=1) with open('LearningCurve.txt', 'w') as f: f.write(str(train_loss_mean)) f.write(str(test_loss_mean)) f.write(str(train_size)) plt.figure() # max_train_index = np.argmax(train_loss_mean) # min_train_index = np.argmin(train_loss_mean) # max_test_index = np.argmax(test_loss_mean) # min_test_index = np.argmin(test_loss_mean) plt.plot(train_size, train_loss_mean, 'o-', color='r', label='Train_Scores') plt.plot(train_size, test_loss_mean, 'o-', color='g', label="Valid_Scores") # plt.plot(train_size[max_train_index], train_loss_mean[max_train_index], 'ks')
lr = LogisticRegression(random_state=0) gs = GridSearchCV(estimator=lr, param_grid=param_grid, scoring='accuracy', cv=10) gs = gs.fit(X_train_std, y_train) print(gs.best_score_) print(gs.best_params_) # diagnosing bias an dvariance problems with learning curves. lr = LogisticRegression(penalty='l1', C=10.0) train_sizes, train_scores, test_scores = \ learning_curve(estimator=lr, X=X_train_std, y=y_train, train_sizes=np.linspace(0.1, 1.0, 20), cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(train_sizes, test_mean, color='red', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes,
x1 = df[df.columns[:4]] #标签为Excel最后一列数据 y1 = df[df.columns[-1:]] #把dataframe 格式转换为阵列 x1 = np.array(x1) y1 = np.array(y1) #数据预处理,否则计算出错 y1 = [i[0] for i in y1] y1 = np.array(y1) #创建一个knn分类器 knn = KNeighborsClassifier() #svc=SVC() train_sizes, train_loss, test_loss = learning_curve( knn, x1, y1, cv=10, scoring='accuracy', train_sizes=[0.1, 0.25, 0.5, 0.75]) train_loss_mean = -np.mean(train_loss, axis=1) test_loss_mean = -np.mean(test_loss, axis=1) plt.plot(train_sizes, train_loss_mean, 'o-', color='r', label='Training') plt.plot(train_sizes, test_loss_mean, 'o-', color='g', label='Cross-validation') plt.xlabel("Traing examples") plt.ylabel("Loss") plt.legend(loc="best") plt.show()
def plot_learning_curve(self, estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(0.05, 1, 20), verbose=0, plot=True): train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) if plot: plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel('symbol num') plt.ylabel('score') # plt.gca().invert_yaxis() plt.grid() plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='r') plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='b') plt.plot(train_sizes, train_scores_mean, 'o-', color='b', label=u'score on train_data') plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u'score on cross validation') plt.legend(loc="best") plt.draw() plt.show() plt.gca().invert_yaxis() midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 diff = (train_scores_mean[-1] + train_scores_std[-1]) - ( test_scores_mean[-1] - test_scores_std[-1]) return midpoint, diff # plt = PlotUtil() # plt.plot() # pyqtgraph.examples.run()
X=X_train, y=y_train, cv=10, n_jobs=4) print('CV accuracy scores: %s' % scores) print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) #learning curve plotting pipe_lr = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression(penalty='l2', random_state=0))]) train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), #we set train_sizes=np.linspace(0.1, 1.0, 10) to use 10 evenly spaced relative intervals for the training set sizes. cv=10, #k=10, set via the cv param n_jobs=4) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') #we add the standard deviation of the average accuracies to the plot using
Xtrain = np.random.random((200, 4)) * 2 - 1 ytrain = f(Xtrain) plt.figure() colors = ['r', 'g', 'b', 'c', 'm'] labels = { 1: "Isotropic", 4: "Automatic Relevance Determination", 8: "Factor Analysis" } for i, n in enumerate(labels.keys()): train_sizes, train_scores, test_scores = \ learning_curve(GaussianProcess(corr='squared_exponential', theta0=[1.0] * n, thetaL=[1e-4] * n, thetaU=[1e2] * n), Xtrain, ytrain, scoring="mean_squared_error", cv=10, n_jobs=4) test_scores = -test_scores # Scores correspond to negative MSE test_scores_mean = np.mean(test_scores, axis=1) test_scores_min = np.min(test_scores, axis=1) test_scores_max = np.max(test_scores, axis=1) plt.plot(train_sizes, test_scores_mean, label=labels[n], color=colors[i]) plt.fill_between(train_sizes, test_scores_min, test_scores_max, alpha=0.2, color=colors[i]) plt.legend(loc="best")