Ejemplo n.º 1
0
def plot(cat,clf,clf2,X_train,X_test,y_train,y_test):  # Plot for main category
	from scipy import sparse
	# plot learning curve
	from sklearn.learning_curve import learning_curve
	import matplotlib.pyplot as plt
	import sklearn.svm
	import sklearn.metrics
	#plot
	title = cat
	
	train_sizes=[0.01,0.1,0.2,0.5,1]
	plt.figure()

	plt.title(cat)
	
	plt.xlabel("Training examples")
	plt.ylabel(" F1 Score")

	train_sizes, train_scores, test_scores = learning_curve(clf, sparse.csr_matrix(np.concatenate([X_train, X_test])),np.concatenate([y_train,y_test]), scoring='f1', cv=5, train_sizes=[0.08,0.1,0.2,0.4,0.6,1])
	train_scores_mean = np.mean(train_scores, axis=1)
	train_scores_mean = np.mean(train_scores, axis=1)
	train_scores_std = np.std(train_scores, axis=1)
	test_scores_mean = np.mean(test_scores, axis=1)
	test_scores_std = np.std(test_scores, axis=1)			

	plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1,color="y")
	plt.fill_between(train_sizes, test_scores_mean - test_scores_std,test_scores_mean + test_scores_std, alpha=0.1, color="b")
	plt.plot(train_sizes, train_scores_mean, 'o', color="y",label="Training score: KNN")
	plt.plot(train_sizes, test_scores_mean, 'o-', color="b",label="Cross-validation score: KNN")
	print 'done knn'


	train_sizes, train_scores, test_scores = learning_curve(clf2, sparse.csr_matrix(np.concatenate([X_train, X_test])),np.concatenate([y_train,y_test]), scoring='f1', cv=5, train_sizes=[0.08,0.1,0.2,0.4,0.6,1])
	train_scores_mean = np.mean(train_scores, axis=1)
	train_scores_mean = np.mean(train_scores, axis=1)
	train_scores_std = np.std(train_scores, axis=1)
	test_scores_mean = np.mean(test_scores, axis=1)
	test_scores_std = np.std(test_scores, axis=1)


	plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1,color="r")
	plt.fill_between(train_sizes, test_scores_mean - test_scores_std,test_scores_mean + test_scores_std, alpha=0.1, color="g")
	plt.plot(train_sizes, train_scores_mean, 'o', color="r",label="Training score: SVM")
	plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score: SVM")

	print 'done svm'
	mini=min(test_scores_mean)-0.1
	ylim=(mini, 1.01)

	plt.ylim(*ylim)
	plt.grid()
	plt.legend(loc='lower right',prop={'size':10})

	plt.savefig(cat+'_learning_curve.png')
Ejemplo n.º 2
0
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.
    from http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt
def get_learning_curve(
        model,
        x,
        y,
        cv=3,
        train_sizes=None,
        scoring="log_loss"):
    """Get a dataframe representing the learning curve for a model

    :param model: a sklearn model
    :type model: object
    :param x: the full dataframe of features to pass to the model pipeline
    :type x: pandas.DataFrame
    :param y: the full vector of results
    :type y: pandas.DataFrame
    :param cv: the number of cross validation folds to make on each iteration
    :param train_sizes: a list of training set sizes to go through
    :returns: a dataframe
    """

    if train_sizes is None:
        train_sizes = range(50, 400, 25)

    sizes, train_score, cv_score = learning_curve(
        model, x, y, train_sizes=train_sizes, cv=cv, scoring=scoring
    )
    train_score = np.apply_along_axis(np.mean, 1, train_score)
    cv_score = np.apply_along_axis(np.mean, 1, cv_score)
    df = DataFrame(
        [sizes, train_score, cv_score],
        index=["sizes", "train_score", "cv_score"]
    ).transpose()
    return df
    def plot_learning_curve(self):
        print " + Plotting learning curve (this will take some time)...",

        (X_train, y_train) = self._train_data

        plt.figure()
        plt.title("Learning curve (%s)" % self._learner)
        plt.xlabel("Training examples")
        plt.ylabel("Score")
        train_sizes, train_scores, test_scores = learning_curve(self._clf[self._learner], X_train, y_train, cv=5)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.grid()

        plt.fill_between(
            train_sizes,
            train_scores_mean - train_scores_std,
            train_scores_mean + train_scores_std,
            alpha=0.1,
            color="r",
        )
        plt.fill_between(
            train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g"
        )
        plt.plot(train_sizes, train_scores_mean, "o-", color="r", label="Training score")
        plt.plot(train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score")

        plt.legend(loc="best")
        plt.show()

        print "done."
Ejemplo n.º 5
0
def plot_learning_curve(estimator, title, X, y, ylim=(0, 1.1), cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5), 
                        filename=None):
    plt.clf()
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2, 
                     color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    save_plot("learning_curve_"+str(filename)+".png")
Ejemplo n.º 6
0
def train(clf, train_sizes, cv, params, features, labels):

    train_sizes, train_scores, test_scores = learning_curve(clf, features, labels, cv = cv, train_sizes = train_sizes,n_jobs = 1)
    #plot_learning_curve(clf, title = "Test", X = features,y = labels, cv = cv, train_sizes = train_sizes,n_jobs = 1)
    #clf_string = pickle.dumps(clf)

    return clf, train_sizes, train_scores, test_scores
Ejemplo n.º 7
0
def plot_curve():
    # Defining our regression algorithm
    reg = DecisionTreeRegressor()
    # Fit our model using X and y
    reg.fit(X, y)
    print "Regressor score: {:.4f}".format(reg.score(X,y))

    # TODO: Use learning_curve imported above to create learning curves for both the
    # training data and testing data. You'll need reg, X, y, cv and score from above.
    # Note: Because i didnt use all the parameters in order of function definition for learning_curve fn,
    #       I have to explicitly assign values to the parameters. e.g, from learning_curve fn, after 'y'
    #       comes 'train_sizes'. But since it is optional and I am not using that parameter, for all other parameters
    #       that come after, i have to explicitly assign values to the parameter (e.g cv=cv, scoring=score)
    #       else error
    train_sizes, train_scores, test_scores = learning_curve(reg, X, y, cv=cv, scoring=score)


    # Taking the mean of the test and training scores
    train_scores_mean = np.mean(train_scores,axis=1)
    test_scores_mean = np.mean(test_scores,axis=1)

    # Plotting the training curves and the testing curves using train_scores_mean and test_scores_mean
    plt.plot(train_sizes ,train_scores_mean,'-o',color='b',label="train_scores_mean")
    plt.plot(train_sizes,test_scores_mean ,'-o',color='r',label="test_scores_mean")

    # Plot aesthetics
    plt.ylim(-0.1, 1.1)
    plt.ylabel("Curve Score")
    plt.xlabel("Training Points")
    plt.legend(bbox_to_anchor=(1.1, 1.1))
    plt.show()
Ejemplo n.º 8
0
    def experience_curve(self, train_sizes=None, cv=5, ylim=None, scoring="r2"):
        """ Return matplotlib plt object with learning/experience curve using self.estimator. """

        print "params: ", self.regressor.get_params()

        if not train_sizes:
            train_sizes = np.linspace(.1, 1.0, 10)

        plt.figure()
        plt.title("UCI Energy Output")
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel("Training examples")
        plt.ylabel("Score")
        train_sizes, train_scores, test_scores = learning_curve(
            self.regressor, self.X, self.y, cv=cv, n_jobs=-1, train_sizes=train_sizes, scoring=scoring)

        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)

        plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1, color="g")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")

        plt.legend(loc="best")

        return plt
Ejemplo n.º 9
0
	def make_graph(self):
		plt.figure()
		plt.title(title)
		plt.xlabel("Training examples")
		plt.ylabel("Score")
		train_sizes, train_scores, test_scores = learning_curve(
			 self.model, self.inputs, self.outputs, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
		train_scores_mean = np.mean(train_scores, axis=1)
		train_scores_std = np.std(train_scores, axis=1)
		test_scores_mean = np.mean(test_scores, axis=1)
		test_scores_std = np.std(test_scores, axis=1)
		plt.grid()

		plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
										train_scores_mean + train_scores_std, alpha=0.1,
										color="r")
		plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
										test_scores_mean + test_scores_std, alpha=0.1, color="g")
		plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
						label="Training score")
		plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
						label="Cross-validation score")

		plt.legend(loc="best")
		return plt
    def __init__(self, master, x_train, y_train, x_test, y_test, evaluator):
        Tk.Frame.__init__(self, master)
        train_sizes, train_scores, test_scores = learning_curve(estimator=evaluator.pipeline,
                                                                X=x_train,
                                                                y=y_train,
                                                                train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1)

        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        test_mean = np.mean(test_scores, axis=1)
        test_std = np.std(test_scores, axis=1)
        frame_lcurve = Tk.Frame(self)
        frame_lcurve.pack(fill="x", expand=1, padx=15, pady=15)
        figure_lcurve = Figure(figsize=(6, 6), dpi=100)
        subplot_lcurve = figure_lcurve.add_subplot(111)
        subplot_lcurve.plot(train_sizes, train_mean, color="blue", marker='o', markersize=5,
                            label="training accuracy")
        subplot_lcurve.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15,
                                    color="blue")
        subplot_lcurve.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5,
                            label="cross-validation accuracy")
        subplot_lcurve.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15,
                                    color="green")
        subplot_lcurve.grid()
        subplot_lcurve.set_xlabel("Number of training samples")
        subplot_lcurve.set_ylabel("Accuracy")
        subplot_lcurve.legend(loc="lower right")
        subplot_lcurve.set_ylim([0.8, 1.0])
        self.attach_figure(figure_lcurve, frame_lcurve)
Ejemplo n.º 11
0
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, train_sizes=np.linspace(.1, 1.0, 5)): 
	""" 画出data在某模型上的learning curve. 参数解释 ---------- 
	estimator : 你用的分类器。 
	title : 表格的标题。 
	X : 输入的feature,numpy类型 y : 输入的target vector 
	ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点 
	cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份) 
	""" 
	plt.figure() 
	train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=5, n_jobs=1, train_sizes=train_sizes) 
	train_scores_mean = np.mean(train_scores, axis=1) 
	train_scores_std = np.std(train_scores, axis=1) 
	test_scores_mean = np.mean(test_scores, axis=1) 
	test_scores_std = np.std(test_scores, axis=1) 
	# fill_between 填充两个函数color黄色填充  
	plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") 
	plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") 
	plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") 
	plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") 
	plt.xlabel("Training examples") 
	plt.ylabel("Score") 
	plt.legend(loc="best") 
	plt.grid("on") 
	if ylim: 
		plt.ylim(ylim) 
	plt.title(title) 
	plt.show() 
Ejemplo n.º 12
0
def plot_learning_curve(model, X, y, scorer, sizes=np.linspace(0.1, 1, 5), cv=None, n_jobs=5, ylim=None, title="Xval. learning curve"):
    ''' Plot learning curve for model on data '''

    df = pd.DataFrame()
    train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=cv, n_jobs=n_jobs, train_sizes=sizes)
    df['sizes_p'] = sizes
    df['sizes_n'] = train_sizes
    df['train_mean'] = 1 - np.mean(train_scores, axis=1)
    df['train_std'] = np.std(train_scores, axis=1)
    df['test_mean'] = 1 - np.mean(test_scores, axis=1)
    df['test_std'] = np.std(test_scores, axis=1)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.set_title(title)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_xlabel("Size of training set")
    ax.set_ylabel("Error (1-score)")
    ax.grid()
    ax.fill_between(sizes, df.train_mean - df.train_std, df.train_mean + df.train_std, alpha=0.1, color="r")
    ax.fill_between(sizes, df.test_mean - df.test_std, df.test_mean + df.test_std, alpha=0.1, color="g")
    ax.plot(sizes, df.train_mean, 'o-', color="r", label="Training")
    ax.plot(sizes, df.test_mean, 'o-', color="g", label="Test")
    ax.legend(loc="best")
    fig.show()
    return df, fig
Ejemplo n.º 13
0
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    #to have a figure object, this can be done figure = plt.figure() then the figure object can be referenced subsequently
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring='f1_weighted')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.get_current_fig_manager().window.raise_()

    plt.show()
    return plt
Ejemplo n.º 14
0
def plot_learning_curve(estimator, title, dataset, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    ds = dataset
    X = ds.X_train
    y = ds.y_train
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt
Ejemplo n.º 15
0
def plot_learning_curves(df):
    # get learning curves
    X = df.values[:, :-1]
    y = df.values[:, -1]
    clf = RandomForestClassifier(n_estimators=n_estimators,
        criterion=criterion, max_features=max_features,
        max_leaf_nodes=max_leaf_nodes, min_samples_leaf=min_samples_leaf,
        min_samples_split=min_samples_split, n_jobs=-1, random_state=42)
    train_sizes, train_scores, test_scores = learning_curve(clf, X, y, cv=10,
        n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))

    # get mean and std deviation
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # plot learning curves
    plt.figure()
    plt.plot(train_sizes, train_scores_mean, 'o-', color='b', label='Train')
    plt.plot(train_sizes, test_scores_mean, 'o-', color='r', label='Test')
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
            train_scores_mean + train_scores_std, color='b', alpha=0.1)
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
            test_scores_mean + test_scores_std, color='r', alpha=0.1)
    plt.title("Random Forest Classifier")
    plt.legend(loc='best')
    plt.xlabel("Training Samples")
    plt.ylabel("Score")
    plt.ylim(0.6, 1.01)
    plt.gca().invert_yaxis()
    plt.grid()
    plt.draw()
    plt.savefig('./figures/learning_curves.png')
    plt.clf()
Ejemplo n.º 16
0
def plot_learning_curve(clf, cv, X, y):

    train_sizes, train_scores, valid_scores = learning_curve(
            clf, 
            X, y,
            train_sizes = np.array([ 0.04, 0.1, 0.33, 0.55, 0.78, 1. ]), 
            cv = cv, scoring = my_pipeline_deviance_function, verbose = 2, n_jobs=14)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    valid_scores_mean = np.mean(valid_scores, axis=1)
    valid_scores_std = np.std(valid_scores, axis=1)

    fig, ax = plt.subplots()
    ax.set_title('Learning curve')
    ax.set_ylim(0, 1.1)
    ax.set_xlabel("Training examples", fontsize = 14)
    ax.set_ylabel("Loss (deviance)", fontsize = 14)
    ax.grid()
    ax.fill_between(train_sizes, train_scores_mean - train_scores_std,
                    train_scores_mean + train_scores_std, alpha=0.1,
                    color="r")
    ax.fill_between(train_sizes, valid_scores_mean - valid_scores_std,
                    valid_scores_mean + valid_scores_std, alpha=0.1, color="g")
    ax.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training")
    ax.plot(train_sizes, valid_scores_mean, 'o-', color="g",
                 label="Cross-validation")
    ax.legend(loc="best")
    plt.savefig('learning_curve.png')
Ejemplo n.º 17
0
def plot_learning_curve(estimator, title, X, y, ylimit, days_tr, train_sizes):
    plt.figure(facecolor='w', figsize = (6,5), frameon = "True")
    a, b, axes, label_size = plot_params()
    plt.title(title, size = label_size)
    if ylimit is not None:
        axes.set_ylim(ylimit)
    plt.xlabel("Training Samples", size = label_size)
    plt.ylabel("Mean Squared Error", size = label_size)
    train_sizes, train_scores, valid_scores = learning_curve(estimator, X, y, cv = 5, train_sizes = train_sizes, scoring = 'mean_squared_error')
    train_scores_mean = -np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    valid_scores_mean = -np.mean(valid_scores, axis=1)
    valid_scores_std = np.std(valid_scores, axis=1)

    #plt.grid(b=True, which='major', color='#696969', linestyle=':')
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
        alpha=0.1, color="r")
    plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std,
        alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training")
    plt.plot(train_sizes, valid_scores_mean, 'o-', color="g", label="Cross-validation")

    leg = plt.legend(loc= 4, fontsize = label_size, frameon = 'True')
    leg.get_frame().set_facecolor('w')
    #fig.savefig('learning_curve.png', bbox_inches= 'tight')
    return plt
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(0.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(
        train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r"
    )
    plt.fill_between(
        train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g"
    )
    plt.plot(train_sizes, train_scores_mean, "o-", color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score")

    plt.legend(loc="best")
    return plt
Ejemplo n.º 19
0
def plot_learning_curve(estimator, title, X, y, ylim = None, cv = None, train_sizes = np.linspace(0.1, 1.0, 5)):

    '''
    画出data在某模型上的learning curve
    参数解释
    -------------
    estimator:你用的分类器
    title:表格的标题
    X:输入的feature(numpy的array类型)
    y:输入的target vector
    ylim:tuple格式的(ymin, ymax),设定图像中纵坐标的最低点和最高点
    cv:做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
    -------------
    '''

    plt.figure()
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv = 5, n_jobs = 1, train_sizes = train_sizes)
    train_scores_mean = np.mean(train_scores, axis = 1)
    train_scores_std = np.std(train_scores, axis = 1)
    test_scores_mean = np.mean(test_scores, axis = 1)
    test_scores_std = np.std(test_scores, axis = 1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha = 0.1, color = 'r')
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha = 0.1, color = 'g')
    plt.plot(train_sizes, train_scores_mean, 'o-', color = 'r', label = 'Training score')
    plt.plot(train_sizes, test_scores_mean, 'o-', color = 'g', label = 'Cross-validation score')

    plt.xlabel('Training examples')
    plt.ylabel('Score')
    plt.legend(loc = 'best')
    plt.grid('on')
    if ylim:
        plt.ylim(ylim)
    plt.title(title)
    plt.show()
Ejemplo n.º 20
0
def plotLearningCurve(X,y,fileStorePath,fileName):

	print("Inside Plot learning curve")

	#train_sizes, train_scores, test_scores =  learning_curve(ExtraTreesClassifier(n_estimators=10, 
	#	max_depth=None, min_samples_split=1,random_state=0), X, y,train_sizes = np.linspace(.001, 1.0, 5))
	train_sizes, train_scores, test_scores =  learning_curve(AdaBoostClassifier(), X, y,train_sizes = np.linspace(.001, 1.0, 5))
	
	train_scores_mean = np.mean(train_scores, axis=1)
	train_scores_std = np.std(train_scores, axis=1)
	test_scores_mean = np.mean(test_scores, axis=1)
	test_scores_std = np.std(test_scores, axis=1)

	plt.title("Learning Curve with ExtraTreesClassifier")
	plt.grid()
	plt.fill_between(train_sizes,1-(train_scores_mean - train_scores_std),1-(train_scores_mean + train_scores_std), alpha=0.1,color="r")
	plt.fill_between(train_sizes,1-(test_scores_mean - test_scores_std),1-(test_scores_mean + test_scores_std), alpha=0.1, color="g")
	plt.plot(train_sizes, 1-train_scores_mean, 'o-', color="r",label="Training set error")
	plt.plot(train_sizes, 1-test_scores_mean, 'o-', color="g",label="Cross-validation error")

	plt.legend(loc="best")
	plt.ylabel('Error')
  	plt.xlabel('Training sample size')

	if not os.path.exists(fileStorePath):
		os.makedirs(fileStorePath)

	plt.savefig(fileStorePath+'/'+fileName+'.png')
	plt.clf()#clear the figure for next loop
Ejemplo n.º 21
0
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        train_sizes=np.linspace(.1, 1.0, 5)):
    if os.name == 'nt':
        n_jobs = 1
    else:
        n_jobs = -1
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("训练数据量")
    plt.ylabel("准确率")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs,
        scoring='accuracy', train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="训练集")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="测试集(交叉校验)")

    plt.legend(loc="best")
    return plt
Ejemplo n.º 22
0
def plot_learning_curve(estimator, X, y,train_sizes):

    n_jobs = -1

    # cv=3
    cv = cross_validation.ShuffleSplit(len(X), n_iter=1, test_size=0.3)

    plt.figure()

    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y,scoring="log_loss", cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,verbose=1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")

    plt.show()
Ejemplo n.º 23
0
def showLearningCurve(clf, X, y):
    print ('calculate to print learning curve' + str(datetime.now()))
    train_sizes, train_scores, valid_scores = learning_curve(clf, X, y, 
                                                            scoring = 'f1',
                                                            cv = StratifiedKFold(y, 3),
                                                            n_jobs = 2)
    plt.figure()
    plt.title('learning curve')
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    valid_scores_mean = np.mean(valid_scores, axis=1)
    valid_scores_std = np.std(valid_scores, axis=1)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std,
                     valid_scores_mean + valid_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, valid_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.legend(loc="best")
    plt.show()
Ejemplo n.º 24
0
def learn_curve_plot(estimator,title,X,y,cv=None,train_sizes=np.linspace(0.1,1.0,5)):
    '''
    :param estimator: the model/algorithem you choose
    :param title: plot title
    :param x: train data numpy array style
    :param y: target data vector
    :param xlim: axes x lim
    :param ylim: axes y lim
    :param cv:
    :return: the figure
    '''
    plt.figure()

    train_sizes,train_scores,test_scores=\
        learning_curve(estimator,X,y,cv=cv,train_sizes=train_sizes)
    '''this is the key score function'''
    train_scores_mean=np.mean(train_scores,axis=1)
    train_scores_std=np.std(train_scores,axis=1)
    test_scores_mean=np.mean(test_scores,axis=1)
    test_scores_std=np.std(test_scores,axis=1)

    plt.fill_between(train_sizes,train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,alpha=0.1,color='b')
    plt.fill_between(train_sizes,test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,alpha=0.1,color='g')
    plt.plot(train_sizes,train_scores_mean,'o-',color='b',label='training score')
    plt.plot(train_sizes,test_scores_mean,'o-',color='g',label='cross valid score')
    plt.xlabel('training examples')
    plt.ylabel('score')
    plt.legend(loc='best')
    plt.grid('on')
    plt.title(title)
    plt.show()
Ejemplo n.º 25
0
def plot_learning_curve(outdir, bdt, x, y):
    logging.info("creating learning curve")
    train_sizes, train_scores, test_scores = learning_curve(bdt, x, y,
                                                            cv=ShuffleSplit(len(x),
                                                                            n_iter=100,
                                                                            test_size=1.0 / CV),
                                                            n_jobs=NJOBS,
                                                            train_sizes=np.linspace(.1, 1., 7),
                                                            scoring='roc_auc')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=.2, color='r')
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=.2, color='g')
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
    plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Cross-validation score')

    plt.xlabel("Sample size")
    plt.ylabel("Score (ROC area)")

    plt.legend()
    plt.savefig(os.path.join(outdir, 'learning-curve.png'))
    plt.savefig(os.path.join(outdir, 'learning-curve.pdf'))
    plt.close()
Ejemplo n.º 26
0
def plot_learning_curve(clf, X, y, name=None):
    try:
        name = clf.__class__.__name__ if name is None else name
        print name, X.shape, y.shape
        ssp = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.2, random_state=5557)
        train_sizes, train_scores, test_scores = learning_curve(clf, X, y, cv=ssp,
                n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 10))
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.title('Learning Curve for {}'.format(name))
        plt.grid()
        plt.ylim(-0.05, 1.05)
        plt.xlabel('Training examples')
        plt.ylabel('Score')

        plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1, color='r')
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1, color='g')
        plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
        plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Cross-validation score')

        plt.legend(loc='best')
        plt.savefig(name+'_learningcurve.png')
        plt.clf()
    except Exception as e:
        print(e)
def plot_learning_curve(X, Y):
    train_sizes = np.array([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
    train_sizes, train_scores, test_scores = learning_curve(
                                                KNeighborsClassifier(),
                                                 X_features, Y, train_sizes = train_sizes,
                                                 cv = 7, n_jobs = -1, verbose = True)
    train_scores_mean = np.mean(train_scores, axis = 1)
    train_scores_std = np.std(train_scores, axis = 1)
    test_scores_mean = np.mean(test_scores, axis = 1)
    test_scores_std = np.mean(test_scores, axis = 1)
    
    plt.figure()    
    plt.grid()
    plt.title("Learning Curve with KNN", size = 15)
    plt.xlabel("Training Examples", size = 15)
    plt.ylabel("Score", size = 15)
    plt.xticks(size = 12)
    plt.yticks(size = 12)
    plt.ylim(0.9,1.0)
    plt.plot(train_sizes, train_scores_mean, label = "Training Score", marker = "o", color = "r")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, 
                     train_scores_mean + train_scores_std, alpha = 0.2, color = "r")
    plt.plot(train_sizes, test_scores_mean, label = "Cross-Validation Score", marker = "o", color = "g")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, 
                     test_scores_mean + test_scores_std, alpha = 0.2, color = "g")
    plt.legend(loc = "best")
    plt.savefig('plot_learning_curve_rf_asis')
    plt.show()
def plot_curve():
    reg = LinearRegression()
    reg.fit(X,y)
    print "Regressor score: {:.4f}".format(reg.score(X,y))
    
    # TODO: Use learning_curve imported above to create learning curves from X and y.
    # You will need to use 'cv_sets' and 'scorer' as parameters in the function.
    
    # train_sizes, train_scores, test_scores = (None, None, None)
    train_sizes, train_scores, test_scores = learning_curve(reg, X, y, cv=cv_sets, scoring=scorer)

    # TODO: Plot the learning curves for both the training scores and testing scores.
    #       Use plt.plot() twice -- one for each score. Be sure to give them labels!
    # NOTE: Using plt.plot(train_scores) will get you 6 lines when we are looking to 
    # plot just 2(mean scores for training and testing). 
    # You can use np.mean(train_scores, axis =1) to get mean train_scores values. 
    # Similarly you can get the mean for the test_scores.
    train_scores_mean = np.mean(train_scores, axis=1)
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    test_scores_mean = np.mean(test_scores, axis=1)
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    
    # Plot aesthetics
    plt.ylim(-0.1, 1.1)
    plt.ylabel("Curve Score")
    plt.xlabel("Training Points")
    plt.legend(bbox_to_anchor=(1.1, 1.1))
    plt.show()
Ejemplo n.º 29
0
 def CurvaAprendizaje(self):
     if self.CV:
         if not self.fitted: self.fit()
         a = self.newClassifiers
     else:
         a = self.clasificadores
     figure(self.datos + ', curvas de aprendizaje')
     i=1
     for c in a:
         if c.__class__.__name__ == 'GaussianNB': 
             modo = True
         else:
             modo = False
         numero, training, CV = learning_curve(c, self.X, self.y, 
                                                 train_sizes = linspace(0.1,1.,10), 
                                                 cv = 5, 
                                                 scoring = 'f1_weighted',
                                                 exploit_incremental_learning = modo
                                                 )
         subplot(2,3,i)
         ylim((0,1))
         title(c.__class__.__name__)
         xlabel('# de datos')
         ylabel('F1')
         training_mean, training_std = mean(training, axis=1), std(training, axis=1)
         CV_mean, CV_std = mean(CV, axis=1), std(CV, axis=1)
         grid()
         fill_between(numero, training_mean - training_std, training_mean + training_std, color = 'r', alpha = 0.1)
         fill_between(numero, CV_mean - CV_std, CV_mean + CV_std, color = 'g', alpha = 0.1)
         plot(numero, training_mean, 'o-', color='r', label = 'Training')
         plot(numero, CV_mean, 'o-', color='g', label = 'Cross Validation')
         legend(loc = 4)
         i += 1
     show()
def plot_learning_curve(X_train, y_train):
    pipe_lr = Pipeline([
        ('scl', StandardScaler()),
        ('clf', LogisticRegression(penalty='l2', random_state=0)),
    ])

    train_sizes, train_scores, test_scores = learning_curve(
        estimator=pipe_lr,
        X=X_train,
        y=y_train,
        train_sizes=np.linspace(0.1, 1.0, 10),
        cv=10,
    )

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.plot(
        train_sizes,
        train_mean,
        color='blue',
        marker='o',
        markersize=5,
        label='training accuracy',
    )
    plt.fill_between(
        train_sizes,
        train_mean + train_std,
        train_mean - train_std,
        alpha=0.15,
        color='blue',
    )

    plt.plot(
        train_sizes,
        test_mean,
        color='green',
        linestyle='--',
        marker='s',
        markersize=5,
        label='validation accuracy',
    )
    plt.fill_between(
        train_sizes,
        test_mean + test_std,
        test_mean - test_std,
        alpha=0.15,
        color='green',
    )

    plt.grid()
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0.8, 1.0])

    plt.show()
Ejemplo n.º 31
0
            ax.xaxis.tick_top()

            for i in range(4):
                for j in range(4):
                    ax.text(j, i, '{:.2f}'.format(cm[i, j]),
                            size='medium',
                            ha='center', va='center')

        cv = ShuffleSplit(n_all, n_iter=100, test_size=0.2, random_state=0)
        train_sizes = np.linspace(.1, 1.0, 5)

        for (name, mdl), ax in zip([(x['title'], x['cl'])
                                    for x in models.values()],
                                   axes.flat[models_num:]):
            train_sizes, train_scores, test_scores = learning_curve(
                mdl, x_all, y_all, cv=cv, train_sizes=train_sizes
            )
            ax.set_xlabel('Score')

            train_scores_mean = np.mean(train_scores, axis=1)
            train_scores_std = np.std(train_scores, axis=1)
            test_scores_mean = np.mean(test_scores, axis=1)
            test_scores_std = np.std(test_scores, axis=1)
            ax.grid()

            ax.fill_between(train_sizes, train_scores_mean - train_scores_std,
                            train_scores_mean + train_scores_std, alpha=0.1,
                            color="r")
            ax.fill_between(train_sizes, test_scores_mean - test_scores_std,
                            test_scores_mean + test_scores_std, alpha=0.1, color="g")
            ax.plot(train_sizes, train_scores_mean, 'o-', color="r",
X_train_orig

# In[ ]:

# Perform pre-processing to determine optimal data set size and tune model parameters
from sklearn.svm import SVC
svm = SVC(kernel='rbf', C=100.0, gamma=0.1, random_state=0)

# Determine optimal training data set size using learning curve methods
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    estimator=svm,
    X=X_train_orig,
    y=y_train_orig,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=10,
    n_jobs=1)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes,
         train_mean,
         color='blue',
         marker='o',
         markersize=5,
         label='training accuracy')
plt.fill_between(train_sizes,
                 train_mean + train_std,
Ejemplo n.º 33
0
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=1,
                        train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="g")
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color="r",
             label="Training score")
    plt.plot(train_sizes,
             test_scores_mean,
             'o-',
             color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt
Ejemplo n.º 34
0
# _*_ coding:utf-8 _*_

import matplotlib.pyplot as plt
import numpy as np

print("#---------------------------------------#")
print("             learning curve              ")
print("#---------------------------------------#")
print("\n")

from sklearn.learning_curve import learning_curve

fig,ax=plt.subplots(1,2,figsize=(16,6))
fig.subplots_adjust(left=0.0625,right=0.95,wspace=0.1)

for i,degree in enumerate([2,9]):
	N,train_lc,val_lc=learning_curve()
	pass
Ejemplo n.º 35
0
             markersize=5,label="test_score")  
    plt.fill_between(train_sizes,test_mean+test_std,test_mean-test_std,alpha=0.15,color="green")  
    plt.grid()  
    plt.title('Learning_curve of Random Forest')
    plt.xlabel("train_size")  
    plt.ylabel("Score")  
    plt.legend(loc="lower right")  
    #plt.ylim([0.8,1.0])  
    plt.show()  
    '''
    
    #evaluation of SVM
    #C=100,gamma=0.001,kernel='linear'
    pipe_line = Pipeline([("std",StandardScaler()),  
                      ("clf",svm.SVC())])  
    train_sizes,train_score,test_score = learning_curve(estimator=pipe_line,X=train_x,y=train_y,train_sizes=np.linspace(0.1,1.0,10),cv=10,n_jobs=1)  

    train_mean = np.mean(train_score,axis=1)  
  
    train_std = np.std(train_score,axis=1)  
    test_mean = np.mean(test_score,axis=1)  
    test_std = np.std(test_score,axis=1)  
    plt.plot(train_sizes,train_mean,color="blue",marker="o",markersize=5,label="train_score")  
    plt.fill_between(train_sizes,train_mean+train_std,train_mean-train_std,alpha=0.15,color="blue")  
    plt.plot(train_sizes,test_mean,color="green",linestyle="--",marker="s",  
             markersize=5,label="test_score")  
    plt.fill_between(train_sizes,test_mean+test_std,test_mean-test_std,alpha=0.15,color="green")  
    plt.grid()  
    plt.title('Learning_curve of SVM')
    plt.xlabel("train_size")  
    plt.ylabel("Score")  
Ejemplo n.º 36
0
    print('Fold: %s, Class dist.: %s, Acc: %.3f' %
          (k + 1, np.bincount(y_train[train]), score))

np.mean(scores)
np.std(scores)

#=============== visualing accuracy curve===============
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
pip_lr = Pipeline([('scl', StandardScaler()),
                   ('clf', LogisticRegression(penalty='12', random_state=0))])

train_sizes, train_scores,test_scores=\
    learning_curve(estimator=pipe_lr,
                   X=X_train,
                   y= y_train,
                   train_sizes=np.linspace(0.1, 1.0, 10),
                   cv=10,
                   n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes,
         train_mean,
         color='blue',
         marker='o',
         markersize=5,
         label='TA')

plt.fill_between(train_sizes,
Ejemplo n.º 37
0
##clf = neighbors.KNeighborsClassifier(n_neighbors=10,weights='distance')
#clf = neighbors.KNeighborsClassifier(n_neighbors=10)


##-------------------------------------------------Traning------------------
clf = clf.fit(XtrainPos, YtrainPos)
print(metrics.classification_report(YtestPos, clf.predict(XtestPos)))

##--------------------------Crossvalidation 5 times using different split------------------------------
#from sklearn import cross_validation
#scores = cross_validation.cross_val_score(clf, XtrainAll, label, cv=3, scoring='f1')
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

####---------------------------------Check for overfeat-------------------------------------
train_sample_size, train_scores, test_scores = learning_curve(clf,
                                                              XtrainAll, label, 
                                                              train_sizes=np.arange(0.1,1,0.1), cv=10)

#----------------------------------------Visualization---------------------------------------------
plt.xlabel("# Training sample")
plt.ylabel("Accuracy")
plt.grid();
mean_train_scores = np.mean(train_scores, axis=1)
mean_test_scores = np.mean(test_scores, axis=1)
std_train_scores = np.std(train_scores, axis=1)
std_test_scores = np.std(test_scores, axis=1)

gap = np.abs(mean_test_scores - mean_train_scores)
g = plt.figure(1)
plt.title("Learning curves for %r\n"
             "Best test score: %0.2f - Gap: %0.2f" %
Ejemplo n.º 38
0
# 画决策树图
# dot_data = StringIO()
# tree.export_graphviz(clf,out_file=dot_data)
# graph=pydot.graph_from_dot_data(dot_data.getvalue())
# graph.write_pdf("D:/luheng/mypython/mytree.pdf")
# df["try"]=df["job_exp"]-df["workexp_months"].astype(int)
# print df
# for x in  df["try"].unique():
#     print x
# print df.describe()
# df2=df[["latest_workexp_job_spec","latest_workexp_job_position","workexp","projectexp"]]
# df3=df[["industry","position"]]
# df3.to_csv("D:/luheng/mypython/HR.txt",index=False,header=False)
# 画学习曲线图
train_sizes = np.linspace(0.1, 1.0, 20)
train_sizes, train_scores, test_scores = learning_curve(
    clf, x, y, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Learning Curve with Tree")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.grid()
plt.fill_between(train_sizes,
                 train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std,
                 alpha=0.1,
                 color="r")
plt.fill_between(train_sizes,
Ejemplo n.º 39
0
    def train_ml_model(self):
        """

        :return:
        """
        logger.info('#########################################################################')
        logger.info('train_ml_model')
        logger.info('#########################################################################')

        ######################################################
        # Load dataset
        ######################################################
        cols, splits = self.get_data()
        data_train, data_test, target_train, target_test = splits

        # clf =  ExtraTreesRegressor(500, n_jobs=constants.ncpu)
        # #clf = SVR(kernel='rbf', C=1e3, gamma=0.1)
        # #clf = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3)
        # data = df_train.as_matrix(columns=cols[1:])  # convert dataframe column to matrix
        # #data = preprocessing.scale(data)
        # target = df_train.as_matrix(columns=[self.var_target]).ravel()  # convert dataframe column to matrix
        # clf.fit(data, target)
        #
        # predict_val = clf.predict(after.as_matrix(columns=cols[1:]))
        # results = compute_stats.ols(predict_val.tolist(), after_target.tolist())
        # print results.rsquared
        # import matplotlib.pyplot as plt
        # plt.scatter(after_target, predict_val)
        # plt.show()
        # pdb.set_trace()
        if not os.path.isfile(self.path_pickle_model):
            # For details in scikit workflow: See http://stackoverflow.com/questions/
            # 35256876/ensuring-right-order-of-operations-in-random-forest-classification-in-scikit-lea
            # TODO Separate out a dataset so that even the grid search cv can be tested
            ############################
            # Select features from model
            ############################
            logger.info('Selecting important features from model')
            if self.classify:
                rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu)
            else:
                rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu)
            feat_selection = SelectFromModel(rf_feature_imp)

            pipeline = Pipeline([
                      ('fs', feat_selection),
                      ('clf', self.model),
                    ])

            #################################
            # Grid search for best parameters
            #################################
            C_range = np.logspace(-2, 10, 13)
            gamma_range = np.logspace(-9, 3, 13)
            logger.info('Tuning hyperparameters')
            param_grid = {
                'fs__threshold': ['mean', 'median'],
                'fs__estimator__max_features': ['auto', 'log2'],
                'clf__max_features': ['auto', 'log2'],
                'clf__n_estimators': [1000, 2000]
                #'clf__gamma': np.logspace(-9, 3, 13),
                #'clf__C': np.logspace(-2, 10, 13)
            }

            gs = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, n_jobs=constants.ncpu, error_score=np.nan)
            # Fir the data before getting the best parameter combination. Different data sets will have
            # different optimized parameter combinations, i.e. without data, there is no optimal parameter combination.
            gs.fit(data_train, target_train)
            logger.info(gs.best_params_)

            data_test = pd.DataFrame(data_test, columns=cols[1:])

            # Update features that should be used in model
            selected_features = gs.best_estimator_.named_steps['fs'].transform([cols[1:]])
            cols = selected_features[0]
            data_test = data_test[cols]

            # Update model with the best parameters learnt in the previous step
            self.model = gs.best_estimator_.named_steps['clf']

            predict_val = self.model.predict(data_test)
            results = compute_stats.ols(predict_val.tolist(), target_test.tolist())
            print results.rsquared
            print cols
            plt.scatter(target_test, predict_val)
            plt.show()
            pdb.set_trace()
            ###################################################################
            # Output and plot importance of model features, and learning curves
            ###################################################################
            self.output_model_importance(gs, 'clf', num_cols=len(cols[1:]))

            if constants.plot_model_importance:
                train_sizes, train_scores, test_scores = learning_curve(self.model, data, target, cv=k_fold,
                                                                        n_jobs=constants.ncpu)
                plot.plot_learning_curve(train_scores, test_scores, train_sizes=train_sizes, fname='learning_curve',
                                         ylim=(0.0, 1.01), title='Learning curves', out_path=self.path_out_dir)

            # Save the model to disk
            logger.info('Saving model and features as pickle on disk')
            with open(self.path_pickle_model, 'wb') as f:
                cPickle.dump(self.model, f)
            with open(self.path_pickle_features, 'wb') as f:
                cPickle.dump(self.vars_features, f)
        else:
            # Read model from pickle on disk
            with open(self.path_pickle_model, 'rb') as f:
                logger.info('Reading model from pickle on disk')
                self.model = cPickle.load(f)

            logger.info('Reading features from pickle on disk')
            self.vars_features = pd.read_pickle(self.path_pickle_features)

        return df_cc
Ejemplo n.º 40
0
def plot_learn_curve(model,
                     train,
                     label,
                     ylim=None,
                     cv=None,
                     n_jobs=1,
                     train_sizes=np.linspace(.05, 1., 20),
                     title='learning_curve',
                     verbose=0,
                     plot=True):
    train_sizes, train_scores, test_scores = learning_curve(
        model,
        train,
        label,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        verbose=verbose)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel('data size')
        plt.ylabel('score')
        plt.gca().invert_yaxis()
        plt.grid()

        plt.fill_between(train_sizes,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color="b")
        plt.fill_between(train_sizes,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color="r")
        plt.plot(train_sizes,
                 train_scores_mean,
                 'o-',
                 color='b',
                 label='train set score')
        plt.plot(train_sizes,
                 test_scores_mean,
                 'o-',
                 color='r',
                 label='cv set score')

        plt.legend(loc='best')

        plt.draw()
        plt.show()

        midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) +
                    (test_scores_mean[-1] - test_scores_std[-1])) / 2
        diff = (train_scores_mean[-1] + test_scores_std[-1]) - (
            test_scores_mean[-1] - test_scores_std[-1])
        return midpoint, diff
Ejemplo n.º 41
0
def plot_learning_curve(train_dataset,
                        estimator,
                        train_sizes=np.linspace(.1, 1.0, 5),
                        score_attr=None,
                        n_jobs=-1,
                        save=False,
                        display=True,
                        cv=5,
                        filename="learning_curve"):
    if not save and not display:
        return

    logging.info("Calculating learning curve")
    train_sizes, train_scores, valid_scores = learning_curve(
        estimator=estimator,
        X=train_dataset.data,
        y=train_dataset.target,
        train_sizes=train_sizes,
        cv=cv,
        n_jobs=n_jobs,
        scoring=score_attr,
    )

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    valid_scores_mean = np.mean(valid_scores, axis=1)
    valid_scores_std = np.std(valid_scores, axis=1)

    logging.debug("Plotting learning curve")
    plt.title("Learning curve")
    plt.xlabel("Training examples")
    plt.ylabel(score_attr.upper() if score_attr else "" + "Score")
    plt.grid()
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             label="Training score",
             color="r")
    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.2,
                     color="r")
    plt.plot(train_sizes,
             valid_scores_mean,
             'o-',
             label="Cross-Validation score",
             color="g")
    plt.fill_between(train_sizes,
                     valid_scores_mean - valid_scores_std,
                     valid_scores_mean + valid_scores_std,
                     alpha=0.2,
                     color="g")
    plt.legend(loc="best")

    if display:
        plt.show()
    if save:
        plt.savefig(filename + "_" +
                    datetime.now().strftime("%Y_%m_%d_%H_%M") + ".png",
                    dpi=400)
    plt.clf()
Ejemplo n.º 42
0
def learning_curves(filepath, scoring, eu_adr_only=False, total_instances=0):
    """
    Plot learning curves of f-score for training and test data
    """
    features, labels = load_data(eu_adr_only, total_instances)

    # convert from dict into np array
    vec = DictVectorizer()
    data = vec.fit_transform(features).toarray()

    # set up pipeline to normalise the data then build the model
    clf = Pipeline([('normaliser', preprocessing.Normalizer()),
                    ('svm',
                     SVC(kernel='poly',
                         coef0=3,
                         degree=2,
                         gamma=1,
                         cache_size=1000))])
    #('svm', SVC(kernel='linear'))])

    cv = cross_validation.StratifiedKFold(labels,
                                          n_folds=10,
                                          shuffle=True,
                                          random_state=0)

    # why does this always return results in the same pattern??? something fishy is going on
    # think that including 0.9 ends up in downward slope at the end
    sizes, t_scores, v_scores = learning_curve(clf,
                                               data,
                                               labels,
                                               train_sizes=np.linspace(
                                                   0.1, 0.9, 8),
                                               cv=cv,
                                               scoring=scoring,
                                               n_jobs=-1)

    train_results = np.array(
        [np.mean(t_scores[i]) for i in range(len(t_scores))])
    valid_results = np.array(
        [np.mean(v_scores[i]) for i in range(len(v_scores))])
    '''
    # define new set of points to be used to smooth the plots
    x_new = np.linspace(sizes.min(), sizes.max())
    training_smooth = spline(sizes, training_results, x_new)
    validation_smooth = spline(sizes, validation_results, x_new)
    #plt.plot(sizes, validation_results)
    plt.plot(x_new, validation_smooth)
    #plt.plot(sizes, training_results)
    plt.plot(x_new, training_smooth)
    '''
    # instead lets fit a polynomial of degree ? as this should give a better impression!
    valid_coefs = np.polyfit(sizes, valid_results, deg=2)
    train_coefs = np.polyfit(sizes, train_results, deg=2)
    x_new = np.linspace(sizes.min(), sizes.max())
    valid_new = np.polyval(valid_coefs, x_new)
    train_new = np.polyval(train_coefs, x_new)

    # plot the raw points and the fitted curves
    #plt.plot(x_new, train_new)
    #plt.plot(sizes, train_results)
    plt.plot(x_new, valid_new, label='fitted poly degree 2')
    plt.plot(sizes, valid_results, label='raw points')

    kernel = str(clf.named_steps['svm'].get_params()['kernel'])
    coef = str(clf.named_steps['svm'].get_params()['coef0'])
    degree = str(clf.named_steps['svm'].get_params()['degree'])
    c_error = str(clf.named_steps['svm'].get_params()['C'])
    plt.title('kernel: ' + kernel + ', degree = ' + degree + ', coef = ' +
              coef + ', C = ' + c_error)
    plt.xlabel('training_instances')
    plt.ylabel('f_score')

    #plt.show()
    plt.savefig(filepath, format='tif')
    plt.clf()
    # shuffle data
    random_idx = permutation(np.arange(len(y)))
    X = X[random_idx]
    y = y[random_idx]

    # create model
    model = LogisticRegression(C=1)
    #model = RandomForestClassifier(n_estimators=10)

    # plot high-dimensional decision boundary
    db = DBPlot(model)
    db.fit(X, y, training_indices=0.5)
    db.plot(plt, generate_testpoints=True)  # set generate_testpoints=False to speed up plotting
    plt.show()

    # plot learning curves for comparison
    N = 10
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=5, train_sizes=np.linspace(.2, 1.0, N))

    plt.errorbar(train_sizes, np.mean(train_scores, axis=1),
                 np.std(train_scores, axis=1) / np.sqrt(N))
    plt.errorbar(train_sizes, np.mean(test_scores, axis=1),
                 np.std(test_scores, axis=1) / np.sqrt(N), c='r')

    plt.legend(["Accuracies on training set", "Accuracies on test set"])
    plt.xlabel("Number of data points")
    plt.title(str(model))
    plt.show()
Ejemplo n.º 44
0
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=1,
                        train_sizes=np.linspace(.05, 1., 20),
                        verbose=0,
                        plot=True):

    train_sizes, train_scores, test_scores = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        verbose=verbose)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel(u"Training Sample Size")
        plt.ylabel(u"Score")
        plt.gca().invert_yaxis()
        plt.grid()

        plt.fill_between(train_sizes,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color="b")
        plt.fill_between(train_sizes,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color="r")
        plt.plot(train_sizes,
                 train_scores_mean,
                 'o-',
                 color="b",
                 label=u"Training Score")
        plt.plot(train_sizes,
                 test_scores_mean,
                 'o-',
                 color="r",
                 label=u"Cross-Validation Score")

        plt.legend(loc="best")

        plt.draw()
        plt.gca().invert_yaxis()
        plt.show()

    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) +
                (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (
        test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=1,
                        train_sizes=np.linspace(.05, 1., 20)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="g")
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color="r",
             label="Training score")
    plt.plot(train_sizes,
             test_scores_mean,
             'o-',
             color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt
Ejemplo n.º 46
0
def draw_learning_curves(estimator,
                         X,
                         y,
                         ylim=None,
                         cv=None,
                         n_jobs=1,
                         scoring=None,
                         train_sizes=np.linspace(.1, 1.0, 5)):
    """Code taken from scikit-learn examples for version 0.15.

    Generate a simple plot of the test and traning learning curve.

    Args:
        estimator (class): object type that implements the "fit" and "predict"
            methods
            An object of that type which is cloned for each validation.
        title (str): Used for the title for the chart.
        X (2D array): array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y (1D array): array-like, shape (n_samples) or (n_samples,
            n_features), optional Target relative to X for classification or
            regression; None for unsupervised learning.
        ylim (tuple, optional): Defines minimum and maximum yvalues plotted.
        cv (int, optional): If an integer is passed, it is the number of folds
            (defaults to 3). Specific cross-validation objects can be passed,
            see sklearn.cross_validation module for the list of possible
            objects
        n_jobs(int, optional) : Number of jobs to run in parallel (default 1).
        train_sizes (float): Relative or absolute numbers of training examples
            that will be used to generate the learning curve. If the dtype is
            float, it is regarded as a fraction of the maximum size of the
            training set (that is determined by the selected validation
            method), i.e. it has to be within (0, 1]. Otherwise it is
            interpreted as absolute sizes of the training sets. Note that for
            classification the number of samples usually have to be big enough
            to contain at least one sample from each class. (default:
            np.linspace(0.1, 1.0, 5))

        Returns:
            A plot of the learning curves for both the training curve and the
            cross-validation curve.
    """
    plt.close('all')
    flat_shape = (X.shape[0], ) + (np.prod(X.shape[1:]), )
    X_flat = X.reshape(flat_shape)
    plt.figure()
    plt.title('Learning Curves', fontsize=20)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples", fontsize=15)
    plt.ylabel("Score", fontsize=15)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator,
        X_flat,
        y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="#f46d43")
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="#1a9641")
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color="#f46d43",
             linewidth=2,
             label="Training score")
    plt.plot(train_sizes,
             test_scores_mean,
             'o-',
             color="#1a9641",
             linewidth=2,
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()
Ejemplo n.º 47
0
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=1,
                        train_sizes=np.linspace(.05, 1., 20),
                        verbose=0,
                        plot=True):
    """
    画出data在某模型上的learning curve.
    参数解释
    ----------
    estimator : 你用的分类器。
    title : 表格的标题。
    X : 输入的feature,numpy类型
    y : 输入的target vector
    ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
    cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
    n_jobs : 并行的的任务数(默认1)
    """
    train_sizes, train_scores, test_scores = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        verbose=verbose)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel(u"训练样本数")
        plt.ylabel(u"得分")
        plt.gca().invert_yaxis()
        plt.grid()

        plt.fill_between(train_sizes,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color="b")
        plt.fill_between(train_sizes,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color="r")
        plt.plot(train_sizes,
                 train_scores_mean,
                 'o-',
                 color="b",
                 label=u"训练集上得分")
        plt.plot(train_sizes,
                 test_scores_mean,
                 'o-',
                 color="r",
                 label=u"交叉验证集上得分")

        plt.legend(loc="best")

        plt.draw()
        plt.gca().invert_yaxis()
        plt.show()

    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) +
                (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (
        test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff
Ejemplo n.º 48
0
from sklearn import datasets
import numpy as np
from sklearn.learning_curve import learning_curve
from sklearn.svm import SVC
import matplotlib.pyplot as plt

digits = datasets.load_digits()
X = digits.data
y = digits.target

train_sizes, train_loss, test_loss = learning_curve(
    SVC(gamma=0.001),
    X,
    y,
    cv=10,
    scoring='mean_squared_error',
    train_sizes=[0.1, 0.25, 0.5, 0.75, 1])

train_loss_mean = -np.mean(train_loss, axis=1)
test_loss_mean = -np.mean(test_loss, axis=1)

plt.plot(train_sizes, train_loss_mean, 'o-', color='r', label='training')
plt.plot(train_sizes,
         test_loss_mean,
         'o-',
         color='g',
         label='Cross-validation')
plt.xlabel('Training examples')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.show()
plt.plot(X, y, 'c*', label='data')
plt.hold('on')

plt.plot(
    X, y_kr, c='r', label='Kernel Regression'
)  ##line -plt.plot(X, y_svr, c='r', label='SVR'),plt.scatter(X, y_kr, c='b', label='Kernel Regression')
#plt.plot(X, y_svr, c='r', label='SVR')
plt.xlabel('height')
plt.ylabel('weight')
plt.title('Kernel regression -boy')
plt.legend()

# Visualize learning curves
plt.figure()
train_sizes, train_scores_svr, test_scores_svr = \
    learning_curve(svr, X, y, train_sizes=np.linspace(0.1, 1, 10),
                   scoring="mean_squared_error", cv=10)
train_sizes_abs, train_scores_kr, test_scores_kr = \
    learning_curve(kr, X, y, train_sizes=np.linspace(0.1, 1, 10),
                   scoring="mean_squared_error", cv=10)
plt.plot(train_sizes, test_scores_svr.mean(1), 'o-', color="r", label="SVR")
plt.plot(train_sizes,
         test_scores_kr.mean(1),
         'o-',
         color="g",
         label="Kernel Regression")
plt.yscale("symlog", linthreshy=1e-7)
plt.ylim(-10, -0.01)
plt.xlabel("Training size")
plt.ylabel("Mean Squared Error")
plt.title('Learning curves')
plt.legend(loc="best")
Ejemplo n.º 50
0
# Linear SVM
clf_lin = svm.SVC(kernel='linear')
linear_train_error = []
linear_test_error = []
training_samples = []
# Radial SVM
clf_rad = svm.SVC(kernel='rbf')
radial_train_error = []
radial_test_error = []
#polinomial SVM
clf_poly = svm.SVC(kernel='poly', degree=3)
poly_train_error = []
poly_test_error = []

rad_train_sizes, rad_train_scores, rad_valid_scores = learning_curve(clf_rad,
                                                                     X_train,
                                                                     Y_train,
                                                                     cv=5)
print('rad')
lin_train_sizes, lin_train_scores, lin_valid_scores = learning_curve(clf_lin,
                                                                     X_train,
                                                                     Y_train,
                                                                     cv=5)
print('lin')
poly_train_sizes, poly_train_scores, poly_valid_scores = learning_curve(
    clf_poly, X_train, Y_train, cv=2)
#print('poly')

stop = timeit.default_timer()
print("time to run:", stop - start)

fig = plt.figure()
Ejemplo n.º 51
0
from sklearn.learning_curve import learning_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np

digits = load_digits()
X = digits.data
y = digits.target

train_sizes, train_loss, test_loss = learning_curve(SVC(gamma=0.01), X, y, cv=10, scoring='accuracy', train_sizes=[0.1, 0.25, 0.5, 0.75, 1])
train_loss_mean = np.mean(train_loss, axis=1)
test_loss_mean = np.mean(test_loss, axis=1)

plt.figure()
plt.plot(train_sizes, train_loss_mean, 'o-', color='r', label='Training')
plt.plot(train_sizes, test_loss_mean, 'o-', color='g', label='Cross validation')
plt.xlabel('Training examples')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.show()
Ejemplo n.º 52
0
plt.figure()
plt.plot(parameter_grid, 100*np.average(train_scores, axis=1), color='black')
plt.title('Validation curve')
plt.xlabel('Maximum depth of the tree')
plt.ylabel('Accuracy')
plt.show()

'''
学习曲线帮助我们了解训练集的大小如何影响机器学习模型
这对解决计算约束的问题很重要
'''
#学习曲线
from sklearn.learning_curve import learning_curve
classifier = RandomForestClassifier(random_state = 7)
parameter_grid = np.array([200,500,800,1000])
train_sizes,train_scores,validation_scores = learning_curve(classifier,X,y,train_sizes = parameter_grid,cv = 5)

print ("\n##### LEARNING CURVES #####")
print ("\nTraining scores:\n", train_scores)
print ("\nValidation scores:\n", validation_scores)

plt.figure()
plt.plot(parameter_grid, 100*np.average(train_scores, axis=1), color='black')
plt.title('Learning curve')
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.show()

'''
小的样本集看似分类精度更好
但容易出现过度拟合的问题
Ejemplo n.º 53
0
# Scikit-Learn offers a convenient utility for computing such learning curves from your models;
#
# here we will compute a learning curve for our original dataset with a second-order polynomial model and a ninth-order polynomial:

# In[41]:

from sklearn.learning_curve import learning_curve
import warnings
warnings.filterwarnings("ignore")

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
for i, degree in enumerate([2, 9]):
    N, train_lc, val_lc = learning_curve(PolynomialRegression(degree),
                                         X,
                                         y,
                                         cv=7,
                                         train_sizes=np.linspace(0.3, 1, 25))

    ax[i].plot(N, np.mean(train_lc, 1), color='blue', label='training score')
    ax[i].plot(N, np.mean(val_lc, 1), color='red', label='validation score')
    ax[i].hlines(np.mean([train_lc[-1], val_lc[-1]]),
                 N[0],
                 N[-1],
                 color='gray',
                 linestyle='dashed')
    ax[i].set_ylim(0, 1)
    ax[i].set_xlim(N[0], N[-1])
    ax[i].set_xlabel('training size', fontsize=30)
    ax[i].set_ylabel('score', fontsize=30)
    ax[i].set_title('degree = {0}'.format(degree), size=24)
Ejemplo n.º 54
0
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylin=None,
                        cv=None,
                        n_jobs=1,
                        train_sizes=np.linspace(.05, 1., 30),
                        verbose=0,
                        plot=True):
    print(len(X))
    print(len(y))
    train_sizes, train_scores, test_scores = learning_curve(
        estimator,
        X,
        y,
        train_sizes=train_sizes,
        cv=10,
        n_jobs=n_jobs,
        verbose=verbose)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    if (plot):
        plt.figure()
        plt.title(title)
        if ylin is not None:
            plt.ylim(ylin)
        plt.xlabel(u'trainset_count')
        plt.ylabel(u'score')
        plt.gca().invert_yaxis()
        plt.grid()

        plt.fill_between(train_sizes,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color='b')
        plt.fill_between(train_sizes,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color='r')
        plt.plot(train_sizes,
                 train_scores_mean,
                 'o-',
                 color='b',
                 label=u'scores of trainSet')
        plt.plot(train_sizes,
                 test_scores_mean,
                 'o-',
                 color='r',
                 label=u'scores of testSet')

        plt.legend(loc='best')

        plt.draw()
        plt.gca().invert_yaxis()
        plt.show()

    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) +
                (test_scores_mean[-1] + test_scores_std[-1])) / 2
    diff = ((train_scores_mean[-1] + train_scores_std[-1]) -
            (test_scores_mean[-1] + test_scores_std[-1])) / 2
    return midpoint, diff
Ejemplo n.º 55
0
    frames = [train_data, dev]
    train_data = pd.concat(frames)
    label, feature = featureEng(train_data, indices[98:])
    # label, feature = off_feature_extraction(train_data)
    clf2 = off_model_building()

    labelList = {'Clap': [1, 2,3,4,5,6,7,8,9], 'Hands': [1, 2,3,4,5,6,7,8], 'Upside': [1, 2,3,4,5,6,7], 'Think': [0, 1, 2,3,4,5,6],
                 'Neutral': [1, 2, 3,4,5], 'Shrug': [1, 2, 3,4], 'FacePalm': [3, 2, 1],
                 'Cry': [2, 1], 'Explode': [ 1], 'Disappoint': []}
    for i in range(0, len(label)):
        label[i] = labelList[label[i]]
    print(label)
    label = MultiLabelBinarizer().fit_transform(label)
    #print(label)
    train_size, train_loss, test_loss = learning_curve(
        clf2, feature, label, train_sizes=[np.linspace(0.1, 1, 5)], cv=5, n_jobs=3)
    print('train done')
    train_loss_mean = np.mean(train_loss, axis=1)
    test_loss_mean = np.mean(test_loss, axis=1)
    with open('LearningCurve.txt', 'w') as f:
        f.write(str(train_loss_mean))
        f.write(str(test_loss_mean))
        f.write(str(train_size))
    plt.figure()
    # max_train_index = np.argmax(train_loss_mean)
    # min_train_index = np.argmin(train_loss_mean)
    # max_test_index = np.argmax(test_loss_mean)
    # min_test_index = np.argmin(test_loss_mean)
    plt.plot(train_size, train_loss_mean, 'o-', color='r', label='Train_Scores')
    plt.plot(train_size, test_loss_mean, 'o-', color='g', label="Valid_Scores")
    # plt.plot(train_size[max_train_index], train_loss_mean[max_train_index], 'ks')
Ejemplo n.º 56
0
lr = LogisticRegression(random_state=0)
gs = GridSearchCV(estimator=lr,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10)
gs = gs.fit(X_train_std, y_train)
print(gs.best_score_)
print(gs.best_params_)

# diagnosing bias an dvariance problems with learning curves.
lr = LogisticRegression(penalty='l1', C=10.0)
train_sizes, train_scores, test_scores = \
    learning_curve(estimator=lr,
                   X=X_train_std,
                   y=y_train,
                   train_sizes=np.linspace(0.1, 1.0, 20),
                   cv=10)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o',
         markersize=5, label='training accuracy')
plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='red', marker='o',
         markersize=5, label='training accuracy')
plt.fill_between(train_sizes,
Ejemplo n.º 57
0
x1 = df[df.columns[:4]]
#标签为Excel最后一列数据
y1 = df[df.columns[-1:]]

#把dataframe 格式转换为阵列
x1 = np.array(x1)
y1 = np.array(y1)
#数据预处理,否则计算出错
y1 = [i[0] for i in y1]
y1 = np.array(y1)

#创建一个knn分类器
knn = KNeighborsClassifier()
#svc=SVC()

train_sizes, train_loss, test_loss = learning_curve(
    knn, x1, y1, cv=10, scoring='accuracy', train_sizes=[0.1, 0.25, 0.5, 0.75])

train_loss_mean = -np.mean(train_loss, axis=1)
test_loss_mean = -np.mean(test_loss, axis=1)

plt.plot(train_sizes, train_loss_mean, 'o-', color='r', label='Training')
plt.plot(train_sizes,
         test_loss_mean,
         'o-',
         color='g',
         label='Cross-validation')

plt.xlabel("Traing examples")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show()
Ejemplo n.º 58
0
    def plot_learning_curve(self,
                            estimator,
                            title,
                            X,
                            y,
                            ylim=None,
                            cv=None,
                            n_jobs=1,
                            train_sizes=np.linspace(0.05, 1, 20),
                            verbose=0,
                            plot=True):
        train_sizes, train_scores, test_scores = learning_curve(
            estimator,
            X,
            y,
            cv=cv,
            n_jobs=n_jobs,
            train_sizes=train_sizes,
            verbose=verbose)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)

        if plot:
            plt.figure()
            plt.title(title)
            if ylim is not None:
                plt.ylim(*ylim)
            plt.xlabel('symbol num')
            plt.ylabel('score')
            # plt.gca().invert_yaxis()
            plt.grid()

            plt.fill_between(train_sizes,
                             test_scores_mean - test_scores_std,
                             test_scores_mean + test_scores_std,
                             alpha=0.1,
                             color='r')
            plt.fill_between(train_sizes,
                             train_scores_mean - train_scores_std,
                             train_scores_mean + train_scores_std,
                             alpha=0.1,
                             color='b')

            plt.plot(train_sizes,
                     train_scores_mean,
                     'o-',
                     color='b',
                     label=u'score on train_data')
            plt.plot(train_sizes,
                     test_scores_mean,
                     'o-',
                     color="r",
                     label=u'score on cross validation')
            plt.legend(loc="best")
            plt.draw()
            plt.show()
            plt.gca().invert_yaxis()

        midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) +
                    (test_scores_mean[-1] - test_scores_std[-1])) / 2
        diff = (train_scores_mean[-1] + train_scores_std[-1]) - (
            test_scores_mean[-1] - test_scores_std[-1])
        return midpoint, diff


# plt = PlotUtil()
# plt.plot()

# pyqtgraph.examples.run()
Ejemplo n.º 59
0
                          X=X_train,
                          y=y_train,
                          cv=10,
                          n_jobs=4)
     
 print('CV accuracy scores: %s' % scores)
 print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
 
 #learning curve plotting
 
 pipe_lr = Pipeline([('scl', StandardScaler()), 
                     ('clf', LogisticRegression(penalty='l2', random_state=0))])
 
 train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr, 
                                                         X=X_train, 
                                                         y=y_train, 
                                                         train_sizes=np.linspace(0.1, 1.0, 10), #we set train_sizes=np.linspace(0.1, 1.0, 10) to use 10 evenly spaced relative intervals for the training set sizes.
                                                         cv=10, #k=10, set via the cv param
                                                         n_jobs=4)
 train_mean = np.mean(train_scores, axis=1)
 train_std = np.std(train_scores, axis=1)
 test_mean = np.mean(test_scores, axis=1)
 test_std = np.std(test_scores, axis=1)
 
 plt.plot(train_sizes, 
          train_mean, 
          color='blue', 
          marker='o',
          markersize=5,
          label='training accuracy')
 
 #we add the standard deviation of the average accuracies to the plot using 
Ejemplo n.º 60
0
Xtrain = np.random.random((200, 4)) * 2 - 1
ytrain = f(Xtrain)

plt.figure()
colors = ['r', 'g', 'b', 'c', 'm']
labels = {
    1: "Isotropic",
    4: "Automatic Relevance Determination",
    8: "Factor Analysis"
}
for i, n in enumerate(labels.keys()):
    train_sizes, train_scores, test_scores = \
        learning_curve(GaussianProcess(corr='squared_exponential',
                                       theta0=[1.0] * n, thetaL=[1e-4] * n,
                                       thetaU=[1e2] * n),
                       Xtrain, ytrain, scoring="mean_squared_error",
                       cv=10, n_jobs=4)
    test_scores = -test_scores  # Scores correspond to negative MSE
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_min = np.min(test_scores, axis=1)
    test_scores_max = np.max(test_scores, axis=1)

    plt.plot(train_sizes, test_scores_mean, label=labels[n], color=colors[i])
    plt.fill_between(train_sizes,
                     test_scores_min,
                     test_scores_max,
                     alpha=0.2,
                     color=colors[i])

plt.legend(loc="best")