def test_learning_curve_with_shuffle():
    # Following test case was designed this way to verify the code
    # changes made in pull request: #7506.
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [11, 12], [13, 14], [15, 16],
                 [17, 18], [19, 20], [7, 8], [9, 10], [11, 12], [13, 14],
                 [15, 16], [17, 18]])
    y = np.array([1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4, 1, 2, 3, 4])
    groups = np.array([1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4, 4])
    # Splits on these groups fail without shuffle as the first iteration
    # of the learning curve doesn't contain label 4 in the training set.
    estimator = PassiveAggressiveClassifier(shuffle=False)

    cv = GroupKFold(n_splits=2)
    train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve(
        estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3),
        groups=groups, shuffle=True, random_state=2)
    assert_array_almost_equal(train_scores_batch.mean(axis=1),
                              np.array([0.75, 0.3, 0.36111111]))
    assert_array_almost_equal(test_scores_batch.mean(axis=1),
                              np.array([0.36111111, 0.25, 0.25]))
    assert_raises(ValueError, learning_curve, estimator, X, y, cv=cv, n_jobs=1,
                  train_sizes=np.linspace(0.3, 1.0, 3), groups=groups)

    train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve(
        estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3),
        groups=groups, shuffle=True, random_state=2,
        exploit_incremental_learning=True)
    assert_array_almost_equal(train_scores_inc.mean(axis=1),
                              train_scores_batch.mean(axis=1))
    assert_array_almost_equal(test_scores_inc.mean(axis=1),
                              test_scores_batch.mean(axis=1))
def test_learning_curve():
    n_samples = 30
    n_splits = 3
    X, y = make_classification(n_samples=n_samples, n_features=1,
                               n_informative=1, n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(n_samples * ((n_splits - 1) / n_splits))
    for shuffle_train in [False, True]:
        with warnings.catch_warnings(record=True) as w:
            train_sizes, train_scores, test_scores = learning_curve(
                estimator, X, y, cv=KFold(n_splits=n_splits),
                train_sizes=np.linspace(0.1, 1.0, 10),
                shuffle=shuffle_train)
        if len(w) > 0:
            raise RuntimeError("Unexpected warning: %r" % w[0].message)
        assert_equal(train_scores.shape, (10, 3))
        assert_equal(test_scores.shape, (10, 3))
        assert_array_equal(train_sizes, np.linspace(2, 20, 10))
        assert_array_almost_equal(train_scores.mean(axis=1),
                                  np.linspace(1.9, 1.0, 10))
        assert_array_almost_equal(test_scores.mean(axis=1),
                                  np.linspace(0.1, 1.0, 10))

        # Test a custom cv splitter that can iterate only once
        with warnings.catch_warnings(record=True) as w:
            train_sizes2, train_scores2, test_scores2 = learning_curve(
                estimator, X, y,
                cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples),
                train_sizes=np.linspace(0.1, 1.0, 10),
                shuffle=shuffle_train)
        if len(w) > 0:
            raise RuntimeError("Unexpected warning: %r" % w[0].message)
        assert_array_almost_equal(train_scores2, train_scores)
        assert_array_almost_equal(test_scores2, test_scores)
Exemple #3
0
 def plot_learning_curve(self, estimator, title, X, y, ylim=None, cv=None,
                         n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5),
                         filename=None):
     
     plt.figure()
     plt.title(title)
     if ylim is not None:
         plt.ylim(*ylim)
     plt.xlabel("Training examples")
     plt.ylabel("Score")
     train_sizes, train_scores, test_scores = learning_curve(
         estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
     train_scores_mean = np.mean(train_scores, axis=1)
     train_scores_std = np.std(train_scores, axis=1)
     test_scores_mean = np.mean(test_scores, axis=1)
     test_scores_std = np.std(test_scores, axis=1)
     plt.grid()
 
     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                      train_scores_mean + train_scores_std, alpha=0.1,
                      color="r")
     plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                      test_scores_mean + test_scores_std, alpha=0.1, color="g")
     plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
              label="Training score")
     plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
              label="Cross-validation score")
 
     plt.legend(loc="best")
     
     if filename != None:
         plt.savefig(filename)
     return plt
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
    # 画出 data 在某模型上的 learning curve.
    # estimator: 你用的分类器
    # title: 表格的标题
    # X: 输入的 feature, numpy 类型
    # y: 输入的 target vector
    # ylim: tuple 格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
    # cv: 做 cross-validation 的时候, 数据分成的份数, 其中一份作为 cv 集, 其余 n-1 份作为 training(默认为 3 份)
    # n_jobs: 并行的的任务数(默认 1)
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None: plt.ylim(*ylim)
        plt.xlabel("训练样本数")
        plt.ylabel("得分")
        plt.gca().invert_yaxis()
        plt.grid()
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="b")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="训练集上得分")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="交叉验证集上得分")
        plt.legend(loc="best")
        plt.draw()
        plt.show()
        plt.gca().invert_yaxis()
    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff
Exemple #5
0
def plot(estimator, title, X, y, ylim=None, cv=None,
         n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None: plt.ylim(*ylim)
    plt.xlabel(u'Veri nokta sayısı')
    plt.ylabel(u"Hata")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores = 1.0-train_scores
    test_scores = 1.0-test_scores
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label=u'Eğitim hatası')
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label=u'Test hatası')
    plt.legend(loc="best")
    return plt
Exemple #6
0
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve. Taken
    from sklearn website.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    train_sizes : sizes to test over.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()
Exemple #7
0
 def plot_learning_curve(self):
     # Plot the learning curve
     plt.figure(figsize=(9, 6))
     train_sizes, train_scores, test_scores = learning_curve(
         self.model, X=self.X_train, y=self.y_train,
         cv=3, scoring='neg_mean_squared_error')
     self.plot_learning_curve_helper(train_sizes, train_scores, test_scores, 'Learning Curve')
     plt.show()
    def test_learning_curve(self):
        digits = datasets.load_digits()
        df = pdml.ModelFrame(digits)

        result = df.learning_curve.learning_curve(df.naive_bayes.GaussianNB())
        expected = ms.learning_curve(nb.GaussianNB(), digits.data, digits.target)

        self.assertEqual(len(result), 3)
        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assert_numpy_array_almost_equal(result[1], expected[1])
        self.assert_numpy_array_almost_equal(result[2], expected[2])
def plot_learning_curve(est, X, y):
    training_set_size, train_scores, test_scores = learning_curve(
        est, X, y, train_sizes=np.linspace(.1, 1, 20), cv=KFold(20, shuffle=True, random_state=1))
    estimator_name = est.__class__.__name__
    line = plt.plot(training_set_size, train_scores.mean(axis=1), '--',
                    label="training " + estimator_name)
    plt.plot(training_set_size, test_scores.mean(axis=1), '-',
             label="test " + estimator_name, c=line[0].get_color())
    plt.xlabel('Training set size')
    plt.ylabel('Score (R^2)')
    plt.ylim(0, 1.1)
Exemple #10
0
 def learning_curve(self, graphs, targets,
                    cv=5, n_steps=10, start_fraction=0.1):
     """learning_curve."""
     graphs, targets = paired_shuffle(graphs, targets)
     x = self.transform(graphs)
     train_sizes = np.linspace(start_fraction, 1.0, n_steps)
     scoring = 'roc_auc'
     train_sizes, train_scores, test_scores = learning_curve(
         self.model, x, targets,
         cv=cv, train_sizes=train_sizes,
         scoring=scoring)
     return train_sizes, train_scores, test_scores
def test_learning_curve_batch_and_incremental_learning_are_equal():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    train_sizes = np.linspace(0.2, 1.0, 5)
    estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False)

    train_sizes_inc, train_scores_inc, test_scores_inc = \
        learning_curve(
            estimator, X, y, train_sizes=train_sizes,
            cv=3, exploit_incremental_learning=True)
    train_sizes_batch, train_scores_batch, test_scores_batch = \
        learning_curve(
            estimator, X, y, cv=3, train_sizes=train_sizes,
            exploit_incremental_learning=False)

    assert_array_equal(train_sizes_inc, train_sizes_batch)
    assert_array_almost_equal(train_scores_inc.mean(axis=1),
                              train_scores_batch.mean(axis=1))
    assert_array_almost_equal(test_scores_inc.mean(axis=1),
                              test_scores_batch.mean(axis=1))
def test_learning_curve_unsupervised():
    X, _ = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y=None, cv=3, train_sizes=np.linspace(0.1, 1.0, 10))
    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
    assert_array_almost_equal(train_scores.mean(axis=1),
                              np.linspace(1.9, 1.0, 10))
    assert_array_almost_equal(test_scores.mean(axis=1),
                              np.linspace(0.1, 1.0, 10))
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("data_dir")
    parser.add_argument('--method','-m',type=int,default=0,choices=range(5),
        help=
        """chose methods from:
                0:linear_svc
                1:logistic regression
                2:naive bayes
                3:decision  tree
                4:ExtraTreesClassifier
        """)
    args= parser.parse_args()

    silent_feature_vector,threshold_feature_vector,threshold_vector,silent_classification_vector\
        = load_data_set(args.data_dir)
    regr = linear_model.LinearRegression()
    clf = get_classifier(args.method)
    
    #regr_train_sizes = gene_train_sizes(len(threshold_feature_vector))        
    #clf_train_sizes = gene_train_sizes(len(silent_feature_vector)) 
    regr_train_sizes = [0.3,0.6,1.0]
    clf_train_sizes = [0.3,0.6,1.0]

    print "cross validation:"
    regr_train_sizes, regr_train_scores, regr_valid_scores =\
        learning_curve(regr, threshold_feature_vector, threshold_vector, train_sizes=regr_train_sizes, cv=5)
    
    clf_train_sizes, clf_train_scores, clf_valid_scores =\
        learning_curve(clf, silent_feature_vector, silent_classification_vector, train_sizes=clf_train_sizes, cv=5)

    print "Thresholding:"
    print regr_train_scores
    print regr_valid_scores

    print "-"*20

    print "Classification:"
    print clf_train_scores
    print clf_valid_scores
def test_learning_curve_with_boolean_indices():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)
    cv = KFold(n_folds=3)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10))
    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
    assert_array_almost_equal(train_scores.mean(axis=1),
                              np.linspace(1.9, 1.0, 10))
    assert_array_almost_equal(test_scores.mean(axis=1),
                              np.linspace(0.1, 1.0, 10))
Exemple #15
0
def ModelLearning(X, y):
    """ Calculates the performance of several models with varying sizes of training data.
        The learning and validation scores for each model are then plotted. """
    
    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0)


    # Generate the training set sizes increasing by 50
    train_sizes = np.rint(np.linspace(1, X.shape[0]*0.8 - 1, 9)).astype(int)

    # Create the figure window
    fig = pl.figure(figsize=(10,7))

    # Create three different models based on max_depth
    for k, depth in enumerate([1,3,6,10]):
        
        # Create a Decision tree regressor at max_depth = depth
        regressor = DecisionTreeRegressor(max_depth = depth)

        # Calculate the training and testing scores
        sizes, train_scores, valid_scores = learning_curve(regressor, X, y, \
            cv = cv, train_sizes = train_sizes, scoring = 'r2')
        
        # Find the mean and standard deviation for smoothing
        train_std = np.std(train_scores, axis = 1)
        train_mean = np.mean(train_scores, axis = 1)
        valid_std = np.std(valid_scores, axis = 1)
        valid_mean = np.mean(valid_scores, axis = 1)

        # Subplot the learning curve 
        ax = fig.add_subplot(2, 2, k+1)
        ax.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score')
        ax.plot(sizes, valid_mean, 'o-', color = 'g', label = 'Validation Score')
        ax.fill_between(sizes, train_mean - train_std, \
            train_mean + train_std, alpha = 0.15, color = 'r')
        ax.fill_between(sizes, valid_mean - valid_std, \
            valid_mean + valid_std, alpha = 0.15, color = 'g')
        
        # Labels
        ax.set_title('max_depth = %s'%(depth))
        ax.set_xlabel('Number of Training Points')
        ax.set_ylabel('r2_score')
        ax.set_xlim([0, X.shape[0]*0.8])
        ax.set_ylim([-0.05, 1.05])
    
    # Visual aesthetics
    ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad = 0.)
    fig.suptitle('Decision Tree Regressor Learning Performances', fontsize = 16, y = 1.03)
    fig.tight_layout()
    fig.show()
Exemple #16
0
    def __calc_learning_curve(self, algorithm):
        estimator = algorithm.estimator
        train_sizes, train_scores, test_scores = learning_curve(
            estimator,
            self.data.X,
            self.data.y,
            cv=self.cv,
            scoring=self.scoring,
            n_jobs=self.n_jobs)  # parallel run in cross validation
        train_scores_mean = np.mean(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)

        return {'x': train_sizes, 'y_train': train_scores_mean,
                'y_cv': test_scores_mean}
def test_learning_curve_incremental_learning():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockIncrementalImprovingEstimator(20)
    for shuffle_train in [False, True]:
        train_sizes, train_scores, test_scores = learning_curve(
            estimator, X, y, cv=3, exploit_incremental_learning=True,
            train_sizes=np.linspace(0.1, 1.0, 10), shuffle=shuffle_train)
        assert_array_equal(train_sizes, np.linspace(2, 20, 10))
        assert_array_almost_equal(train_scores.mean(axis=1),
                                  np.linspace(1.9, 1.0, 10))
        assert_array_almost_equal(test_scores.mean(axis=1),
                                  np.linspace(0.1, 1.0, 10))
def test_learning_curve_implementation():
    """
    Test to ensure that the learning curve results match scikit-learn
    """

    # This test is different from the other tests which just use regression data.
    # The reason is that we want this test to fail in case our implementation
    # diverges from the scikit-learn implementation. This test essentially
    # serves as a regression test as well.

    # Load in the digits data set
    digits = load_digits()
    X, y = digits.data, digits.target

    # get the learning curve results from scikit-learn for this data
    cv_folds = 10
    random_state = 123456789
    cv = ShuffleSplit(n_splits=cv_folds, test_size=0.2, random_state=random_state)
    estimator = MultinomialNB()
    train_sizes = np.linspace(.1, 1.0, 5)
    train_sizes1, train_scores1, test_scores1 = learning_curve(estimator,
                                                               X,
                                                               y,
                                                               cv=cv,
                                                               train_sizes=train_sizes,
                                                               scoring='accuracy')

    # get the features from this data into a FeatureSet instance we can use
    # with the SKLL API
    feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))
    fs = FeatureSet('train', features=features, labels=y, ids=list(range(X.shape[0])))

    # we don't want to filter out any features since scikit-learn
    # does not do that either
    learner = Learner('MultinomialNB', min_feature_count=0)
    (train_scores2,
     test_scores2,
     train_sizes2) = learner.learning_curve(fs,
                                            cv_folds=cv_folds,
                                            train_sizes=train_sizes,
                                            metric='accuracy')

    assert np.all(train_sizes1 == train_sizes2)
    assert np.allclose(train_scores1, train_scores2)
    assert np.allclose(test_scores1, test_scores2)
def plot_learning_curve(mod, X, y, cv, n_jobs, title, ax=None, invert=True):
    '''
    Generates a simple plot of test & training learning curves.
    Inspired from https://github.com/cs109/a-2017/blob/master/Sections/Standard/section_9_student.ipynb
    and from lecture/section.
    
    Inputs:
    -----------------------------------------------------------------
     mod: model for which learning curve must be plotted
     X: predictor data 
     y: true labels
     cv: number cross validation iterations
     n_jobs: number of cores (-1 for all available)
     ax: optional matplotlib Axes object on which to plot
    
    Outputs:
    -----------------------------------------------------------------
     None: plotted learning curves
    '''
    plt.style.use('seaborn-whitegrid')
    
    train_sizes, train_scores, test_scores = learning_curve(mod, X=X, y=y_train.values.ravel(), cv=20, n_jobs=-1)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    if ax == None: fig, ax = plt.subplots(figsize=(12, 7))
    if invert: ax.invert_yaxis()
        
    ax.plot(train_sizes, train_scores_mean, 'o-', color='r', label='training score')
    ax.plot(train_sizes, test_scores_mean, 'o-', color='g', label='test score')
    ax.set_xlabel('Training Examples')
    ax.set_ylabel('Score')
    ax.set_title(title)
    ax.grid(alpha=0.5)
    sns.despine(bottom=True, left=True)
    ax.legend(loc='best')
    ax.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    ax.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    
    return None
def test_learning_curve():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)
    with warnings.catch_warnings(record=True) as w:
        train_sizes, train_scores, test_scores = learning_curve(
            estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10))
    if len(w) > 0:
        raise RuntimeError("Unexpected warning: %r" % w[0].message)
    assert_equal(train_scores.shape, (10, 3))
    assert_equal(test_scores.shape, (10, 3))
    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
    assert_array_almost_equal(train_scores.mean(axis=1),
                              np.linspace(1.9, 1.0, 10))
    assert_array_almost_equal(test_scores.mean(axis=1),
                              np.linspace(0.1, 1.0, 10))
def test_learning_curve_verbose():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        train_sizes, train_scores, test_scores = \
            learning_curve(estimator, X, y, cv=3, verbose=1)
    finally:
        out = sys.stdout.getvalue()
        sys.stdout.close()
        sys.stdout = old_stdout

    assert("[learning_curve]" in out)
Exemple #22
0
    def __plot_learning_curve(self, dname=None):
        for alg in self.algorithms:
            if self.verbose:
                print('    %s' % alg.name)
            estimator = alg.estimator
            train_sizes, train_scores, test_scores = learning_curve(
                estimator,
                self.data.X,
                self.data.y,
                cv=self.cv,
                scoring=self.scoring,
                n_jobs=self.n_jobs)  # parallel run in cross validation
            train_scores_mean = np.mean(train_scores, axis=1)
            train_scores_std = np.std(train_scores, axis=1)
            test_scores_mean = np.mean(test_scores, axis=1)
            test_scores_std = np.std(test_scores, axis=1)

            plt.figure()
            plt.title(estimator.__class__.__name__)
            plt.xlabel("Training examples")
            plt.ylabel("Score")
            plt.grid()

            plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                             train_scores_mean + train_scores_std, alpha=0.1,
                             color="r")
            plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                             test_scores_mean + test_scores_std,
                             alpha=0.1, color="g")
            plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
                     label="Training score")
            plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
                     label="Cross-validation score")
            plt.legend(loc="lower right")
            if dname is not None and not os.path.exists(dname):
                os.mkdir(dname)
            if dname is not None:
                plt.savefig('%s/learning_curve_%s.png' %
                            (dname, estimator.__class__.__name__),
                            bbox_inches='tight', dpi=75)
            else:
                plt.savefig('learning_curve_%s.png' %
                            estimator.__class__.__name__,
                            bbox_inches='tight', dpi=75)
            plt.close()
def plot_cv_accuracy(classifier,X_train,y_train,cv=10,n_jobs=1):
    train_sizes,train_scores,test_scores =\
                        learning_curve(estimator=classifier,
                                       X=X_train,
                                       y=y_train,
                                       train_sizes=np.linspace(0.1,1.0,10),
                                       cv=10,
                                       n_jobs=1)
    
    train_mean = np.mean(train_scores,axis=1)
    train_std = np.std(train_scores,axis=1)
    test_mean = np.mean(test_scores,axis=1)
    test_std = np.std(test_scores,axis=1)
    
    
    fig = plt.figure(figsize=(10,5))
    plt.plot(train_sizes,
             train_mean,
             color='blue',
             marker='o' ,
             markersize=5,
             label='training accuracy')
    plt.fill_between(train_sizes,
                     train_mean+train_std,
                     train_mean-train_std,
                     alpha=0.15,color='blue')
    plt.plot(train_sizes,
             test_mean,
             color='green',
             linestyle="--",
             marker='s',
             markersize=5,
             label='validation accuracy')
    plt.fill_between(train_sizes,
                     test_mean+test_std,
                     test_mean-test_std,
                     alpha=0.15,color='green')
    plt.grid()
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0.6 ,1.1])
    plt.show()
Exemple #24
0
def plot_learning_curve(estimator, X, y, train_sizes=np.linspace(.1, 1.0, 5),
                        cv=None, n_jobs=1, ax=None):
    '''
    Plot the learning curve for `estimator`.

    Parameters
    ----------
    estimator : sklearn.Estimator
    X : array-like
    y : array-like
    train_sizes : array-like
        list of floats between 0 and 1
    cv : int
    n_jobs : int
    ax : matplotlib.axes
    '''
    # http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
    if ax is None:
        fig, ax = plt.subplots()
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return ax
Exemple #25
0
 def plot_learning_curve(self, estimator, x_train, y_train, cv, data_label, n_jobs=-1):
         
     # plot the learning curves using sklearn and matplotlib
     plt.clf()
     train_sizes, train_scores, test_scores = learning_curve(estimator=estimator,
                                                             X=x_train,
                                                             y=y_train,
                                                             cv=cv,
                                                             n_jobs=n_jobs)
     
     train_mean = np.mean(train_scores, axis=1)
     train_std = np.std(train_scores, axis=1)
     test_mean = np.mean(test_scores, axis=1)
     test_std = np.std(test_scores, axis=1)
     
     plt.plot(train_sizes, train_mean,
              color='blue', marker='o',
              markersize=5,
              label='training accuracy')
     
     plt.fill_between(train_sizes,
                      train_mean + train_std,
                      train_mean - train_std,
                      alpha=0.15, color='blue')
     
     plt.plot(train_sizes, test_mean,
              color='green', marker='s',
              markersize=5, linestyle='--',
              label='validation accuracy')        
     
     plt.fill_between(train_sizes,
                      test_mean + test_std,
                      test_mean - test_std,
                      alpha=0.15, color='green')
     
     plt.grid()
     plt.title("Learning curve: %s" % (data_label))
     plt.xlabel('Number of training samples')
     plt.ylabel('Accurancy')
     plt.legend(loc='lower right')
     fn = self.save_path + data_label + '_learncurve.png'
     plt.savefig(fn)
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("top_data_dir")
    parser.add_argument('--method','-m',type=int,default=0,choices=range(5),
        help=
        """chose methods from:
                0:linear_svc
                1:logistic regression
                2:naive bayes
                3:decision  tree
                4:ExtraTreesClassifier
        """)
    args=parser.parse_args()

    training_dataset, testing_dataset = load_data(args.top_data_dir)
    clf = get_classifier(args.method)

    print "cross validation:"
    clf_train_sizes = [0.05,0.1,0.2,0.4,0.6,0.8,1.0]
    clf_train_sizes, clf_train_scores, clf_valid_scores =\
        learning_curve(clf, testing_dataset.X, testing_dataset.y,
                       train_sizes=clf_train_sizes, cv=5,scoring='f1')

    print "-"*20

    print "Classification:"
    print "Training:"
    print clf_train_scores
    # random.shuffle(clf_train_scores)
    # print clf_train_scores
    print "Validation:"
    print clf_valid_scores
    # random.shuffle(clf_valid_scores)
    # print clf_valid_scores

    print "Average"
    print "Training:"
    for i in clf_train_scores:
        print "%f" %(sum(i)/len(i))
    print "Validation:"
    for i in clf_valid_scores:
        print "%f" %(sum(i)/len(i))
Exemple #27
0
def run_nn(X,y):
    # Create CV training and test scores for various training set sizes 
    #this is for Neural Network Classification problem of shots made
    train_sizes1, train_scores1, test_scores1 = learning_curve(MLPClassifier(),
                                                            X, 
                                                            y,
                                                            # Number of folds in cross-validation
                                                            cv=10,
                                                            # Evaluation metric
                                                            scoring='accuracy',
                                                            # Use all computer cores
                                                            n_jobs=-1, 
                                                            # 50 different sizes of the training set
                                                            train_sizes=np.linspace(0.01, 1.0, 50))

    # Create means and standard deviations of training set scores
    train_mean1 = np.mean(train_scores1, axis=1)
    train_std1 = np.std(train_scores1, axis=1)

    print "Avg. Accuracy Score of Training Set: ", np.mean(train_mean1)
    # Create means and standard deviations of test set scores
    test_mean1 = np.mean(test_scores1, axis=1)
    test_std1 = np.std(test_scores1, axis=1)
    print "Avg. Accuracy Score of Test Set: ", np.mean(test_mean1)

    # Draw lines
    plt.plot(train_sizes1, train_mean1, '--', color="#111111",  label="Training score")
    plt.plot(train_sizes1, test_mean1, color="#111111", label="Cross-validation score")

    # Draw bands
    plt.fill_between(train_sizes1, train_mean1 - train_std1, train_mean1 + train_std1, color="#DDDDDD")
    plt.fill_between(train_sizes1, test_mean1 - test_std1, test_mean1 + test_std1, color="#DDDDDD")

    # Create plot
    plt.title("Learning Curve for Shot Made Classification Problem Neural Network")
    plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
    plt.tight_layout()
    plt.show()
Exemple #28
0
X = finalfsdf

import lightgbm as lgb

params = {
  'max_depth': 10,
  'n_estimators ': 10,
  'objective': 'binary',
  'colsample_bytree': 0.8,
  "class_weight":{0:1 , 1:20},
  "base_score":0.2,
  "n_jobs":-1,
  "metric":"auc",
  "reg_alpha":0.4,
  "reg_lambda":0.18,
}

clf = lgb.LGBMClassifier(**params)

from sklearn.model_selection import learning_curve


train_sizes, train_scores, valid_scores = learning_curve(clf, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=5)
train_sizes

lcurveplotdf = pd.DataFrame({"train_size":train_sizes , "train_score" : train_scores[:,1] , "valid_score":valid_scores[:,1]})

ggplot(lcurveplotdf ) + \
    geom_line(aes(x="train_size" , y="train_score") , color="red") + \
    geom_line(aes(x="train_size" , y="valid_score") , color="green")
Exemple #29
0
clf = KNeighborsClassifier()
grid_object = GridSearchCV(estimator=clf,
                           param_grid=param_dict,
                           scoring='accuracy',
                           cv=10,
                           n_jobs=-1)
grid_object.fit(X_train, y_train)
best_params = grid_object.best_params_
print(best_params)

opt_clf = KNeighborsClassifier(**best_params)

# Learning Curve Plots
train_sizes, train_scores, validation_scores = learning_curve(
    opt_clf,
    X_train,
    y_train,
    train_sizes=np.linspace(0.1, 1.0, 100),
    n_jobs=-1)
av_train_scores = np.mean(train_scores, axis=1)
av_validation_scores = np.mean(validation_scores, axis=1)
# LC Plot
plt.plot(train_sizes, av_train_scores, label='train scores')
plt.plot(train_sizes, av_validation_scores, label='validation scores')
plt.title("Learning Curve")
plt.xlabel("Training Examples")
plt.ylabel("Scores")
plt.ylim([0.60, 1.02])
plt.legend()
plt.show()

# Validation Curves
Exemple #30
0
model.fit(x_train, y_train, verbose=1) #, callbacks=[tb])

# accuracy
# train_sizes, train_scores_model, test_scores_model = \
#     learning_curve(model, x_train[:100], y_train[:100], train_sizes=np.linspace(0.1, 1.0, 10),
#                    scoring="accuracy", cv=8, shuffle=True, random_state=42, verbose=1)

# train_scores_mean = np.mean(train_scores_model, axis=1)
# train_scores_std = np.std(train_scores_model, axis=1)
# test_scores_mean = np.mean(test_scores_model, axis=1)
# test_scores_std = np.std(test_scores_model, axis=1)

# log loss
train_sizes, train_scores_model, test_scores_model = \
    learning_curve(model, x_train[:100], y_train[:100], train_sizes=np.linspace(0.1, 1.0, 10),
                   scoring='neg_log_loss', cv=8, shuffle=True, random_state=42)

# accuracy
# plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
#                  label="Training score")
# plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
#                  label="validation score")

# log loss
plt.plot(train_sizes, -train_scores_model.mean(1), 'o-', color="r", label="log_loss")
plt.plot(train_sizes, -test_scores_model.mean(1), 'o-', color="g", label="val log_loss")

plt.xlabel("Train size")
plt.ylabel("Log loss")
# plt.ylabel("Accuracy")
plt.title('lgbm')
def function_plot_learning_curve(estimator, features, target, train_sizes, cv,
                                 title):

    _, axes = plt.subplots(figsize=(8, 5))

    axes.set_title(title)

    axes.set_xlabel("Training examples")
    axes.set_ylabel("MAE")

    train_sizes, train_scores, validation_scores = learning_curve(
        estimator,
        features,
        target,
        train_sizes=train_sizes,
        cv=cv,
        scoring='neg_mean_squared_error')
    train_scores_mean = -train_scores.mean(axis=1)
    test_scores_mean = -validation_scores.mean(axis=1)

    train_scores_std = np.std(train_scores, axis=1)
    test_scores_std = np.std(validation_scores, axis=1)

    # Plot learning curve
    axes.grid()
    axes.fill_between(train_sizes,
                      train_scores_mean - train_scores_std,
                      train_scores_mean + train_scores_std,
                      alpha=0.1,
                      color="r")
    axes.fill_between(train_sizes,
                      test_scores_mean - test_scores_std,
                      test_scores_mean + test_scores_std,
                      alpha=0.1,
                      color="g")
    axes.plot(train_sizes,
              train_scores_mean,
              'o-',
              color="r",
              label="Training score")
    axes.plot(train_sizes,
              test_scores_mean,
              'o-',
              color="g",
              label="Cross-validation score")
    axes.legend(loc="lower left")

    #    plt.ylim(0,40)

    #def function_plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
    #                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    #    """
    #    Generate 3 plots: the test and training learning curve, the training
    #    samples vs fit times curve, the fit times vs score curve.
    #
    #    Parameters
    #    ----------
    #    estimator : object type that implements the "fit" and "predict" methods
    #        An object of that type which is cloned for each validation.
    #
    #    title : string
    #        Title for the chart.
    #
    #    X : array-like, shape (n_samples, n_features)
    #        Training vector, where n_samples is the number of samples and
    #        n_features is the number of features.
    #
    #    y : array-like, shape (n_samples) or (n_samples, n_features), optional
    #        Target relative to X for classification or regression;
    #        None for unsupervised learning.
    #
    #    axes : array of 3 axes, optional (default=None)
    #        Axes to use for plotting the curves.
    #
    #    ylim : tuple, shape (ymin, ymax), optional
    #        Defines minimum and maximum yvalues plotted.
    #
    #    cv : int, cross-validation generator or an iterable, optional
    #        Determines the cross-validation splitting strategy.
    #        Possible inputs for cv are:
    #
    #          - None, to use the default 5-fold cross-validation,
    #          - integer, to specify the number of folds.
    #          - :term:`CV splitter`,
    #          - An iterable yielding (train, test) splits as arrays of indices.
    #
    #        For integer/None inputs, if ``y`` is binary or multiclass,
    #        :class:`StratifiedKFold` used. If the estimator is not a classifier
    #        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
    #
    #        Refer :ref:`User Guide <cross_validation>` for the various
    #        cross-validators that can be used here.
    #
    #    n_jobs : int or None, optional (default=None)
    #        Number of jobs to run in parallel.
    #        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
    #        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
    #        for more details.
    #
    #    train_sizes : array-like, shape (n_ticks,), dtype float or int
    #        Relative or absolute numbers of training examples that will be used to
    #        generate the learning curve. If the dtype is float, it is regarded as a
    #        fraction of the maximum size of the training set (that is determined
    #        by the selected validation method), i.e. it has to be within (0, 1].
    #        Otherwise it is interpreted as absolute sizes of the training sets.
    #        Note that for classification the number of samples usually have to
    #        be big enough to contain at least one sample from each class.
    #        (default: np.linspace(0.1, 1.0, 5))
    #    """
    #
    #    _, axes = plt.subplots(figsize=(8, 5))
    #
    #    axes.set_title(title)
    #    if ylim is not None:
    #        axes.set_ylim(*ylim)
    #    axes.set_xlabel("Training examples")
    #    axes.set_ylabel("MAE")
    #
    #
    #    train_sizes, train_scores, test_scores, fit_times, _ = \
    #        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, scoring='neg_mean_absolute_error',
    #                       train_sizes=train_sizes, return_times=True)
    #    train_scores_mean = np.mean(train_scores, axis=1)
    #    train_scores_std = np.std(train_scores, axis=1)
    #    test_scores_mean = np.mean(test_scores, axis=1)
    #    test_scores_std = np.std(test_scores, axis=1)
    #    fit_times_mean = np.mean(fit_times, axis=1)
    #    fit_times_std = np.std(fit_times, axis=1)
    #
    #    # Plot learning curve
    #    axes.grid()
    #    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
    #                         train_scores_mean + train_scores_std, alpha=0.1,
    #                         color="r")
    #    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
    #                         test_scores_mean + test_scores_std, alpha=0.1,
    #                         color="g")
    #    axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
    #                 label="Training score")
    #    axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
    #                 label="Cross-validation score")
    #    axes.legend(loc="lower left")

    return plt
Exemple #32
0
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        axes=None,
                        ylim=None,
                        cv=None,
                        n_jobs=None,
                        train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.

    title : str
        Title for the chart.

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True, scoring='accuracy')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes,
                 train_scores_mean,
                 'o-',
                 color="r",
                 label="Training score")
    axes[0].plot(train_sizes,
                 test_scores_mean,
                 'o-',
                 color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes,
                         fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std,
                         alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt
def runMLP():
    # We use Stochastic Gradient Descent to be able to get learning errors, and
    # Adam is more suited for larger datasets. Lbfgs would be better than sgd but
    # does not provide error information on learning.
    solvers = ["lbfgs", "sgd", "adam"]
    # We disregard the tanh activation function: It ranges from -1 to 1, mapping inputs
    # Distinctly towrads -1 when negative, and 0 inputs close to 0. Behavior making it
    # well suited for classification between two classes, thus not our choice.
    # For much the same reason we disregard the logistic (sigmoid) activation function.
    # We use relu for our activation function. It goes from 0 to infinity and can suffer from 'dying'
    # when the input includes negative values. Not a great concern for us. It has the advantage
    # of not suffering from the vanishing gradient problem (while a benefit over of activations such
    # as sigmoid and tanh, whom both suffer from this problem,
    # it would be more relevant if our model included a greater number of layers).
    # The fact that it sets all negative inputs to 0 also helps produce a more sparse model,
    # silencing low/non importance neurons.
    activationFunction = ["identity", "logistic", "tanh", "relu"]
    # We leave our alpha value as default. Could be optimised further with trial and error.
    alpha = 0.001
    # We set our learning rate to adaptive, a perk of using a gradient descent algorithm.
    learning_rate = ["constant", "invscaling", "adaptive"]
    learning_rate_init = 0.001
    # To avoid needles learning we set a max iter of 200.
    max_iters = 200
    hidden_layer_sizes = (50, 10)
    applyStandardization = True
    mlpTrainData = trainData
    mlpTestData = data.instanceAttriTest

    if applyStandardization:
        scaler = StandardScaler()
        scaler.fit(mlpTrainData)
        mlpTrainData = scaler.transform(mlpTrainData)
        mlpTestData = scaler.transform(mlpTestData)

    mlp = MLPClassifier(solver="sgd",
                        activation="relu",
                        alpha=alpha,
                        learning_rate="adaptive",
                        max_iter=max_iters,
                        learning_rate_init=learning_rate_init,
                        hidden_layer_sizes=hidden_layer_sizes)

    mlp.fit(mlpTrainData, trainLabels)
    print(mlp.loss_curve_)

    t_sizes, t_scores, valid_scores = learning_curve(
        mlp,
        mlpTrainData,
        trainLabels,
        train_sizes=np.linspace(0.1, 1.0, 5),
        cv=5,
        scoring='neg_mean_squared_error',
        error_score='raise')

    prediction = mlp.predict(mlpTestData)

    print(
        f"Precision:  {precision_score(data.instanceLabelTest, prediction, average='weighted')}"
    )
    print(f"Accuracy: {accuracy_score(data.instanceLabelTest, prediction)}")
    print(
        f"confusion Matrix:\n {confusion_matrix(data.instanceLabelTest, prediction)}\n"
    )

    fig, axs = plt.subplots(3)
    axs[0].plot(np.arange(0, mlp.n_iter_, 1), mlp.loss_curve_, label='')
    axs[0].set_title("Loss Curve")
    axs[1].plot(
        t_sizes,
        t_scores.mean(axis=1),
    )
    axs[1].set_title("Training Scores")
    axs[2].plot(
        t_sizes,
        valid_scores.mean(axis=1),
    )
    axs[2].set_title("Validation Scores")
    plt.ylabel('Error/Score')
    plt.xlabel('Set Size')
    plt.legend()
    fig.tight_layout()
    plt.show()
Exemple #34
0
def eval_lc(model, x, y, train_sizes):
    train_sizes, train_scores, test_scores = sm.learning_curve(
        model, x, y, train_sizes=train_sizes, cv=5)
    print(train_scores)
    print(test_scores)
    return train_sizes, train_scores, test_scores
Exemple #35
0
info = np.load(op.join(cfg.path_data, 'info_allch.npy')).item()
picks = mne.pick_types(info, meg=meg)

fname = op.join(cfg.path_outputs, 'covs_allch_oas.h5')
covs = mne.externals.h5io.read_hdf5(fname)
subjects = [d['subject'] for d in covs if 'subject' in d]
covs = [d['covs'][:, picks][:, :, picks] for d in covs if 'subject' in d]
X = np.array(covs)
n_sub, n_fb, n_ch, _ = X.shape

part = pd.read_csv(op.join(cfg.path_data, 'participants.csv'))
y = part.set_index('Observations').age.loc[subjects]

common = ProjCommonSpace(scale=scale, n_compo=n_compo, reg=reg)
riemann = Riemann(n_fb=n_fb, metric=metric)
sc = StandardScaler()
ridge = RidgeCV(alphas=np.logspace(-3, 5, 100))
cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

pipe = make_pipeline(common, riemann, sc, ridge)
train_sizes = np.linspace(0.1, 1, 5)
train_sizes, train_scores, test_scores = learning_curve(
                    pipe, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
                    scoring='neg_mean_absolute_error')

scores = {'train_sizes': train_sizes,
          'train_scores': train_scores,
          'test_scores': test_scores}
np.save(op.join(cfg.path_outputs,
        'all_scores_learning_curves.npy'), scores)
Exemple #36
0
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        axes=None,
                        ylim=None,
                        cv=None,
                        n_jobs=None,
                        scoring=None,
                        train_sizes=np.linspace(.1, 1.0, 5)):
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=n_jobs,
        scoring=scoring,
        train_sizes=train_sizes,
        return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes,
                 train_scores_mean,
                 'o-',
                 color="r",
                 label="Training score")
    axes[0].plot(train_sizes,
                 test_scores_mean,
                 'o-',
                 color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes,
                         fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std,
                         alpha=0.1)
    axes[1].set_xlabel("")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt
Exemple #37
0
    plt.ylim(-.65, .0)
    plt.xlim(.5 * train_sizes.min(), train_sizes.max())
    plt.xticks((100, 1000), ('100', '1000'), size=13)
    plt.yticks(())

    plt.ylabel('Error')
    plt.xlabel('Number of samples      ')
    plt.subplots_adjust(left=.07, bottom=.22, top=.99, right=.99)
    plt.savefig(name, edgecolor='none', facecolor='none')


# Degree 9
model = make_pipeline(PolynomialFeatures(degree=9), LinearRegression())
train_sizes, train_scores, test_scores = model_selection.learning_curve(
    model,
    X,
    y,
    cv=model_selection.ShuffleSplit(n_splits=20),
    train_sizes=np.logspace(-2.5, -.3, 30))

idx_to_plot = [0, 7, 19, 29]

for i in idx_to_plot:
    n_train = train_sizes[i]
    if i > 0:
        symbol_train = '--'
        symbol_test = ''
    else:
        symbol_train = 'o'
        symbol_test = 'o'
    plt.figure(figsize=(4.5, 3))
    test_plot = plt.semilogx(train_sizes[:i + 1],
Exemple #38
0
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=1,
                        train_sizes=np.linspace(.1, 1.0, 5),
                        verbose=0):
    '''
    Generate a simple plot of the test and traning learning curve.
    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.
    title : string
        Title for the chart.
    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.
    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.
    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.
    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects
    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    '''
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="g")
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color="r",
             label="Training score")
    plt.plot(train_sizes,
             test_scores_mean,
             'o-',
             color="g",
             label="Cross-validation score")
    plt.legend(loc="best")
    return plt
f1Score = f1_score(y_test, y_pred, average=None)
print('\n\n\n\n', ' f1 score is : ', f1Score)

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import learning_curve

#Coming up with training sizes
train_sizes = [1, 50, 100, 150, 200, 250, 300, 4000, 8000]

train_sizes, training_scores, test_scores = learning_curve(
    DecisionTreeClassifier(criterion='gini',
                           max_depth=1,
                           min_samples_split=2,
                           min_weight_fraction_leaf=0.0,
                           splitter='best'),
    X,
    y,
    train_sizes=train_sizes,
    cv=5,
    scoring='neg_mean_squared_error',
    shuffle='True')

print('\n\n\nTraining scores:\n\n', training_scores)
print('\n', '-' * 70)  # separator to make the output easy to read
print('\nValidation scores:\n\n', test_scores)

training_scores_mean = -training_scores.mean(axis=1)
test_scores_mean = -test_scores.mean(axis=1)

print('\n\nMean training scores\n\n',
      pd.Series(training_scores_mean, index=train_sizes))
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=1,
                        train_sizes=np.linspace(.05, 1., 20),
                        verbose=0,
                        plot=True):
    """
    画出data在某模型上的learning curve.
    参数解释
    ----------
    estimator : 你用的分类器。
    title : 表格的标题。
    X : 输入的feature,numpy类型
    y : 输入的target vector
    ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
    cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
    n_jobs : 并行的的任务数(默认1)
    """
    train_sizes, train_scores, test_scores = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        verbose=verbose)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel(u"训练样本数")
        plt.ylabel(u"得分")
        plt.gca().invert_yaxis()
        plt.grid()

        plt.fill_between(train_sizes,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color="b")
        plt.fill_between(train_sizes,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color="r")
        plt.plot(train_sizes,
                 train_scores_mean,
                 'o-',
                 color="b",
                 label=u"训练集上得分")
        plt.plot(train_sizes,
                 test_scores_mean,
                 'o-',
                 color="r",
                 label=u"交叉验证集上得分")

        plt.legend(loc="best")

        plt.draw()
        plt.gca().invert_yaxis()
        plt.show()

    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) +
                (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (
        test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff
Exemple #41
0
                                                    test_size=0.2,
                                                    random_state=0)

# Cross validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
# # Generate C param
C = np.arange(1, 10)

# # LEARNING CURVE SCORE
# # Create three different models based on max_depth
for k, C in enumerate(C):
    # Create a Decision tree regressor at max_depth = depth
    regressor = LogisticRegression(C=C)

    # Calculate the training and testing scores
    sizes, train_scores, test_scores = learning_curve(
        regressor, X, y, cv=cv, n_jobs=4, scoring=make_scorer(accuracy_score))

    print('C:', C)
    print('score train:', np.mean(train_scores))
    print('score test:', np.mean(test_scores))

# MODEL COMPLEX SCORE
# Calculate the training and testing scores
C = np.arange(1, 10)
regressor = LogisticRegression()
train_scores, test_scores = validation_curve(
    regressor,
    X,
    y,
    cv=cv,
    param_name='C',
more_scores = precision_recall_fscore_support(y_test,
                                              y_pred_test,
                                              average='weighted')

print('Precision: ', more_scores[0])
print('Recall: ', more_scores[1])

# Define a 10 fold CV with 11 % data of training set (train_temp) for validation
# 11 %, not 10 %,  because the validation split is being used instead of the test split.
cv = ShuffleSplit(n_splits=10, test_size=0.11, random_state=0)

# Plot learning curves with 10-fold CV
train_sizes, train_scores, test_scores = learning_curve(
    estimator=svm,
    X=X_train_temp_centered,
    y=y_train_temp,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=cv,
    n_jobs=1)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

fig = plt.figure()
plt.plot(train_sizes,
         train_mean,
         color='tab:blue',
         marker='o',
         markersize=5,
         label='training accuracy')
Exemple #43
0
def plot_learning_curve(estimator, x, y, file_name=None, **estimator_info):
    '''
    plot the learning curve of a estimator configured with a specific set of parameters
    
    estimator: unfitted estimator with a specific set of parameters already set
    x: numpy array of shape (N, K), with N samples and K features
    y: numpy array of shape (1,), target
    file_name: the file name to save the plot, if None, plot the graph on a window
    estimator_info: dict, details about the estimator and parameter configuration
    
    return:
    None
    '''

    train_sizes, train_scores, test_scores = learning_curve(estimator,
                                                            x,
                                                            y,
                                                            cv=5,
                                                            scoring=ks_scorer,
                                                            n_jobs=6)

    plt.figure()
    plt.title(dict_to_string(**estimator_info))

    plt.xlabel("Training examples")
    plt.ylabel("Score")

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="g")
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color="r",
             label="Training score")
    plt.plot(train_sizes,
             test_scores_mean,
             'o-',
             color="g",
             label="Cross-validation score")

    plt.legend(loc="best")

    if file_name is not None:
        plt.savefig(file_name)

    else:
        plt.show()

    plt.close()
Exemple #44
0
            features,
            labels,
            param_name='svc__gamma',
            param_range=param_range)
        plt.title("Validation Curve with SVM")
        plt.xlabel("gamma")
        plt.ylabel("Score")
        plt.plot(param_range,
                 validation_scores.mean(axis=1),
                 label='cross-validation')
        plt.plot(param_range, train_scores.mean(axis=1), label='training')
        plt.legend(loc='best')
        plt.show()

    plt.figure()
    train_sizes, train_scores, validation_scores = learning_curve(
        tree_model, features, labels, train_sizes=np.logspace(-1, 0, 20))
    plt.xlabel('Trainging Examples')
    plt.ylabel('Score')
    plt.title('Learning Curve')
    plt.plot(train_sizes,
             validation_scores.mean(axis=1),
             label='cross-validation')
    plt.plot(train_sizes, train_scores.mean(axis=1), label='training')
    plt.legend(loc='best')
    plt.show()

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(0, len(classes)):
        fpr[i], tpr[i], _ = roc_curve(te_lab[:, i], y_p[:, i])
Exemple #45
0
data = pd.read_excel('D:\SVM\\test_all.xlsx')
pre_data = data.iloc[0:, 1:]
X = pre_data.iloc[:, :53]
y = pre_data.iloc[:, 53]
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=0.75)

clf_rbf = svm.SVC(kernel='rbf')
clf_rbf.fit(x_train, y_train.ravel())
print('rbf_train:%.2f' % clf_rbf.score(x_train, y_train))
print('rbf_test:%.2f' % clf_rbf.score(x_test, y_test))

#绘制学习曲线
X_shuffle, y_shuffle = shuffle(X, y)
plt.figure(figsize=(7, 5))
train_sizes, train_scores, test_scores = learning_curve(clf_rbf,X_shuffle,y_shuffle)
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.plot(train_sizes, train_scores_mean, 'o-', color='red', label='Train_Score = %0.2f' % clf_rbf.score(x_train, y_train) )
plt.plot(train_sizes, test_scores_mean, 'o-', color='blue', label='Test_Score = %0.2f' % clf_rbf.score(x_test, y_test))
plt.xlim([0.0, 200.0])
plt.ylim([0.5, 1.2])
plt.legend(loc="lower right")
plt.title("Learning Curve")


#绘制ROC
metrics.f1_score(y_test,clf_rbf.predict(x_test))
fpr,tpr,thresholds=metrics.roc_curve(y_test,clf_rbf.decision_function(x_test),
pos_label=2)
roc_auc = metrics.auc(fpr,tpr)
Exemple #46
0
def plot_learing_curve(pipeline, title):
    size = 10000
    cv = KFold(size, shuffle=True)

    X = dataprep.train_news["Statement"]
    y = dataprep.train_news["Label"]

    pl = pipeline
    pl.fit(X, y)

    train_sizes, train_scores, test_scores = learning_curve(
        pl,
        X,
        y,
        n_jobs=-1,
        cv=cv,
        train_sizes=np.linspace(.1, 1.0, 5),
        verbose=0)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure()
    plt.title(title)
    plt.legend(loc="best")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.gca().invert_yaxis()

    # box-like grid
    plt.grid()

    # plot the std deviation as a transparent range at each training set size
    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="g")

    # plot the average training and test score lines at each training set size
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color="r",
             label="Training score")
    plt.plot(train_sizes,
             test_scores_mean,
             'o-',
             color="g",
             label="Cross-validation score")

    # sizes the window for readability and displays the plot
    # shows error from 0 to 1.1
    plt.ylim(-.1, 1.1)
    plt.show()
Exemple #47
0
def plot_learning_curve2(estimator,
                         fn,
                         title,
                         X,
                         y,
                         ylim=None,
                         cv=None,
                         n_jobs=1,
                         train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=1,
        train_sizes=train_sizes,
        return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="g")
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color="r",
             label="Training Score")
    plt.plot(train_sizes,
             test_scores_mean,
             'o-',
             color="g",
             label="Validation Score")

    plt.legend(loc="best")

    plt.savefig("./%s" % fn)
    plt.close()

    # timing
    time_mean = np.mean(fit_times, axis=1)

    # Draw lines
    plt.plot(train_sizes, time_mean, label="Fit Time")

    # Create plot
    plt.title("Scalability (w/ regards to time)")
    plt.xlabel("Training Set Size"), plt.ylabel("Time"), plt.legend(loc="best")
    plt.tight_layout()
    plt.savefig("./%s_scale" % fn)
    plt.close()

    return train_scores, train_sizes, test_scores, fit_times
Exemple #48
0
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=None,
                        train_sizes=np.linspace(.1, 1.0, 5)):
    """
    绘制学习曲线,用于判断欠拟合与过拟合
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="g")
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color="r",
             label="Training score")
    plt.plot(train_sizes,
             test_scores_mean,
             'o-',
             color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt
Exemple #50
0
def plot_learning_curve(clf,
                        X,
                        y,
                        title='Learning Curve',
                        cv=None,
                        train_sizes=None,
                        n_jobs=1,
                        ax=None):
    """Generates a plot of the train and test learning curves for a given classifier.

    Args:
        clf: Classifier instance that implements ``fit`` and ``predict`` methods.

        X (array-like, shape (n_samples, n_features)):
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y (array-like, shape (n_samples) or (n_samples, n_features)):
            Target relative to X for classification or regression;
            None for unsupervised learning.

        title (string, optional): Title of the generated plot. Defaults to "Learning Curve"

        cv (int, cross-validation generator, iterable, optional): Determines the
            cross-validation strategy to be used for splitting.

            Possible inputs for cv are:
              - None, to use the default 3-fold cross-validation,
              - integer, to specify the number of folds.
              - An object to be used as a cross-validation generator.
              - An iterable yielding train/test splits.

            For integer/None inputs, if ``y`` is binary or multiclass,
            :class:`StratifiedKFold` used. If the estimator is not a classifier
            or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        train_sizes (iterable, optional): Determines the training sizes used to plot the
            learning curve. If None, ``np.linspace(.1, 1.0, 5)`` is used.

        n_jobs (int, optional): Number of jobs to run in parallel. Defaults to 1.

        ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot
            the learning curve. If None, the plot is drawn on a new set of axes.

    Returns:
        ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn.

    Example:
        >>> import scikitplot.plotters as skplt
        >>> rf = RandomForestClassifier()
        >>> skplt.plot_learning_curve(rf, X, y)
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490>
        >>> plt.show()

        .. image:: _static/examples/plot_learning_curve.png
           :align: center
           :alt: Learning Curve
    """
    if ax is None:
        fig, ax = plt.subplots(1, 1)

    if train_sizes is None:
        train_sizes = np.linspace(.1, 1.0, 5)

    ax.set_title(title)
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        clf, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    ax.grid()
    ax.fill_between(train_sizes,
                    train_scores_mean - train_scores_std,
                    train_scores_mean + train_scores_std,
                    alpha=0.1,
                    color="r")
    ax.fill_between(train_sizes,
                    test_scores_mean - test_scores_std,
                    test_scores_mean + test_scores_std,
                    alpha=0.1,
                    color="g")
    ax.plot(train_sizes,
            train_scores_mean,
            'o-',
            color="r",
            label="Training score")
    ax.plot(train_sizes,
            test_scores_mean,
            'o-',
            color="g",
            label="Cross-validation score")
    ax.legend(loc="best")

    return ax
Exemple #51
0
    plt.fill_between(max_depth2, train_mean2 - train_std2, \
        train_mean2 + train_std2, alpha = 0.15, color = 'r')
    plt.fill_between(max_depth2, test_mean2 - test_std2, \
        test_mean2 + test_std2, alpha = 0.15, color = 'g')

    # Visual aesthetics
    plt.legend(loc = 'lower right')
    plt.xlabel('Maximum Depth')
    plt.ylabel('Score')
    plt.ylim([-0.05,1.05])
    plt.show()
    """
    regressor = DecisionTreeRegressor(max_depth=depth)
    sizes, train_score, test_score = learning_curve(regressor,
                                                    features,
                                                    price,
                                                    train_sizes=train_sizes,
                                                    cv=cv)

    train_std = np.std(train_score, axis=1)
    train_mean = np.mean(train_score, axis=1)
    test_std = np.std(test_score, axis=1)
    test_mean = np.mean(test_score, axis=1)

    ax = fig.add_subplot(2, 2, K + 1)
    ax.plot(sizes, train_mean, 'o-', color='r', label='Training Score')
    ax.plot(sizes, test_mean, 'o-', color='g', label='Testing Score')
    ax.fill_between(sizes, train_mean - train_std, \
        train_mean + train_std, alpha = 0.15, color = 'r')
    ax.fill_between(sizes, test_mean - test_std, \
        test_mean + test_std, alpha = 0.15, color = 'g')
Exemple #52
0
print(
    "\n\nBest Accuracy Score %f\n Best Parameters %s\n Best Splits %i" %
    (gridResults.best_score_, gridResults.best_params_, gridResults.n_splits_))

from sklearn.model_selection import learning_curve

#Coming up with training sizes
train_sizes = [100, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]

#Features=['Mean of the integrated profile',' Standard deviation of the integrated profile',' Excess kurtosis of the integrated profile',' Skewness of the integrated profile',' Mean of the DM-SNR curve',' Standard deviation of the DM-SNR curve',' Excess kurtosis of the DM-SNR curve',' Skewness of the DM-SNR curve']
#target='target_class'

train_sizes, training_scores, test_scores = learning_curve(
    SVC(kernel='rbf', random_state=0, degree=1, shrinking=True, gamma='auto'),
    X,
    y,
    train_sizes=train_sizes,
    cv=5,
    scoring='neg_mean_squared_error',
    shuffle='True')

print('Training scores:\n\n', training_scores)
print('\n', '-' * 70)  # separator to make the output easy to read
print('\nValidation scores:\n\n', test_scores)

training_scores_mean = -training_scores.mean(axis=1)
test_scores_mean = -test_scores.mean(axis=1)

print('Mean training scores\n\n',
      pd.Series(training_scores_mean, index=train_sizes))
print('\n', '-' * 20)  # separator
print('\nMean test scores\n\n', pd.Series(test_scores_mean, index=train_sizes))
             label="%s (test)" % name)

plt.xscale("log")
plt.yscale("log")
plt.xlabel("Train size")
plt.ylabel("Time (seconds)")
plt.title('Execution Time')
plt.legend(loc="best")

# Visualize learning curves
plt.figure()

svr = SVR(kernel='rbf', C=1e1, gamma=0.1)
kr = KernelRidge(kernel='rbf', alpha=0.1, gamma=0.1)
train_sizes, train_scores_svr, test_scores_svr = \
    learning_curve(svr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10),
                   scoring="neg_mean_squared_error", cv=10)
train_sizes_abs, train_scores_kr, test_scores_kr = \
    learning_curve(kr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10),
                   scoring="neg_mean_squared_error", cv=10)

plt.plot(train_sizes, -test_scores_svr.mean(1), 'o-', color="r",
         label="SVR")
plt.plot(train_sizes, -test_scores_kr.mean(1), 'o-', color="g",
         label="KRR")
plt.xlabel("Train size")
plt.ylabel("Mean Squared Error")
plt.title('Learning curves')
plt.legend(loc="best")

plt.show()
import numpy as np

from sklearn.model_selection import learning_curve

from sklearn.metrics import make_scorer

from sklearn.metrics import matthews_corrcoef

import time

start = time.time()
train_sizes_lc_lgbm, train_scores_lc_lgbm, test_scores_lc_lgbm = learning_curve( estimator = classifier_lgbm,
                                                                                 X = X,
                                                                                 y = Y,
                                                                                 train_sizes = np.linspace(0.1, 1.0, 20),
                                                                                 cv = 10,
                                                                                 scoring = make_scorer(matthews_corrcoef),
                                                                                 shuffle = True,
                                                                                 random_state = 42 )
end = time.time()
print("Tempo de Execução: {:.2f} min".format((end - start)/60))
#Tempo de Execução: 1058.66 min

train_mean_lc_lgbm = np.mean(train_scores_lc_lgbm, axis = 1)
train_std_lc_lgbm = np.std(train_scores_lc_lgbm, axis = 1)
test_mean_lc_lgbm = np.mean(test_scores_lc_lgbm, axis = 1)
test_std_lc_lgbm = np.std(test_scores_lc_lgbm, axis = 1)

plt.figure(figsize = (14, 7))
plt.plot( train_sizes_lc_lgbm, 
          train_mean_lc_lgbm,
Exemple #55
0
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=1,
                        train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="r")
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color="r",
             label="Training score")

    plt.legend(loc="best")
    return plt
Exemple #56
0
def plot_learning_curve(
    estimator,
    X,
    y,
    ylim=None,
    cv=None,
    n_jobs=multiprocessing.cpu_count() - 1,
    train_sizes=np.linspace(0.1, 1.0, 5),
    scoring=None,
    title="Learning Curve",
):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=multiprocessing.cpu_count() - 1)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """

    # learning curves in scikit learn
    # https://devdocs.io/scikit_learn/modules/generated/sklearn.model_selection.learning_curve#sklearn.model_selection.learning_curve
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, train_sizes=train_sizes, cv=cv, n_jobs=n_jobs, scoring=scoring
    )

    # https://devdocs.io/scikit_learn/auto_examples/model_selection/plot_learning_curve#sphx-glr-auto-examples-model-selection-plot-learning-curve-py

    plt.figure()
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    if ylim is not None:
        plt.ylim(*ylim)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="#00AAAA",
    )
    plt.fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="#AA00AA",
    )
    plt.plot(
        train_sizes, train_scores_mean, "o-", color="#00AAAA", label="Training score"
    )
    plt.plot(
        train_sizes,
        test_scores_mean,
        "o-",
        color="#AA00AA",
        label="Cross-validation score",
    )

    # plt.axis([0, 80, 0, 3])

    plt.legend(loc="best")
    plt.show()
def createLearningCurve(estimator, x, y, cv=None, name="", keras=False, is_estimator=True, scores=None):
    #if the estimator is the keras model
    if keras == True:
        #model already ran
        plt.plot(estimator.history['acc'])
        plt.plot(estimator.history['val_acc'])
        fname = "./results/" + name + '_learning_curve.png'
        plt.savefig(fname)
        return

    if is_estimator:
        train_sizes, train_scores, test_scores = learning_curve(estimator, x, y, cv=cv, train_sizes=np.linspace(.1, 1.0, 5))

    else:
        #read in scores[x[kfolds], y[kfolds], testx[kfolds], test_y[kfolds]]
        train_sizes = scores[0][0]
        test_scores = []
        train_scores = []
        
        '''
        for i in range(len(scores)):
            train_scores.append(scores[i][1])
            test_scores.append(scores[i][3])
        '''
        time_train = []
        time_test = []
        for i in range(len(scores)):
            #rolls over all train scores of ith run
            for j in range(len(scores[i][1])):
                #create time array 
                #add first time score
                if i == 0:
                    time_train.append([scores[i][1][j]])
                else:
                    train_scores[j].append(scores[i][1][j])
                
            
            #for 1 to batchNum test scores
            for j in range(len(scores[i][3])):
                #add first time score
                if i == 0:
                    time_test.append([scores[i][3][j]])
                else:
                    test_scores[j].append(scores[i][3][j])

            if i == 0:
                #two arrays of batchNum arrays each
                train_scores = time_train
                test_scores = time_test
        

    #create plots
    _, plots = plt.subplots(figsize=(20, 5))

    #set axis names 
    plots.set_xlabel("Training examples")
    plots.set_ylabel("Score")

    #plot curves
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    #fit_times_mean = np.mean(fit_times, axis=1)
    #fit_times_std = np.std(fit_times, axis=1)

    plots.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plots.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")

    #plot train
    plots.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    #plot test
    plots.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross validation score")
    fname = "./results/" + name + '_learning_curve.png'
    plt.savefig(fname)

    plots.legend(loc="best")

    
'''
## Adjust
parameters = {#'n_estimators': [10,20,30,40,50,60,70,80,90,100,120,140,160,180,200]}
              #'min_samples_leaf': [ 3, 10,20,30,40,50,60,70,80,90,100,120,140,160,180,200,300,400,500]}
              #'alpha': [0.1, 0.3, 0.6, 0.9]
               #'max_features':[30,32,34,36,38,40,42,44,46,48,50]
                'max_depth':[i for i in range(10,200)]  }  # 定义要优化的参数信息
model_gs = GridSearchCV(estimator=vr, param_grid=parameters, cv=10)
model_gs.fit(X_train,y_train)
print(model_gs.best_params_, model_gs.best_score_)
'''


# learning_curve
train_sizes, train_scores, test_scores = learning_curve(estimator=gbr, X=X_train, y=y_train,
                                                        train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='test accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.01, 1.0])
Exemple #59
0
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt
Exemple #60
0
#dataset = datasets.load_diabetes()
# fit a model to the data
import pandas as pd
mydata = pd.read_csv('winequality-red.csv')
dataset = mydata
dataset.target = mydata["quality"]
#provided your csv has header row, and the label column is named "Label"

#select all but the last column as data
dataset.data = mydata.ix[:, :-1]
model = ensemble.AdaBoostClassifier()
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
mse = np.mean((predicted - expected)**2)
print(mse)
print(model.score(dataset.data, dataset.target))

train_sizes, train_scores, valid_scores = learning_curve(
    model,
    dataset.data,
    dataset.target,
    train_sizes=[100, 200, 300, 400, 500, 600, 700, 800],
    cv=5)
train_sizes
print(train_scores)
print(valid_scores)