Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("training_data", help="Training data (CSV)")
    parser.add_argument("testing_data", help="Testing data (CSV)")
    parser.add_argument("learning_curve_file", help="Learning curve (PNG)")
    parser.add_argument("output", help="Output predictions (CSV)")
    args = parser.parse_args()

    training_data = pandas.read_csv(args.training_data, header=0)
    test_data = pandas.read_csv(args.testing_data, header=0)
    all_data = pandas.concat([training_data, test_data])

    clean_data(all_data)

    training_data = all_data[all_data.Survived.notnull()]
    test_data = all_data[all_data.Survived.isnull()]

    training_x, training_y, columns = select_features(training_data)

    # cross-validate the classifier
    split_iterator = sklearn.cross_validation.StratifiedShuffleSplit(training_y, n_iter=10, random_state=4)

    # learning curve with default settings
    learning_curve(training_x, training_y, args.learning_curve_file)

    print "Hyperparameter tuning"
    base_classifier = sklearn.ensemble.RandomForestClassifier(100, oob_score=True, random_state=13)
    parameter_space = {
        "max_features": [None, "sqrt", 0.5, training_x.shape[1] - 1, training_x.shape[1] - 2, training_x.shape[1] - 3,
                         training_x.shape[1] - 4, training_x.shape[1] - 5],
        "min_samples_split": [3, 5, 10, 20],
        "min_samples_leaf": [1, 2, 3]
    }
    parameter_space["max_features"] = [n for n in parameter_space["max_features"] if n is None or n > 0]
    tuned_classifier = sklearn.grid_search.GridSearchCV(base_classifier, parameter_space, n_jobs=-1, cv=split_iterator,refit=True)
    tuned_classifier.fit(training_x, training_y)
    print_tuning_scores(tuned_classifier)

    print_feature_importances(columns, tuned_classifier)

    training_predictions = tuned_classifier.predict(training_x)
    diffs = training_predictions - training_y
    print "Training accuracy: {:.3f}".format(1. - numpy.abs(diffs).mean())

    ids = test_data.PassengerId.values
    test_x, _, _ = select_features(test_data)
    test_predictions = tuned_classifier.predict(test_x)

    with io.open(args.output, "wb") as csv_out:
        csv_writer = csv.writer(csv_out)
        csv_writer.writerow(["PassengerId", "Survived"])
        csv_writer.writerows(zip(ids, test_predictions.astype(int)))
def main():
    args = parse_args()

    logging.basicConfig(level=logging.INFO)

    original_data = pandas.read_csv(args.input, header=0)
    data = preprocess_features(original_data)

    if args.save_matrix:
        data.to_csv(args.save_matrix)

    global N_JOBS
    N_JOBS = args.n_jobs

    X, y = dataframe_to_ndarrays(data)
    check_data(X, y)

    cross_val_splits = sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=5, random_state=4)

    if args.decision_tree:
        decision_tree(X, y, data, cross_val_splits)

    if args.logistic:
        logistic_regression_cv(X, y, data, cross_val_splits)

    if args.learning_curve:
        learning_curve(X, y, args.learning_curve, sklearn.ensemble.RandomForestClassifier(100), "Random Forest Classifier")

    if args.forest:
        random_forest(X, y, data, cross_val_splits)

    if args.predictability:
        predictability_tests(X, y, original_data)

    if args.elastic:
        elastic_net(X, y, cross_val_splits)

    if args.xg:
        gradient_boosting_exp(X, y, data, cross_val_splits)

    if args.xg_hybrid:
        gradient_boosting_exp(X, y, data, cross_val_splits, base_classifier=classifiers.LogisticRegressionCVWrapper(5, solver="lbfgs"))

    if args.neural_network:
        neural_network(X, y, data, cross_val_splits)

    if args.ensemble:
        ensemble(X, y, cross_val_splits)

    if args.logistic_ensemble:
        logistic_ensemble(X, y, cross_val_splits)
Esempio n. 3
0
def learning_curve_analysis(estimator=None, issues_train=None, priority_train=None):
    """
    Generates the learning curve for a specific estimator.
    :param estimator: Estimator.
    :param issues_train: Standarized train set.
    :param priority_train: Priorities for train set.
    :return: None.
    """
    train_sizes, train_scores, test_scores = learning_curve(estimator=estimator, X=issues_train,
                                                            y=priority_train,
                                                            train_sizes=np.linspace(0.1, 1.0, 10),
                                                            cv=10,
                                                            n_jobs=1)

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    _, _ = plt.subplots(figsize=(2.5, 2.5))
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')

    plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5,
             label='validation accuracy')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')

    plt.grid()
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.show()
Esempio n. 4
0
def plot_learning_curve(X, y, model_type, algorithm, metric, transforms):
    """
    Plots a learning curve showing model performance against both training and
    validation data sets as a function of the number of training samples.
    """
    model = define_model(model_type, algorithm)
    X = apply_transforms(X, transforms)

    t0 = time.time()
    train_sizes, train_scores, test_scores = learning_curve(model, X, y, scoring=metric, cv=3, n_jobs=-1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    fig, ax = plt.subplots(figsize=(16, 10))
    ax.set_title('Learning Curve')
    ax.set_xlabel('Training Examples')
    ax.set_ylabel('Score')
    ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
                    alpha=0.1, color='r')
    ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
                    alpha=0.1, color='r')
    ax.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
    ax.plot(train_sizes, test_scores_mean, 'o-', color='r', label='Cross-validation score')
    ax.legend(loc='best')
    fig.tight_layout()
    t1 = time.time()
    print('Learning curve generated in {0:3f} s.'.format(t1 - t0))
Esempio n. 5
0
def plot_learning_curve(X, y, algorithm, scaler, pca, selector, metric):
    """
    Plots a learning curve showing model performance against both training and
    validation data sets as a function of the number of training samples.
    """
    model = define_model(algorithm)
    X = apply_transforms(X, scaler, pca, selector)

    t0 = time.time()
    train_sizes, train_scores, test_scores = learning_curve(model, X, y, scoring=metric, cv=3, n_jobs=-1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    fig, ax = plt.subplots(figsize=(16, 10))
    ax.set_title('Learning Curve')
    ax.set_xlabel('Training Examples')
    ax.set_ylabel('Score')
    ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
                    alpha=0.1, color='r')
    ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
                    alpha=0.1, color='r')
    ax.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
    ax.plot(train_sizes, test_scores_mean, 'o-', color='r', label='Cross-validation score')
    ax.legend(loc='best')
    fig.tight_layout()
    t1 = time.time()
    print('Learning curve generated in {0:3f} s.'.format(t1 - t0))
Esempio n. 6
0
    def learnCurve(self, modelEstimator, title, data, label, cv=None, train_sizes=np.linspace(0.1, 1.0, 5)):
        '''
        :param estimator: the model/algorithem you choose
        :param title: plot title
        :param x: train data numpy array style
        :param y: target data vector
        :param xlim: axes x lim
        :param ylim: axes y lim
        :param cv:
        :return: the figure
        '''

        train_sizes, train_scores, test_scores = \
            learning_curve(modelEstimator, data, label, cv=cv, train_sizes=train_sizes)

        '''this is the key score function'''
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)

        plt.figure()
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1, color='b')
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1, color='g')
        plt.plot(train_sizes, train_scores_mean, 'o-', color='b', label='training score')
        plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='cross valid score')
        plt.xlabel('training examples')
        plt.ylabel('score')
        plt.legend(loc='best')
        plt.grid('on')
        plt.title(title)
        plt.show()
Esempio n. 7
0
def learning_curve_analysis(estimator=None,
                            issues_train=None,
                            priority_train=None):
    """
    Generates the learning curve for a specific estimator.
    :param estimator: Estimator.
    :param issues_train: Standarized train set.
    :param priority_train: Priorities for train set.
    :return: None.
    """
    train_sizes, train_scores, test_scores = learning_curve(
        estimator=estimator,
        X=issues_train,
        y=priority_train,
        train_sizes=np.linspace(0.1, 1.0, 10),
        cv=10,
        n_jobs=1)

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    _, _ = plt.subplots(figsize=(2.5, 2.5))
    plt.plot(train_sizes,
             train_mean,
             color='blue',
             marker='o',
             markersize=5,
             label='training accuracy')
    plt.fill_between(train_sizes,
                     train_mean + train_std,
                     train_mean - train_std,
                     alpha=0.15,
                     color='blue')

    plt.plot(train_sizes,
             test_mean,
             color='green',
             linestyle='--',
             marker='s',
             markersize=5,
             label='validation accuracy')
    plt.fill_between(train_sizes,
                     test_mean + test_std,
                     test_mean - test_std,
                     alpha=0.15,
                     color='green')

    plt.grid()
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.show()
Esempio n. 8
0
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1,
                        train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
    """
    画出data在某模型上的learning curve.
    参数解释
    ----------
    estimator : 你用的分类器。
    title : 表格的标题。
    X : 输入的feature,numpy类型
    y : 输入的target vector
    ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
    cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
    n_jobs : 并行的的任务数(默认1)
    """
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel(u"训练样本数")
        plt.ylabel(u"得分")
        plt.gca().invert_yaxis()
        plt.grid()

        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
                         alpha=0.1, color="b")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分")

        plt.legend(loc="best")

        plt.draw()
        plt.show()
        plt.gca().invert_yaxis()

    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=1,
                        train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="g")
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color="r",
             label="Training score")
    plt.plot(train_sizes,
             test_scores_mean,
             'o-',
             color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()
Esempio n. 10
0
get_ipython().magic(u'pinfo sklearn.svm.SVC')
help(sklearn.svm.SVC)
get_ipython().set_next_input(u'help(sklearn.svm.SVC');get_ipython().magic(u'pinfo sklearn.svm.SVC')
get_ipython().magic(u'pinfo sklearn.svm.SVC')
help(sklearn.svm.SVC)
get_ipython().magic(u'pinfo sklearn.svm.SVC')
X_test.max
X_test.max()
X_test.min()
from  sklearn.learning_curve import  learning_curve
learning_curve>
get_ipython().magic(u'pinfo learning_curve')
get_ipython().magic(u'pinfo learning_curve')
get_ipython().magic(u'pinfo learning_curve')
estimator = GaussianNB()
learning_curve(estimator,data.data,data,target)
learning_curve(estimator,data.data,data.target)
estimator.fit(X_train,y_train)
estimator.fit(X_train,y_train)
estimator.fit(X_train,y_train)
estimator.fit(X_train,y_train)
estimator.fit(X_train,y_train)
estimator.fit(X_train,y_train)
estimator.fit(X_train,y_train)
estimator.fit(X_train,y_train)
learning_curve(estimator,data.data,data.target)
get_ipython().magic(u'pinfo _')
get_ipython().magic(u'pinfo learning_curve')
estimator.name
estimator.__name__
print estimator