def main(): parser = argparse.ArgumentParser() parser.add_argument("training_data", help="Training data (CSV)") parser.add_argument("testing_data", help="Testing data (CSV)") parser.add_argument("learning_curve_file", help="Learning curve (PNG)") parser.add_argument("output", help="Output predictions (CSV)") args = parser.parse_args() training_data = pandas.read_csv(args.training_data, header=0) test_data = pandas.read_csv(args.testing_data, header=0) all_data = pandas.concat([training_data, test_data]) clean_data(all_data) training_data = all_data[all_data.Survived.notnull()] test_data = all_data[all_data.Survived.isnull()] training_x, training_y, columns = select_features(training_data) # cross-validate the classifier split_iterator = sklearn.cross_validation.StratifiedShuffleSplit(training_y, n_iter=10, random_state=4) # learning curve with default settings learning_curve(training_x, training_y, args.learning_curve_file) print "Hyperparameter tuning" base_classifier = sklearn.ensemble.RandomForestClassifier(100, oob_score=True, random_state=13) parameter_space = { "max_features": [None, "sqrt", 0.5, training_x.shape[1] - 1, training_x.shape[1] - 2, training_x.shape[1] - 3, training_x.shape[1] - 4, training_x.shape[1] - 5], "min_samples_split": [3, 5, 10, 20], "min_samples_leaf": [1, 2, 3] } parameter_space["max_features"] = [n for n in parameter_space["max_features"] if n is None or n > 0] tuned_classifier = sklearn.grid_search.GridSearchCV(base_classifier, parameter_space, n_jobs=-1, cv=split_iterator,refit=True) tuned_classifier.fit(training_x, training_y) print_tuning_scores(tuned_classifier) print_feature_importances(columns, tuned_classifier) training_predictions = tuned_classifier.predict(training_x) diffs = training_predictions - training_y print "Training accuracy: {:.3f}".format(1. - numpy.abs(diffs).mean()) ids = test_data.PassengerId.values test_x, _, _ = select_features(test_data) test_predictions = tuned_classifier.predict(test_x) with io.open(args.output, "wb") as csv_out: csv_writer = csv.writer(csv_out) csv_writer.writerow(["PassengerId", "Survived"]) csv_writer.writerows(zip(ids, test_predictions.astype(int)))
def main(): args = parse_args() logging.basicConfig(level=logging.INFO) original_data = pandas.read_csv(args.input, header=0) data = preprocess_features(original_data) if args.save_matrix: data.to_csv(args.save_matrix) global N_JOBS N_JOBS = args.n_jobs X, y = dataframe_to_ndarrays(data) check_data(X, y) cross_val_splits = sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=5, random_state=4) if args.decision_tree: decision_tree(X, y, data, cross_val_splits) if args.logistic: logistic_regression_cv(X, y, data, cross_val_splits) if args.learning_curve: learning_curve(X, y, args.learning_curve, sklearn.ensemble.RandomForestClassifier(100), "Random Forest Classifier") if args.forest: random_forest(X, y, data, cross_val_splits) if args.predictability: predictability_tests(X, y, original_data) if args.elastic: elastic_net(X, y, cross_val_splits) if args.xg: gradient_boosting_exp(X, y, data, cross_val_splits) if args.xg_hybrid: gradient_boosting_exp(X, y, data, cross_val_splits, base_classifier=classifiers.LogisticRegressionCVWrapper(5, solver="lbfgs")) if args.neural_network: neural_network(X, y, data, cross_val_splits) if args.ensemble: ensemble(X, y, cross_val_splits) if args.logistic_ensemble: logistic_ensemble(X, y, cross_val_splits)
def learning_curve_analysis(estimator=None, issues_train=None, priority_train=None): """ Generates the learning curve for a specific estimator. :param estimator: Estimator. :param issues_train: Standarized train set. :param priority_train: Priorities for train set. :return: None. """ train_sizes, train_scores, test_scores = learning_curve(estimator=estimator, X=issues_train, y=priority_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) _, _ = plt.subplots(figsize=(2.5, 2.5)) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.show()
def plot_learning_curve(X, y, model_type, algorithm, metric, transforms): """ Plots a learning curve showing model performance against both training and validation data sets as a function of the number of training samples. """ model = define_model(model_type, algorithm) X = apply_transforms(X, transforms) t0 = time.time() train_sizes, train_scores, test_scores = learning_curve(model, X, y, scoring=metric, cv=3, n_jobs=-1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fig, ax = plt.subplots(figsize=(16, 10)) ax.set_title('Learning Curve') ax.set_xlabel('Training Examples') ax.set_ylabel('Score') ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='r') ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='r') ax.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score') ax.plot(train_sizes, test_scores_mean, 'o-', color='r', label='Cross-validation score') ax.legend(loc='best') fig.tight_layout() t1 = time.time() print('Learning curve generated in {0:3f} s.'.format(t1 - t0))
def plot_learning_curve(X, y, algorithm, scaler, pca, selector, metric): """ Plots a learning curve showing model performance against both training and validation data sets as a function of the number of training samples. """ model = define_model(algorithm) X = apply_transforms(X, scaler, pca, selector) t0 = time.time() train_sizes, train_scores, test_scores = learning_curve(model, X, y, scoring=metric, cv=3, n_jobs=-1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fig, ax = plt.subplots(figsize=(16, 10)) ax.set_title('Learning Curve') ax.set_xlabel('Training Examples') ax.set_ylabel('Score') ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='r') ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='r') ax.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score') ax.plot(train_sizes, test_scores_mean, 'o-', color='r', label='Cross-validation score') ax.legend(loc='best') fig.tight_layout() t1 = time.time() print('Learning curve generated in {0:3f} s.'.format(t1 - t0))
def learnCurve(self, modelEstimator, title, data, label, cv=None, train_sizes=np.linspace(0.1, 1.0, 5)): ''' :param estimator: the model/algorithem you choose :param title: plot title :param x: train data numpy array style :param y: target data vector :param xlim: axes x lim :param ylim: axes y lim :param cv: :return: the figure ''' train_sizes, train_scores, test_scores = \ learning_curve(modelEstimator, data, label, cv=cv, train_sizes=train_sizes) '''this is the key score function''' train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.figure() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='b') plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='g') plt.plot(train_sizes, train_scores_mean, 'o-', color='b', label='training score') plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='cross valid score') plt.xlabel('training examples') plt.ylabel('score') plt.legend(loc='best') plt.grid('on') plt.title(title) plt.show()
def learning_curve_analysis(estimator=None, issues_train=None, priority_train=None): """ Generates the learning curve for a specific estimator. :param estimator: Estimator. :param issues_train: Standarized train set. :param priority_train: Priorities for train set. :return: None. """ train_sizes, train_scores, test_scores = learning_curve( estimator=estimator, X=issues_train, y=priority_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) _, _ = plt.subplots(figsize=(2.5, 2.5)) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.show()
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True): """ 画出data在某模型上的learning curve. 参数解释 ---------- estimator : 你用的分类器。 title : 表格的标题。 X : 输入的feature,numpy类型 y : 输入的target vector ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点 cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份) n_jobs : 并行的的任务数(默认1) """ train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) if plot: plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel(u"训练样本数") plt.ylabel(u"得分") plt.gca().invert_yaxis() plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="b") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="r") plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分") plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分") plt.legend(loc="best") plt.draw() plt.show() plt.gca().invert_yaxis() midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1]) return midpoint, diff
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") plt.show()
get_ipython().magic(u'pinfo sklearn.svm.SVC') help(sklearn.svm.SVC) get_ipython().set_next_input(u'help(sklearn.svm.SVC');get_ipython().magic(u'pinfo sklearn.svm.SVC') get_ipython().magic(u'pinfo sklearn.svm.SVC') help(sklearn.svm.SVC) get_ipython().magic(u'pinfo sklearn.svm.SVC') X_test.max X_test.max() X_test.min() from sklearn.learning_curve import learning_curve learning_curve> get_ipython().magic(u'pinfo learning_curve') get_ipython().magic(u'pinfo learning_curve') get_ipython().magic(u'pinfo learning_curve') estimator = GaussianNB() learning_curve(estimator,data.data,data,target) learning_curve(estimator,data.data,data.target) estimator.fit(X_train,y_train) estimator.fit(X_train,y_train) estimator.fit(X_train,y_train) estimator.fit(X_train,y_train) estimator.fit(X_train,y_train) estimator.fit(X_train,y_train) estimator.fit(X_train,y_train) estimator.fit(X_train,y_train) learning_curve(estimator,data.data,data.target) get_ipython().magic(u'pinfo _') get_ipython().magic(u'pinfo learning_curve') estimator.name estimator.__name__ print estimator