def run_ann_with_only_dimensionality_reduction(pca_x_train, ica_x_train,
                                               rp_x_train, variance_x_train):
    classifier = MLPClassifier(hidden_layer_sizes=(25),
                               activation='logistic',
                               max_iter=5000,
                               solver='adam')
    cv = ShuffleSplit(n_splits=20, test_size=0.2, random_state=0)
    ann_data = standard_scaler.fit_transform(data[features])
    plot_learning_curve("{}- DR - ANN Learning Curve".format(plot_name),
                        classifier,
                        "{}- DR - ANN Learning Curve Simple".format(plot_name),
                        ann_data,
                        data[target],
                        ylim=(0, 1),
                        cv=cv,
                        n_jobs=4)
    plot_learning_curve("{}-PCA DR - ANN Learning Curve".format(plot_name),
                        classifier,
                        "{}- PCA DR - ANN Learning Curve".format(plot_name),
                        standard_scaler.fit_transform(pca_x_train),
                        data[target],
                        ylim=(0, 1),
                        cv=cv,
                        n_jobs=4)
    plot_learning_curve("{}-ICA DR - ANN Learning Curve".format(plot_name),
                        classifier,
                        "{}- ICA DR - ANN Learning Curve".format(plot_name),
                        standard_scaler.fit_transform(ica_x_train),
                        data[target],
                        ylim=(0, 1),
                        cv=cv,
                        n_jobs=4)
    plot_learning_curve("{}-RP DR - ANN Learning Curve".format(plot_name),
                        classifier,
                        "{}- RP DR - ANN Learning Curve".format(plot_name),
                        standard_scaler.fit_transform(rp_x_train),
                        data[target],
                        ylim=(0, 1),
                        cv=cv,
                        n_jobs=4)
    plot_learning_curve(
        "{}-Variance filter DR - ANN Learning Curve".format(plot_name),
        classifier,
        "{}- Variance filter DR - ANN Learning Curve".format(plot_name),
        standard_scaler.fit_transform(variance_x_train),
        data[target],
        ylim=(0, 1),
        cv=cv,
        n_jobs=4)
def clustering_after_reduction(pca_x_train, ica_x_train, rp_x_train,
                               variance_x_train):
    ica_kmeans_k = 6
    ica_em_k = 5
    pca_em_k = 5
    pca_kmeans_k = 7
    rp_em_k = 6
    rp_kmeans_k = 7
    variance_filter_em_k = 13
    variance_filter_kmeans_k = 6
    cluster_params1 = simple_clustering(
        "{}- ICA".format(plot_name), ica_x_train, ica_kmeans_k, ica_em_k,
        base_experiment.identify_top_2_features(ica_x_train))
    cluster_params2 = simple_clustering(
        "{}- PCA".format(plot_name), pca_x_train, pca_kmeans_k, pca_em_k,
        base_experiment.identify_top_2_features(pca_x_train))
    cluster_params3 = simple_clustering(
        "{}- Random Projection".format(plot_name), rp_x_train, rp_kmeans_k,
        rp_em_k, base_experiment.identify_top_2_features(rp_x_train))
    cluster_params4 = simple_clustering(
        "{}- Variance Filtering".format(plot_name), variance_x_train,
        variance_filter_kmeans_k, variance_filter_em_k,
        base_experiment.identify_top_2_features(variance_x_train))
    final_cluster_array = np.concatenate(
        (cluster_params1, cluster_params2, cluster_params3, cluster_params4),
        axis=1)
    cluster_features = pd.DataFrame(data=final_cluster_array)
    classifier = MLPClassifier(hidden_layer_sizes=(25),
                               activation='logistic',
                               max_iter=5000,
                               solver='adam')
    cv = ShuffleSplit(n_splits=20, test_size=0.2, random_state=0)
    plot_learning_curve(
        "{}-Cluster Features - ANN Learning Curve".format(plot_name),
        classifier,
        "{}- Cluster Features - ANN Learning Curve".format(plot_name),
        standard_scaler.fit_transform(cluster_features),
        data[target],
        ylim=(0, 1),
        cv=cv,
        n_jobs=4)
Beispiel #3
0
def train(X,
          y,
          config,
          max_epochs,
          batch_iterator='BatchIterator',
          pretrained_model=None,
          name=None,
          debug=True):
    # print globals()['net_name']
    global net_name
    sample = 500 if debug else X.shape[0]
    X_t, y_t = X[:sample], y[:sample, :]
    param_dump_folder = './model_%s' % (m_time.strftime("%m_%d_%H_%M_%S")
                                        if name is None else name)

    print 'Model name: %s' % param_dump_folder
    print 'Debug mode:', debug
    # Load the net and add a function to save the params after every epoch
    nnet = globals()[config](batch_iterator, max_epochs)
    func_save_model = lambda n, h: save_model_params(n, h, param_dump_folder,
                                                     debug)
    nnet.on_epoch_finished.append(func_save_model)
    func_learning_curve = lambda n, h: plot_learning_curve(
        n, h, param_dump_folder, debug)
    nnet.on_epoch_finished.append(func_learning_curve)
    # func_viz_weights = lambda n, h: plot_weight_matrix_grid(
    #     n, h, param_dump_folder, debug)
    # nnet.on_epoch_finished.append(func_viz_weights)

    print 'Config: %s' % config
    print 'Max num epochs: %d' % nnet.max_epochs
    print "Dataset loaded, shape:", X_t.shape, y_t.shape
    print "Loading pretrained model %s ..." % pretrained_model
    if pretrained_model is not None:
        pretrained_weights = pickle.load(open(pretrained_model, 'rb'))
        nnet.load_params_from(pretrained_weights)
    print "Finished loading"
    # Train
    if not debug:
        if not os.path.exists(param_dump_folder):
            os.mkdir(param_dump_folder)
    try:
        nnet.fit(X_t, y_t)
    except KeyboardInterrupt:
        pass
    if not debug:
        nnet.save_params_to(os.path.join(param_dump_folder, 'model_final.pkl'))
Beispiel #4
0
def train(X, y, config, max_epochs, batch_iterator='BatchIterator',
          pretrained_model=None, name=None, debug=True):
    # print globals()['net_name']
    global net_name
    sample = 500 if debug else X.shape[0]
    X_t, y_t = X[:sample], y[:sample, :]
    param_dump_folder = './model_%s' % (m_time.strftime(
        "%m_%d_%H_%M_%S") if name is None else name)

    print 'Model name: %s' % param_dump_folder
    print 'Debug mode:', debug
    # Load the net and add a function to save the params after every epoch
    nnet = globals()[config](batch_iterator, max_epochs)
    func_save_model = lambda n, h: save_model_params(
        n, h, param_dump_folder, debug)
    nnet.on_epoch_finished.append(func_save_model)
    func_learning_curve = lambda n, h: plot_learning_curve(
        n, h, param_dump_folder, debug)
    nnet.on_epoch_finished.append(func_learning_curve)
    # func_viz_weights = lambda n, h: plot_weight_matrix_grid(
    #     n, h, param_dump_folder, debug)
    # nnet.on_epoch_finished.append(func_viz_weights)

    print 'Config: %s' % config
    print 'Max num epochs: %d' % nnet.max_epochs
    print "Dataset loaded, shape:", X_t.shape, y_t.shape
    print "Loading pretrained model %s ..." % pretrained_model
    if pretrained_model is not None:
        pretrained_weights = pickle.load(open(pretrained_model, 'rb'))
        nnet.load_params_from(pretrained_weights)
    print "Finished loading"
    # Train
    if not debug:
        if not os.path.exists(param_dump_folder):
            os.mkdir(param_dump_folder)
    try:
        nnet.fit(X_t, y_t)
    except KeyboardInterrupt:
        pass
    if not debug:
        nnet.save_params_to(os.path.join(
            param_dump_folder, 'model_final.pkl'))
                    'criterion': 'gini',
                    'max_depth': 3,
                    'min_samples_split': 2
                })
}

# Running code with default values
plt = print_results(result_dict)
#plt.show()
plt.savefig(fig_path + 'results.png')

title = "Learning Curves for Decision Tree"
plt = plot_learning_curve(DecisionTreeClassifier(criterion='gini',
                                                 max_depth=3,
                                                 min_samples_split=2),
                          'Survived',
                          FEATURES,
                          titanic_df,
                          title,
                          ylim=(0.4, 1.01))
#plt.show()
plt.savefig(fig_path + 'learning_curve_dt.png')

title = "Learning Curves for Neural Networks"
plt = plot_learning_curve(MLPClassifier(activation='identity',
                                        learning_rate='constant',
                                        solver='lbfgs'),
                          'Survived',
                          FEATURES,
                          titanic_df,
                          title,
                          ylim=(0.4, 1.01))
Beispiel #6
0
                      title='Wine Quality Neural Network Confusion Matrix')
plt.show()

#DecisionTreeClassifier
singleDT = DecisionTreeClassifier(max_depth=43, max_features=5)
singleDT.fit(X_train, y_train)
DT_predictions = singleDT.predict(X_test)
DT_accuracy = accuracy_score(y_test, DT_predictions)
print 'Wine Quality Decision Tree Accuracy:', DT_accuracy
print '-----------------'

cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
plot_learning_curve(singleDT,
                    'Wine Quality Decision Tree Learning Curve',
                    X,
                    y,
                    ylim=(0.5, 1.01),
                    cv=cv,
                    n_jobs=4)
plt.show()

#Boosting
AdaBoost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10),
                              n_estimators=40)
AdaBoost.fit(X_train, y_train)
boosted_accuracy = AdaBoost.score(X_test, y_test)
print 'Wine Quality AdaBoost Accuracy:', boosted_accuracy
print '-----------------'

cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
plot_learning_curve(AdaBoost,
Beispiel #7
0
               'income - Linear SVM': build_model(linear_svm_fn, 'income', FEATURES, adult_df, options={'C': 0.1, 'loss': 'hinge'}),
               'income - SVM Linear': build_model(svm_linear_fn, 'income', FEATURES, adult_df, options={'C': 1, 'gamma': 0.1}),
               'income - SVM RBF': build_model(svm_rbf_fn, 'income', FEATURES, adult_df, options={'C': 1, 'gamma': 0.1}),
               'income - Ada Boosting': build_model(ada_boosting_fn, 'income', FEATURES, adult_df, options={'algorithm': 'SAMME.R', 'learning_rate': 1, 'n_estimators': 500}),
               'income - Gradient Boosting': build_model(gradient_boosting_fn, 'income', FEATURES, adult_df, options={'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 100}),
               'income - Neural networks': build_model(neural_network_fn, 'income', FEATURES, adult_df, options={'activation':'tanh', 'learning_rate':'invscaling', 'solver': 'adam'}),
               'income - Decision_tree': build_model(decision_tree_fn, 'income', FEATURES, adult_df, options={'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2})
               }

# Running code with default values
plt = print_results(result_dict)
#plt.show()
plt.savefig(fig_path + 'results.png')

title = "Learning Curves for Decision Tree"
plt = plot_learning_curve(DecisionTreeClassifier(criterion='gini', max_depth= 8, min_samples_split=12), 'income', FEATURES, adult_df, title, ylim=(0.4, 1.01))
#plt.show()
plt.savefig(fig_path + 'learning_curve_dt.png')

title = "Learning Curves for Neural Networks"
plt = plot_learning_curve(MLPClassifier(activation='tanh', learning_rate='invscaling', solver='adam'), 'income', FEATURES, adult_df, title, ylim=(0.4, 1.01))
#plt.show()
plt.savefig(fig_path + 'learning_curve_neural.png')

title = "Learning Curves for AdaBoost"
plt = plot_learning_curve(AdaBoostClassifier(algorithm='SAMME.R', learning_rate=1, n_estimators= 500), 'income', FEATURES, adult_df, title, ylim=(0.4, 1.01))
#plt.show()
plt.savefig(fig_path + 'learning_curve_adaboost.png')

title = "Learning Curves for GradientBoost"
plt = plot_learning_curve(GradientBoostingClassifier(criterion='friedman_mse', learning_rate=0.1, loss='deviance', n_estimators=100), 'income', FEATURES, adult_df, title, ylim=(0.4, 1.01))
Beispiel #8
0
    C=1e10,  # Large C for no regularization
    random_state=33,
    penalty='l1')
pipe_lrn = create_pipeline(lrn_basic)
results_mod = cv_evaluation(pipe_lrn, train_x.question_text, train_y)
print('The mean train {} and test CV {}.'.format(
    round(results_mod['train_score'], 2), round(results_mod['test_score'], 2)))

pipe_lrn.fit(train_x.question_text, train_y)
predictions_valid = pipe_lrn.predict(valid_x.question_text)
val_score = metrics.f1_score(valid_y, predictions_valid)
print('The validation F1_score was of {}'.format(val_score))

h.plot_learning_curve(pipe_lrn,
                      text,
                      train_y,
                      cv=3,
                      n_jobs=3,
                      title='Learning Curves (Logistic Classifer)')

# Validating preprocessing with the same base learner
results_mod = cv_evaluation(pipe_lrn, train_x.qt_clean_stop, train_y)
print('The mean train {} and test CV {}.'.format(
    round(results_mod['train_score'], 2), round(results_mod['test_score'], 2)))

# 5. Feature Engineering ------------------------------------------------------
# 5.1 Feature Engineering: Ngram and noise reduction
# One problem is the vast amount of text features generated in CountVectorizer
# much of them are pure noise
count_vect = CountVectorizer(binary=True)
count_vect.fit(train_x.qt_clean_stop)
xtrain_count = count_vect.transform(train_x.qt_clean_stop)
Beispiel #9
0
def main():
    # ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
    train_df_ori = pd.read_csv('data/train.csv')
    test_df_ori = pd.read_csv('data/test.csv')

    train_df, test_df = fea_eng(train_df_ori, test_df_ori)

    # colormap = plt.cm.RdBu
    # plt.figure(figsize=(14, 12))
    # sns.heatmap(train_df.astype(float).corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white',
    #             annot=True)
    # plt.title('Pearson Correlation of Features', y=1.05, size=15)
    # plt.show()

    # Some useful parameters which will come in handy later on
    rf = helper.SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=config.rf_params)
    et = helper.SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=config.et_params)
    lr = helper.SklearnHelper(clf=LogisticRegression, seed=SEED, params=config.lr_params)
    gb = helper.SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=config.gb_params)
    # svc = helper.SklearnHelper(clf=SVC, seed=SEED, params=config.svc_params)

    y_train = train_df['Survived'].ravel()
    train_df = train_df.drop(['Survived'], axis=1)
    x_train = train_df.values
    x_test = test_df.values

    # # 对XGBoost进行网格搜索
    # x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=SEED)
    #
    # xgb_grid = GridSearchCV(estimator=xgb.XGBClassifier(n_estimators=102,
    #                                                     learning_rate=0.01,
    #                                                     objective='binary:logistic',
    #                                                     max_depth=2,
    #                                                     min_child_weight=1,
    #                                                     gamma=0,
    #                                                     subsample=0.8,
    #                                                     colsample_bytree=0.9,
    #                                                     seed=SEED),
    #                         param_grid=config.xgb_grid_params,
    #                         cv=5)
    # xgb_grid.fit(X=x_train, y=y_train)
    # y_scores = xgb_grid.predict(x_test)
    # # Print model report:
    # print("Model Report")
    # print('best parameter:' + str(xgb_grid.best_params_))
    # print("Accuracy : {}".format(accuracy_score(y_true=y_test, y_pred=y_scores)))
    # print("AUC Score (Test): {}".format(roc_auc_score(y_true=y_test, y_score=y_scores)))
    #
    # feat_imp = pd.Series(xgb_grid.best_estimator_.feature_importances_, index=train_df.columns.values).sort_values(ascending=False)
    # feat_imp.plot(kind='bar', title='Feature Importances')
    # plt.ylabel('Feature Importance Score')
    # plt.show()

    # # 网格搜索,寻找最优参数
    # helper.GridSearchProcessing(helper.process_build(clf=RandomForestClassifier(),
    #                                                  params=config.rf_grid_params,
    #                                                  X=x_train,
    #                                                  y=y_train))
    # helper.GridSearchProcessing(helper.process_build(clf=ExtraTreesClassifier(),
    #                                                  params=config.et_grid_params,
    #                                                  X=x_train,
    #                                                  y=y_train))
    # helper.GridSearchProcessing(helper.process_build(clf=LogisticRegression(),
    #                                                  params=config.lr_grid_params,
    #                                                  X=x_train,
    #                                                  y=y_train))
    # process = helper.GridSearchProcessing(helper.process_build(clf=GradientBoostingClassifier(),
    #                                                  params=config.gb_grid_params,
    #                                                  X=x_train,
    #                                                  y=y_train))
    # # 对Pool对象调用join()方法会等待所有子进程执行完毕,调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了
    # process.get_pool().close()
    # process.get_pool().join()

    # Create our OOF train and test predictions. These base results will be used as new features
    et_oof_train, et_oof_test = helper.get_oof(et, x_train, y_train, x_test)  # Extra Trees
    rf_oof_train, rf_oof_test = helper.get_oof(rf, x_train, y_train, x_test)  # Random Forest
    lr_oof_train, lr_oof_test = helper.get_oof(lr, x_train, y_train, x_test)  # lr
    gb_oof_train, gb_oof_test = helper.get_oof(gb, x_train, y_train, x_test)  # Gradient Boost
    # svc_oof_train, svc_oof_test = helper.get_oof(svc, x_train, y_train, x_test)  # Support Vector Classifier

    plt.figure(figsize=(10,8))

    plt.subplot(2,2,1)
    rf_feature_importances = pd.Series(rf.clf.feature_importances_, index=train_df.columns.values).sort_values(ascending=False)
    rf_feature_importances.plot(kind='bar', title='Feature Importances')
    plt.ylabel('rf Feature Importance Score')

    plt.subplot(2,2,2)
    et_feature_importances = pd.Series(et.clf.feature_importances_, index=train_df.columns.values).sort_values(ascending=False)
    et_feature_importances.plot(kind='bar', title='Feature Importances')
    plt.ylabel('et Feature Importance Score')
    plt.subplot(2,2,3)
    gb_feature_importances = pd.Series(gb.clf.feature_importances_, index=train_df.columns.values).sort_values(ascending=False)
    gb_feature_importances.plot(kind='bar', title='Feature Importances')
    plt.ylabel('gb Feature Importance Score')
    plt.show()

    print("Training is complete")

    base_predictions_train = pd.DataFrame({'RandomForest': rf_oof_train.ravel(),
                                           'ExtraTrees': et_oof_train.ravel(),
                                           # 'AdaBoost': ada_oof_train.ravel(),
                                           'GradientBoost': gb_oof_train.ravel()
                                           })

    x_train = np.concatenate((et_oof_train, rf_oof_train, lr_oof_train, gb_oof_train), axis=1)
    x_test = np.concatenate((et_oof_test, rf_oof_test, lr_oof_test, gb_oof_test), axis=1)

    xgb_helper = helper.SklearnHelper(clf=xgb.XGBClassifier, seed=SEED, params=config.xgb_params)
    xgb_helper.train(x_train, y_train)
    predictions = xgb_helper.predict(x_test)

    helper.create_feature_map(train_df.columns)
    xgb.plot_tree(xgb_helper.clf, fmap='xgb.fmap', num_trees=0)

    # 交叉验证,可以快速得到模型的预测正确率
    scores = cross_val_score(rf.clf, x_train, y_train, cv=5)
    print("rf Accuracy: {}".format(scores))
    scores = cross_val_score(et.clf, x_train, y_train, cv=5)
    print("et Accuracy: {}".format(scores))
    scores = cross_val_score(lr.clf, x_train, y_train, cv=5)
    print("lr Accuracy: {}".format(scores))
    scores = cross_val_score(gb.clf, x_train, y_train, cv=5)
    print("gb Accuracy: {}".format(scores))
    # scores = cross_val_score(svc.clf, x_train, y_train, cv=5)
    # print("svc Accuracy: {}".format(scores))
    scores = cross_val_score(xgb_helper.clf, x_train, y_train, cv=5)
    print("stacking xgb Accuracy: {}".format(scores))

    helper.plot_learning_curve(xgb_helper.clf, 'xgb_learn curve', x_train, y_train)
    helper.plot_learning_curve(rf.clf, 'rf_learn curve', x_train, y_train)

    result = pd.DataFrame({"PassengerId": test_df_ori['PassengerId'],
                           "Survived": predictions
                           })

    result.to_csv("data/submission.csv", index=False)
Beispiel #10
0
                      title='Pen Digits Neural Network Confusion Matrix')
plt.show()

#DecisionTreeClassifier
singleDT = DecisionTreeClassifier(max_depth=22, max_features=5)
singleDT.fit(X_train, y_train)
DT_predictions = singleDT.predict(X_test)
DT_accuracy = accuracy_score(y_test, DT_predictions)
print 'Pen Digit Decision Tree Accuracy:', DT_accuracy
print '-----------------'

cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
plot_learning_curve(singleDT,
                    'Pen Digit Decision Tree Learning Curve',
                    X,
                    y,
                    ylim=(0.5, 1.01),
                    cv=cv,
                    n_jobs=4)
plt.show()

#Boosting
AdaBoost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10),
                              n_estimators=22)
AdaBoost.fit(X_train, y_train)
boosted_accuracy = AdaBoost.score(X_test, y_test)
print 'Pen Digit AdaBoost Accuracy:', boosted_accuracy
print '-----------------'

cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
plot_learning_curve(AdaBoost,