def run_ann_with_only_dimensionality_reduction(pca_x_train, ica_x_train, rp_x_train, variance_x_train): classifier = MLPClassifier(hidden_layer_sizes=(25), activation='logistic', max_iter=5000, solver='adam') cv = ShuffleSplit(n_splits=20, test_size=0.2, random_state=0) ann_data = standard_scaler.fit_transform(data[features]) plot_learning_curve("{}- DR - ANN Learning Curve".format(plot_name), classifier, "{}- DR - ANN Learning Curve Simple".format(plot_name), ann_data, data[target], ylim=(0, 1), cv=cv, n_jobs=4) plot_learning_curve("{}-PCA DR - ANN Learning Curve".format(plot_name), classifier, "{}- PCA DR - ANN Learning Curve".format(plot_name), standard_scaler.fit_transform(pca_x_train), data[target], ylim=(0, 1), cv=cv, n_jobs=4) plot_learning_curve("{}-ICA DR - ANN Learning Curve".format(plot_name), classifier, "{}- ICA DR - ANN Learning Curve".format(plot_name), standard_scaler.fit_transform(ica_x_train), data[target], ylim=(0, 1), cv=cv, n_jobs=4) plot_learning_curve("{}-RP DR - ANN Learning Curve".format(plot_name), classifier, "{}- RP DR - ANN Learning Curve".format(plot_name), standard_scaler.fit_transform(rp_x_train), data[target], ylim=(0, 1), cv=cv, n_jobs=4) plot_learning_curve( "{}-Variance filter DR - ANN Learning Curve".format(plot_name), classifier, "{}- Variance filter DR - ANN Learning Curve".format(plot_name), standard_scaler.fit_transform(variance_x_train), data[target], ylim=(0, 1), cv=cv, n_jobs=4)
def clustering_after_reduction(pca_x_train, ica_x_train, rp_x_train, variance_x_train): ica_kmeans_k = 6 ica_em_k = 5 pca_em_k = 5 pca_kmeans_k = 7 rp_em_k = 6 rp_kmeans_k = 7 variance_filter_em_k = 13 variance_filter_kmeans_k = 6 cluster_params1 = simple_clustering( "{}- ICA".format(plot_name), ica_x_train, ica_kmeans_k, ica_em_k, base_experiment.identify_top_2_features(ica_x_train)) cluster_params2 = simple_clustering( "{}- PCA".format(plot_name), pca_x_train, pca_kmeans_k, pca_em_k, base_experiment.identify_top_2_features(pca_x_train)) cluster_params3 = simple_clustering( "{}- Random Projection".format(plot_name), rp_x_train, rp_kmeans_k, rp_em_k, base_experiment.identify_top_2_features(rp_x_train)) cluster_params4 = simple_clustering( "{}- Variance Filtering".format(plot_name), variance_x_train, variance_filter_kmeans_k, variance_filter_em_k, base_experiment.identify_top_2_features(variance_x_train)) final_cluster_array = np.concatenate( (cluster_params1, cluster_params2, cluster_params3, cluster_params4), axis=1) cluster_features = pd.DataFrame(data=final_cluster_array) classifier = MLPClassifier(hidden_layer_sizes=(25), activation='logistic', max_iter=5000, solver='adam') cv = ShuffleSplit(n_splits=20, test_size=0.2, random_state=0) plot_learning_curve( "{}-Cluster Features - ANN Learning Curve".format(plot_name), classifier, "{}- Cluster Features - ANN Learning Curve".format(plot_name), standard_scaler.fit_transform(cluster_features), data[target], ylim=(0, 1), cv=cv, n_jobs=4)
def train(X, y, config, max_epochs, batch_iterator='BatchIterator', pretrained_model=None, name=None, debug=True): # print globals()['net_name'] global net_name sample = 500 if debug else X.shape[0] X_t, y_t = X[:sample], y[:sample, :] param_dump_folder = './model_%s' % (m_time.strftime("%m_%d_%H_%M_%S") if name is None else name) print 'Model name: %s' % param_dump_folder print 'Debug mode:', debug # Load the net and add a function to save the params after every epoch nnet = globals()[config](batch_iterator, max_epochs) func_save_model = lambda n, h: save_model_params(n, h, param_dump_folder, debug) nnet.on_epoch_finished.append(func_save_model) func_learning_curve = lambda n, h: plot_learning_curve( n, h, param_dump_folder, debug) nnet.on_epoch_finished.append(func_learning_curve) # func_viz_weights = lambda n, h: plot_weight_matrix_grid( # n, h, param_dump_folder, debug) # nnet.on_epoch_finished.append(func_viz_weights) print 'Config: %s' % config print 'Max num epochs: %d' % nnet.max_epochs print "Dataset loaded, shape:", X_t.shape, y_t.shape print "Loading pretrained model %s ..." % pretrained_model if pretrained_model is not None: pretrained_weights = pickle.load(open(pretrained_model, 'rb')) nnet.load_params_from(pretrained_weights) print "Finished loading" # Train if not debug: if not os.path.exists(param_dump_folder): os.mkdir(param_dump_folder) try: nnet.fit(X_t, y_t) except KeyboardInterrupt: pass if not debug: nnet.save_params_to(os.path.join(param_dump_folder, 'model_final.pkl'))
def train(X, y, config, max_epochs, batch_iterator='BatchIterator', pretrained_model=None, name=None, debug=True): # print globals()['net_name'] global net_name sample = 500 if debug else X.shape[0] X_t, y_t = X[:sample], y[:sample, :] param_dump_folder = './model_%s' % (m_time.strftime( "%m_%d_%H_%M_%S") if name is None else name) print 'Model name: %s' % param_dump_folder print 'Debug mode:', debug # Load the net and add a function to save the params after every epoch nnet = globals()[config](batch_iterator, max_epochs) func_save_model = lambda n, h: save_model_params( n, h, param_dump_folder, debug) nnet.on_epoch_finished.append(func_save_model) func_learning_curve = lambda n, h: plot_learning_curve( n, h, param_dump_folder, debug) nnet.on_epoch_finished.append(func_learning_curve) # func_viz_weights = lambda n, h: plot_weight_matrix_grid( # n, h, param_dump_folder, debug) # nnet.on_epoch_finished.append(func_viz_weights) print 'Config: %s' % config print 'Max num epochs: %d' % nnet.max_epochs print "Dataset loaded, shape:", X_t.shape, y_t.shape print "Loading pretrained model %s ..." % pretrained_model if pretrained_model is not None: pretrained_weights = pickle.load(open(pretrained_model, 'rb')) nnet.load_params_from(pretrained_weights) print "Finished loading" # Train if not debug: if not os.path.exists(param_dump_folder): os.mkdir(param_dump_folder) try: nnet.fit(X_t, y_t) except KeyboardInterrupt: pass if not debug: nnet.save_params_to(os.path.join( param_dump_folder, 'model_final.pkl'))
'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2 }) } # Running code with default values plt = print_results(result_dict) #plt.show() plt.savefig(fig_path + 'results.png') title = "Learning Curves for Decision Tree" plt = plot_learning_curve(DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_split=2), 'Survived', FEATURES, titanic_df, title, ylim=(0.4, 1.01)) #plt.show() plt.savefig(fig_path + 'learning_curve_dt.png') title = "Learning Curves for Neural Networks" plt = plot_learning_curve(MLPClassifier(activation='identity', learning_rate='constant', solver='lbfgs'), 'Survived', FEATURES, titanic_df, title, ylim=(0.4, 1.01))
title='Wine Quality Neural Network Confusion Matrix') plt.show() #DecisionTreeClassifier singleDT = DecisionTreeClassifier(max_depth=43, max_features=5) singleDT.fit(X_train, y_train) DT_predictions = singleDT.predict(X_test) DT_accuracy = accuracy_score(y_test, DT_predictions) print 'Wine Quality Decision Tree Accuracy:', DT_accuracy print '-----------------' cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0) plot_learning_curve(singleDT, 'Wine Quality Decision Tree Learning Curve', X, y, ylim=(0.5, 1.01), cv=cv, n_jobs=4) plt.show() #Boosting AdaBoost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10), n_estimators=40) AdaBoost.fit(X_train, y_train) boosted_accuracy = AdaBoost.score(X_test, y_test) print 'Wine Quality AdaBoost Accuracy:', boosted_accuracy print '-----------------' cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0) plot_learning_curve(AdaBoost,
'income - Linear SVM': build_model(linear_svm_fn, 'income', FEATURES, adult_df, options={'C': 0.1, 'loss': 'hinge'}), 'income - SVM Linear': build_model(svm_linear_fn, 'income', FEATURES, adult_df, options={'C': 1, 'gamma': 0.1}), 'income - SVM RBF': build_model(svm_rbf_fn, 'income', FEATURES, adult_df, options={'C': 1, 'gamma': 0.1}), 'income - Ada Boosting': build_model(ada_boosting_fn, 'income', FEATURES, adult_df, options={'algorithm': 'SAMME.R', 'learning_rate': 1, 'n_estimators': 500}), 'income - Gradient Boosting': build_model(gradient_boosting_fn, 'income', FEATURES, adult_df, options={'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 100}), 'income - Neural networks': build_model(neural_network_fn, 'income', FEATURES, adult_df, options={'activation':'tanh', 'learning_rate':'invscaling', 'solver': 'adam'}), 'income - Decision_tree': build_model(decision_tree_fn, 'income', FEATURES, adult_df, options={'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2}) } # Running code with default values plt = print_results(result_dict) #plt.show() plt.savefig(fig_path + 'results.png') title = "Learning Curves for Decision Tree" plt = plot_learning_curve(DecisionTreeClassifier(criterion='gini', max_depth= 8, min_samples_split=12), 'income', FEATURES, adult_df, title, ylim=(0.4, 1.01)) #plt.show() plt.savefig(fig_path + 'learning_curve_dt.png') title = "Learning Curves for Neural Networks" plt = plot_learning_curve(MLPClassifier(activation='tanh', learning_rate='invscaling', solver='adam'), 'income', FEATURES, adult_df, title, ylim=(0.4, 1.01)) #plt.show() plt.savefig(fig_path + 'learning_curve_neural.png') title = "Learning Curves for AdaBoost" plt = plot_learning_curve(AdaBoostClassifier(algorithm='SAMME.R', learning_rate=1, n_estimators= 500), 'income', FEATURES, adult_df, title, ylim=(0.4, 1.01)) #plt.show() plt.savefig(fig_path + 'learning_curve_adaboost.png') title = "Learning Curves for GradientBoost" plt = plot_learning_curve(GradientBoostingClassifier(criterion='friedman_mse', learning_rate=0.1, loss='deviance', n_estimators=100), 'income', FEATURES, adult_df, title, ylim=(0.4, 1.01))
C=1e10, # Large C for no regularization random_state=33, penalty='l1') pipe_lrn = create_pipeline(lrn_basic) results_mod = cv_evaluation(pipe_lrn, train_x.question_text, train_y) print('The mean train {} and test CV {}.'.format( round(results_mod['train_score'], 2), round(results_mod['test_score'], 2))) pipe_lrn.fit(train_x.question_text, train_y) predictions_valid = pipe_lrn.predict(valid_x.question_text) val_score = metrics.f1_score(valid_y, predictions_valid) print('The validation F1_score was of {}'.format(val_score)) h.plot_learning_curve(pipe_lrn, text, train_y, cv=3, n_jobs=3, title='Learning Curves (Logistic Classifer)') # Validating preprocessing with the same base learner results_mod = cv_evaluation(pipe_lrn, train_x.qt_clean_stop, train_y) print('The mean train {} and test CV {}.'.format( round(results_mod['train_score'], 2), round(results_mod['test_score'], 2))) # 5. Feature Engineering ------------------------------------------------------ # 5.1 Feature Engineering: Ngram and noise reduction # One problem is the vast amount of text features generated in CountVectorizer # much of them are pure noise count_vect = CountVectorizer(binary=True) count_vect.fit(train_x.qt_clean_stop) xtrain_count = count_vect.transform(train_x.qt_clean_stop)
def main(): # ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'] train_df_ori = pd.read_csv('data/train.csv') test_df_ori = pd.read_csv('data/test.csv') train_df, test_df = fea_eng(train_df_ori, test_df_ori) # colormap = plt.cm.RdBu # plt.figure(figsize=(14, 12)) # sns.heatmap(train_df.astype(float).corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', # annot=True) # plt.title('Pearson Correlation of Features', y=1.05, size=15) # plt.show() # Some useful parameters which will come in handy later on rf = helper.SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=config.rf_params) et = helper.SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=config.et_params) lr = helper.SklearnHelper(clf=LogisticRegression, seed=SEED, params=config.lr_params) gb = helper.SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=config.gb_params) # svc = helper.SklearnHelper(clf=SVC, seed=SEED, params=config.svc_params) y_train = train_df['Survived'].ravel() train_df = train_df.drop(['Survived'], axis=1) x_train = train_df.values x_test = test_df.values # # 对XGBoost进行网格搜索 # x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=SEED) # # xgb_grid = GridSearchCV(estimator=xgb.XGBClassifier(n_estimators=102, # learning_rate=0.01, # objective='binary:logistic', # max_depth=2, # min_child_weight=1, # gamma=0, # subsample=0.8, # colsample_bytree=0.9, # seed=SEED), # param_grid=config.xgb_grid_params, # cv=5) # xgb_grid.fit(X=x_train, y=y_train) # y_scores = xgb_grid.predict(x_test) # # Print model report: # print("Model Report") # print('best parameter:' + str(xgb_grid.best_params_)) # print("Accuracy : {}".format(accuracy_score(y_true=y_test, y_pred=y_scores))) # print("AUC Score (Test): {}".format(roc_auc_score(y_true=y_test, y_score=y_scores))) # # feat_imp = pd.Series(xgb_grid.best_estimator_.feature_importances_, index=train_df.columns.values).sort_values(ascending=False) # feat_imp.plot(kind='bar', title='Feature Importances') # plt.ylabel('Feature Importance Score') # plt.show() # # 网格搜索,寻找最优参数 # helper.GridSearchProcessing(helper.process_build(clf=RandomForestClassifier(), # params=config.rf_grid_params, # X=x_train, # y=y_train)) # helper.GridSearchProcessing(helper.process_build(clf=ExtraTreesClassifier(), # params=config.et_grid_params, # X=x_train, # y=y_train)) # helper.GridSearchProcessing(helper.process_build(clf=LogisticRegression(), # params=config.lr_grid_params, # X=x_train, # y=y_train)) # process = helper.GridSearchProcessing(helper.process_build(clf=GradientBoostingClassifier(), # params=config.gb_grid_params, # X=x_train, # y=y_train)) # # 对Pool对象调用join()方法会等待所有子进程执行完毕,调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了 # process.get_pool().close() # process.get_pool().join() # Create our OOF train and test predictions. These base results will be used as new features et_oof_train, et_oof_test = helper.get_oof(et, x_train, y_train, x_test) # Extra Trees rf_oof_train, rf_oof_test = helper.get_oof(rf, x_train, y_train, x_test) # Random Forest lr_oof_train, lr_oof_test = helper.get_oof(lr, x_train, y_train, x_test) # lr gb_oof_train, gb_oof_test = helper.get_oof(gb, x_train, y_train, x_test) # Gradient Boost # svc_oof_train, svc_oof_test = helper.get_oof(svc, x_train, y_train, x_test) # Support Vector Classifier plt.figure(figsize=(10,8)) plt.subplot(2,2,1) rf_feature_importances = pd.Series(rf.clf.feature_importances_, index=train_df.columns.values).sort_values(ascending=False) rf_feature_importances.plot(kind='bar', title='Feature Importances') plt.ylabel('rf Feature Importance Score') plt.subplot(2,2,2) et_feature_importances = pd.Series(et.clf.feature_importances_, index=train_df.columns.values).sort_values(ascending=False) et_feature_importances.plot(kind='bar', title='Feature Importances') plt.ylabel('et Feature Importance Score') plt.subplot(2,2,3) gb_feature_importances = pd.Series(gb.clf.feature_importances_, index=train_df.columns.values).sort_values(ascending=False) gb_feature_importances.plot(kind='bar', title='Feature Importances') plt.ylabel('gb Feature Importance Score') plt.show() print("Training is complete") base_predictions_train = pd.DataFrame({'RandomForest': rf_oof_train.ravel(), 'ExtraTrees': et_oof_train.ravel(), # 'AdaBoost': ada_oof_train.ravel(), 'GradientBoost': gb_oof_train.ravel() }) x_train = np.concatenate((et_oof_train, rf_oof_train, lr_oof_train, gb_oof_train), axis=1) x_test = np.concatenate((et_oof_test, rf_oof_test, lr_oof_test, gb_oof_test), axis=1) xgb_helper = helper.SklearnHelper(clf=xgb.XGBClassifier, seed=SEED, params=config.xgb_params) xgb_helper.train(x_train, y_train) predictions = xgb_helper.predict(x_test) helper.create_feature_map(train_df.columns) xgb.plot_tree(xgb_helper.clf, fmap='xgb.fmap', num_trees=0) # 交叉验证,可以快速得到模型的预测正确率 scores = cross_val_score(rf.clf, x_train, y_train, cv=5) print("rf Accuracy: {}".format(scores)) scores = cross_val_score(et.clf, x_train, y_train, cv=5) print("et Accuracy: {}".format(scores)) scores = cross_val_score(lr.clf, x_train, y_train, cv=5) print("lr Accuracy: {}".format(scores)) scores = cross_val_score(gb.clf, x_train, y_train, cv=5) print("gb Accuracy: {}".format(scores)) # scores = cross_val_score(svc.clf, x_train, y_train, cv=5) # print("svc Accuracy: {}".format(scores)) scores = cross_val_score(xgb_helper.clf, x_train, y_train, cv=5) print("stacking xgb Accuracy: {}".format(scores)) helper.plot_learning_curve(xgb_helper.clf, 'xgb_learn curve', x_train, y_train) helper.plot_learning_curve(rf.clf, 'rf_learn curve', x_train, y_train) result = pd.DataFrame({"PassengerId": test_df_ori['PassengerId'], "Survived": predictions }) result.to_csv("data/submission.csv", index=False)
title='Pen Digits Neural Network Confusion Matrix') plt.show() #DecisionTreeClassifier singleDT = DecisionTreeClassifier(max_depth=22, max_features=5) singleDT.fit(X_train, y_train) DT_predictions = singleDT.predict(X_test) DT_accuracy = accuracy_score(y_test, DT_predictions) print 'Pen Digit Decision Tree Accuracy:', DT_accuracy print '-----------------' cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0) plot_learning_curve(singleDT, 'Pen Digit Decision Tree Learning Curve', X, y, ylim=(0.5, 1.01), cv=cv, n_jobs=4) plt.show() #Boosting AdaBoost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10), n_estimators=22) AdaBoost.fit(X_train, y_train) boosted_accuracy = AdaBoost.score(X_test, y_test) print 'Pen Digit AdaBoost Accuracy:', boosted_accuracy print '-----------------' cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0) plot_learning_curve(AdaBoost,