def try_linear_svc(): """ Linear SVC """ print("LinearSVC") from sklearn import svm from sklearn.svm import LinearSVC from sklearn.svm import SVC clf_lr = SVC(kernel='rbf', C=1000) clf_lr.fit(features_train,labels_train) pred = clf_lr.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) f1 = f1_score(labels_test, pred) print("accuracy SVC: ", accuracy) print("precision: ", precision) print("recall: ", recall) print("f1 score: ", f1) print_separator_line() dict_results = { "classifier": "linear svc, rbf", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, clf_lr
def try_ada_boost_decision_tree(): """ AdaBoost appied to Decision Tree """ print("AdaBoost to Decision Tree") from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.grid_search import GridSearchCV param_grid = {"base_estimator__criterion" : ["gini", "entropy"], "base_estimator__splitter" : ["best", "random"], "n_estimators": [10, 30] } DTC = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = None) ABC = AdaBoostClassifier(base_estimator = DTC) grid_search_ABC = GridSearchCV(ABC, param_grid=param_grid, scoring = 'roc_auc') grid_search_ABC.fit(features_train,labels_train) pred = grid_search_ABC.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) print("DecisionTree after applying AdaBoost and GridSearchCV:") print("accuracy AdaBoost: ", accuracy) print("precision: ", precision) print("recall: ", recall) print_separator_line() dict_results = { "classifier": "AdaBoost decision tree", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, grid_search_ABC
def try_k_neighbors(): """ K Nearest Neighbors classifier: """ print "KNeighborsClassifier" from sklearn.neighbors import NearestNeighbors from sklearn.neighbors import KNeighborsClassifier pipe = Pipeline([ ('classify', KNeighborsClassifier()) ]) clf_knn = KNeighborsClassifier(n_neighbors=5) params_knn = {} KNC = KNeighborsClassifier(n_neighbors=2) ABC = AdaBoostClassifier(base_estimator = KNC) KNC.fit(features_train,labels_train) pred = KNC.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) #f1 = f1_score(labels_test, pred) print("KNeighborsClassifier:") print("accuracy KNC: ", accuracy) print("precision: ", precision) print("recall: ", recall) print_separator_line() dict_results = { "classifier": "K Nearest Neighbors", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, KNC
def try_classifier_GaussianNB(): """ GaussanNB """ print "Classifier: GaussianNB:" from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score clf_GuassianNB = GaussianNB() clf_GuassianNB.fit(features_train,labels_train) pred = clf_GuassianNB.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) f1 = f1_score(labels_test, pred) print("GaussianNB accuracy: ", accuracy) print("precision: ", precision) print("recall: ", recall) print("f1 score: ", f1) print_separator_line() dict_results = { "classifier": "GaussianNB", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, clf_GuassianNB
def try_logistic_regression_pipeline(): """ Logistic Regression with pipeline: """ print("Logistic Regression with pipeline and PCA:") from sklearn import linear_model, decomposition, datasets from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV logistic = linear_model.LogisticRegression() pca = decomposition.PCA() pipe_lr = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) pipe_lr.fit(features_train,labels_train) pred_lr = pipe_lr.predict(features_test) accuracy = accuracy_score(labels_test, pred_lr) precision = precision_score(labels_test,pred_lr) recall = recall_score(labels_test,pred_lr) f1 = f1_score(labels_test,pred_lr) print("accuracy SVC: ", accuracy) print("precision: ", precision) print("recall: ", recall) print_separator_line() dict_results = { "classifier": "Logistic regression, pca and pipeline", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, pipe_lr
def try_k_neighbors_pipeline(): """ K Nearest Neighbors with pipeline: """ print "KNeighborsClassifier with Pipeline" from sklearn.neighbors import NearestNeighbors from sklearn.neighbors import KNeighborsClassifier KNC2 = KNeighborsClassifier() pca = PCA() pipe = Pipeline([('pca', pca), ('knn', KNC2)]) pipe.fit(features_train, labels_train) pred = pipe.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) print("KNeighborsClassifier - with Pipe:") print("accuracy KNC2: ", accuracy) print("precision: ", precision) print("recall: ", recall) print_separator_line() dict_results = { "classifier": "K nearest neighbors, pipeline", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, pipe
def try_classifier_Decision_Tree_Pipeline(): """ Decision Tree Classifier, optimized with Pipeline """ print "Decision Tree classifier with pipeline:" scaler = preprocessing.MinMaxScaler() skb = SelectKBest(k = 15) dt3 = tree.DecisionTreeClassifier(criterion='entropy',splitter='best') clf_DT3 = Pipeline(steps=[('scaling',scaler),("SKB", skb), ("DecisionTree", dt3)]) clf_DT3.fit(features_train,labels_train) pred = clf_DT3.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) f1 = f1_score(labels_test, pred) print("accuracy:",accuracy) print("precision: ", precision) print("recall: ", recall) print("f1 score: ", f1) print_separator_line() print_separator_line() dict_results = { "classifier": "Decision Tree, pipeline", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, clf_DT3
def try_classifier_Decision_Tree2(): """ Decision Tree Classifier """ print "DecisionTree with criterion = entropy:" from sklearn import tree from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score scaler = preprocessing.MinMaxScaler() skb = SelectKBest(k = 'all') ### Use entropy as criterion dt = tree.DecisionTreeClassifier(criterion='entropy',splitter='best') clf_DT2 = Pipeline(steps=[('scaling',scaler),("SKB", skb), ("DecisionTree", dt)]) clf_DT2.fit(features_train,labels_train) pred = clf_DT2.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) f1 = f1_score(labels_test, pred) print("DT-entropy with pipeline accuracy: ", accuracy) print("precision: ", precision) print("recall: ", recall) print("f1 score: ", f1) print_separator_line() dict_results = { "classifier": "Decision Tree, entropy", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, clf_DT2
def try_classifier_GaussianNB_pipeline(): """ GaussanNB improved with Pipeline """ from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score print "GaussianNB with pipeline:" gnb = GaussianNB() skb = SelectKBest(k = 'all') pipe_GuassianNB = Pipeline(steps=[('scaling',scaler),("SKB", skb), ("NaiveBayes", gnb)]) pipe_GuassianNB.fit(features_train,labels_train) pred = pipe_GuassianNB.predict(features_test) accuracy = pipe_GuassianNB.score(features_test,labels_test) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) f1 = f1_score(labels_test, pred) print("GaussianNB with Pipeline, accuracy: ", accuracy) print("precision: ", precision) print("recall: ", recall) print("f1 score: ", f1) print_separator_line() dict_results = { "classifier": "GaussianNB with pipeline", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, pred
def try_random_forest(): """ Random Forest classifier """ print("Random Forest") from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier() rf.fit(features_train, labels_train) score_train = rf.score(features_train,labels_train) score_test = rf.score(features_test,labels_test) print("score train: ", score_train) print("score test: ", score_test) clf_rf = RandomForestClassifier(n_estimators=30, criterion='entropy', max_depth=None, max_features=10) clf_rf.fit(features_train,labels_train) pred = clf_rf.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) f1 = f1_score(labels_test, pred) print("accuracy RandomForest: ", accuracy) print("precision: ", precision) print("recall: ", recall) print("f1 score: ", f1) print_separator_line() dict_results = { "classifier": "Random Forest", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, clf_rf
def try_linear_svc_gridsearchcv(): """ Linear SVC model optimized with Pipeline and GridSearchCV """ scaler = preprocessing.MinMaxScaler() skb = SelectKBest(k = 'all') pipe = Pipeline([ ('reduce_dim', PCA()), ('classify', LinearSVC()) ]) N_FEATURES_OPTIONS = [2, 4, 8] C_OPTIONS = [1, 10, 100, 1000] param_grid = [ { 'reduce_dim': [PCA(iterated_power=7), NMF()], 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, { 'reduce_dim': [SelectKBest(chi2)], 'reduce_dim__k': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, ] reducer_labels = ['PCA', 'NMF', 'KBest(chi2)'] grid_linearSVC = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid) grid_linearSVC.fit(features_train,labels_train) pred = grid_linearSVC.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) print "SVM after applying PCA and GridSearchCV:" print("accuracy SVC: ", accuracy) print("precision: ", precision) print("recall: ", recall) print_separator_line() dict_results = { "classifier": "linear SVC with GridSearchCV", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, grid_linearSVC
def try_logistic_regression_tuned(): """ Logistic Regression tuned """ clf_winner = Pipeline(steps=[("scaler", scaler), ("skb", SelectKBest(k='all')), ("clf_winner", LogisticRegression(tol=0.1, C = 1**19, class_weight='balanced'))]) clf_winner.fit(features_train,labels_train) pred = clf_winner.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) print("accuracy LogisticRegression with PCA 2: ", accuracy) print("precision: ", precision) print("recall: ", recall) #return clf_winner print_separator_line() dict_results = { "classifier": "Logistic regression tuned", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, clf_winner
def try_svm_classifier(): """ SVM classifier """ print("svm SVC classifier:") clf_svm = svm.SVC(C=10., cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) clf_svm.fit(features_train,labels_train) pred = clf_svm.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) print("accuracy SVC: ", accuracy) print("precision: ", precision) print("recall: ", recall) print_separator_line() dict_results = { "classifier": "svm.SVC", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, clf_svm
def try_svc_tuned(): """ Apply the tuned parameters generated by GridSearchCV """ from sklearn.model_selection import GridSearchCV parameters = {'kernel':['rbf'], 'C':[1, 1000]} svc = SVC() clf_svm3 = GridSearchCV(svc, parameters) clf_svm3.fit(features_train,labels_train) clf_svm3 = clf_svm3.best_estimator_ pred = clf_svm3.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) print("accuracy SVC: ", accuracy) print("precision: ", precision) print("recall: ", recall) print_separator_line() dict_results = { "classifier": "svc tuned", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, clf_svm3
def try_logistic_regression(): """ Logistic Regression classifier: """ print("Logistic Regression") from sklearn import linear_model, decomposition, datasets from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV logistic = linear_model.LogisticRegression() logistic.fit(features_train,labels_train) pred = logistic.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) print("accuracy LogisticRegression with PCA: ", accuracy) print("precision: ", precision) print("recall: ", recall) print_separator_line() dict_results = { "classifier": "Logistic regression", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, logistic
def try_svc_pipeline_gridsearchcv(): """ SVC with pipeline and GridSearchCV generate the best parameters """ print "results pipeline svm:" pca = decomposition.PCA() svm = SVC() pipe = Pipeline(steps=[('pca', pca), ('svm', svm)]) n_components = [10, 14, 18] params_grid = { 'svm__C': [1, 10, 100, 1000], 'svm__kernel': ['linear', 'rbf'], 'svm__gamma': [0.001, 0.0001], 'pca__n_components': n_components, } estimator = GridSearchCV(pipe, params_grid) estimator.fit(features_train,labels_train) print estimator.best_params_, estimator.best_score_ params = estimator.best_params_ clf_svm2 = SVC(C=100, kernel='rbf', decision_function_shape='ovr', degree=3, gamma='auto', coef0=0.0,max_iter=-1, probability=False, random_state=None, shrinking=True) clf_svm2.fit(features_train,labels_train) pred = clf_svm2.predict(features_test) accuracy = accuracy_score(labels_test, pred) precision = precision_score(labels_test, pred) recall = recall_score(labels_test, pred) print("accuracy SVC: ", accuracy) print("precision: ", precision) print("recall: ", recall) print_separator_line() dict_results = { "classifier": "svc with pipeline", "accuracy": accuracy, "precision": precision, "recall": recall } return dict_results, clf_svm2
from helpers_enron import make_csv make_csv(data_dict) ### Task: Remove outliers data_dict.pop('TOTAL',0) data_dict.pop('THE TRAVEL AGENCY IN THE PARK',0) ### This record has NaN in every field data_dict.pop('LOCKHART EUGENE E',0) ### Task: Select what features you'll use. ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". print_separator_line() ### Combine poi, financial and email features features_list = poi + features_financial + features_email ### Get the best features features_list = set_kbest_features_list(data_dict, features_list) ### csv file is written in order to see results as spreadsheet output_file = "test_results_original.csv" ### *** UNCOMMENT THESE LINES TO RERUN THE CLASSIFIERS WITH ADDED FEATURES *** ### Additional features: log of financial fields and POI email ratio