def finish(): try: if best_solution_so_far: print "" print "=============================" print "Optimization Path:" print "=============================" for solution in optimization_path: print_result_item(solution[0]) print solution[1] print "" print "=============================" print "Final Solution:" print "=============================" print best_solution_so_far clf = best_solution_so_far[0][3] features_list = best_solution_so_far[1] dump_classifier_and_data(clf, my_dataset, features_list) print "Model saved with success." else: print "" print "No solution found" except Exception as e: print e
def setup_and_test(my_dataset, features_list, classifier): # Dump classifier and features list, so we can test them dump_classifier_and_data(classifier, my_dataset, features_list) # load up student's classifier, dataset, and feature_list clf, dataset, feature_list = load_classifier_and_data() # Run testing script test_classifier(clf, dataset, feature_list) return
def detect_poi(): ### Load the dictionary containing the dataset data_dict = pickle.load(open("final_project_dataset.pkl", "r") ) ### Task 1: Remove outliers data_dict.pop('TOTAL',0) ### Task 2: Select what features ### 'stk_pay_ratio','to_poi_ratio', 'from_poi_ratio','bonus_salary_ratio' ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". my_dataset = data_dict stk_pay_ratio(my_dataset) from_poi_ratio(my_dataset) to_poi_ratio(my_dataset) bonus_salary_ratio(my_dataset) ### Task 3: Feature Selection ### Generate a set of 15 feature lists from these 4 features ### This way, all possible combinations of these features are tested all_features_list = fList_set() ### Because of the small size of the dataset, the script uses stratified ### shuffle split cross validation in tester.py metrics = [] clf = GaussianNB() ### ptest uses Stratified shuffle split cross validation and calculates the precision ### Find the precision for every list for i in range(0,15): metrics.append(ptest(clf,my_dataset,all_features_list[i])) ### Go for the feature list that produces the best precision. ### For this dataset only, it is harder to get a high precision. best = np.array(metrics).argmax() ### Run test_classifier to print evaluation metrics to console test_classifier(clf, my_dataset,all_features_list[best]) ### Now use the same feature list to run the decison tree classifier features_list = all_features_list[best] ### Task 4: Try a varity of classifiers samples_split_values = [2,4] samples_leaf_values = [1,2] for split in samples_split_values: for leaf in samples_leaf_values: clf = tree.DecisionTreeClassifier(min_samples_split=split,\ min_samples_leaf=leaf) test_classifier(clf, my_dataset, features_list) print_feature_importances(features_list, clf) ###Choose best classfier and feature set clf = GaussianNB() ### Dump classifier, dataset, and features_list dump_classifier_and_data(clf, my_dataset, features_list)
def main(): data_dict = pickle.load(open("final_project_dataset.pkl", "r")) my_dataset = data_dict my_dataset = AddFeatures(my_dataset) # Exclude using Discretion. Exc1 = ["email_address"] # Replaced by creating better versions of the features Exc2 = ["to_messages", "from_messages", "from_this_person_to_poi", "from_poi_to_this_person"] # Exclude because Highly Correlated with stronger features Exc3 = [ "deferral_payments", "expenses", "deferred_income", "restricted_stock_deferred", "director_fees", "long_term_incentive", "bonus", "total_payments", "salary", "total_stock_value", "restricted_stock", "exercised_stock_options", "other", ] exclude = Exc1 + Exc2 + Exc3 # QueryDataSet(my_dataset) # ShowCorrel(my_dataset) features_list = next(my_dataset.itervalues()).keys() for i in exclude: features_list.remove(i) features_list.insert(0, features_list.pop(features_list.index("poi"))) data = featureFormat(my_dataset, features_list, sort_keys=True) ### Extract features and labels from dataset for local testing labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.1, random_state=42, stratify=labels ) # clf=TuneSVM(features, labels,features_list) # clf=TuneKNN(features, labels,features_list) # clf=NoTuneDT(features, labels,features_list) # clf=TuneDT(features,labels,features_list) features_list.insert(0, "poi") dump_classifier_and_data(clf, my_dataset, features_list) test_classifier(clf, my_dataset, features_list)
print "For optimum",score,":" for name in classifier_names: print " ",name,": ",best_performance[score][name] # acc = accuracy_score(pred, labels_test) # print "" # print "Accuracy:",acc," (Good predictions / All predictions)" # pre = precision_score(pred, labels_test) # print "Precision:",pre," (Real POIs / Predicted POIs)" # rec = recall_score(pred, labels_test) # print "Recall:",rec," (Identified POIs / All POIs)" ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! # features_train, features_test, labels_train, labels_test = \ # train_test_split(features, labels, test_size=0.3, random_state=42) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. # dump_classifier_and_data(clf, my_dataset, features_list) dump_classifier_and_data(clf, data_dict, features_list)
print "F1-score on testing data: {:.4f}".format(results['Unoptimized Model']['F1 Score']) print "\nOptimized Model\n------" print "Final accuracy score on the testing data: {:.4f}".format(results['Optimized Model']['Accuracy']) print "Final precision on testing data: {:.4f}".format(results['Optimized Model']['Precision']) print "Final recall on testing data: {:.4f}".format(results['Optimized Model']['Recall']) print "Final F1-score on the testing data: {:.4f}".format(results['Optimized Model']['F1 Score']) print '\n' print classification_report(labels_test, best_predictions) #%% ### Evaluate the final model by using 'test_classifier' function in 'tester.py' script. # Convert the data to dictionary to be compatible with 'test_classifier' input format selected_features_df = pd.DataFrame(data = selected_features, columns = selected_features_list) labels_df = pd.DataFrame(data = labels) my_dataset_df = pd.concat([labels_df, selected_features_df], axis=1) my_dataset = pd.DataFrame.to_dict(my_dataset_df, orient='index') print '\nPerformance of the model based on test_classifier function:' # Pass the optimized model to the 'test_classifier' function test_classifier(best_estimator, my_dataset, list(my_dataset_df.columns)) #%% ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(best_estimator, my_dataset, list(my_dataset_df.columns))
grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv) grid_search.fit(features, labels) clf = grid_search.best_estimator_ #------------------------------------------------------------------------------ ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation. ### StratifiedShuffleSplit.html test_classifier(clf, my_dataset, features_list) # Example starting point. Try investigating other evaluation techniques! from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) #------------------------------------------------------------------------------ ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list)
# decision_function_shape='ovo', degree=3, gamma='auto', # kernel='linear', max_iter=-1, probability=False, # random_state=20160308, shrinking=False, tol=0.001, # verbose=False)) pipe = make_pipeline( Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0), ExtraTreesClassifier(bootstrap=False, class_weight='balanced', criterion='gini', max_depth=None, max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=3, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=-1, oob_score=False, random_state=20160308, verbose=0, warm_start=False)) #pipe = make_pipeline( # Imputer(axis=0, copy=True, missing_values='NaN', # strategy='median', verbose=0), # SelectFpr(alpha=0.05, score_func=f_classif), # ExtraTreesClassifier(bootstrap=False, class_weight='balanced', # criterion='gini', max_depth=None, # max_features='sqrt', max_leaf_nodes=None, # min_samples_leaf=3, min_samples_split=2, # min_weight_fraction_leaf=0.0, n_estimators=30, # n_jobs=-1, oob_score=False, # random_state=20160308, verbose=0, # warm_start=False)) # Task 6: Dump your classifier, dataset, and features_list dump_classifier_and_data(pipe, df.to_dict(orient='index'), ['poi'] + F_ALL_NEW)
sd = StandardScaler() fsl = FeatureSel(k_best=5, pca_comp=5) # clf=Pipeline([("fsl",fsl),("sd",sd),("lvc",LinearSVC(C=0.000001))]) clf = Pipeline([("fsl", fsl), ("sd", sd), ("lvc", LinearSVC())]) gscv=GridSearchCV(clf,{"lvc__C":np.logspace(-6,-1,5), "fsl__k_best":[1,5,10], "fsl__pca_comp":[0,5,10]}, scoring="recall",verbose=0) gscv.fit(np.array(features),np.array(labels)) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. ### Because of the small size of the dataset, the script uses stratified ### shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html test_classifier(gscv.best_estimator_, my_dataset, features_list) ### Dump your classifier, dataset, and features_list so ### anyone can run/check your results. dump_classifier_and_data(gscv.best_estimator_, my_dataset, features_list)
def dump(clf, my_dataset, features_list): dump_classifier_and_data(clf, my_dataset, features_list) return
param_grid=tree_param_grid, scoring="recall") ### Show results of parameter tuning grid_search.fit(features_train, labels_train) print "\nbest estimator: \n", (grid_search.best_estimator_),\ "\n best score:\n",grid_search.best_score_ ,\ "\n best params:\n",grid_search.best_params_ clf = grid_search.best_estimator_ features_selected_bool = clf.named_steps['skb'].get_support() features_selected_list = [x for x, y in zip(features_selected_list[1:], features_selected_bool ) if y] print "\nselected features: ", features_selected_list ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, ["poi"]+features_selected_list)
enron_df['from_poi_ratio'] = enron_df['from_poi_to_this_person'] / enron_df['from_messages'] enron_df['to_poi_ratio'] = enron_df['from_this_person_to_poi'] / enron_df['to_messages'] enron_df['bonus_ratio'] = enron_df['bonus'] / enron_df['salary'] enron_df.fillna(0, inplace=True) # Separate labels and features enron_df_labels = enron_df['poi'] enron_df_features = enron_df[enron_df.columns.difference(['poi'])] # Use SelectKBest and GaussianNB pipeline = Pipeline([ ('kbest', SelectKBest()), ('gnb', GaussianNB())]) folds = 100 cv = StratifiedShuffleSplit(enron_df_labels, n_iter=folds, random_state=42, test_size=0.20) parameters = {"kbest__k": [1, 2, 3, 5, 8, 13, 19], "kbest__score_func": [f_classif]} clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1') clf.fit(enron_df_features, enron_df_labels) # Select Features kbest = clf.best_estimator_.steps[0][1] kbest.get_support() features = sorted(zip(enron_df_features.columns, kbest.scores_, kbest.get_support()), key=lambda x: x[1]) my_list = [x[0] for x in features if x[2] == True] my_list = ['poi'] + my_list my_dataset = enron_df[my_list].transpose().to_dict() dump_classifier_and_data(clf.best_estimator_.steps[1][1], my_dataset, my_list)
print('\n ') print(' ############## Model with Highest Precision Score ############### ') highest_precision = sorted(grid_search_dict_results, key=lambda k: k['precision'], reverse=True) # highest_precision = sorted(grid_search_dict_results, key=itemgetter('precision'), reverse=True) pprint(highest_precision[0]) print('\n ') print(' ################# Model with Highest Recall Score ################# ') highest_recall = sorted(grid_search_dict_results, key=lambda k: k['recall'], reverse=True) # highest_recall = sorted(grid_search_dict_results, key=itemgetter('recall'), reverse=True) pprint(highest_recall[0]) print('\n ') print(' #### Model with Highest F1, Precision, Recall, Accuracy Score ##### ') sorted_grid_search_dict_results = sorted(grid_search_dict_results, key=lambda k: (k['f1'], k['precision'], k['recall'], k['accuracy']), reverse=True) pprint(sorted_grid_search_dict_results) pprint(sorted_grid_search_dict_results[0]['best_estimator']) ### Submit / Export files for tester.py print('\n ') print('\n ') my_clf = sorted_grid_search_dict_results[0]['best_estimator'] my_feature_list = all_features from tester import dump_classifier_and_data, test_classifier ### Dump pkl files dump_classifier_and_data(my_clf, my_dataset, my_feature_list) ### Run my_clf, my_dataset and my_feature_list against tester.test_classifier print(' ########### Final Results from Best Estimator Options ########### ') test_classifier(my_clf, my_dataset, my_feature_list)
print "***** Fitting SVM with GridSearchCV Tunning *****" sk_fold_svc_t = StratifiedShuffleSplit(labels_train, 100, random_state=42) gs_svc_t = GridSearchCV(pipe_svc_t, param_grid=param_grid_def, cv=sk_fold_svc_t, scoring='f1') gs_svc_t.fit(features, labels) clf_svc_t_be = gs_svc_t.best_estimator_ #### Naive Bayes Classifier Fitting - Tunning - clf_gnb8 print "***** Fitting GaussianNB Tunning *****" sk_fold_gnb8 = StratifiedShuffleSplit(labels, 1000, random_state=42) gs_gnb8 = GridSearchCV(pipe_gnb8, param_grid=parameters, cv=sk_fold_gnb8, scoring='f1') gs_gnb8.fit(features, labels) clf_gnb8_be = gs_gnb8.best_estimator_ print "Best Estimator Fitting SVM with GridSearchCV Tunning" print clf_gnb8_be ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. #dump_classifier_and_data(clf_nb, my_dataset, features_list_selection) #dump_classifier_and_data(clf_gs_nbp_be, my_dataset, features_list_selection) #dump_classifier_and_data(clf_dt, my_dataset, features_list_selection) #dump_classifier_and_data(clf_tree_be, my_dataset, features_list_selection) #dump_classifier_and_data(clf_svm_be, my_dataset, features_list_selection) #dump_classifier_and_data(clf_svc_t_be, my_dataset, features_list_selection) dump_classifier_and_data(clf_gnb8_be, my_dataset, features_list_selection)
labels05, features05 = targetFeatureSplit(data05) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. clf05 = neighbors.KNeighborsClassifier() ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! features_train05, features_test05, labels_train05, labels_test05 = \ train_test_split(features05, labels05, test_size=0.3, random_state=42) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf05, my_dataset, features_list05)
# #tune_random_forest() # # best_features_list_rf = fraudfunctions.get_k_best(my_dataset, features_list, 9) # # clf_rf = Pipeline(steps=[ # ('scaler', StandardScaler()), # ('classifier', RandomForestClassifier(max_depth=5, # n_estimators=25, # random_state=42)) # ]) # # print "Random Forest Classifier : \n", tester.test_classifier(clf_rf, my_dataset, best_features_list_rf) # # # ''' ADA BOOST CLASSIFIER ''' # # #tune_ada_boost() # # best_features_list_ab = fraudfunctions.get_k_best(my_dataset, features_list, 9) # # clf_ab = Pipeline(steps=[ # ('scaler', StandardScaler()), # ('classifier', AdaBoostClassifier(learning_rate=1.5, # n_estimators=30, # algorithm='SAMME.R')) # ]) # # print "Ada Boost Classifier : \n", tester.test_classifier(clf_ab, my_dataset, best_features_list_ab) ''' dump final algorithm classifier, dataset and features in the data directory ''' dump_classifier_and_data(clf_lr, my_dataset, best_features_list_lr)
str('%.2f' % rec_ab), str('%.2f' % prec_ab), str('%.3f' % score_ab), str('%.2f' % (ab_t1 - ab_t0)) ]) #score_array = np.array([score_nb, score_svm, score_tree, score_knn, score_rf, score_ab], dtype = float)) #score_array = np.array([score_nb, score_svm, score_tree, score_knn], dtype=float) #clf_list = [clf_nb, clf_svm, clf_tree, clf_knn, clf_rf, clf_ab] #clf_list = [clf_nb, clf_svm, clf_tree, clf_knn] #max_index = np.argmax(score_array) #clf = clf_list[max_index] ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. #clf = clf_tree dump_classifier_and_data(clf_nb, my_dataset, features_list, "my_classifier_nb.pkl") dump_classifier_and_data(clf_svm, my_dataset, features_list, "my_classifier_svm.pkl") dump_classifier_and_data(clf_tree, my_dataset, features_list, "my_classifier_tree.pkl") dump_classifier_and_data(clf_knn, my_dataset, features_list, "my_classifier_knn.pkl") dump_classifier_and_data(clf_rf, my_dataset, features_list, "my_classifier_rf.pkl") dump_classifier_and_data(clf_ab, my_dataset, features_list, "my_classifier_ab.pkl")
test_classifier(clf_NB3, my_dataset, features_list) ######################################################################## print("================ DTree ========================") test_classifier(clf_DT, my_dataset, features_list) print("==================== NN ============================") test_classifier(clf_NN, my_dataset, features_list) print("==================== NN Scalled F =================") test_classifier(classifier5, my_dataset, features_list) # dump the best classifier, dataset and features_list so # anyone can run/check your results dump_classifier_and_data(clf_NB, my_dataset, my_best_feature_list)
#print "accuracy score is ",accuracy #print "recall score is ",recall #print "precision score is ",precision # Example starting point. Try investigating other evaluation techniques! #from sklearn.cross_validation import train_test_split #features_train, features_test, labels_train, labels_test = \ # train_test_split(features, labels, test_size=0.3, random_state=42) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, testfeatureswithpoi) ##MYCODE :convert my_dataset dictionary to list #import csv #dictlist=[] #temp=[] #fieldnames=['name'] #for name,detail in my_dataset.iteritems(): # temp.append(name) # for key,value in detail.iteritems(): # temp.append(value)
algo.fit(feature_train, labels_train) test_classifier(algo.best_estimator_, my_dataset, features_list) ###SVC scaler = MinMaxScaler() features = scaler.fit_transform(features) feature_train, feature_test, labels_train, labels_test = \ train_test_split( features, labels, test_size=0.3, random_state=42) print '\nSVM:' svc_clf = SVC() parameters = {'C': [0.001, 0.01, 0.1, 1, 10], \ 'kernel': ['rbf', 'linear', 'poly'], \ 'gamma': [0.001, 0.01, 0.1, 1] } algo = GridSearchCV(svc_clf, parameters) algo.fit(feature_train, labels_train) test_classifier(algo.best_estimator_, my_dataset, features_list) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. clf = gnb_clf dump_classifier_and_data(clf, my_dataset, ['poi', \ 'exercised_stock_options', 'total_stock_value', \ 'bonus', 'salary', 'total']) sys.stdout.close()
dtc_clf = sklearn.tree.DecisionTreeClassifier() dtcclf = grid_search.GridSearchCV(dtc_clf, parameters, scoring = scoring, cv = cv) dtcclf.fit(features, labels) print dtcclf.best_estimator_ print dtcclf.best_score_ print 'Processing time:',round(time()-t0,3) ,'s' #Classifier validation ##DecisionTreeClassifier Validation 1 (StratifiedShuffleSplit, folds = 1000) t0 = time() dtc_best_clf = dtcclf.best_estimator_ test_classifier(dtc_best_clf, enron_data, eng_feature_list) print 'Processing time:',round(time()-t0,3) ,'s' ##DecisionTreeClassifier Validation 2 (Randomized, partitioned trials, n=1,000) t0 = time() dtc_best_clf = dtcclf.best_estimator_ evaluate.evaluate_clf(dtc_best_clf, features, labels, num_iters=1000, test_size=0.3) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print 'Processing time:',round(time()-t0,3) ,'s' #Dump my classifier dump_classifier_and_data(dtc_best_clf, enron_data, eng_feature_list)
#特征得分 features_score = zip(features_list_6[1:25], clf.scores_[:24]) features_score = sorted(features_score, key=lambda s: s[1], reverse=True) print u'使分类器精确度和召回率最高的六个特征为:' for i in features_score: print i print '*' * 100 # ## 调整算法 # In[60]: #调整算法 print u'调整K近邻分类器参数,提高算法性能:' knc = KNeighborsClassifier(n_neighbors=2, weights='distance', n_jobs=-1) print u'正在计算,预计时间为2分钟..' test_classifier(knc, my_dataset, features_list_3, folds=1000) # In[65]: print u'***************最终算法*********************' print u'调参后的KNeighborsClassifier性能最高' print u'参数为: n_neighbors=2, weights=distance' print u'Precision: 0.49715 Recall: 0.39250 F1: 0.43867' # In[14]: from tester import dump_classifier_and_data dump_classifier_and_data(knc, my_dataset, features_list_3)
cachedir = mkdtemp() pipe = Pipeline(estimators) print(str(pipe)+'\n') #Training the classifier pipe = pipe.fit(features_train, labels_train) #Predicting the labels knn_labels_predicted = pipe.predict(features_test) #Calculating the accuracy, precision, recall and f1 scores knn_accuracy = accuracy_score(labels_test, knn_labels_predicted) knn_classification_report = classification_report(labels_test, knn_labels_predicted) print("After Tuning and Feature Scaling:") print("KNearestNeighbors accuracy score: {}.".format(knn_accuracy)) print("KNearestNeighbors classification report:\n{}.".format(knn_classification_report)) ### Task 6: Dump your classifier, dataset, and features_list ### You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=1, weights='uniform') dump_classifier_and_data(clf=knn, dataset=my_dataset, feature_list=features_list) from tester import test_classifier test_classifier(clf=knn, dataset=my_dataset, feature_list=features_list)
def main(): ### Task 1: Select what features you'll use. ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". # features_list = ['poi','salary', 'from_poi_to_this_person', 'exercised_stock_options', 'expenses'] # You will need to use more features features_list = available_features ### Load the dictionary containing the dataset data_dict = pickle.load(open("final_project_dataset.pkl", "r") ) ### Task 2: Remove outliers # pprint(data_dict) ### Task 3: Create new feature(s) ### Store to my_dataset for easy export below. my_dataset = data_dict # pprint(my_dataset[my_dataset.keys()[0]]) ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) # pprint(data) labels, features = targetFeatureSplit(data) # pprint(labels) # pprint(features) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Load previously saved stats to save time if we don't have to # recompute some of them clf_scores = load_saved_scores() # pprint(clf_scores) # gather the stats for each classifier if it's arguments have changed clf_scores = score_classifiers(my_dataset, features_list, saved_scores=clf_scores) with open(CLASSIFIER_STATS_FILE, 'w') as f: pickle.dump(clf_scores, f) pprint(clf_scores.values()) best_clf_stats = {} for clf_str, clf_stats in clf_scores.items(): for num_features, inner_stats in clf_stats.get('stats_by_n_features', {}).items(): if inner_stats.get('f2', 0) >= best_clf_stats.get('f2', 0): best_clf_stats = inner_stats # best_clf = max([s for s in clf_scores.values() if 'f1' in s], key=itemgetter('f1')) pprint(['best classifier: ', best_clf_stats]) # find classifiers that had >= 0.3 precision/recall balanced_clf_stats = [] for clf_str, clf_stats in clf_scores.items(): for num_features, inner_stats in clf_stats.get('stats_by_n_features', {}).items(): if inner_stats.get('precision', 0) >= 0.3 and inner_stats.get('recall', 0) >= 0.3: balanced_clf_stats.append(inner_stats) pprint(['balanced classifiers/datasets: ', balanced_clf_stats]) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. ### Because of the small size of the dataset, the script uses stratified ### shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # pprint(clf_scores.values(), indent=2) # test_classifier(clf, my_dataset, features_list) ### Dump your classifier, dataset, and features_list so ### anyone can run/check your results. with open('my_dataset.json', 'w') as f: f.write(json.dumps(my_dataset, indent=2)) dump_classifier_and_data(best_clf, my_dataset, features_list)
def evaluate_clasifier(df, extras, algo, dump=False): """Evaluate and possibly store classifier and data""" if not dump: # Only redirect output for the search orig_stdout, logfile = init_logfile(extras, algo) ### Task 3: Create new feature(s) df = create_features(df, *extras) ### Extract features and labels from dataset for local testing dfx, dfy = features_split_df(df) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html split_indices = StratifiedShuffleSplit(dfy, n_iter=1000, test_size=0.1) features_list = ['poi'] + dfx.columns.values.tolist() pipeline, params = create_pipeline(algo, extras, is_search=(not dump), max_features=len(dfx.columns)) grid_searcher = GridSearchCV(pipeline, param_grid=params, cv=split_indices, n_jobs=-1, scoring=create_scorer(), verbose=0) t0 = time() with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) grid_searcher.fit(dfx, y=dfy) print '\nTime to fit: {:0>8}\n'.format( dt.timedelta(seconds=(time() - t0))) print "Best parameters set:" print grid_searcher.best_params_ print '' print 'Grid score:' for params, mean_score, scores in grid_searcher.grid_scores_: print "%0.3f for %r" % (mean_score, params) print '' selector = grid_searcher.best_estimator_.named_steps['selection'] scored = pd.DataFrame( zip(dfx.columns.tolist(), selector.scores_, selector.get_support())) scored.columns = ['Feature', 'Score', 'Selected'] scored = scored.sort_values(by=['Score'], ascending=False) scored.index = range(1, len(scored) + 1) n_selected = len(scored[scored.Selected]) print 'Scored features: {} selected'.format(n_selected) print scored print '' # n_pca_components = grid_searcher.best_estimator_.named_steps[ # 'reducer'].n_components_ # print "Reduced to {0} PCA components".format(n_pca_components) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. clf = grid_searcher.best_estimator_ ### Store to my_dataset for easy export below. df = features_combine_df(dfx, dfy) my_dataset = df.to_dict(orient='index') test_classifier(clf, my_dataset, features_list) if dump: dump_classifier_and_data(clf, my_dataset, features_list) else: close_logfile(orig_stdout, logfile)
def nearestCentroid(orig_dataset=False, fine_tune=False, feature_select=None, folds=1000, dump=False, **kwargs): clf = NearestCentroid() dataset = f.df.to_dict('index') if orig_dataset: tester_dataset = f.orig_df.to_dict('index') tester_features = list(f.orig_df.columns.values) tester_features.remove('poi') tester_features = ['poi'] + tester_features test_classifier(clf, tester_dataset, tester_features, folds) return if not fine_tune: if feature_select not in [ 'kbest', 'xgboost', 'random_forest', 'xgboost_cv' ]: features = [f.targetCol] + f.featureCols test_classifier(clf, dataset, features, folds=folds) else: if feature_select.lower() == 'kbest': k = kwargs.get('k') eval_func = kwargs.get('eval_func') imp_features = imp.get_importance_kBest( k=k, eval_func=eval_func).keys() elif feature_select.lower() == 'xgboost': save = kwargs.get('save') k = kwargs.get('k') if not k: k = 5 imp_features = imp.get_importance_xgboost(save=save, k=k).keys() elif feature_select.lower() == 'random_forest': save = kwargs.get('save') k = kwargs.get('k') if not k: k = 5 imp_features = imp.get_importance_rf(save=save, k=k).keys() print imp_features else: save = kwargs.get('save') k = kwargs.get('k') if not k: k = 5 imp_features = imp.get_importance_xgboost(save=save, cv=True, k=k).keys() imp_features = [f.targetCol] + imp_features test_classifier(clf, dataset, imp_features, folds) else: tester_features = [f.targetCol] + f.featureCols pipe = Pipeline([('scale', MaxAbsScaler()), ('reduce_dim', PCA(random_state=42)), ('classify', NearestCentroid())]) number_of_features = range(2, f.df.shape[1] - 1) shrink_threshold = [None, 0.1, 0.6, 0.7, 0.8, 0.9, 1, 2, 5, 10] param_grid = [{ 'scale': [None, MaxAbsScaler(), StandardScaler(), MinMaxScaler()], 'reduce_dim': [PCA(random_state=42)], 'reduce_dim__n_components': number_of_features, 'classify__metric': ["euclidean", "manhattan"], 'classify__shrink_threshold': shrink_threshold }, { 'scale': [None, MaxAbsScaler(), StandardScaler(), MinMaxScaler()], 'reduce_dim': [SelectKBest()], 'reduce_dim__k': number_of_features, 'classify__metric': ["euclidean", "manhattan"], 'classify__shrink_threshold': shrink_threshold }] cv = StratifiedShuffleSplit(random_state=42) grid = GridSearchCV(pipe, param_grid=param_grid, cv=cv, scoring='precision', n_jobs=-1) features = f.df.as_matrix()[:, 0:-2] labels = f.df.as_matrix()[:, -1] grid.fit(features, labels) test_classifier(grid.best_estimator_, dataset, tester_features, folds) if dump: dump_classifier_and_data(grid.best_estimator_, dataset, tester_features)
test_clf(grid_search, labels, features, parameters) clf = AdaBoostClassifier() parameters = {'n_estimators': [10, 20, 30, 40, 50], 'algorithm': ['SAMME', 'SAMME.R'], 'learning_rate': [.5,.8, 1, 1.2, 1.5]} grid_search = GridSearchCV(clf, parameters) print '\nAdaBoost:' test_clf(grid_search, labels, features, parameters) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, my_features)
def svc(orig_dataset=False, fine_tune=False, feature_select=None, folds=1000, dump=False, **kwargs): clf = SVC(class_weight={0.: 1, 1.: 3.3}) dataset = f.df.to_dict('index') if orig_dataset: tester_dataset = f.orig_df.to_dict('index') tester_features = list(f.orig_df.columns.values) tester_features.remove('poi') tester_features = ['poi'] + tester_features test_classifier(clf, tester_dataset, tester_features, folds) return if not fine_tune: if feature_select not in [ 'kbest', 'xgboost', 'random_forest', 'xgboost_cv' ]: features = [f.targetCol] + f.featureCols test_classifier(clf, dataset, features, folds=folds) else: if feature_select.lower() == 'kbest': k = kwargs.get('k') eval_func = kwargs.get('eval_func') imp_features = imp.get_importance_kBest( k=k, eval_func=eval_func).keys() elif feature_select.lower() == 'xgboost': save = kwargs.get('save') k = kwargs.get('k') if not k: k = 5 imp_features = imp.get_importance_xgboost(save=save, k=k).keys() elif feature_select.lower() == 'random_forest': save = kwargs.get('save') k = kwargs.get('k') if not k: k = 5 imp_features = imp.get_importance_rf(save=save, k=k).keys() print imp_features else: save = kwargs.get('save') k = kwargs.get('k') if not k: k = 5 imp_features = imp.get_importance_xgboost(save=save, cv=True, k=k).keys() imp_features = [f.targetCol] + imp_features test_classifier(clf, dataset, imp_features, folds) else: tester_features = [f.targetCol] + f.featureCols pipe = Pipeline([('scale', MaxAbsScaler()), ('reduce_dim', PCA(random_state=42)), ('classify', SVC(class_weight={ 0.: 1, 1.: 3.3 }))]) number_of_features = range(2, f.df.shape[1] - 1) C_param = [0.1, 1, 10] gamma_param = range(10, 30) param_grid = [ { 'scale': [None, MaxAbsScaler()], 'reduce_dim': [PCA(random_state=42)], 'reduce_dim__n_components': number_of_features, 'classify__C': C_param, 'classify__gamma': gamma_param }, { 'scale': [None, MaxAbsScaler()], 'reduce_dim': [SelectKBest()], 'reduce_dim__k': number_of_features, 'classify__C': C_param, 'classify__gamma': gamma_param }, ] cv = StratifiedShuffleSplit(random_state=42) grid = GridSearchCV(pipe, param_grid=param_grid, cv=cv, scoring='f1', n_jobs=-1) features = f.df.as_matrix()[:, 0:-2] labels = f.df.as_matrix()[:, -1] grid.fit(features, labels) test_classifier(grid.best_estimator_, dataset, tester_features, folds) if dump: dump_classifier_and_data(grid.best_estimator_, dataset, tester_features)
print "-----------------" return (grid_search.best_estimator_, score) best_classifier_score = 0 best_classifier = 0 for classifierKey in classifiers: classifierData = classifiers[classifierKey] classifier = classifierData['classifier'] parameters = classifierData.get('parameters') pipelineData = default_pipline[:] pipelineData.append(('classifier', classifier)) pipe = Pipeline(pipelineData) param = dict(default_param_grid) if (parameters != None): param.update(parameters) (clf, score) = fit_and_score(classifierKey, pipe, param) if (score > best_classifier_score): best_classifier = clf ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(best_classifier, my_dataset, features_list)
# Classifier validation ##DecisionTreeClassifier Validation 1 (StratifiedShuffleSplit, folds = 1000) from tester import test_classifier t0 = time() decTree_best_clf = decTreeclf.best_estimator_ test_classifier(decTree_best_clf, my_dataset, features_list) print 'Processing time:', round(time() - t0, 3), 's' ##DecisionTreeClassifier Validation 2 (Cross validation) from sklearn.model_selection import cross_val_score t0 = time() decTree_best_clf = decTreeclf.best_estimator_ scores = cross_val_score(decTree_best_clf, features, labels, cv=5, scoring='accuracy') print("Accuracy and Deviation: " + str((scores.mean(), scores.std() * 2))) print 'Processing time:', round(time() - t0, 3), 's' test_classifier(decTree_best_clf, my_dataset, features_list) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(decTree_best_clf, my_dataset, features_list)
#SCALE REDUCED DATA #Scales the data sets that have the reduced numbers of features created above. scaler2 = preprocessing.MinMaxScaler() reduced_features = scaler2.fit_transform(reduced_unscaled_features) #SET UP GRID SEARCH PARAMETERS AND MODELS #Set up range of number of estimators to use in addition to previously defined ranges in random forest with GridsearchCV n_estimators = range(10,100,10) parametersRF = {'n_estimators': n_estimators, 'criterion':('gini','entropy')} #Creates the decision tree, random forest, and SVM classifiers rf=RandomForestClassifier() #Runs GridsearchCV with the selected model and features print "RF" RF=run_algorithm(rf, parametersRF, reduced_unscaled_features, reduced_labels) #Set up parameters for pipeline so that the entire pipeline can be passed to grader scaling = preprocessing.MinMaxScaler() estimators_RF = [('algorithm', RF)] print "Reduced RF" RRF = run_test(estimators_RF, my_dataset, reduced_features_list) #Pickles model, data, and selected features dump_classifier_and_data(RF, data_dict ,reduced_features_list)
#avg / total 0.71 0.84 0.77 38 clf = RandomForestClassifier( n_estimators=33, min_samples_leaf=2 ) #pick odd number of estimators to always get a decision test_classifier(clf, labels, features, test_size=0.3) # precision recall f1-score support # # 0.0 0.89 1.00 0.94 32 # 1.0 1.00 0.33 0.50 6 # #avg / total 0.91 0.89 0.87 38 # Choose NaiveBayes and test with N-fold cross-validation from tester import * clf = GaussianNB() test_classifier(clf, my_dataset, lean_list, folds=100) #GaussianNB(priors=None) #Accuracy: 0.86385 Precision: 0.60952 Recall: 0.32000 F1: 0.41967 F2: 0.35359 #Total predictions: 1300 True positives: 64 False positives: 41 False negatives: 136 True negatives: 1059 # Example starting point. Try investigating other evaluation techniques! ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, lean_list)
def main(): print "==========" import sys #import os import pickle from time import time ## evaluation from sklearn.metrics import precision_score, recall_score import matplotlib.pyplot as plt import pandas as pd #from ggplot import * import numpy as np from sklearn.preprocessing import MinMaxScaler from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.grid_search import GridSearchCV from sklearn.svm import SVC #from sklearn.datasets import load_iris from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.cross_validation import train_test_split ############################################################################### ############################################################################### ############################################################################### ## current file running print "Running:", sys.argv[0].split("/")[-1] t_start_all = time() ### import helper functions sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit ## make sure 'tester' in same dir from tester import dump_classifier_and_data ## moving loading dict code to be consistent with 'validate.py' ex from prev. ## lesson. ### Load the dictionary containing the dataset with open("final_project_dataset.pkl", "r") as data_file: data_dict = pickle.load(data_file) ''' #Example structure of data_dict: >>> data_dict {'METTS MARK': {'salary': 365788, 'to_messages': 807, 'deferral_payments': 'NaN', 'total_payments': 1061827, 'exercised_stock_options': 'NaN', 'bonus': 600000, 'restricted_stock': 585062, 'shared_receipt_with_poi': 702, 'restricted_stock_deferred': 'NaN', 'total_stock_value': 585062, 'expenses': 94299, 'loan_advances': 'NaN', 'from_messages': 29, 'other': 1740, 'from_this_person_to_poi': 1, 'poi': False, 'director_fees': 'NaN', 'deferred_income': 'NaN', 'long_term_incentive': 'NaN', 'email_address': '*****@*****.**', 'from_poi_to_this_person': 38 }, 'BAXTER JOHN C': {'salary': 267102, 'to_messages': 'NaN', 'deferral_payments': 1295738, 'total_payments': 5634343, 'exercised_stock_options': 6680544, 'bonus': 1200000, 'restricted_stock': 3942714, 'shared_receipt_with_poi': 'NaN', 'restricted_stock_deferred': 'NaN', 'total_stock_value': 10623258, 'expenses': 11200, 'loan_advances': 'NaN', 'from_messages': 'NaN', 'other': 2660303, 'from_this_person_to_poi': 'NaN', 'poi': False, 'director_fees': 'NaN', 'deferred_income': -1386055, 'long_term_incentive': 1586055, 'email_address': 'NaN', 'from_poi_to_this_person': 'NaN' }, ... ''' ############################################################################### ############################################################################### ############################################################################### print "----------" ''' ##### Task 0. Data Exploration Rubric: --- Data Exploration (related mini-project: Lesson 5) Student response addresses the most important characteristics of the dataset and uses these characteristics to inform their analysis. Important characteristics include: total number of data points allocation across classes (POI/non-POI) number of features are there features with many missing values? etc. --- ''' print "START: Task 0 - Explore data." t_start_0 = time() Boolean_doTask0 = False if Boolean_doTask0: ### Following L5, "explore_enron_data_16021614.py", do some data exploration. # How many data points (people) are in the dataset? #print "total number of data points, len(data_dict):", len(data_dict) #>>> 146 # Display all keys: #print data_dict.keys() #>>> ['METTS MARK', 'BAXTER JOHN C', 'ELLIOTT STEVEN',.....] #print data_dict.items() #get list of dict items #print data_dict['METTS MARK'].keys() #>>> ['salary', 'to_messages', 'deferral_payments', #print "number of features, len(data_dict['METTS MARK'].keys()):", len(data_dict['METTS MARK'].keys()) #>>> 21 ''' The poi feature records whether the person is a person of interest, according to our definition. How many POIs are there in the E+F dataset? In other words, count the number of entries in the dictionary where data[person_name]["poi"]==1 ''' list_count_poi = [key for key, value in data_dict.iteritems() if (data_dict[key]['poi']==True)] #print len(list_count_poi) #>>> 18 #print "(POI/total), pre-outlier removal:", 1.0*len(list_count_poi)/(1.0*len(data_dict)) #>>> 0.1233 #print "(POI/non-POI), pre-outlier removal:", 1.0*len(list_count_poi)/(1.0*len(data_dict) - 1.0*len(list_count_poi)) #>>> 0.1406 #Whats the value of stock options exercised by Jeffrey Skilling? #print data_dict['SKILLING JEFFREY K'].keys() #print data_dict['SKILLING JEFFREY K']['exercised_stock_options'] #>>> 19250000 ''' Of these three individuals (Lay, Skilling and Fastow), who took home the most money (largest value of total_payments feature)? How much money did that person get? ''' #print "Skilling total_payments:", data_dict['SKILLING JEFFREY K']['total_payments'] #>>> 8682716 #print "Lay total_payments:", data_dict['LAY KENNETH L']['total_payments'] #>>> 103559793 #print "Fastow total_payments:", data_dict['FASTOW ANDREW S']['total_payments'] #>>> 2424083 ''' For nearly every person in the dataset, not every feature has a value. How is it denoted when a feature doesnt have a well-defined value? #NaN ''' #How many folks in this dataset have a quantified salary? #What about a known email address? list_count_quantifiedSalary = [key for key, value in data_dict.iteritems() if (data_dict[key]['salary']!='NaN')] #print "len(list_count_quantifiedSalary):",len(list_count_quantifiedSalary) #>>> 95 list_count_email_address = [key for key, value in data_dict.iteritems() if (data_dict[key]['email_address']!='NaN')] #print "len(list_count_email_address):",len(list_count_email_address) #>>> 111 ## "allocation across classes" - take this to mean "how many features are #non-NA" dict_summary = {} dict_summary2 = {} for feature in data_dict[data_dict.keys()[0]].keys(): #print name dict_summary[feature] = [key for key, value in data_dict.iteritems() if (data_dict[key][feature]!='NaN')] #print "not NaN: len,", feature,":",len(dict_summary[feature]) dict_summary2[feature] = len(dict_summary[feature]) ## plot via pandas? see forum suggestions. ### can find meaningful trends, outliers in which features to use. #plt.scatter(ages, net_worths) #plt.show() columns = data_dict.keys() index = data_dict[data_dict.keys()[0]].keys() df1 = pd.DataFrame(index=index, columns=columns) df1 = df1.fillna(0) # with 0s rather than NaNs for name in columns: L_temp = [] for feature in index: if data_dict[name][feature] == 'NaN': L_temp.append(0.0) else: L_temp.append(data_dict[name][feature]) df1[name] = L_temp ##plot #df1.transpose().plot(kind='scatter', x='salary', y='total_payments', color = 'poi') #bad df1 = df1.applymap(lambda x: 1 if x else 0) #df1.transpose().plot(kind='scatter', x='salary', y='total_payments', color = 'poi') ##ggplot way; needs ggplot to work #ggplot(aes(x='salary', y='total_payments', color='poi'), data=df1.transpose()) + geom_point() #ggplot not a 64-bit package!! cant use w 64-bit Anaconda ##need a non-ggplot approach to plotting by color. resume to matplotlib. df1T = df1.transpose() colors = np.where(df1T['poi'] == True, 'r', 'b') #plt.scatter(x=df1T['salary'], y=df1T['total_stock_value'], color = colors, alpha = 0.5) ##known outliers from manual check of enron.pdf df1T = df1T.drop(['TOTAL', 'THE TRAVEL AGENCY IN THE PARK']) #plt.scatter(x=df1T['salary'], y=df1T['total_stock_value'], color = colors, alpha = 0.5) #plt.scatter(x=df1T['exercised_stock_options'], y=df1T['total_payments'], color = colors, alpha = 0.5) #exampple df['race_label'] = df.apply (lambda row: label_race (row),axis=1) df1T['poi_float'] = df1T.apply(lambda row: 1.0 if row['poi'] == True else 0.0, axis = 1) #plt.scatter(x=df1T['exercised_stock_options'], y=df1T['poi_float'], color = colors, alpha = 0.5) ''' [u'salary', u'to_messages', u'deferral_payments', u'total_payments', u'exercised_stock_options', u'bonus', u'restricted_stock', u'shared_receipt_with_poi', u'restricted_stock_deferred', u'total_stock_value', u'expenses', u'loan_advances', u'from_messages', u'other', u'from_this_person_to_poi', u'poi', u'director_fees', u'deferred_income', u'long_term_incentive', u'email_address', u'from_poi_to_this_person', u'poi_float'] ''' #plt.scatter(x=df1T['salary'], y=df1T['poi_float'], color = colors, alpha = 0.5) ## use this approach, but replace x-value with other variables. # salary is ok feature # to_messages is ok feature # deferral_payments is ok feature # total_payments is ok feature # exercised_stock_options is ok feature # bonus is ok feature # restricted_stock is ok feature # shared_receipt_with_poi is ok feature # restricted_stock_deferred is NOT ok feature #total_stock_value ok #expenses ok #loan_advances NOT ok #from_messages ~ok #other ~ok #from_this_person_to_poi ~ok #director_fees NOT ok #deferred_income ok #long_term_incentive ~ok #from_poi_to_this_person ~ok ''' ##################### ### Conclusions: ##################### >> Feature selection: ok features: salary, to_messages, deferral_payments, total_payments, exercised_stock_options, bonus, restricted_stock, shared_receipt_with_poi, total_stock_value, expenses, deferred_income not ok features: restricted_stock_deferred, loan_advances, director_fees unsure features: from_messages, other, from_this_person_to_poi, long_term_incentive, long_term_incentive, from_poi_to_this_person >> Outlier detection: 'TOTAL' and 'THE TRAVEL AGENCY IN THE PARK' need to be removed after manual overview of the "enron61702insiderpay.pdf" doc. ''' print "END: Task 0 - Explore data." t_end_0 = time() print "Task 0 run time:", round(t_end_0 - t_start_0, 16), "s" print "----------" ############################################################################### ############################################################################### ############################################################################### print "----------" ''' ##### Task 1: Select what features you'll use. Rubric: --- Intelligently select features (related mini-project: Lesson 11) Univariate or recursive feature selection is deployed, or features are selected by hand (different combinations of features are attempted, and the performance is documented for each one). Features that are selected are reported and the number of features selected is justified. For an algorithm that supports getting the feature importances (e.g. decision tree) or feature scores (e.g. SelectKBest), those are documented as well. Properly scale features (related mini-project: Lesson 9) If algorithm calls for scaled features, feature scaling is deployed. --- ''' ## *features_list is a list of strings, each of which is a feature name. ## ** The first feature must be "poi". ## ** You will need to use more features print "START: Task 1 - Feature Selection." t_start_1 = time() ### for brevity, include all features, and deselect with automated tools, such ### as KBest. Then, compare with anticipated features ("ok", "not ok") from T0. Boolean_doTask1 = True if Boolean_doTask1: features_all = data_dict[data_dict.keys()[0]].keys() ## errored. drop email address features_all.remove('email_address') features_all.remove('poi') features_list = ['poi'] + features_all print "END: Task 1 - Feature Selection." t_end_1 = time() print "Task 1 run time:", round(t_end_1 - t_start_1, 16), "s" print "----------" ############################################################################### ############################################################################### ############################################################################### print "----------" ''' ### Task 2: Remove outliers Rubric: --- Outlier Investigation (related mini-project: Lesson 7) Student response identifies outlier(s) in the financial data, and explains how they are removed or otherwise handled. Outliers are removed or retained as appropriate. --- ''' print "START: Task 2 - Remove Outliers." t_start_2 = time() Boolean_doTask2 = True if Boolean_doTask2: del data_dict["TOTAL"] del data_dict["THE TRAVEL AGENCY IN THE PARK"] print "END: Task 2 - Remove Outliers." t_end_2 = time() print "Task 2 run time:", round(t_end_2 - t_start_2, 16), "s" print "----------" ############################################################################### ############################################################################### ############################################################################### print "----------" ''' ### Task 3: Create new feature(s) Rubric: --- Create new features (related mini-project: Lesson 11) {} At least one new feature is implemented. Justification for that feature is provided in the written response, and the effect of that feature on the final algorithm performance is tested. --- ''' print "START: Task 3 - Feature creation." t_start_3 = time() Boolean_doTask3 = True if Boolean_doTask3: ### Store to my_dataset for easy export below. my_dataset = data_dict Boolean_doTask3_addNewFeatures = False if Boolean_doTask3_addNewFeatures: ### compute new features here, in "my_dataset", so not to disturb "data_dict" ## start: copy from studentCode_16030217.py, L11 def computeFraction(poi_messages, all_messages): """ given a number messages to/from POI (numerator) and number of all messages to/from a person (denominator), return the fraction of messages to/from that person that are from/to a POI """ ### you fill in this code, so that it returns either ### the fraction of all messages to this person that come from POIs ### or ### the fraction of all messages from this person that are sent to POIs ### the same code can be used to compute either quantity ### beware of "NaN" when there is no known email address (and so ### no filled email features), and integer division! ### in case of poi_messages or all_messages having "NaN" value, return 0. if poi_messages == "NaN" or all_messages == "NaN": fraction = 0.0 else: fraction = float(poi_messages)/float(all_messages) return fraction submit_dict = {} for name in my_dataset: data_point = my_dataset[name] ##from POI from_poi_to_this_person = data_point["from_poi_to_this_person"] to_messages = data_point["to_messages"] fraction_from_poi = computeFraction(from_poi_to_this_person, to_messages) ##to POI from_this_person_to_poi = data_point["from_this_person_to_poi"] from_messages = data_point["from_messages"] fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages ) ##populate dummy dict my_dataset[name]["fraction_from_poi"] = fraction_from_poi my_dataset[name]["fraction_to_poi"] = fraction_to_poi ## end: copy from studentCode_16030217.py ## add newly generated features to past "features_list" features_list = features_list + ["fraction_from_poi", "fraction_to_poi"] ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) print "END: Task 3 - Feature creation." t_end_3 = time() print "Task 3 run time:", round(t_end_3 - t_start_3, 16), "s" print "----------" ############################################################################### ############################################################################### ############################################################################### print "----------" ''' ### Task 4: Try a variety of classifiers Rubric: --- Pick an algorithm (related mini-project: Lessons 1-3) {} At least 2 different algorithms are attempted and their performance is compared, with the more performant one used in the final analysis. --- ''' print "START: Task 4 - Classifier model study." t_start_4 = time() Boolean_doTask4 = True if Boolean_doTask4: models = [] # append in |("name", clf_pipeline, param_grid)| format for each `models` # entry. combine inside the loop, and then do CV, GridSearchCV, .fit, # results.append(clf.score_) ### define scalars,arrays to use in GridSearch feature_min = 1 if Boolean_doTask3_addNewFeatures: feature_max = 22 #change depending on how many new feat. if added = 22 else: feature_max = 20 #20 is max for original num of features. max_svc_max_iter = int(1e5) #1e5 crashed. try 1e3. crashing due to non-scaling. return to 1e5. svm_C = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000] svm_kernel = ["linear", "poly", "rbf", "sigmoid"] svm_gamma = [0.01, 0.1, 0.5, 0.9, 10, 100, 1000] dt_min_samples_split = [2,5,10,20,40,80,100,200,500,1000] knn_n_neighbors = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,25,30,40,50,75,100] Boolean_doTask4_fullGridSearch = False if Boolean_doTask4_fullGridSearch: # SVM variants models.append(('MinMaxSclr_KBest_SVM', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", SelectKBest()), ("svm", SVC(max_iter=max_svc_max_iter))]), #SVM:max_iter=1000 dict(features__k = range(feature_min,feature_max), svm__C = svm_C, svm__kernel = svm_kernel, svm__gamma = svm_gamma) ) ) models.append(('MinMaxSclr_PCA_SVM', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", PCA()), ("svm", SVC(max_iter=max_svc_max_iter))]), dict(features__n_components = range(feature_min,feature_max), svm__C = svm_C, svm__kernel = svm_kernel, svm__gamma = svm_gamma) ) ) models.append(('MinMaxSclr_PCAKBest_SVM', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", FeatureUnion([("pca", PCA()), ("univ_select", SelectKBest())])), ("svm", SVC(max_iter=max_svc_max_iter))]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), svm__C = svm_C, svm__kernel = svm_kernel, svm__gamma = svm_gamma) ) ) models.append(('MinMaxSclr_KBestPCA_SVM', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", FeatureUnion([("univ_select", SelectKBest()), ("pca", PCA())])), ("svm", SVC(max_iter=max_svc_max_iter))]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), svm__C = svm_C, svm__kernel = svm_kernel, svm__gamma = svm_gamma) ) ) ''' models.append(('KBest_SVM', Pipeline([("features", SelectKBest()), ("svm", SVC(max_iter=max_svc_max_iter))]), #SVM:max_iter=1000 dict(features__k = range(feature_min,feature_max), svm__C = svm_C, svm__kernel = svm_kernel, svm__gamma = svm_gamma) ) ) ''' #removed because crash point. ''' models.append(('PCA_SVM', Pipeline([("features", PCA()), ("svm", SVC(max_iter=max_svc_max_iter))]), dict(features__n_components = range(feature_min,feature_max), svm__C = svm_C, svm__kernel = svm_kernel, svm__gamma = svm_gamma) ) ) ''' #removed because crash point. ''' models.append(('PCAKBest_SVM', Pipeline([("features", FeatureUnion([("pca", PCA()), ("univ_select", SelectKBest())])), ("svm", SVC(max_iter=max_svc_max_iter))]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), svm__C = svm_C, svm__kernel = svm_kernel, svm__gamma = svm_gamma) ) ) ''' #removed because crash point. ''' models.append(('KBestPCA_SVM', Pipeline([("features", FeatureUnion([("univ_select", SelectKBest()), ("pca", PCA())])), ("svm", SVC(max_iter=max_svc_max_iter))]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), svm__C = svm_C, svm__kernel = svm_kernel, svm__gamma = svm_gamma) ) ) ''' #removed because crash point. # GNB variants models.append(('MinMaxSclr_KBest_GNB', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", SelectKBest()), ("gnb", GaussianNB())]), dict(features__k = range(feature_min,feature_max)) ) ) models.append(('MinMaxSclr_PCA_GNB', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", PCA()), ("gnb", GaussianNB())]), dict(features__n_components = range(feature_min,feature_max)) ) ) models.append(('MinMaxSclr_PCAKBest_GNB', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", FeatureUnion([("pca", PCA()), ("univ_select", SelectKBest())])), ("gnb", GaussianNB())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max)) ) ) models.append(('MinMaxSclr_KBestPCA_GNB', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", FeatureUnion([("univ_select", SelectKBest()), ("pca", PCA())])), ("gnb", GaussianNB())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max)) ) ) models.append(('KBest_GNB', Pipeline([("features", SelectKBest()), ("gnb", GaussianNB())]), dict(features__k = range(feature_min,feature_max)) ) ) models.append(('PCA_GNB', Pipeline([("features", PCA()), ("gnb", GaussianNB())]), dict(features__n_components = range(feature_min,feature_max)) ) ) models.append(('PCAKBest_GNB', Pipeline([("features", FeatureUnion([("pca", PCA()), ("univ_select", SelectKBest())])), ("gnb", GaussianNB())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max)) ) ) models.append(('KBestPCA_GNB', Pipeline([("features", FeatureUnion([("univ_select", SelectKBest()), ("pca", PCA())])), ("gnb", GaussianNB())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max)) ) ) # DT variants models.append(('KBest_DT', Pipeline([("features", SelectKBest()), ("dt", DecisionTreeClassifier())]), dict(features__k = range(feature_min,feature_max), dt__min_samples_split = dt_min_samples_split) ) ) models.append(('MinMaxSclr_PCA_DT', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", PCA()), ("dt", DecisionTreeClassifier())]), dict(features__n_components = range(feature_min,feature_max), dt__min_samples_split = dt_min_samples_split) ) ) models.append(('MinMaxSclr_PCAKBest_DT', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", FeatureUnion([("pca", PCA()), ("univ_select", SelectKBest())])), ("dt", DecisionTreeClassifier())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), dt__min_samples_split = dt_min_samples_split) ) ) models.append(('MinMaxSclr_KBestPCA_DT', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", FeatureUnion([("univ_select", SelectKBest()), ("pca", PCA())])), ("dt", DecisionTreeClassifier())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), dt__min_samples_split = dt_min_samples_split) ) ) models.append(('PCA_DT', Pipeline([("features", PCA()), ("dt", DecisionTreeClassifier())]), dict(features__n_components = range(feature_min,feature_max), dt__min_samples_split = dt_min_samples_split) ) ) models.append(('PCAKBest_DT', Pipeline([("features", FeatureUnion([("pca", PCA()), ("univ_select", SelectKBest())])), ("dt", DecisionTreeClassifier())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), dt__min_samples_split = dt_min_samples_split) ) ) models.append(('KBestPCA_DT', Pipeline([("features", FeatureUnion([("univ_select", SelectKBest()), ("pca", PCA())])), ("dt", DecisionTreeClassifier())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), dt__min_samples_split = dt_min_samples_split) ) ) # KNN variants models.append(('MinMaxSclr_KBest_KNN', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", SelectKBest()), ("knn", KNeighborsClassifier())]), dict(features__k = range(feature_min,feature_max), knn__n_neighbors = knn_n_neighbors) ) ) models.append(('MinMaxSclr_PCA_KNN', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", PCA()), ("knn", KNeighborsClassifier())]), dict(features__n_components = range(feature_min,feature_max), knn__n_neighbors = knn_n_neighbors) ) ) models.append(('MinMaxSclr_KBestPCA_KNN', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", FeatureUnion([("univ_select", SelectKBest()), ("pca", PCA())])), ("knn", KNeighborsClassifier())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), knn__n_neighbors = knn_n_neighbors) ) ) models.append(('MinMaxSclr_PCAKBest_KNN', Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))), ("features", FeatureUnion([("pca", PCA()), ("univ_select", SelectKBest())])), ("knn", KNeighborsClassifier())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), knn__n_neighbors = knn_n_neighbors) ) ) models.append(('KBest_KNN', Pipeline([("features", SelectKBest()), ("knn", KNeighborsClassifier())]), dict(features__k = range(feature_min,feature_max), knn__n_neighbors = knn_n_neighbors) ) ) models.append(('PCA_KNN', Pipeline([("features", PCA()), ("knn", KNeighborsClassifier())]), dict(features__n_components = range(feature_min,feature_max), knn__n_neighbors = knn_n_neighbors) ) ) models.append(('KBestPCA_KNN', Pipeline([("features", FeatureUnion([("univ_select", SelectKBest()), ("pca", PCA())])), ("knn", KNeighborsClassifier())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), knn__n_neighbors = knn_n_neighbors) ) ) models.append(('PCAKBest_KNN', Pipeline([("features", FeatureUnion([("pca", PCA()), ("univ_select", SelectKBest())])), ("knn", KNeighborsClassifier())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), knn__n_neighbors = knn_n_neighbors) ) ) else: models.append(('KBestPCA_KNN', Pipeline([("features", FeatureUnion([("univ_select", SelectKBest()), ("pca", PCA())])), ("knn", KNeighborsClassifier())]), dict(features__pca__n_components = range(feature_min,feature_max), features__univ_select__k = range(feature_min,feature_max), knn__n_neighbors = knn_n_neighbors) ) ) # prepare results reports best_estimators = [] best_scores = [] names = [] cv = StratifiedShuffleSplit(y = labels, n_iter = 10, #default is 10; change to 30 for increased fidelity (Rationale: approx min samples needed for good approx of Gaus distr). Failed w large SVM max? reduce and test w cv=1. {} test_size = 0.1, random_state = 2016) # cycle through all grid searches for name, pipeline, param_grid in models: print "Start:", name grid_search = GridSearchCV(estimator = pipeline, param_grid = param_grid, verbose = 1, cv = cv, scoring = None, #default "scoring=None". try 'f1', 'recall' to combine both R and P. n_jobs = 1) # parallelize to lower runtime grid_search.fit(features, labels) best_estimators.append(grid_search.best_estimator_) best_scores.append([grid_search.best_score_]) names.append(name) print "End:", name print "grid_search.best_score_:", grid_search.best_score_ # boxplot algorithm comparison fig = plt.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) plt.boxplot(best_scores) ax.set_xticklabels(names) ax.tick_params(axis='both', which='major', labelsize=8) plt.setp(ax.xaxis.get_majorticklabels(), rotation=45) plt.show() print "END: Task 4 - Classifier model study." t_end_4 = time() print "Task 4 run time:", round(t_end_4 - t_start_4, 16), "s" print "----------" ############################################################################### ############################################################################### ############################################################################### print "----------" ''' ### Task 5: Tune your classifier to achieve better than 0.3 precision and recall ### using our testing script. Rubric: --- Tune the algorithm (related mini-project: Lessons 2, 3, 13) Response addresses what it means to perform parameter tuning and why it is important. {} At least one important parameter tuned, with at least 3 settings investigated systematically, or any of the following are true: GridSearchCV used for parameter tuning Several parameters tuned Parameter tuning incorporated into algorithm selection (i.e. parameters tuned for more than one algorithm, and best algorithm-tune combination selected for final analysis) --- ''' print "START: Task 5 - Classifier tuning." t_start_5 = time() Boolean_doTask5 = False if Boolean_doTask5: pass print "END: Task 5 - Classifier tuning." t_end_5 = time() print "Task 5 run time:", round(t_end_5 - t_start_5, 16), "s" print "----------" ############################################################################### ############################################################################### ############################################################################### print "----------" ''' ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. Rubric: --- Usage of Evaluation Metrics (related mini-project: Lesson 14) At least two appropriate metrics are used to evaluate algorithm performance (e.g. precision and recall), and the student articulates what those metrics measure in context of the project task. Validation Strategy (related mini-project: Lesson 13) Response addresses what validation is and why it is important. Performance of the final algorithm selected is assessed by splitting the data into training and testing sets or through the use of cross validation, noting the specific type of validation performed. Algorithm Performance When tester.py is used to evaluate performance, precision and recall are both at least 0.3. --- ''' print "START: Task 6 - Dump classifier, dataset,and features_list." t_start_6 = time() Boolean_doTask6 = True if Boolean_doTask6: ###You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(best_estimators[best_scores.index(max(best_scores))], my_dataset, features_list) #output for display of finals results: print "names:", names print "best_scores:", best_scores print "names[best_scores.index(max(best_scores))]:", names[best_scores.index(max(best_scores))] print "best_scores[best_scores.index(max(best_scores))]:", best_scores[best_scores.index(max(best_scores))] print "best_estimators[best_scores.index(max(best_scores))]:", best_estimators[best_scores.index(max(best_scores))] print "best_estimators[best_scores.index(max(best_scores))].steps:", best_estimators[best_scores.index(max(best_scores))].steps print "END: Task 6 - Dump classifier, dataset,and features_list." t_end_6 = time() print "Task 6 run time:", round(t_end_6 - t_start_6, 16), "s" print "----------" ############################################################################### ############################################################################### ############################################################################### t_end_all = time() print "total run time:", round(t_end_all - t_start_all, 16), "s" print "=========="
### Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html print('Start training') # Example starting point. Try investigating other evaluation techniques! from sklearn import model_selection from sklearn import ensemble features_train, features_test, labels_train, labels_test = \ model_selection.train_test_split(features, labels, test_size=0.3, random_state=42) params_grid = {'n_estimators': [25, 50, 100, 150, 200, 300]} clf = model_selection.GridSearchCV(ensemble.RandomForestClassifier(), params_grid, return_train_score=True) clf.fit(features_train, labels_train) test_classifier(clf, my_dataset, features_list) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf.best_estimator_, my_dataset, features_list) print('Process finalised')
def main(): ########################################################################### ### Get features from global definitions orig_features_list = getFeaturesList() ### load the dictionary containing the dataset with open("final_project_dataset.pkl", "r") as data_file: data_dict = pickle.load(data_file) ### Store to my_dataset for easy export below. orig_dataset = data_dict ########################################################################### ### Remove outliers orig_dataset = clean_data(orig_dataset) ########################################################################### ### Create new feature(s) my_features_list = orig_features_list my_dataset, my_features_list = create_new_features(orig_dataset, my_features_list) ### write full data to file data_df = pd.DataFrame(my_dataset) data_df.T.to_csv("full_data.csv", sep=',', encoding='utf-8') ########################################################################### ### Extract features and labels from dataset for local testing my_data = featureFormat(my_dataset, my_features_list, sort_keys=gl_sort_keys) my_labels, my_features = targetFeatureSplit(my_data) ### orig data orig_data = featureFormat(orig_dataset, orig_features_list, sort_keys=gl_sort_keys) orig_labels, orig_features = targetFeatureSplit(orig_data) ########################################################################### ### Preparation of training and testing data # without feature scaling my_features_train, my_features_test, my_labels_train, my_labels_test = \ train_test_split(my_features, my_labels, test_size=gl_test_size, random_state=gl_random_state) orig_features_train, orig_features_test, orig_labels_train, orig_labels_test = \ train_test_split(orig_features, orig_labels, test_size=gl_test_size, random_state=gl_random_state) if show_SelectKBest_results: print "BEST 10 FEATURES" bestTen = SelectKBest(f_classif, k=5) bestTen.fit(my_features_train, my_labels_train) try: scores = bestTen.scores_ indices = np.argsort(scores)[::-1] print("Features score ranking based on SelectKBest:") for f in range(np.array(my_features_train).shape[1]): print("%d. feature %s (%f)" % (f + 1, my_features_list[indices[f]], scores[indices[f]])) except: print "no scores available for the given combination" ### with feature scaling scaler = MinMaxScaler() my_features_scaled = scaler.fit_transform(my_features) my_features_train_scaled, my_features_test_scaled, \ my_labels_train_scaled, my_labels_test_scaled = \ train_test_split(my_features_scaled, my_labels, test_size=gl_test_size, random_state=gl_random_state) orig_features_scaled = scaler.fit_transform(orig_features) orig_features_train_scaled, orig_features_test_scaled, \ orig_labels_train_scaled, orig_labels_test_scaled = \ train_test_split(orig_features_scaled, orig_labels, test_size=gl_test_size, random_state=gl_random_state) print "Current test data size: " + str(gl_test_size * 100) + " %" print "Current train data size: " + str(100 - gl_test_size * 100) + " %" ########################################################################### ### PCA # prepare info for later output PCA_info = do_perform_PCA if PCA_info: PCA_info = pca_components # Do a PCA on the features for non scaled data my_features_train, my_features_test = \ do_PCA(my_features_train, my_features_test, pca_components) # Do a PCA on the features for scaled data my_features_train_scaled, my_features_test_scaled = \ do_PCA(my_features_train_scaled, my_features_test_scaled, pca_components) # Do a PCA on the features for non scaled data #orig_features_train, orig_features_test = \ # do_PCA(orig_features_train, orig_features_test, pca_components) # Do a PCA on the features for scaled data #orig_features_train_scaled, orig_features_test_scaled = \ # do_PCA(orig_features_train_scaled, orig_features_test_scaled, pca_components) ########################################################################### ### Train Classifier(s) print "###################################################################" print "Start performing selection of best algorithms and configurations " # calling the classifier validation with non-scaled features apply_clfs(my_features_train, my_features_test, my_features_train_scaled, my_features_test_scaled, my_labels_train, my_labels_test, gl_test_size, my_features_list) print "End performing selection of best algorithms and configurations " print "###################################################################" # pick 10 best performing classifier best_clf_config_list = clf_collection.sort_values(['precision','recall', 'accuracy','number of features'], ascending=[False,False,False,True]) clf_collection.sort_values(['precision', 'recall', 'accuracy', 'number of features'], ascending=[False, False, False, True]) # dump the results of all the tested classifiers and related configurtion # and train/test setup clf_collection.to_csv("training_data.csv", sep=',', encoding='utf-8') # iterating through all the classifiers chosen print "Validating list of best classifiers: " for index, best_clf_config in best_clf_config_list.iterrows(): # go for the best, instantiate it and dump the data best_clf_class_id = best_clf_config["class_id"] best_clf_params = best_clf_config["best parameters"] for id, clf_class, clf_kwargs, feat_scaling in gl_clf_list: if id == int(best_clf_class_id): try: # instantiate classifier best_clf = clf_class(**best_clf_params) best_clf_org = clf_class(**best_clf_params) if best_clf_config["features_scaled"]: # train the algorithm best_clf.fit(my_features_train_scaled, my_labels_train) best_clf_org.fit(orig_features_train_scaled, orig_labels_train) else: # train the algorithm best_clf_org.fit(orig_features_train, orig_labels_train) best_clf.fit(my_features_train, my_labels_train) print "start original data set" # test with original data set #v_o_total_predictions, v_o_accuracy, v_o_precision, v_o_recall, v_o_f1, v_o_f2 =\ # test_classifier(best_clf_org, orig_dataset, orig_features_list) #clf_best_collection.loc[1000 + index] = (best_clf_config["class_id"], # best_clf_config["clf"], # best_clf_config['features_scaled'], # len(orig_features_list), # str(my_features_list), # v_o_accuracy, v_o_precision, # v_o_recall, best_clf_params, # best_clf_config["best estimator"], # True, # create_new_message_features, # create_new_finance_features, # PCA_info) # dump final information dump_classifier_and_data(best_clf, my_dataset, my_features_list) print "start original data set with new features" #test with newly created features on top of the original data set v_total_predictions, v_accuracy, v_precision, v_recall, v_f1, v_f2 =\ test_classifier(best_clf, my_dataset, my_features_list, do_perform_PCA, pca_components, best_clf_config['features_scaled']) clf_best_collection.loc[index] = (best_clf_config["class_id"], best_clf_config["clf"], best_clf_config['features_scaled'], len(my_features_list), str(my_features_list), v_accuracy, v_precision, v_recall, best_clf_params, best_clf_config["best estimator"], False, create_new_message_features, create_new_finance_features, PCA_info ) except TypeError: clf_best_collection.loc[index] = (best_clf_config["class_id"], best_clf_config["clf"], best_clf_config['features_scaled'], len(my_features_list), str(my_features_list), "Error", "Error", "Error", best_clf_params, best_clf_config["best estimator"], False, create_new_message_features, create_new_finance_features, PCA_info ) cbc = clf_best_collection.sort_values(['precision', 'recall', 'accuracy'], ascending=[False, False, False]) # write classifier validation result to file cbc.to_csv(output_file_results, sep=',', encoding='utf-8') print "###################################################################" print "###################################################################"
from sklearn.naive_bayes import GaussianNB ''' parameters = {} clf = GaussianNB() find_kbest(clf, features, labels, parameters ) # In[63]: doPCA(grid_search, features, labels, parameters) ''' # ## Testing top two classifiers # In[75]: ''' from tester import dump_classifier_and_data from tester import main my_features = ['poi', 'exercised_stock_options', 'total_stock_value', 'bonus', 'salary', 'deferred_income'] clf = GaussianNB() dump_classifier_and_data(clf, my_dataset, my_features) main() ''' # In[76]: from tester import dump_classifier_and_data from tester import main my_features = ['poi', 'exercised_stock_options', 'total_stock_value', 'bonus']
t= time.time() pipeline = Pipeline([('normalization', scaler), ('classifier', KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto', leaf_size=30, p=1, metric='minkowski'))]) test_classifier(pipeline, enron_data, features_select(4)) print time.time()-t # ###Data dump # In[45]: ### Dump your classifier, dataset, and features_list so ### anyone can run/check your results. dump_classifier_and_data(pipeline, enron_data, features_select(4)) # ###Additional methods to explore include: # # * using k-fold cross-validation to improve model validation # In[ ]:
clf__C=[0.001, 0.1, 1, 10, 100, 1000, 10000, 1e3, 5e3, 1e4, 5e4, 1e5], clf__gamma=[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], reduce_dim__n_components=[1, 2, 4, 6, 8, 10, 12, 13]) param_grid = dict(clf__kernel=['sigmoid'], clf__C=[0.1], clf__gamma=[0.0001], reduce_dim__n_components=[14]) # scoring='%s_macro' % scores[1], grid_search = GridSearchCV(pipe, param_grid=param_grid, refit=True, cv=10) grid_search.fit(features_train, labels_train) labels_predict = grid_search.predict(features_test) from sklearn.ensemble import RandomForestClassifier clf_r = RandomForestClassifier(max_depth=6, random_state=0) clf_r.fit(features_train, labels_train) labels_predict_r = clf_r.predict(features_test) from sklearn.metrics import classification_report print('PCA and SVC', classification_report(labels_test, labels_predict)) print('Random Forest', classification_report(labels_test, labels_predict_r)) print(grid_search.best_estimator_.named_steps['reduce_dim'].n_components) # Task 6: Dump your classifier, dataset, and features_list so anyone can # check your results. You do not need to change anything below, but make sure # that the version of poi_id.py that you submit can be run on its own and # generates the necessary .pkl files for validating your results. dump_classifier_and_data(grid_search.best_estimator_, my_dataset, features_list)
feats.append(y) for x,y in zip(full_features_list[1:], clf_best.named_steps['skb'].scores_): list_scores.append({'feature_list' : x, "scores" : y}) print feats print pd.DataFrame(list_scores) print "---------------------------------------------------------------" for param_name in sorted(grid_search.param_grid.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) print "GridSearch time:" time1 = round(time()-t0,2) print time1 print "test_classifier time:" t1 = time() test_classifier(clf_best, my_dataset, full_features_list) time2 = round(time()-t1, 2) print time2 print "total time:", time2+time1 print "-----------------------------------------------------------------------" ############################################################################### ## Tune classifier ### Generates the necessary .pkl files for validating results. if full_report: for clf in [ dtc, gnc, knn, abc, rfc ]: test_classifier(clf, my_dataset, features_list) dump_classifier_and_data(clf_best, my_dataset, full_features_list)
# Without new features _ = build_model(original_features, estimator, {}, use_kbest=True, k=['all'], use_scaler=True) # With grand_total _ = build_model(original_features + ['grand_total'], estimator, {}, use_kbest=True, k=['all'], use_scaler=True) # With from_poi_ratio _ = build_model(original_features + ['from_poi_ratio'], estimator, {}, use_kbest=True, k=['all'], use_scaler=True) # With to_poi_ratio _ = build_model(original_features + ['to_poi_ratio'], estimator, {}, use_kbest=True, k=['all'], use_scaler=True) # ---------------------------------------------------------- # Final Model # ---------------------------------------------------------- final_model, final_features = build_model(original_features + ['grand_total'], estimator, {}, use_kbest=True, use_scaler=True) test_classifier(final_model, data_dict, final_features, folds=1000) # ---------------------------------------------------------- # Dump Classifier and Data # ---------------------------------------------------------- dump_classifier_and_data(final_model, data_dict, final_features)
### KNN Classifier ### ####################### # start_time = time.time() # params = {'n_neighbors': [3,4,5,6] , 'weights':['uniform','distance'],'leaf_size':[15,20,25,30,40], 'n_jobs':[-1]} # cv_KNN = GridSearchCV(clf_KNN, params) # cv_KNN.fit(features, labels) # clf1 = cv_KNN.best_estimator_ # print cv_KNN.best_score_ # #test_classifier(clf1,my_dataset,features_list) # test_classifier(clf1,enron_data_sub,cols) # elapsed_time= start_time - time.time() # print elapsed_time # print ################################################################################### ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. ################################################################################### dump_classifier_and_data(clf1, enron_data_sub, cols)
('select_features', SelectKBest(f_classif, k=opt_features)), ('reduce_dim', PCA()), ('naive', GaussianNB())]) clf.fit(features_train, labels_train) pred = clf.predict(features_test) print("") print("Efficiency of selected algorithm:") print 'F1 score:\t', '{0:.2f}'.format(f1_score(labels_test, pred)) print 'Accuracy:\t', '{0:.2f}'.format(accuracy_score(labels_test, pred)) print 'Precision:\t', '{0:.2f}'.format(precision_score(labels_test, pred)) print 'Recall:\t', '{0:.2f}'.format(recall_score(labels_test, pred)) scores = clf.named_steps['select_features'].scores_ features_selected_bool = clf.named_steps['select_features'].get_support(indices=True) features_selected = [features_list[i+1] for i in features_selected_bool] features_scores = [scores[i] for i in features_selected_bool] print("") print('Feature scores:') for i in range(len(features_scores)): print features_selected[i], '{0:.2f}'.format(features_scores[i]) features_selected.insert(0, 'poi') ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_selected)
def run_main(): ### Task 1: Select what features you'll use. ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". features_list = ['poi','email_subject','to_poi_ratio','combined', 'from_messages','expenses', 'deferred_income','other','restricted_stock', 'email_body'] #,'long_term_incentive','deferral_payments','email_body','restricted_stock_deferred'] # You will need to use more features ''' FEATURE LIST bonus, deferral_payments, deferred_income, director_fees, email_address, email_body, email_subject, exercised_stock_options, expenses, from_messages, from_poi_to_this_person, from_this_person_to_poi, loan_advances, long_term_incentive, other, poi, restricted_stock, restricted_stock_preferred, salary, shared_receipt_with_poi, to_messages, total_payments, total_stock_value ------------ ''' ### Load the dictionary containing the dataset data_dict = pickle.load(open("final_project_dataset.pkl", "r") ) data_dict = remove_key(data_dict, 'TOTAL') #data_dict = pickle.load(open("my_dataset.pkl", "r") ) get_sent_by_date.process_text_learning_features() data_dict = text_results_to_dataset.add_text_results(data_dict) def value_or_zero(inp): if inp == 'NaN': return 0 else: return float(inp) ### Task 2: Remove outliers ### Task 3: Create new feature(s) # create percent email from poi for key in data_dict.keys(): if data_dict[key]['to_messages'] == 'NaN': data_dict[key]['to_poi_ratio'] = 'NaN' else: data_dict[key]['to_poi_ratio'] = float(data_dict[key]['from_this_person_to_poi']) / float(data_dict[key]['from_messages']) combined = value_or_zero(data_dict[key]['salary']) + value_or_zero(data_dict[key]['bonus']) + \ value_or_zero(data_dict[key]['total_stock_value']) + value_or_zero(data_dict[key]['total_payments']) + \ value_or_zero(data_dict[key]['exercised_stock_options']) data_dict[key]['combined'] = combined # create percent email from poi ### Store to my_dataset for easy export below. features_list = scale_features(data_dict, [], features_list) my_dataset = data_dict #outlier_treatment(my_dataset, 'combined', elim_top=.01) ### Extract features and labels from dataset for local testing #data = featureFormat(my_dataset, features_list, sort_keys = True) #labels, features = targetFeatureSplit(data) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html #from sklearn.naive_bayes import GaussianNB #clf = GaussianNB() # Provided to give you a starting point. Try a varity of classifiers. # Fit classifier with out-of-bag estimates from sklearn import ensemble params = {'n_estimators': 200, 'max_depth': 2,'min_samples_split':20, 'learning_rate': .5, 'min_samples_leaf': 1} clf = ensemble.GradientBoostingClassifier(**params) scaler = MinMaxScaler() scaler_clf = Pipeline([('scaler',scaler),('clf',clf)]) #from sklearn.ensemble import AdaBoostClassifier #from sklearn.tree import DecisionTreeClassifier #clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2,min_samples_split=20),algorithm="SAMME",n_estimators=200) # RECALL: .39 features: 'email_subject','email_body','to_poi_ratio','combined' max_depth=3, min_samples_split=10 # RECALL: .41 featuers < SAME AS ABOVE but max_depth = 2 # RECALL: .36 with just email_body & email_subject #tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], # 'C': [1, 10, 100, 1000]}, #tuned_parameters = [{'C': [.001,1,.01,10]}] #from sklearn.grid_search import GridSearchCV #from sklearn.svm import LinearSVC #clf = GridSearchCV(LinearSVC(C=1,penalty="l2",class_weight='auto',loss="squared_hinge"), tuned_parameters, scoring='recall', verbose=3, n_jobs=5) # Maybe some original features where good, too? #fil = SelectKBest(f_regression, k=4) # create the pipeline to do the best selection: #clf = make_pipeline(fil, clf) #from sklearn.svm import LinearSVC #clf = LinearSVC(C=.001,penalty="l2",class_weight='auto',loss="squared_hinge") ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. ### Because of the small size of the dataset, the script uses stratified ### shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html test_classifier(scaler_clf, my_dataset, features_list) weights = clf.feature_importances_ for w, f in zip(weights,features_list[1:]): print str(w) + ' is the weight of '+f ### Dump your classifier, dataset, and features_list so ### anyone can run/check your results. dump_classifier_and_data(clf, my_dataset, features_list)
pipe = Pipeline(steps=[('skbest', SelectKBest(score_func=f_classif)), ('clf', GaussianNB())]) cv = StratifiedShuffleSplit(labels,n_iter = 60,random_state = 42) b_grid_search = grid_search.GridSearchCV(pipe, param_grid = clf_params,cv = cv,scoring = 'precision') b_grid_search.fit(features_saved,labels_saved) print 'Time:',round(time()-t0,3) ,'s\n' t0 = time() # pick a winner best_clf_nb = b_grid_search.best_estimator_ print best_clf_nb found_skb_nb=best_clf_nb.steps[0][1] found_clf_nb=best_clf_nb.steps[1][1] features=found_skb_nb.fit_transform(features_saved,labels_saved) features_list_to_use_nb=np.asarray(all_features_list_saved)[found_skb_nb.get_support()].tolist() print "\nFeatures used:" print features_list_to_use_nb test_classifier(found_clf_nb, dataset_to_export, ['poi']+features_list_to_use_nb) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(svm_clf, dataset_to_export, ['poi']+features_list)
# grid search parameters = _get_parameters() scoring = ['precision', 'recall'] grid_search = GridSearchCV(mypipeline, parameters, scoring=scoring) _evaluate_grid_search(grid_search, mypipeline, parameters, feature_train, label_train) # this is for fixed parameters mypipeline.set_params(feat_select__n_components=5, clf__C=1e6, clf__gamma=1).fit(feature_train, label_train) _cross_validate(mypipeline, feature_train, label_train) prediction = mypipeline.predict(feature_test) # Provided to give you a starting point. Try a variety of classifiers. ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! from sklearn.model_selection import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, data_dict, features_list)
### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html features_train, features_test, labels_train, labels_test = \ df.peform_StratifiedShuffleSplit(features, labels) ## GaussianNB print 'Performing GaussianNB' from sklearn.naive_bayes import GaussianNB clf_NB = GaussianNB() ''' Finding best number of features df.perform_plot_evaluation_metrics(clf_NB, my_dataset, kBest_features, 'GaussianNB') ''' tester.dump_classifier_and_data(clf_NB, my_dataset, kBest_features[:7]) t0 = time() tester.main() print "training time GaussianNB: ", round(time() - t0, 3), "s" ## Decision Tree print 'Performing Decision Tree' from sklearn import tree clf_DT = tree.DecisionTreeClassifier() params = {'criterion': ('gini', 'entropy'), 'splitter': ('best', 'random')} clf = GridSearchCV(clf_DT, params) clf.fit(features_train, labels_train) best_params = clf.best_params_ print 'Best parameters for Decision Tree: ' print best_params clf_DT = tree.DecisionTreeClassifier(splitter='random', criterion='entropy')
testing_features_list = [u'poi'] for feature in features_list_score_order: testing_features_list.append(feature) pipe = Pipeline([('impute', Imputer(strategy='median')), ('classify', GaussianNB(priors=[(i/2.)*.1, (1 - (i/2.)*.1)]))]) total_predictions, accuracy, precision, recall, f1, f2 = \ test_classifier(pipe, my_dataset, testing_features_list, folds=200) acc.append(accuracy) prec.append(precision) reca.append(recall) acc_all.append(acc) prec_all.append(prec) reca_all.append(reca) results_dict['prec' + str(i)] = prec results_dict['reca' + str(i)] = reca results_dict['acc' + str(i)] = acc #tuneNB() test_df = pd.DataFrame(results_dict) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results features_list_score_order = [u'poi', u'exercised_stock_options', u'total_stock_value', u'bonus'] pipe = Pipeline([('impute', Imputer(strategy='median')), ('classify', GaussianNB(priors=[.15, .85]))]) total_predictions, accuracy, precision, recall, f1, f2 = \ test_classifier(pipe, my_dataset, features_list_score_order, folds=1000) dump_classifier_and_data(pipe, my_dataset, features_list_score_order)
pred = pipe.predict(features_test) print "\nPCA - explained variance: ", pca.explained_variance_ratio_ first_pc = pca.components_[0] #print "\nFirst PC: ", first_pc print "\ntester result: ", \ test_classifier(pipe, my_dataset_t, features_list_all, folds=1000) # Best settings for NB: PCA - n_components = 8 # Best settings for DT: min_samples_split = 10; PCA - n_components = 3 # ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. # dump decision tree classifier clf_sub = clf_dt_1 my_dataset_sub = my_dataset_v5 features_list_sub = features_list_selector_9 dump_classifier_and_data(clf_sub, my_dataset_sub, features_list_sub)
#!/usr/bin/python '''###FINAL RESULTS### KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_neighbors=5, p=2, weights='distance') Accuracy: 0.86969 Precision: 0.63235 Recall: 0.36550 F1: 0.46324 F2: 0.39919 Total predictions: 13000 True positives: 731 False positives: 425 False negatives: 1269 True negatives: 10575 ''' import sys import pickle import pprint #sys.path.append("../tools/") #All files are in the final_project folder #### Comment out the 4 lines below before running import os os.getcwd() os.chdir("/Users/jas/Project-4-Identifying-Fraud-from-Enron-Email/final_project") os.getcwd() #### from feature_format import featureFormat, targetFeatureSplit from tester import test_classifier, dump_classifier_and_data ### Task 1: Select what features you'll use. ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". features_list = ['poi','salary','exercised_stock_options', 'bonus'] #This simpler model gives higher recall and precision than more features and #accuracy only goes down slightly - less than 1 % ### Load the dictionary containing the dataset data_dict = pickle.load(open("final_project_dataset.pkl", "r") ) ### Task 2: Remove outliers '''
### extract features and labels for gridsearch optimization # data extraction using k_best features list data = featureFormat(my_dataset, my_features_list, sort_keys=True) tru, trn = targetFeatureSplit(data) ## scale extracted features scaler = preprocessing.MinMaxScaler() trn = scaler.fit_transform(trn) # Set up cross validator (will be used for tuning all classifiers) cv = cross_validation.StratifiedShuffleSplit(tru, n_iter=10, random_state=42) ## Evaluate Final Adaboost Classifier # load tuned classifier pipeline best_a_pipe = pickle.load(open('best_clf_pipe.pkl', "r")) print 'best_a_clf\n' best_a_pipe test_classifier(best_a_pipe, my_dataset, my_features_list) print sep ### Dump your classifier, dataset, and features_list so ### anyone can run/check your results. dump_classifier_and_data(best_a_pipe, my_dataset, my_features_list)
# #print self.X_fit+X # best_words=self.wt.transform(self.X_fit+X) # word_pca = self.pca.fit_transform(best_words) # qqq = np.array(word_pca)[np.arange(len(self.y_fit)),:] # best_pca_train = self.pt.fit_transform(qqq,self.y_fit) # self.clf.fit(best_pca_train,self.y_fit) # #x=remove_low_frequency_words(X) # best_pca_test = self.pt.transform(np.array(word_pca)[np.arange(len(X))+len(self.X_fit)]) # #word_pca = self.pca.transform(best_words) # #best_pca = self.pt.transform(word_pca) # return self.clf.predict(best_pca_test) ## create filtered_gnb classifier #word_transformer = SelectKBest(f_regression,200) #pca = PCA(n_components=86) #pca_transformer = SelectKBest(f_classif,20) #classifier1 = DecisionTreeClassifier(min_samples_leaf=2) #classifier2 = GaussianNB() #classifier3 = KNeighborsClassifier() #filtered_gnb=FilteredGNB(word_transformer,pca,pca_transformer,classifier1) #print "FILTERED GNB CLASSIFIER USING ALL WORD FEATURES" #test_classifier(filtered_gnb, my_dataset, ["poi"]+ words.tolist(),folds=5) print "Gaussian NB with Word PCA Features:" test_classifier(GaussianNB(), my_dataset, ["poi"]+ best_word_pca_features) ### Dump your classifier, dataset, and features_list so ### anyone can run/check your results. dump_classifier_and_data(GaussianNB(), my_dataset, ["poi"]+best_word_pca_features)
### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html from sklearn.preprocessing import scale from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.cluster import KMeans from sklearn.ensemble import AdaBoostClassifier import tester # 创建GaussianNB clf_an1 = GaussianNB() tester.dump_classifier_and_data(clf_an1, my_dataset, features_list) tester.main() # 创建决策树 clf_an2 = DecisionTreeClassifier() tester.dump_classifier_and_data(clf_an2, my_dataset, features_list) tester.main() # 创建svc clf_an3 = SVC(kernel='linear') tester.dump_classifier_and_data(clf_an3, my_dataset, features_list) tester.main() # 创建kmeans clf_an4 = KMeans(n_clusters=2) tester.dump_classifier_and_data(clf_an4, my_dataset, features_list)
test_classifier(Clf, my_dataset, selected_features_list, folds = 1000) print "------" # Tuning K-Nearest Neighbors print "Tuning K-Nearest Neighbors" t0 = time() tuning_parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'weights': ('uniform', 'distance'), 'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'), 'leaf_size': [1, 5, 10, 20, 30, 40, 50, 75, 100, 200]} print("Tuning Parameters for Recall") KNN = GridSearchCV(KNeighborsClassifier(), tuning_parameters, cv=scv, scoring = 'recall') KNN.fit(selected_features, labels) print("Best parameters are:") print(KNN.best_params_) print "tunning time: {0}".format(round(time()-t0, 3)) Clf = KNN.best_estimator_ print "measurements for tuned random forest classifier: " test_classifier(Clf, my_dataset, selected_features_list, folds = 1000) ## Final Selection and Evaluation clf = RF.best_estimator_ ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, selected_features_list)
print f1_score(pred, labels_test1) CLF_PICKLE_FILENAME = "my_classifier.pkl" DATASET_PICKLE_FILENAME = "my_dataset.pkl" FEATURE_LIST_FILENAME = "my_feature_list.pkl" def dump_classifier_and_data(clf, dataset, feature_list): with open(CLF_PICKLE_FILENAME, "w") as clf_outfile: pickle.dump(clf, clf_outfile) with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile: pickle.dump(dataset, dataset_outfile) with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile: pickle.dump(feature_list, featurelist_outfile) dump_classifier_and_data(bestknn, data_dict, features_list) with open("my_classifier.pkl", "r") as file: clf = pickle.load(file) print clf with open("my_dataset.pkl", "r") as file: data = pickle.load(file) for person in data: print person, data[person] break with open("my_feature_list.pkl", "r") as file: features = pickle.load(file) print features
def runTest(self,clf,features_list): print "test result on stratified cross validation data...." dump_classifier_and_data(clf, self.data_dict, features_list) tester.main() return
def evaluate_clasifier(df, extras, algo, dump=False): """Evaluate and possibly store classifier and data""" if not dump: # Only redirect output for the search orig_stdout, logfile = init_logfile(extras, algo) ### Task 3: Create new feature(s) df = create_features(df, *extras) ### Extract features and labels from dataset for local testing dfx, dfy = features_split_df(df) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html split_indices = StratifiedShuffleSplit(dfy, n_iter=1000, test_size=0.1) features_list = ['poi'] + dfx.columns.values.tolist() pipeline, params = create_pipeline( algo, extras, is_search=(not dump), max_features=len(dfx.columns)) grid_searcher = GridSearchCV( pipeline, param_grid=params, cv=split_indices, n_jobs=-1, scoring=create_scorer(), verbose=0) t0 = time() with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) grid_searcher.fit(dfx, y=dfy) print '\nTime to fit: {:0>8}\n'.format(dt.timedelta(seconds=(time() - t0))) print "Best parameters set:" print grid_searcher.best_params_ print '' print 'Grid score:' for params, mean_score, scores in grid_searcher.grid_scores_: print "%0.3f for %r" % (mean_score, params) print '' selector = grid_searcher.best_estimator_.named_steps['selection'] scored = pd.DataFrame(zip( dfx.columns.tolist(), selector.scores_, selector.get_support())) scored.columns = ['Feature', 'Score', 'Selected'] scored = scored.sort_values(by=['Score'], ascending=False) scored.index = range(1, len(scored) + 1) n_selected = len(scored[scored.Selected]) print 'Scored features: {} selected'.format(n_selected) print scored print '' # n_pca_components = grid_searcher.best_estimator_.named_steps[ # 'reducer'].n_components_ # print "Reduced to {0} PCA components".format(n_pca_components) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. clf = grid_searcher.best_estimator_ ### Store to my_dataset for easy export below. df = features_combine_df(dfx, dfy) my_dataset = df.to_dict(orient='index') test_classifier(clf, my_dataset, features_list) if dump: dump_classifier_and_data(clf, my_dataset, features_list) else: close_logfile(orig_stdout, logfile)
rf = rft.best_estimator_ t0 = time() test_classifier(rf, data_dict, features_list, folds = 100) print("Random Forest evaluation time: %rs" % round(time()-t0, 3)) from sklearn.tree import DecisionTreeClassifier dt = [] for i in range(5): dt.append(DecisionTreeClassifier(max_depth=(i+1))) ab_params = {'base_estimator': dt, 'n_estimators': range(50, 101, 10)} t0 = time() abt = GridSearchCV(ab, ab_params, scoring=metric, cv=sss) print("AdaBoost tuning: %r" % round(time()-t0, 3)) t0 = time() abt = abt.fit(features, labels) print("AdaBoost fitting time: %rs" % round(time()-t0, 3)) ab = abt.best_estimator_ t0 = time() test_classifier(ab, data_dict, features_list, folds = 100) print("AdaBoost evaluation time: %rs" % round(time()-t0, 3)) ### Select tuned adaboost as best classifier clf = ab ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list)
def evaluate(clf, my_dataset, features_list): dump_classifier_and_data(clf, my_dataset, features_list) print '{1}Udacity\'s Evaluation:{0}'.format(color.Normal, color.BlinkBlue) return main() # from tester.py