def finish():
    try:
        if best_solution_so_far:

            print ""
            print "============================="
            print "Optimization Path:"
            print "============================="
            for solution in optimization_path:
                print_result_item(solution[0])
                print solution[1]
            print ""
            print "============================="
            print "Final Solution:"
            print "============================="
            print best_solution_so_far
            clf = best_solution_so_far[0][3]
            features_list = best_solution_so_far[1]
            dump_classifier_and_data(clf, my_dataset, features_list)
            print "Model saved with success."
        else:
            print ""
            print "No solution found"
    except Exception as e:
        print e
Exemple #2
0
def setup_and_test(my_dataset, features_list, classifier):
    # Dump classifier and features list, so we can test them
    dump_classifier_and_data(classifier, my_dataset, features_list)

    # load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    # Run testing script
    test_classifier(clf, dataset, feature_list)

    return
Exemple #3
0
def detect_poi():
### Load the dictionary containing the dataset
    data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
### Task 1: Remove outliers
    data_dict.pop('TOTAL',0)    
    
### Task 2: Select what features
### 'stk_pay_ratio','to_poi_ratio', 'from_poi_ratio','bonus_salary_ratio'
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
    my_dataset = data_dict
    stk_pay_ratio(my_dataset)
    from_poi_ratio(my_dataset)
    to_poi_ratio(my_dataset)
    bonus_salary_ratio(my_dataset)
     
### Task 3: Feature Selection
### Generate a set of 15 feature lists from these 4 features
### This way, all possible combinations of these features are tested

    all_features_list = fList_set()

### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation in tester.py
    metrics = []    
    clf = GaussianNB()    
### ptest uses Stratified shuffle split cross validation and calculates the precision
### Find the precision for every list
    for i in range(0,15):
        metrics.append(ptest(clf,my_dataset,all_features_list[i]))
### Go for the feature list that produces the best precision.  
### For this dataset only, it is harder to get a high precision.
    best = np.array(metrics).argmax()  
    
### Run test_classifier to print evaluation metrics to console
    test_classifier(clf, my_dataset,all_features_list[best])

### Now use the same feature list to run the decison tree classifier
    features_list = all_features_list[best]
### Task 4: Try a varity of classifiers
    samples_split_values = [2,4]
    samples_leaf_values = [1,2]

    for split in samples_split_values:
        for leaf in samples_leaf_values:
            clf = tree.DecisionTreeClassifier(min_samples_split=split,\
            min_samples_leaf=leaf)
            test_classifier(clf, my_dataset, features_list)
            print_feature_importances(features_list, clf)
###Choose best classfier and feature set    
    clf = GaussianNB()   

### Dump classifier, dataset, and features_list
    dump_classifier_and_data(clf, my_dataset, features_list)
def main():
    data_dict = pickle.load(open("final_project_dataset.pkl", "r"))
    my_dataset = data_dict
    my_dataset = AddFeatures(my_dataset)
    # Exclude using Discretion.
    Exc1 = ["email_address"]
    # Replaced by creating better versions of the features
    Exc2 = ["to_messages", "from_messages", "from_this_person_to_poi", "from_poi_to_this_person"]
    # Exclude because Highly Correlated with stronger features
    Exc3 = [
        "deferral_payments",
        "expenses",
        "deferred_income",
        "restricted_stock_deferred",
        "director_fees",
        "long_term_incentive",
        "bonus",
        "total_payments",
        "salary",
        "total_stock_value",
        "restricted_stock",
        "exercised_stock_options",
        "other",
    ]
    exclude = Exc1 + Exc2 + Exc3
    # QueryDataSet(my_dataset)
    # ShowCorrel(my_dataset)
    features_list = next(my_dataset.itervalues()).keys()
    for i in exclude:
        features_list.remove(i)
    features_list.insert(0, features_list.pop(features_list.index("poi")))
    data = featureFormat(my_dataset, features_list, sort_keys=True)
    ### Extract features and labels from dataset for local testing
    labels, features = targetFeatureSplit(data)
    features_train, features_test, labels_train, labels_test = train_test_split(
        features, labels, test_size=0.1, random_state=42, stratify=labels
    )
    # clf=TuneSVM(features, labels,features_list)
    # clf=TuneKNN(features, labels,features_list)
    # clf=NoTuneDT(features, labels,features_list)
    # clf=TuneDT(features,labels,features_list)
    features_list.insert(0, "poi")
    dump_classifier_and_data(clf, my_dataset, features_list)
    test_classifier(clf, my_dataset, features_list)
Exemple #5
0
    print "For optimum",score,":"
    for name in classifier_names:
        print "  ",name,":  ",best_performance[score][name]


# acc  = accuracy_score(pred, labels_test)
# print ""
# print "Accuracy:",acc," (Good predictions / All predictions)"
# pre = precision_score(pred, labels_test)
# print "Precision:",pre," (Real POIs / Predicted POIs)"
# rec  = recall_score(pred, labels_test)
# print "Recall:",rec," (Identified POIs / All POIs)"
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
# features_train, features_test, labels_train, labels_test = \
    # train_test_split(features, labels, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

# dump_classifier_and_data(clf, my_dataset, features_list)
dump_classifier_and_data(clf, data_dict, features_list)
print "F1-score on testing data: {:.4f}".format(results['Unoptimized Model']['F1 Score'])
print "\nOptimized Model\n------"
print "Final accuracy score on the testing data: {:.4f}".format(results['Optimized Model']['Accuracy'])
print "Final precision on testing data: {:.4f}".format(results['Optimized Model']['Precision'])
print "Final recall on testing data: {:.4f}".format(results['Optimized Model']['Recall'])
print "Final F1-score on the testing data: {:.4f}".format(results['Optimized Model']['F1 Score'])
print '\n'
print classification_report(labels_test, best_predictions)    

#%%
### Evaluate the final model by using 'test_classifier' function in 'tester.py' script.

# Convert the data to dictionary to be compatible with 'test_classifier' input format
selected_features_df = pd.DataFrame(data = selected_features, columns = selected_features_list)
labels_df = pd.DataFrame(data = labels)

my_dataset_df = pd.concat([labels_df, selected_features_df], axis=1)
my_dataset = pd.DataFrame.to_dict(my_dataset_df, orient='index')

print '\nPerformance of the model based on test_classifier function:'
# Pass the optimized model to the 'test_classifier' function
test_classifier(best_estimator, my_dataset, list(my_dataset_df.columns))   
 
#%%
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(best_estimator, my_dataset, list(my_dataset_df.columns))
Exemple #7
0
grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)

grid_search.fit(features, labels)

clf = grid_search.best_estimator_

#------------------------------------------------------------------------------

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.
### StratifiedShuffleSplit.html
test_classifier(clf, my_dataset, features_list)

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

#------------------------------------------------------------------------------

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
#              decision_function_shape='ovo', degree=3, gamma='auto',
#              kernel='linear', max_iter=-1, probability=False,
#              random_state=20160308, shrinking=False, tol=0.001,
#              verbose=False))
pipe = make_pipeline(
          Imputer(axis=0, copy=True, missing_values='NaN',
                  strategy='median', verbose=0),
          ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
                               criterion='gini', max_depth=None,
                               max_features='sqrt', max_leaf_nodes=None,
                               min_samples_leaf=3, min_samples_split=2,
                               min_weight_fraction_leaf=0.0, n_estimators=30,
                               n_jobs=-1, oob_score=False,
                               random_state=20160308, verbose=0,
                               warm_start=False))
#pipe = make_pipeline(
#          Imputer(axis=0, copy=True, missing_values='NaN',
#                  strategy='median', verbose=0),
#          SelectFpr(alpha=0.05, score_func=f_classif),
#          ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
#                               criterion='gini', max_depth=None,
#                               max_features='sqrt', max_leaf_nodes=None,
#                               min_samples_leaf=3, min_samples_split=2,
#                               min_weight_fraction_leaf=0.0, n_estimators=30,
#                               n_jobs=-1, oob_score=False,
#                               random_state=20160308, verbose=0,
#                               warm_start=False))

# Task 6: Dump your classifier, dataset, and features_list
dump_classifier_and_data(pipe, df.to_dict(orient='index'), ['poi'] + F_ALL_NEW)
Exemple #9
0

sd = StandardScaler()
fsl = FeatureSel(k_best=5, pca_comp=5)
# clf=Pipeline([("fsl",fsl),("sd",sd),("lvc",LinearSVC(C=0.000001))])


clf = Pipeline([("fsl", fsl), ("sd", sd), ("lvc", LinearSVC())])

gscv=GridSearchCV(clf,{"lvc__C":np.logspace(-6,-1,5),
                       "fsl__k_best":[1,5,10],
                       "fsl__pca_comp":[0,5,10]},
                  scoring="recall",verbose=0)


gscv.fit(np.array(features),np.array(labels))

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html


test_classifier(gscv.best_estimator_, my_dataset, features_list)

### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.

dump_classifier_and_data(gscv.best_estimator_, my_dataset, features_list)
def dump(clf, my_dataset, features_list):
    dump_classifier_and_data(clf, my_dataset, features_list)
    return
Exemple #11
0
param_grid=tree_param_grid,
scoring="recall")



### Show results of parameter tuning
grid_search.fit(features_train, labels_train)
print "\nbest estimator: \n", (grid_search.best_estimator_),\
"\n best score:\n",grid_search.best_score_ ,\
"\n best params:\n",grid_search.best_params_


clf = grid_search.best_estimator_

features_selected_bool  =  clf.named_steps['skb'].get_support()
features_selected_list = [x for x, y in zip(features_selected_list[1:],
features_selected_bool ) if y]

print "\nselected features: ", features_selected_list



### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.



dump_classifier_and_data(clf, my_dataset, ["poi"]+features_selected_list)
enron_df['from_poi_ratio'] = enron_df['from_poi_to_this_person'] / enron_df['from_messages']
enron_df['to_poi_ratio'] = enron_df['from_this_person_to_poi'] / enron_df['to_messages']
enron_df['bonus_ratio'] = enron_df['bonus'] / enron_df['salary']
enron_df.fillna(0, inplace=True)

# Separate labels and features
enron_df_labels = enron_df['poi']
enron_df_features = enron_df[enron_df.columns.difference(['poi'])]

# Use SelectKBest and GaussianNB
pipeline = Pipeline([
    ('kbest', SelectKBest()),
    ('gnb', GaussianNB())])

folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter=folds, random_state=42, test_size=0.20)
parameters = {"kbest__k": [1, 2, 3, 5, 8, 13, 19], "kbest__score_func": [f_classif]}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)

# Select Features
kbest = clf.best_estimator_.steps[0][1]
kbest.get_support()
features = sorted(zip(enron_df_features.columns, kbest.scores_, kbest.get_support()), key=lambda x: x[1])
my_list = [x[0] for x in features if x[2] == True]
my_list = ['poi'] + my_list

my_dataset = enron_df[my_list].transpose().to_dict()

dump_classifier_and_data(clf.best_estimator_.steps[1][1], my_dataset, my_list)
print('\n ')
print('    ##############   Model with Highest Precision Score   ###############  ')
highest_precision = sorted(grid_search_dict_results, key=lambda k: k['precision'], reverse=True)
# highest_precision = sorted(grid_search_dict_results, key=itemgetter('precision'), reverse=True)
pprint(highest_precision[0])

print('\n ')
print('    #################   Model with Highest Recall Score ################# ')
highest_recall = sorted(grid_search_dict_results, key=lambda k: k['recall'], reverse=True)
# highest_recall = sorted(grid_search_dict_results, key=itemgetter('recall'), reverse=True)
pprint(highest_recall[0])

print('\n ')
print('    ####  Model with Highest F1, Precision, Recall, Accuracy Score  ##### ')
sorted_grid_search_dict_results = sorted(grid_search_dict_results, key=lambda k: (k['f1'], k['precision'], k['recall'], k['accuracy']), reverse=True)
pprint(sorted_grid_search_dict_results)
pprint(sorted_grid_search_dict_results[0]['best_estimator'])

### Submit / Export files for tester.py
print('\n ')
print('\n ')
my_clf = sorted_grid_search_dict_results[0]['best_estimator']
my_feature_list = all_features
from tester import dump_classifier_and_data, test_classifier
### Dump pkl files
dump_classifier_and_data(my_clf, my_dataset, my_feature_list)
### Run my_clf, my_dataset and my_feature_list against tester.test_classifier
print('   ###########   Final Results from Best Estimator Options    ###########   ')
test_classifier(my_clf, my_dataset, my_feature_list)
Exemple #14
0
print "***** Fitting SVM with GridSearchCV Tunning *****"
sk_fold_svc_t = StratifiedShuffleSplit(labels_train, 100, random_state=42)
gs_svc_t = GridSearchCV(pipe_svc_t, param_grid=param_grid_def, cv=sk_fold_svc_t, scoring='f1')
gs_svc_t.fit(features, labels)
clf_svc_t_be = gs_svc_t.best_estimator_

#### Naive Bayes Classifier Fitting - Tunning - clf_gnb8
print "***** Fitting GaussianNB Tunning *****"
sk_fold_gnb8 = StratifiedShuffleSplit(labels, 1000, random_state=42)
gs_gnb8 = GridSearchCV(pipe_gnb8, param_grid=parameters, cv=sk_fold_gnb8, scoring='f1')
gs_gnb8.fit(features, labels)
clf_gnb8_be = gs_gnb8.best_estimator_

print "Best Estimator Fitting SVM with GridSearchCV Tunning"
print clf_gnb8_be


### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

#dump_classifier_and_data(clf_nb, my_dataset, features_list_selection)
#dump_classifier_and_data(clf_gs_nbp_be, my_dataset, features_list_selection)
#dump_classifier_and_data(clf_dt, my_dataset, features_list_selection)
#dump_classifier_and_data(clf_tree_be, my_dataset, features_list_selection)
#dump_classifier_and_data(clf_svm_be, my_dataset, features_list_selection)
#dump_classifier_and_data(clf_svc_t_be, my_dataset, features_list_selection)
dump_classifier_and_data(clf_gnb8_be, my_dataset, features_list_selection)

Exemple #15
0
labels05, features05 = targetFeatureSplit(data05)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.

clf05 = neighbors.KNeighborsClassifier()

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!

features_train05, features_test05, labels_train05, labels_test05 = \
    train_test_split(features05, labels05, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf05, my_dataset, features_list05)
 #    #tune_random_forest()
 #
 #    best_features_list_rf = fraudfunctions.get_k_best(my_dataset, features_list, 9)
 #
 #    clf_rf = Pipeline(steps=[
 #        ('scaler', StandardScaler()),
 #        ('classifier', RandomForestClassifier(max_depth=5,
 #                                              n_estimators=25,
 #                                              random_state=42))
 #    ])
 #
 #    print "Random Forest Classifier : \n", tester.test_classifier(clf_rf, my_dataset, best_features_list_rf)
 #
 #
 #    '''         ADA BOOST CLASSIFIER            '''
 #
 #    #tune_ada_boost()
 #
 #    best_features_list_ab = fraudfunctions.get_k_best(my_dataset, features_list, 9)
 #
 #    clf_ab = Pipeline(steps=[
 #        ('scaler', StandardScaler()),
 #        ('classifier', AdaBoostClassifier(learning_rate=1.5,
 #                                          n_estimators=30,
 #                                          algorithm='SAMME.R'))
 #    ])
 #
 #    print "Ada Boost Classifier : \n", tester.test_classifier(clf_ab, my_dataset, best_features_list_ab)
 '''         dump final algorithm classifier, dataset and features in the data directory         '''
 dump_classifier_and_data(clf_lr, my_dataset, best_features_list_lr)
    str('%.2f' % rec_ab),
    str('%.2f' % prec_ab),
    str('%.3f' % score_ab),
    str('%.2f' % (ab_t1 - ab_t0))
])

#score_array = np.array([score_nb, score_svm, score_tree, score_knn, score_rf, score_ab], dtype = float))
#score_array = np.array([score_nb, score_svm, score_tree, score_knn], dtype=float)
#clf_list = [clf_nb, clf_svm, clf_tree, clf_knn, clf_rf, clf_ab]
#clf_list = [clf_nb, clf_svm, clf_tree, clf_knn]
#max_index = np.argmax(score_array)

#clf = clf_list[max_index]
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
#clf = clf_tree

dump_classifier_and_data(clf_nb, my_dataset, features_list,
                         "my_classifier_nb.pkl")
dump_classifier_and_data(clf_svm, my_dataset, features_list,
                         "my_classifier_svm.pkl")
dump_classifier_and_data(clf_tree, my_dataset, features_list,
                         "my_classifier_tree.pkl")
dump_classifier_and_data(clf_knn, my_dataset, features_list,
                         "my_classifier_knn.pkl")
dump_classifier_and_data(clf_rf, my_dataset, features_list,
                         "my_classifier_rf.pkl")
dump_classifier_and_data(clf_ab, my_dataset, features_list,
                         "my_classifier_ab.pkl")
Exemple #18
0
test_classifier(clf_NB3, my_dataset, features_list)

                                 
########################################################################


print("================   DTree    ========================")

test_classifier(clf_DT, my_dataset, features_list)



print("====================  NN ============================")

test_classifier(clf_NN, my_dataset, features_list)


print("====================  NN Scalled F =================")

test_classifier(classifier5, my_dataset, features_list) 






# dump the best classifier, dataset and features_list so
# anyone can run/check your results

dump_classifier_and_data(clf_NB, my_dataset, my_best_feature_list)
Exemple #19
0
#print "accuracy score is ",accuracy
#print "recall score is ",recall
#print "precision score is ",precision


# Example starting point. Try investigating other evaluation techniques!
#from sklearn.cross_validation import train_test_split
#features_train, features_test, labels_train, labels_test = \
#    train_test_split(features, labels, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, testfeatureswithpoi)


##MYCODE :convert my_dataset dictionary to list
#import csv
#dictlist=[]
#temp=[]
#fieldnames=['name']


#for name,detail in my_dataset.iteritems():
#   temp.append(name)
   
#   for key,value in detail.iteritems():
       
#	   temp.append(value)
Exemple #20
0
algo.fit(feature_train, labels_train)
test_classifier(algo.best_estimator_, my_dataset, features_list)

###SVC

scaler = MinMaxScaler()
features = scaler.fit_transform(features)

feature_train, feature_test, labels_train, labels_test = \
 train_test_split( features, labels, test_size=0.3, random_state=42)

print '\nSVM:'
svc_clf = SVC()
parameters = {'C': [0.001, 0.01, 0.1, 1, 10], \
   'kernel': ['rbf', 'linear', 'poly'], \
   'gamma': [0.001, 0.01, 0.1, 1] }
algo = GridSearchCV(svc_clf, parameters)
algo.fit(feature_train, labels_train)
test_classifier(algo.best_estimator_, my_dataset, features_list)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

clf = gnb_clf
dump_classifier_and_data(clf, my_dataset, ['poi', \
    'exercised_stock_options', 'total_stock_value', \
    'bonus', 'salary', 'total'])

sys.stdout.close()
Exemple #21
0
dtc_clf = sklearn.tree.DecisionTreeClassifier() 
dtcclf = grid_search.GridSearchCV(dtc_clf, parameters, scoring = scoring, cv = cv)

dtcclf.fit(features, labels)
print dtcclf.best_estimator_
print dtcclf.best_score_
print 'Processing time:',round(time()-t0,3) ,'s'


#Classifier validation
##DecisionTreeClassifier Validation 1 (StratifiedShuffleSplit, folds = 1000)
t0 = time()
dtc_best_clf = dtcclf.best_estimator_
   
test_classifier(dtc_best_clf, enron_data, eng_feature_list)

print 'Processing time:',round(time()-t0,3) ,'s'


##DecisionTreeClassifier Validation 2 (Randomized, partitioned trials, n=1,000)
t0 = time()
dtc_best_clf = dtcclf.best_estimator_
   
evaluate.evaluate_clf(dtc_best_clf, features, labels, num_iters=1000, test_size=0.3)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print 'Processing time:',round(time()-t0,3) ,'s'

#Dump my classifier
dump_classifier_and_data(dtc_best_clf, enron_data, eng_feature_list)

Exemple #22
0
#特征得分
features_score = zip(features_list_6[1:25], clf.scores_[:24])
features_score = sorted(features_score, key=lambda s: s[1], reverse=True)
print u'使分类器精确度和召回率最高的六个特征为:'
for i in features_score:
    print i
print '*' * 100

# ## 调整算法

# In[60]:

#调整算法
print u'调整K近邻分类器参数,提高算法性能:'
knc = KNeighborsClassifier(n_neighbors=2, weights='distance', n_jobs=-1)
print u'正在计算,预计时间为2分钟..'
test_classifier(knc, my_dataset, features_list_3, folds=1000)

# In[65]:

print u'***************最终算法*********************'
print u'调参后的KNeighborsClassifier性能最高'
print u'参数为:  n_neighbors=2, weights=distance'
print u'Precision: 0.49715   Recall: 0.39250   F1: 0.43867'

# In[14]:

from tester import dump_classifier_and_data
dump_classifier_and_data(knc, my_dataset, features_list_3)
Exemple #23
0
cachedir = mkdtemp()
pipe = Pipeline(estimators)
print(str(pipe)+'\n')

#Training the classifier
pipe = pipe.fit(features_train, labels_train)

#Predicting the labels
knn_labels_predicted = pipe.predict(features_test)

#Calculating the accuracy, precision, recall and f1 scores
knn_accuracy = accuracy_score(labels_test, knn_labels_predicted)
knn_classification_report = classification_report(labels_test, knn_labels_predicted)

print("After Tuning and Feature Scaling:")
print("KNearestNeighbors accuracy score: {}.".format(knn_accuracy))
print("KNearestNeighbors classification report:\n{}.".format(knn_classification_report))
### Task 6: Dump your classifier, dataset, and features_list
### You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=1,
           weights='uniform')

dump_classifier_and_data(clf=knn, dataset=my_dataset, feature_list=features_list)
from tester import test_classifier
test_classifier(clf=knn, dataset=my_dataset, feature_list=features_list)
def main():
    ### Task 1: Select what features you'll use.
    ### features_list is a list of strings, each of which is a feature name.
    ### The first feature must be "poi".
    # features_list = ['poi','salary', 'from_poi_to_this_person', 'exercised_stock_options', 'expenses'] # You will need to use more features
    features_list = available_features
    
    ### Load the dictionary containing the dataset
    data_dict = pickle.load(open("final_project_dataset.pkl", "r") )

    ### Task 2: Remove outliers
    # pprint(data_dict)
    ### Task 3: Create new feature(s)
    ### Store to my_dataset for easy export below.
    my_dataset = data_dict

    # pprint(my_dataset[my_dataset.keys()[0]])

    ### Extract features and labels from dataset for local testing
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    
    # pprint(data)
    labels, features = targetFeatureSplit(data)
    # pprint(labels)
    # pprint(features)

    ### Task 4: Try a varity of classifiers
    ### Please name your classifier clf for easy export below.
    ### Note that if you want to do PCA or other multi-stage operations,
    ### you'll need to use Pipelines. For more info:
    ### http://scikit-learn.org/stable/modules/pipeline.html

    # Load previously saved stats to save time if we don't have to 
    # recompute some of them
    clf_scores = load_saved_scores()
    # pprint(clf_scores)
      
    # gather the stats for each classifier if it's arguments have changed
    clf_scores = score_classifiers(my_dataset, features_list, saved_scores=clf_scores)

    with open(CLASSIFIER_STATS_FILE, 'w') as f:
      pickle.dump(clf_scores, f)

    pprint(clf_scores.values())
    
    best_clf_stats = {}
    for clf_str, clf_stats in clf_scores.items():
      for num_features, inner_stats in clf_stats.get('stats_by_n_features', {}).items():
        if inner_stats.get('f2', 0) >= best_clf_stats.get('f2', 0):
          best_clf_stats = inner_stats
    
    # best_clf = max([s for s in clf_scores.values() if 'f1' in s], key=itemgetter('f1'))
    pprint(['best classifier: ', best_clf_stats])

    # find classifiers that had >= 0.3 precision/recall
    balanced_clf_stats = []
    for clf_str, clf_stats in clf_scores.items():
      for num_features, inner_stats in clf_stats.get('stats_by_n_features', {}).items():
        if inner_stats.get('precision', 0) >= 0.3 and inner_stats.get('recall', 0) >= 0.3:
          balanced_clf_stats.append(inner_stats)
    pprint(['balanced classifiers/datasets: ', balanced_clf_stats])

    ### Task 5: Tune your classifier to achieve better than .3 precision and recall 
    ### using our testing script.
    ### Because of the small size of the dataset, the script uses stratified
    ### shuffle split cross validation. For more info: 
    ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
    # pprint(clf_scores.values(), indent=2)
    # test_classifier(clf, my_dataset, features_list)

    ### Dump your classifier, dataset, and features_list so 
    ### anyone can run/check your results.
    
    with open('my_dataset.json', 'w') as f:
        f.write(json.dumps(my_dataset, indent=2))
    dump_classifier_and_data(best_clf, my_dataset, features_list)
Exemple #25
0
def evaluate_clasifier(df, extras, algo, dump=False):
    """Evaluate and possibly store classifier and data"""

    if not dump:
        # Only redirect output for the search
        orig_stdout, logfile = init_logfile(extras, algo)

    ### Task 3: Create new feature(s)
    df = create_features(df, *extras)

    ### Extract features and labels from dataset for local testing
    dfx, dfy = features_split_df(df)

    ### Task 4: Try a varity of classifiers
    ### Please name your classifier clf for easy export below.
    ### Note that if you want to do PCA or other multi-stage operations,
    ### you'll need to use Pipelines. For more info:
    ### http://scikit-learn.org/stable/modules/pipeline.html

    ### Task 5: Tune your classifier to achieve better than .3 precision and recall
    ### using our testing script. Check the tester.py script in the final project
    ### folder for details on the evaluation method, especially the test_classifier
    ### function. Because of the small size of the dataset, the script uses
    ### stratified shuffle split cross validation. For more info:
    ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

    split_indices = StratifiedShuffleSplit(dfy, n_iter=1000, test_size=0.1)

    features_list = ['poi'] + dfx.columns.values.tolist()

    pipeline, params = create_pipeline(algo,
                                       extras,
                                       is_search=(not dump),
                                       max_features=len(dfx.columns))

    grid_searcher = GridSearchCV(pipeline,
                                 param_grid=params,
                                 cv=split_indices,
                                 n_jobs=-1,
                                 scoring=create_scorer(),
                                 verbose=0)

    t0 = time()
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)

        grid_searcher.fit(dfx, y=dfy)
        print '\nTime to fit: {:0>8}\n'.format(
            dt.timedelta(seconds=(time() - t0)))

        print "Best parameters set:"
        print grid_searcher.best_params_
        print ''

        print 'Grid score:'
        for params, mean_score, scores in grid_searcher.grid_scores_:
            print "%0.3f for %r" % (mean_score, params)
        print ''

        selector = grid_searcher.best_estimator_.named_steps['selection']
        scored = pd.DataFrame(
            zip(dfx.columns.tolist(), selector.scores_,
                selector.get_support()))

        scored.columns = ['Feature', 'Score', 'Selected']
        scored = scored.sort_values(by=['Score'], ascending=False)
        scored.index = range(1, len(scored) + 1)
        n_selected = len(scored[scored.Selected])
        print 'Scored features: {} selected'.format(n_selected)
        print scored
        print ''

        # n_pca_components = grid_searcher.best_estimator_.named_steps[
        #     'reducer'].n_components_

        # print "Reduced to {0} PCA components".format(n_pca_components)

        ### Task 6: Dump your classifier, dataset, and features_list so anyone can
        ### check your results. You do not need to change anything below, but make sure
        ### that the version of poi_id.py that you submit can be run on its own and
        ### generates the necessary .pkl files for validating your results.

        clf = grid_searcher.best_estimator_

        ### Store to my_dataset for easy export below.
        df = features_combine_df(dfx, dfy)
        my_dataset = df.to_dict(orient='index')

        test_classifier(clf, my_dataset, features_list)

        if dump:
            dump_classifier_and_data(clf, my_dataset, features_list)
        else:
            close_logfile(orig_stdout, logfile)
Exemple #26
0
def nearestCentroid(orig_dataset=False,
                    fine_tune=False,
                    feature_select=None,
                    folds=1000,
                    dump=False,
                    **kwargs):
    clf = NearestCentroid()
    dataset = f.df.to_dict('index')
    if orig_dataset:
        tester_dataset = f.orig_df.to_dict('index')
        tester_features = list(f.orig_df.columns.values)
        tester_features.remove('poi')
        tester_features = ['poi'] + tester_features
        test_classifier(clf, tester_dataset, tester_features, folds)
        return
    if not fine_tune:
        if feature_select not in [
                'kbest', 'xgboost', 'random_forest', 'xgboost_cv'
        ]:
            features = [f.targetCol] + f.featureCols
            test_classifier(clf, dataset, features, folds=folds)
        else:
            if feature_select.lower() == 'kbest':
                k = kwargs.get('k')
                eval_func = kwargs.get('eval_func')
                imp_features = imp.get_importance_kBest(
                    k=k, eval_func=eval_func).keys()
            elif feature_select.lower() == 'xgboost':
                save = kwargs.get('save')
                k = kwargs.get('k')
                if not k:
                    k = 5
                imp_features = imp.get_importance_xgboost(save=save,
                                                          k=k).keys()
            elif feature_select.lower() == 'random_forest':
                save = kwargs.get('save')
                k = kwargs.get('k')
                if not k:
                    k = 5
                imp_features = imp.get_importance_rf(save=save, k=k).keys()
                print imp_features
            else:
                save = kwargs.get('save')
                k = kwargs.get('k')
                if not k:
                    k = 5
                imp_features = imp.get_importance_xgboost(save=save,
                                                          cv=True,
                                                          k=k).keys()
            imp_features = [f.targetCol] + imp_features
            test_classifier(clf, dataset, imp_features, folds)
    else:
        tester_features = [f.targetCol] + f.featureCols
        pipe = Pipeline([('scale', MaxAbsScaler()),
                         ('reduce_dim', PCA(random_state=42)),
                         ('classify', NearestCentroid())])

        number_of_features = range(2, f.df.shape[1] - 1)
        shrink_threshold = [None, 0.1, 0.6, 0.7, 0.8, 0.9, 1, 2, 5, 10]
        param_grid = [{
            'scale': [None,
                      MaxAbsScaler(),
                      StandardScaler(),
                      MinMaxScaler()],
            'reduce_dim': [PCA(random_state=42)],
            'reduce_dim__n_components':
            number_of_features,
            'classify__metric': ["euclidean", "manhattan"],
            'classify__shrink_threshold':
            shrink_threshold
        }, {
            'scale': [None,
                      MaxAbsScaler(),
                      StandardScaler(),
                      MinMaxScaler()],
            'reduce_dim': [SelectKBest()],
            'reduce_dim__k':
            number_of_features,
            'classify__metric': ["euclidean", "manhattan"],
            'classify__shrink_threshold':
            shrink_threshold
        }]
        cv = StratifiedShuffleSplit(random_state=42)
        grid = GridSearchCV(pipe,
                            param_grid=param_grid,
                            cv=cv,
                            scoring='precision',
                            n_jobs=-1)
        features = f.df.as_matrix()[:, 0:-2]
        labels = f.df.as_matrix()[:, -1]
        grid.fit(features, labels)
        test_classifier(grid.best_estimator_, dataset, tester_features, folds)
        if dump:
            dump_classifier_and_data(grid.best_estimator_, dataset,
                                     tester_features)
Exemple #27
0
test_clf(grid_search, labels, features, parameters)

clf = AdaBoostClassifier()
parameters = {'n_estimators': [10, 20, 30, 40, 50],
               'algorithm': ['SAMME', 'SAMME.R'],
               'learning_rate': [.5,.8, 1, 1.2, 1.5]}
grid_search = GridSearchCV(clf, parameters)
print '\nAdaBoost:'
test_clf(grid_search, labels, features, parameters)

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42)

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, my_features)
Exemple #28
0
def svc(orig_dataset=False,
        fine_tune=False,
        feature_select=None,
        folds=1000,
        dump=False,
        **kwargs):
    clf = SVC(class_weight={0.: 1, 1.: 3.3})
    dataset = f.df.to_dict('index')
    if orig_dataset:
        tester_dataset = f.orig_df.to_dict('index')
        tester_features = list(f.orig_df.columns.values)
        tester_features.remove('poi')
        tester_features = ['poi'] + tester_features
        test_classifier(clf, tester_dataset, tester_features, folds)
        return
    if not fine_tune:
        if feature_select not in [
                'kbest', 'xgboost', 'random_forest', 'xgboost_cv'
        ]:
            features = [f.targetCol] + f.featureCols
            test_classifier(clf, dataset, features, folds=folds)
        else:
            if feature_select.lower() == 'kbest':
                k = kwargs.get('k')
                eval_func = kwargs.get('eval_func')
                imp_features = imp.get_importance_kBest(
                    k=k, eval_func=eval_func).keys()
            elif feature_select.lower() == 'xgboost':
                save = kwargs.get('save')
                k = kwargs.get('k')
                if not k:
                    k = 5
                imp_features = imp.get_importance_xgboost(save=save,
                                                          k=k).keys()
            elif feature_select.lower() == 'random_forest':
                save = kwargs.get('save')
                k = kwargs.get('k')
                if not k:
                    k = 5
                imp_features = imp.get_importance_rf(save=save, k=k).keys()
                print imp_features
            else:
                save = kwargs.get('save')
                k = kwargs.get('k')
                if not k:
                    k = 5
                imp_features = imp.get_importance_xgboost(save=save,
                                                          cv=True,
                                                          k=k).keys()
            imp_features = [f.targetCol] + imp_features
            test_classifier(clf, dataset, imp_features, folds)
    else:
        tester_features = [f.targetCol] + f.featureCols
        pipe = Pipeline([('scale', MaxAbsScaler()),
                         ('reduce_dim', PCA(random_state=42)),
                         ('classify', SVC(class_weight={
                             0.: 1,
                             1.: 3.3
                         }))])

        number_of_features = range(2, f.df.shape[1] - 1)

        C_param = [0.1, 1, 10]
        gamma_param = range(10, 30)
        param_grid = [
            {
                'scale': [None, MaxAbsScaler()],
                'reduce_dim': [PCA(random_state=42)],
                'reduce_dim__n_components': number_of_features,
                'classify__C': C_param,
                'classify__gamma': gamma_param
            },
            {
                'scale': [None, MaxAbsScaler()],
                'reduce_dim': [SelectKBest()],
                'reduce_dim__k': number_of_features,
                'classify__C': C_param,
                'classify__gamma': gamma_param
            },
        ]
        cv = StratifiedShuffleSplit(random_state=42)
        grid = GridSearchCV(pipe,
                            param_grid=param_grid,
                            cv=cv,
                            scoring='f1',
                            n_jobs=-1)
        features = f.df.as_matrix()[:, 0:-2]
        labels = f.df.as_matrix()[:, -1]
        grid.fit(features, labels)
        test_classifier(grid.best_estimator_, dataset, tester_features, folds)
        if dump:
            dump_classifier_and_data(grid.best_estimator_, dataset,
                                     tester_features)
Exemple #29
0
    print "-----------------"

    return (grid_search.best_estimator_, score)


best_classifier_score = 0
best_classifier = 0
for classifierKey in classifiers:
    classifierData = classifiers[classifierKey]
    classifier = classifierData['classifier']

    parameters = classifierData.get('parameters')

    pipelineData = default_pipline[:]
    pipelineData.append(('classifier', classifier))
    pipe = Pipeline(pipelineData)
    param = dict(default_param_grid)
    if (parameters != None):
        param.update(parameters)

    (clf, score) = fit_and_score(classifierKey, pipe, param)
    if (score > best_classifier_score):
        best_classifier = clf


### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(best_classifier, my_dataset, features_list)
# Classifier validation
##DecisionTreeClassifier Validation 1 (StratifiedShuffleSplit, folds = 1000)

from tester import test_classifier
t0 = time()
decTree_best_clf = decTreeclf.best_estimator_
test_classifier(decTree_best_clf, my_dataset, features_list)
print 'Processing time:', round(time() - t0, 3), 's'

##DecisionTreeClassifier Validation 2  (Cross validation)

from sklearn.model_selection import cross_val_score
t0 = time()
decTree_best_clf = decTreeclf.best_estimator_
scores = cross_val_score(decTree_best_clf,
                         features,
                         labels,
                         cv=5,
                         scoring='accuracy')
print("Accuracy and Deviation: " + str((scores.mean(), scores.std() * 2)))
print 'Processing time:', round(time() - t0, 3), 's'
test_classifier(decTree_best_clf, my_dataset, features_list)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(decTree_best_clf, my_dataset, features_list)
Exemple #31
0
#SCALE REDUCED DATA
#Scales the data sets that have the reduced numbers of features created above. 
scaler2 = preprocessing.MinMaxScaler()
reduced_features = scaler2.fit_transform(reduced_unscaled_features)

#SET UP GRID SEARCH PARAMETERS AND MODELS
#Set up range of number of estimators to use in addition to previously defined ranges in random forest with GridsearchCV
n_estimators = range(10,100,10)
parametersRF = {'n_estimators': n_estimators, 'criterion':('gini','entropy')}

#Creates the decision tree, random forest, and SVM classifiers
rf=RandomForestClassifier()

#Runs GridsearchCV with the selected model and features

print "RF"
RF=run_algorithm(rf, parametersRF, reduced_unscaled_features, reduced_labels)

#Set up parameters for pipeline so that the entire pipeline can be passed to grader 
scaling = preprocessing.MinMaxScaler()

estimators_RF = [('algorithm', RF)]

print "Reduced RF"
RRF = run_test(estimators_RF, my_dataset, reduced_features_list)

#Pickles model, data, and selected features
dump_classifier_and_data(RF, data_dict ,reduced_features_list)


#avg / total       0.71      0.84      0.77        38

clf = RandomForestClassifier(
    n_estimators=33, min_samples_leaf=2
)  #pick odd number of estimators to always get a decision
test_classifier(clf, labels, features, test_size=0.3)
#             precision    recall  f1-score   support
#
#        0.0       0.89      1.00      0.94        32
#        1.0       1.00      0.33      0.50         6
#
#avg / total       0.91      0.89      0.87        38

# Choose NaiveBayes and test with N-fold cross-validation
from tester import *

clf = GaussianNB()
test_classifier(clf, my_dataset, lean_list, folds=100)

#GaussianNB(priors=None)
#Accuracy: 0.86385       Precision: 0.60952      Recall: 0.32000 F1: 0.41967     F2: 0.35359
#Total predictions: 1300 True positives:   64    False positives:   41   False negatives:  136   True negatives: 1059

# Example starting point. Try investigating other evaluation techniques!
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, lean_list)
def main():
    print "=========="
    import sys
    #import os
    import pickle
    from time import time
    
    ## evaluation
    from sklearn.metrics import precision_score, recall_score 
    import matplotlib.pyplot as plt
    import pandas as pd
    #from ggplot import *
    import numpy as np
    
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.pipeline import Pipeline, FeatureUnion
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    #from sklearn.datasets import load_iris
    from sklearn.decomposition import PCA
    from sklearn.feature_selection import SelectKBest
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.cross_validation import StratifiedShuffleSplit
    from sklearn.cross_validation import train_test_split
    ###############################################################################
    ###############################################################################
    ###############################################################################
    ## current file running
    print "Running:", sys.argv[0].split("/")[-1]
    t_start_all = time()
    
    ### import helper functions
    sys.path.append("../tools/")
    from feature_format import featureFormat, targetFeatureSplit
    ## make sure 'tester' in same dir 
    from tester import dump_classifier_and_data
    
    ## moving loading dict code to be consistent with 'validate.py' ex from prev.
    ## lesson.
    ### Load the dictionary containing the dataset
    with open("final_project_dataset.pkl", "r") as data_file:
        data_dict = pickle.load(data_file)
    
    '''
    #Example structure of data_dict:
    >>> data_dict
            {'METTS MARK': {'salary': 365788, 
                            'to_messages': 807, 
                            'deferral_payments': 'NaN', 
                            'total_payments': 1061827, 
                            'exercised_stock_options': 'NaN', 
                            'bonus': 600000, 
                            'restricted_stock': 585062, 
                            'shared_receipt_with_poi': 702, 
                            'restricted_stock_deferred': 'NaN', 
                            'total_stock_value': 585062, 
                            'expenses': 94299, 
                            'loan_advances': 'NaN', 
                            'from_messages': 29, 
                            'other': 1740, 
                            'from_this_person_to_poi': 1, 
                            'poi': False, 
                            'director_fees': 'NaN', 
                            'deferred_income': 'NaN', 
                            'long_term_incentive': 'NaN', 
                            'email_address': '*****@*****.**', 
                            'from_poi_to_this_person': 38
                            }, 
            'BAXTER JOHN C': {'salary': 267102, 
                              'to_messages': 'NaN', 
                              'deferral_payments': 1295738, 
                              'total_payments': 5634343, 
                              'exercised_stock_options': 6680544, 
                              'bonus': 1200000, 
                              'restricted_stock': 3942714, 
                              'shared_receipt_with_poi': 'NaN', 
                              'restricted_stock_deferred': 'NaN', 
                              'total_stock_value': 10623258, 
                              'expenses': 11200, 
                              'loan_advances': 'NaN', 
                              'from_messages': 'NaN', 
                              'other': 2660303, 
                              'from_this_person_to_poi': 'NaN', 
                              'poi': False, 
                              'director_fees': 'NaN', 
                              'deferred_income': -1386055, 
                              'long_term_incentive': 1586055, 
                              'email_address': 'NaN', 
                              'from_poi_to_this_person': 'NaN'
                              },
            ...
    '''
    ###############################################################################
    ###############################################################################
    ###############################################################################
    print "----------"
    '''
    ##### Task 0. Data Exploration
    
    Rubric:
        ---
        Data Exploration (related mini-project: Lesson 5)
            Student response addresses the most important characteristics of the 
            dataset and uses these characteristics to inform their analysis. 
            Important characteristics include:
                total number of data points
                allocation across classes (POI/non-POI)
                number of features
                are there features with many missing values? etc.
        ---
    '''
    print "START: Task 0 - Explore data."
    t_start_0 = time()
    
    Boolean_doTask0 = False
    if Boolean_doTask0:
        ### Following L5, "explore_enron_data_16021614.py", do some data exploration.
        
        # How many data points (people) are in the dataset?
        #print "total number of data points, len(data_dict):", len(data_dict)
        #>>> 146
        
        # Display all keys:
        #print data_dict.keys()
        #>>> ['METTS MARK', 'BAXTER JOHN C', 'ELLIOTT STEVEN',.....]
        
        #print data_dict.items()
        #get list of dict items
        
        #print data_dict['METTS MARK'].keys()
        #>>> ['salary', 'to_messages', 'deferral_payments',
        
        #print "number of features, len(data_dict['METTS MARK'].keys()):", len(data_dict['METTS MARK'].keys())
        #>>> 21
        
        '''
        The poi feature records whether the person is a person of interest, 
        according to our definition. How many POIs are there in the E+F dataset? 
        In other words, count the number of entries in the dictionary where
        data[person_name]["poi"]==1
        '''
        list_count_poi = [key for key, value in data_dict.iteritems() if (data_dict[key]['poi']==True)]
        #print len(list_count_poi)
        #>>> 18
        #print "(POI/total), pre-outlier removal:", 1.0*len(list_count_poi)/(1.0*len(data_dict))
        #>>> 0.1233
        #print "(POI/non-POI), pre-outlier removal:", 1.0*len(list_count_poi)/(1.0*len(data_dict) - 1.0*len(list_count_poi))
        #>>> 0.1406
        
        #Whats the value of stock options exercised by Jeffrey Skilling?
        #print data_dict['SKILLING JEFFREY K'].keys()
        #print data_dict['SKILLING JEFFREY K']['exercised_stock_options']
        #>>> 19250000
        
        
        '''
        Of these three individuals (Lay, Skilling and Fastow), who took home the most 
        money (largest value of total_payments feature)?
        
        How much money did that person get?
        '''
        #print "Skilling total_payments:", data_dict['SKILLING JEFFREY K']['total_payments']
        #>>> 8682716
        #print "Lay total_payments:", data_dict['LAY KENNETH L']['total_payments']
        #>>> 103559793
        #print "Fastow total_payments:", data_dict['FASTOW ANDREW S']['total_payments']
        #>>> 2424083
        
        '''
        For nearly every person in the dataset, not every feature has a value. 
        How is it denoted when a feature doesnt have a well-defined value?
        #NaN
        '''
        
        #How many folks in this dataset have a quantified salary? 
        #What about a known email address?
        list_count_quantifiedSalary = [key for key, value in data_dict.iteritems() if (data_dict[key]['salary']!='NaN')]
        #print "len(list_count_quantifiedSalary):",len(list_count_quantifiedSalary)
        #>>> 95
        list_count_email_address = [key for key, value in data_dict.iteritems() if (data_dict[key]['email_address']!='NaN')]
        #print "len(list_count_email_address):",len(list_count_email_address)
        #>>> 111
        
        ## "allocation across classes" - take this to mean "how many features are 
        #non-NA"
        dict_summary = {}
        dict_summary2 = {}
        for feature in data_dict[data_dict.keys()[0]].keys():
            #print name
            dict_summary[feature] = [key for key, value in data_dict.iteritems() if (data_dict[key][feature]!='NaN')]
            #print "not NaN: len,", feature,":",len(dict_summary[feature])
            dict_summary2[feature] = len(dict_summary[feature])
        
        ## plot via pandas? see forum suggestions.
        ### can find meaningful trends, outliers in which features to use.
        #plt.scatter(ages, net_worths)
        #plt.show()
        
        columns = data_dict.keys()
        index = data_dict[data_dict.keys()[0]].keys()
        df1 = pd.DataFrame(index=index, 
                           columns=columns)
        df1 = df1.fillna(0) # with 0s rather than NaNs
        for name in columns:
            L_temp = []
            for feature in index:
                if data_dict[name][feature] == 'NaN':
                    L_temp.append(0.0)
                else:
                    L_temp.append(data_dict[name][feature])
            df1[name] = L_temp
        ##plot    
        #df1.transpose().plot(kind='scatter', x='salary', y='total_payments', color = 'poi')
        #bad df1 = df1.applymap(lambda x: 1 if x else 0)
        #df1.transpose().plot(kind='scatter', x='salary', y='total_payments', color = 'poi')
        
        ##ggplot way; needs ggplot to work
        #ggplot(aes(x='salary', y='total_payments', color='poi'), data=df1.transpose())  + geom_point()
        #ggplot not a 64-bit package!! cant use w 64-bit Anaconda
        ##need a non-ggplot approach to plotting by color. resume to matplotlib.
        df1T = df1.transpose()
        colors = np.where(df1T['poi'] == True, 'r', 'b')
        #plt.scatter(x=df1T['salary'], y=df1T['total_stock_value'], color = colors, alpha = 0.5)
        
        ##known outliers from manual check of enron.pdf
        df1T = df1T.drop(['TOTAL', 'THE TRAVEL AGENCY IN THE PARK']) 
        
        #plt.scatter(x=df1T['salary'], y=df1T['total_stock_value'], color = colors, alpha = 0.5)
        #plt.scatter(x=df1T['exercised_stock_options'], y=df1T['total_payments'], color = colors, alpha = 0.5)
        
        #exampple df['race_label'] = df.apply (lambda row: label_race (row),axis=1)
        df1T['poi_float'] = df1T.apply(lambda row: 1.0 if row['poi'] == True else 0.0, axis = 1)
        #plt.scatter(x=df1T['exercised_stock_options'], y=df1T['poi_float'], color = colors, alpha = 0.5)
        
        '''
        [u'salary', u'to_messages', u'deferral_payments', u'total_payments',
               u'exercised_stock_options', u'bonus', u'restricted_stock',
               u'shared_receipt_with_poi', u'restricted_stock_deferred',
               u'total_stock_value', u'expenses', u'loan_advances', u'from_messages',
               u'other', u'from_this_person_to_poi', u'poi', u'director_fees',
               u'deferred_income', u'long_term_incentive', u'email_address',
               u'from_poi_to_this_person', u'poi_float']
        '''
        #plt.scatter(x=df1T['salary'], y=df1T['poi_float'], color = colors, alpha = 0.5)
        ## use this approach, but replace x-value with other variables.
        # salary is ok feature
        # to_messages is ok feature
        # deferral_payments is ok feature
        # total_payments is ok feature
        # exercised_stock_options is ok feature
        # bonus is ok feature
        # restricted_stock is ok feature
        # shared_receipt_with_poi is ok feature
        # restricted_stock_deferred is NOT ok feature
        #total_stock_value ok
        #expenses ok
        #loan_advances NOT ok
        #from_messages ~ok
        #other ~ok
        #from_this_person_to_poi ~ok
        #director_fees NOT ok
        #deferred_income ok
        #long_term_incentive ~ok
        #from_poi_to_this_person ~ok
        
        '''
        ##################### 
        ### Conclusions:
        #####################
        
        >> Feature selection:
        ok features: 
        salary, to_messages, deferral_payments, total_payments, 
        exercised_stock_options, bonus, restricted_stock, shared_receipt_with_poi, 
        total_stock_value, expenses, deferred_income
        
        not ok features: 
        restricted_stock_deferred, loan_advances, director_fees
        
        unsure features: 
        from_messages, other, from_this_person_to_poi, 
        long_term_incentive, long_term_incentive, from_poi_to_this_person
        
        >> Outlier detection:
        'TOTAL' and 'THE TRAVEL AGENCY IN THE PARK' need to be removed after manual 
        overview of the "enron61702insiderpay.pdf" doc. 
        '''
    
    print "END: Task 0 - Explore data."
    t_end_0 = time()
    print "Task 0 run time:", round(t_end_0 - t_start_0, 16), "s"
    print "----------"
    ###############################################################################
    ###############################################################################
    ###############################################################################
    print "----------"
    '''
    ##### Task 1: Select what features you'll use.
    
    Rubric:
        ---
        Intelligently select features (related mini-project: Lesson 11)
            Univariate or recursive feature selection is deployed, or features are 
            selected by hand (different combinations of features are attempted, and 
            the performance is documented for each one). Features that are selected 
            are reported and the number of features selected is justified. For an 
            algorithm that supports getting the feature importances 
            (e.g. decision tree) or feature scores (e.g. SelectKBest), those are 
            documented as well.
        Properly scale features (related mini-project: Lesson 9)
            If algorithm calls for scaled features, feature scaling is deployed.
        ---
    '''
    ## *features_list is a list of strings, each of which is a feature name.
    ## ** The first feature must be "poi".
    ## ** You will need to use more features
    print "START: Task 1 - Feature Selection."
    t_start_1 = time()
    
    ### for brevity, include all features, and deselect with automated tools, such 
    ### as KBest. Then, compare with anticipated features ("ok", "not ok") from T0.
    
    Boolean_doTask1 = True
    if Boolean_doTask1:
        features_all = data_dict[data_dict.keys()[0]].keys()
        ## errored. drop email address
        features_all.remove('email_address')
        features_all.remove('poi')
        features_list = ['poi'] + features_all
        
    print "END: Task 1 - Feature Selection."
    t_end_1 = time()
    print "Task 1 run time:", round(t_end_1 - t_start_1, 16), "s"
    print "----------"
    ###############################################################################
    ###############################################################################
    ###############################################################################
    print "----------"
    '''
    ### Task 2: Remove outliers
    
    Rubric:
        ---
        Outlier Investigation (related mini-project: Lesson 7)
            Student response identifies outlier(s) in the financial data, and 
            explains how they are removed or otherwise handled. Outliers are 
            removed or retained as appropriate.
        ---
    '''
    print "START: Task 2 - Remove Outliers."
    t_start_2 = time()
    
    Boolean_doTask2 = True
    if Boolean_doTask2:  
        del data_dict["TOTAL"]
        del data_dict["THE TRAVEL AGENCY IN THE PARK"]
        
    print "END: Task 2 - Remove Outliers."
    t_end_2 = time()
    print "Task 2 run time:", round(t_end_2 - t_start_2, 16), "s"
    print "----------"
    ###############################################################################
    ###############################################################################
    ###############################################################################
    print "----------"
    '''
    ### Task 3: Create new feature(s)
    
    Rubric:
        ---
        Create new features (related mini-project: Lesson 11)
            {} At least one new feature is implemented. Justification for that feature 
            is provided in the written response, and the effect of that feature on 
            the final algorithm performance is tested.
        ---
    '''
    print "START: Task 3 - Feature creation."
    t_start_3 = time()
    
    Boolean_doTask3 = True
    if Boolean_doTask3: 
        ### Store to my_dataset for easy export below.
        my_dataset = data_dict
        Boolean_doTask3_addNewFeatures = False
        if Boolean_doTask3_addNewFeatures:
            ### compute new features here, in "my_dataset", so not to disturb "data_dict"
            ## start: copy from studentCode_16030217.py, L11
            def computeFraction(poi_messages, all_messages):
                """ given a number messages to/from POI (numerator) 
                    and number of all messages to/from a person (denominator),
                    return the fraction of messages to/from that person
                    that are from/to a POI
               """
                ### you fill in this code, so that it returns either
                ###     the fraction of all messages to this person that come from POIs
                ###     or
                ###     the fraction of all messages from this person that are sent to POIs
                ### the same code can be used to compute either quantity
            
                ### beware of "NaN" when there is no known email address (and so
                ### no filled email features), and integer division!
                ### in case of poi_messages or all_messages having "NaN" value, return 0.
                if poi_messages == "NaN" or all_messages == "NaN":
                    fraction = 0.0
                else:
                    fraction = float(poi_messages)/float(all_messages)
                return fraction
            submit_dict = {}
            for name in my_dataset:
                data_point = my_dataset[name]
                ##from POI
                from_poi_to_this_person = data_point["from_poi_to_this_person"]
                to_messages = data_point["to_messages"]
                fraction_from_poi = computeFraction(from_poi_to_this_person, to_messages)
                ##to POI
                from_this_person_to_poi = data_point["from_this_person_to_poi"]
                from_messages = data_point["from_messages"]
                fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )
                ##populate dummy dict
                my_dataset[name]["fraction_from_poi"] = fraction_from_poi
                my_dataset[name]["fraction_to_poi"] = fraction_to_poi
            ## end: copy from studentCode_16030217.py    
            ## add newly generated features to past "features_list"
            features_list = features_list + ["fraction_from_poi", "fraction_to_poi"]
        ### Extract features and labels from dataset for local testing
        data = featureFormat(my_dataset, features_list, sort_keys = True)
        labels, features = targetFeatureSplit(data)
        
    print "END: Task 3 - Feature creation."
    t_end_3 = time()
    print "Task 3 run time:", round(t_end_3 - t_start_3, 16), "s"
    print "----------"
    ###############################################################################
    ###############################################################################
    ###############################################################################
    print "----------"
    '''
    ### Task 4: Try a variety of classifiers
    
    Rubric:
        ---
        Pick an algorithm  (related mini-project: Lessons 1-3)
            {} At least 2 different algorithms are attempted and their performance 
            is compared, with the more performant one used in the final analysis.
        ---
    '''
    print "START: Task 4 - Classifier model study."
    t_start_4 = time()
    
    Boolean_doTask4 = True
    if Boolean_doTask4:
        models = []
        # append in |("name", clf_pipeline, param_grid)| format for each `models` 
        # entry. combine inside the loop, and then do CV, GridSearchCV, .fit, 
        # results.append(clf.score_)
        ### define scalars,arrays to use in GridSearch
        feature_min = 1
        if Boolean_doTask3_addNewFeatures:
            feature_max = 22 #change depending on how many new feat. if added = 22
        else:
            feature_max = 20 #20 is max for original num of features.
        max_svc_max_iter = int(1e5) #1e5 crashed. try 1e3. crashing due to non-scaling. return to 1e5.
        svm_C = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
        svm_kernel = ["linear", "poly", "rbf", "sigmoid"]
        svm_gamma = [0.01, 0.1, 0.5, 0.9, 10, 100, 1000]            
        dt_min_samples_split = [2,5,10,20,40,80,100,200,500,1000]
        knn_n_neighbors = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,25,30,40,50,75,100]
        Boolean_doTask4_fullGridSearch = False
        if Boolean_doTask4_fullGridSearch:
            # SVM variants
            models.append(('MinMaxSclr_KBest_SVM',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", SelectKBest()),
                                     ("svm", SVC(max_iter=max_svc_max_iter))]), #SVM:max_iter=1000
                           dict(features__k = range(feature_min,feature_max),
                                svm__C = svm_C, 
                                svm__kernel = svm_kernel, 
                                svm__gamma = svm_gamma)
                            )
                           )
            models.append(('MinMaxSclr_PCA_SVM',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", PCA()),
                                     ("svm", SVC(max_iter=max_svc_max_iter))]),
                           dict(features__n_components = range(feature_min,feature_max),
                                svm__C = svm_C, 
                                svm__kernel = svm_kernel, 
                                svm__gamma = svm_gamma)
                            )
                           )
            models.append(('MinMaxSclr_PCAKBest_SVM',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", FeatureUnion([("pca", PCA()), 
                                                                ("univ_select", SelectKBest())])),                                                                           
                                     ("svm", SVC(max_iter=max_svc_max_iter))]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                svm__C = svm_C, 
                                svm__kernel = svm_kernel, 
                                svm__gamma = svm_gamma) 
                            )
                           )
            models.append(('MinMaxSclr_KBestPCA_SVM',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", FeatureUnion([("univ_select", SelectKBest()),
                                                                ("pca", PCA())])),                                                                           
                                     ("svm", SVC(max_iter=max_svc_max_iter))]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                svm__C = svm_C, 
                                svm__kernel = svm_kernel, 
                                svm__gamma = svm_gamma) 
                            )
                           )
            '''
            models.append(('KBest_SVM',
                           Pipeline([("features", SelectKBest()),
                                     ("svm", SVC(max_iter=max_svc_max_iter))]), #SVM:max_iter=1000
                           dict(features__k = range(feature_min,feature_max),
                                svm__C = svm_C, 
                                svm__kernel = svm_kernel, 
                                svm__gamma = svm_gamma)
                            )
                           )
            ''' #removed because crash point.
            '''            
            models.append(('PCA_SVM',
                           Pipeline([("features", PCA()),
                                     ("svm", SVC(max_iter=max_svc_max_iter))]),
                           dict(features__n_components = range(feature_min,feature_max),
                                svm__C = svm_C, 
                                svm__kernel = svm_kernel, 
                                svm__gamma = svm_gamma)
                            )
                           )
            ''' #removed because crash point.
            '''
            models.append(('PCAKBest_SVM',
                           Pipeline([("features", FeatureUnion([("pca", PCA()), 
                                                                ("univ_select", SelectKBest())])),                                                                           
                                     ("svm", SVC(max_iter=max_svc_max_iter))]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                svm__C = svm_C, 
                                svm__kernel = svm_kernel, 
                                svm__gamma = svm_gamma) 
                            )
                           )
            ''' #removed because crash point.
            '''
            models.append(('KBestPCA_SVM',
                           Pipeline([("features", FeatureUnion([("univ_select", SelectKBest()),
                                                                ("pca", PCA())])),                                                                           
                                     ("svm", SVC(max_iter=max_svc_max_iter))]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                svm__C = svm_C, 
                                svm__kernel = svm_kernel, 
                                svm__gamma = svm_gamma) 
                            )
                           )
            ''' #removed because crash point.
            # GNB variants
            models.append(('MinMaxSclr_KBest_GNB',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", SelectKBest()),                                                                          
                                     ("gnb", GaussianNB())]),
                           dict(features__k = range(feature_min,feature_max))
                            )
                           )
            
            models.append(('MinMaxSclr_PCA_GNB',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", PCA()),                                                                          
                                     ("gnb", GaussianNB())]),
                           dict(features__n_components = range(feature_min,feature_max)) 
                            )
                           )
            models.append(('MinMaxSclr_PCAKBest_GNB',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", FeatureUnion([("pca", PCA()), 
                                                                ("univ_select", SelectKBest())])),                                                                           
                                     ("gnb", GaussianNB())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max)) 
                            )
                           )
            models.append(('MinMaxSclr_KBestPCA_GNB',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", FeatureUnion([("univ_select", SelectKBest()),
                                                                ("pca", PCA())])),                                                                           
                                     ("gnb", GaussianNB())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max)) 
                            )
                           )
            models.append(('KBest_GNB',
                           Pipeline([("features", SelectKBest()),                                                                          
                                     ("gnb", GaussianNB())]),
                           dict(features__k = range(feature_min,feature_max))
                            )
                           )
            models.append(('PCA_GNB',
                           Pipeline([("features", PCA()),                                                                          
                                     ("gnb", GaussianNB())]),
                           dict(features__n_components = range(feature_min,feature_max)) 
                            )
                           )               
            models.append(('PCAKBest_GNB',
                           Pipeline([("features", FeatureUnion([("pca", PCA()), 
                                                                ("univ_select", SelectKBest())])),                                                                           
                                     ("gnb", GaussianNB())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max)) 
                            )
                           )
            models.append(('KBestPCA_GNB',
                           Pipeline([("features", FeatureUnion([("univ_select", SelectKBest()),
                                                                ("pca", PCA())])),                                                                           
                                     ("gnb", GaussianNB())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max)) 
                            )
                           )
            # DT variants
            models.append(('KBest_DT',
                           Pipeline([("features", SelectKBest()),                                                                          
                                     ("dt", DecisionTreeClassifier())]),
                           dict(features__k = range(feature_min,feature_max),
                                dt__min_samples_split = dt_min_samples_split)
                            )
                           )
            models.append(('MinMaxSclr_PCA_DT',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", PCA()),                                                                          
                                     ("dt", DecisionTreeClassifier())]),
                           dict(features__n_components = range(feature_min,feature_max),
                                dt__min_samples_split = dt_min_samples_split) 
                            )
                           )               
            models.append(('MinMaxSclr_PCAKBest_DT',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", FeatureUnion([("pca", PCA()), 
                                                                ("univ_select", SelectKBest())])),                                                                           
                                     ("dt", DecisionTreeClassifier())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                dt__min_samples_split = dt_min_samples_split) 
                            )
                           )
            models.append(('MinMaxSclr_KBestPCA_DT',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", FeatureUnion([("univ_select", SelectKBest()),
                                                                ("pca", PCA())])),                                                                           
                                     ("dt", DecisionTreeClassifier())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                dt__min_samples_split = dt_min_samples_split) 
                            )
                           )   
            models.append(('PCA_DT',
                           Pipeline([("features", PCA()),                                                                          
                                     ("dt", DecisionTreeClassifier())]),
                           dict(features__n_components = range(feature_min,feature_max),
                                dt__min_samples_split = dt_min_samples_split) 
                            )
                           )               
            models.append(('PCAKBest_DT',
                           Pipeline([("features", FeatureUnion([("pca", PCA()), 
                                                                ("univ_select", SelectKBest())])),                                                                           
                                     ("dt", DecisionTreeClassifier())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                dt__min_samples_split = dt_min_samples_split) 
                            )
                           )
            models.append(('KBestPCA_DT',
                           Pipeline([("features", FeatureUnion([("univ_select", SelectKBest()),
                                                                ("pca", PCA())])),                                                                           
                                     ("dt", DecisionTreeClassifier())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                dt__min_samples_split = dt_min_samples_split) 
                            )
                           )
            # KNN variants
            models.append(('MinMaxSclr_KBest_KNN',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", SelectKBest()),                                                                          
                                     ("knn", KNeighborsClassifier())]),
                           dict(features__k = range(feature_min,feature_max),
                                knn__n_neighbors = knn_n_neighbors)
                            )
                           )
            models.append(('MinMaxSclr_PCA_KNN',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", PCA()),                                                                          
                                     ("knn", KNeighborsClassifier())]),
                           dict(features__n_components = range(feature_min,feature_max),
                                knn__n_neighbors = knn_n_neighbors) 
                            )
                           )               
            models.append(('MinMaxSclr_KBestPCA_KNN',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", FeatureUnion([("univ_select", SelectKBest()),
                                                                ("pca", PCA())])),                                                                           
                                     ("knn", KNeighborsClassifier())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                knn__n_neighbors = knn_n_neighbors) 
                            )
                           )
            models.append(('MinMaxSclr_PCAKBest_KNN',
                           Pipeline([("scale", MinMaxScaler(feature_range=(0, 1))),
                                     ("features", FeatureUnion([("pca", PCA()), 
                                                                ("univ_select", SelectKBest())])),                                                                           
                                     ("knn", KNeighborsClassifier())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                knn__n_neighbors = knn_n_neighbors) 
                            )
                           )
            models.append(('KBest_KNN',
                           Pipeline([("features", SelectKBest()),                                                                          
                                     ("knn", KNeighborsClassifier())]),
                           dict(features__k = range(feature_min,feature_max),
                                knn__n_neighbors = knn_n_neighbors)
                            )
                           )
            models.append(('PCA_KNN',
                           Pipeline([("features", PCA()),                                                                          
                                     ("knn", KNeighborsClassifier())]),
                           dict(features__n_components = range(feature_min,feature_max),
                                knn__n_neighbors = knn_n_neighbors) 
                            )
                           )               
            models.append(('KBestPCA_KNN',
                           Pipeline([("features", FeatureUnion([("univ_select", SelectKBest()),
                                                                ("pca", PCA())])),                                                                           
                                     ("knn", KNeighborsClassifier())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                knn__n_neighbors = knn_n_neighbors) 
                            )
                           )
            models.append(('PCAKBest_KNN',
                           Pipeline([("features", FeatureUnion([("pca", PCA()), 
                                                                ("univ_select", SelectKBest())])),                                                                           
                                     ("knn", KNeighborsClassifier())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                knn__n_neighbors = knn_n_neighbors) 
                            )
                           )
        else:
            models.append(('KBestPCA_KNN',
                           Pipeline([("features", FeatureUnion([("univ_select", SelectKBest()),
                                                                ("pca", PCA())])),                                                                           
                                     ("knn", KNeighborsClassifier())]),
                           dict(features__pca__n_components = range(feature_min,feature_max),
                                features__univ_select__k = range(feature_min,feature_max),
                                knn__n_neighbors = knn_n_neighbors) 
                            )
                           )
        # prepare results reports                   
        best_estimators = []
        best_scores = []
        names = []
        cv = StratifiedShuffleSplit(y = labels, 
                                    n_iter = 10, #default is 10; change to 30 for increased fidelity (Rationale: approx min samples needed for good approx of Gaus distr). Failed w large SVM max? reduce and test w cv=1. {}
                                    test_size = 0.1, 
                                    random_state = 2016)
        # cycle through all grid searches
        for name, pipeline, param_grid in models:
            print "Start:", name
            grid_search = GridSearchCV(estimator = pipeline, 
                                       param_grid = param_grid, 
                                       verbose = 1,
                                       cv = cv,
                                       scoring = None, #default "scoring=None". try 'f1', 'recall' to combine both R and P.
                                       n_jobs = 1) # parallelize to lower runtime
            grid_search.fit(features, labels)
            best_estimators.append(grid_search.best_estimator_)
            best_scores.append([grid_search.best_score_])
            names.append(name)
            print "End:", name
            print "grid_search.best_score_:", grid_search.best_score_
        # boxplot algorithm comparison
        fig = plt.figure()
        fig.suptitle('Algorithm Comparison')
        ax = fig.add_subplot(111)
        plt.boxplot(best_scores)
        ax.set_xticklabels(names)
        ax.tick_params(axis='both', which='major', labelsize=8)
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
        plt.show()
    
    print "END: Task 4 - Classifier model study."
    t_end_4 = time()
    print "Task 4 run time:", round(t_end_4 - t_start_4, 16), "s"
    print "----------"
    ###############################################################################
    ###############################################################################
    ###############################################################################
    print "----------"
    '''
    ### Task 5: Tune your classifier to achieve better than 0.3 precision and recall 
    ### using our testing script. 
    
    Rubric:
        ---
        Tune the algorithm (related mini-project: Lessons 2, 3, 13)
            Response addresses what it means to perform parameter tuning and why it 
            is important. {} At least one important parameter tuned, with at least 
            3 settings investigated systematically, or any of the following are true:
                GridSearchCV used for parameter tuning
                Several parameters tuned
                Parameter tuning incorporated into algorithm selection (i.e. 
                    parameters tuned for more than one algorithm, and best 
                    algorithm-tune combination selected for final analysis)
        ---
    '''
    
    print "START: Task 5 - Classifier tuning."
    t_start_5 = time()
    
    Boolean_doTask5 = False
    if Boolean_doTask5:
        pass
        
    print "END: Task 5 - Classifier tuning."
    t_end_5 = time()
    print "Task 5 run time:", round(t_end_5 - t_start_5, 16), "s"
    print "----------"
    ###############################################################################
    ###############################################################################
    ###############################################################################
    print "----------"
    '''
    ### Task 6: Dump your classifier, dataset, and features_list so anyone can
    ### check your results. 
    
    Rubric:
        ---
        Usage of Evaluation Metrics (related mini-project: Lesson 14)
            At least two appropriate metrics are used to evaluate algorithm 
                performance (e.g. precision and recall), and the student 
                articulates what those metrics measure in context of the project 
                task.
        Validation Strategy (related mini-project: Lesson 13)
            Response addresses what validation is and why it is important.
            Performance of the final algorithm selected is assessed by splitting 
            the data into training and testing sets or through the use of cross 
            validation, noting the specific type of validation performed.
        Algorithm Performance
            When tester.py is used to evaluate performance, precision and recall 
            are both at least 0.3.
        ---
    '''
    print "START: Task 6 - Dump classifier, dataset,and features_list."
    t_start_6 = time()
    
    Boolean_doTask6 = True
    if Boolean_doTask6:
        ###You do not need to change anything below, but make sure
        ### that the version of poi_id.py that you submit can be run on its own and
        ### generates the necessary .pkl files for validating your results.
        dump_classifier_and_data(best_estimators[best_scores.index(max(best_scores))], 
                                                 my_dataset, 
                                                 features_list)
        #output for display of finals results:
        print "names:", names
        print "best_scores:", best_scores
        print "names[best_scores.index(max(best_scores))]:", names[best_scores.index(max(best_scores))]
        print "best_scores[best_scores.index(max(best_scores))]:", best_scores[best_scores.index(max(best_scores))]
        print "best_estimators[best_scores.index(max(best_scores))]:", best_estimators[best_scores.index(max(best_scores))]
        print "best_estimators[best_scores.index(max(best_scores))].steps:", best_estimators[best_scores.index(max(best_scores))].steps
        
    print "END: Task 6 - Dump classifier, dataset,and features_list."
    t_end_6 = time()
    print "Task 6 run time:", round(t_end_6 - t_start_6, 16), "s"
    print "----------"
    ###############################################################################
    ###############################################################################
    ###############################################################################
    t_end_all = time()
    print "total run time:", round(t_end_all - t_start_all, 16), "s"
    print "=========="
Exemple #34
0
### Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

print('Start training')

# Example starting point. Try investigating other evaluation techniques!
from sklearn import model_selection
from sklearn import ensemble
features_train, features_test, labels_train, labels_test = \
model_selection.train_test_split(features, labels, test_size=0.3,
                                 random_state=42)

params_grid = {'n_estimators': [25, 50, 100, 150, 200, 300]}
clf = model_selection.GridSearchCV(ensemble.RandomForestClassifier(),
                                   params_grid,
                                   return_train_score=True)
clf.fit(features_train, labels_train)
test_classifier(clf, my_dataset, features_list)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf.best_estimator_, my_dataset, features_list)
print('Process finalised')
Exemple #35
0
def main():

    ###########################################################################
    ### Get features from global definitions
    orig_features_list = getFeaturesList()

    ### load the dictionary containing the dataset
    with open("final_project_dataset.pkl", "r") as data_file:
        data_dict = pickle.load(data_file)

    ### Store to my_dataset for easy export below.
    orig_dataset = data_dict

    ###########################################################################
    ### Remove outliers
    orig_dataset = clean_data(orig_dataset)

    ###########################################################################
    ### Create new feature(s)
    my_features_list = orig_features_list
    my_dataset, my_features_list = create_new_features(orig_dataset, my_features_list)

    ### write full data to file
    data_df = pd.DataFrame(my_dataset)
    data_df.T.to_csv("full_data.csv", sep=',', encoding='utf-8')

    ###########################################################################
    ### Extract features and labels from dataset for local testing

    my_data = featureFormat(my_dataset, my_features_list, sort_keys=gl_sort_keys)
    my_labels, my_features = targetFeatureSplit(my_data)

    ### orig data
    orig_data = featureFormat(orig_dataset, orig_features_list, sort_keys=gl_sort_keys)
    orig_labels, orig_features = targetFeatureSplit(orig_data)

    ###########################################################################
    ### Preparation of training and testing data

    # without feature scaling
    my_features_train, my_features_test, my_labels_train, my_labels_test = \
        train_test_split(my_features, my_labels, test_size=gl_test_size,
                         random_state=gl_random_state)

    orig_features_train, orig_features_test, orig_labels_train, orig_labels_test = \
        train_test_split(orig_features, orig_labels, test_size=gl_test_size,
                         random_state=gl_random_state)

    if show_SelectKBest_results:
        print "BEST 10 FEATURES"
        bestTen = SelectKBest(f_classif, k=5)
        bestTen.fit(my_features_train, my_labels_train)

        try:
            scores = bestTen.scores_
            indices = np.argsort(scores)[::-1]
            print("Features score ranking based on SelectKBest:")
            for f in range(np.array(my_features_train).shape[1]):
                print("%d. feature %s (%f)" % (f + 1, my_features_list[indices[f]], scores[indices[f]]))
        except:
            print "no scores available for the given combination"

    ### with feature scaling
    scaler = MinMaxScaler()
    my_features_scaled = scaler.fit_transform(my_features)
    my_features_train_scaled, my_features_test_scaled, \
    my_labels_train_scaled, my_labels_test_scaled = \
        train_test_split(my_features_scaled, my_labels, test_size=gl_test_size,
                         random_state=gl_random_state)

    orig_features_scaled = scaler.fit_transform(orig_features)
    orig_features_train_scaled, orig_features_test_scaled, \
    orig_labels_train_scaled, orig_labels_test_scaled = \
        train_test_split(orig_features_scaled, orig_labels, test_size=gl_test_size,
                         random_state=gl_random_state)

    print "Current test  data size:  " + str(gl_test_size * 100) + " %"
    print "Current train data size:  " + str(100 - gl_test_size * 100) + " %"

    ###########################################################################
    ### PCA

    # prepare info for later output
    PCA_info = do_perform_PCA
    if PCA_info:
        PCA_info = pca_components

    # Do a PCA on the features for non scaled data
    my_features_train, my_features_test = \
        do_PCA(my_features_train, my_features_test, pca_components)

    # Do a PCA on the features for  scaled data
    my_features_train_scaled, my_features_test_scaled = \
        do_PCA(my_features_train_scaled, my_features_test_scaled, pca_components)

    # Do a PCA on the features for non scaled data
    #orig_features_train, orig_features_test = \
    #    do_PCA(orig_features_train, orig_features_test, pca_components)

    # Do a PCA on the features for  scaled data
    #orig_features_train_scaled, orig_features_test_scaled = \
    #    do_PCA(orig_features_train_scaled, orig_features_test_scaled, pca_components)

    ###########################################################################
    ### Train Classifier(s)

    print "###################################################################"
    print "Start performing selection of best algorithms and configurations "

    # calling the classifier validation with non-scaled features
    apply_clfs(my_features_train, my_features_test,
               my_features_train_scaled, my_features_test_scaled,
               my_labels_train, my_labels_test, gl_test_size, my_features_list)

    print "End performing selection of best algorithms and configurations "
    print "###################################################################"


    # pick 10 best performing classifier
    best_clf_config_list = clf_collection.sort_values(['precision','recall',
                                            'accuracy','number of features'],
                                    ascending=[False,False,False,True])

    clf_collection.sort_values(['precision', 'recall',
                                'accuracy', 'number of features'],
                               ascending=[False, False, False, True])

    # dump the results of all the tested classifiers and related configurtion
    # and train/test setup
    clf_collection.to_csv("training_data.csv", sep=',', encoding='utf-8')

    # iterating through all the classifiers chosen
    print "Validating list of best classifiers: "
    for index, best_clf_config in best_clf_config_list.iterrows():
        # go for the best, instantiate it and dump the data
        best_clf_class_id = best_clf_config["class_id"]
        best_clf_params = best_clf_config["best parameters"]

        for id, clf_class, clf_kwargs, feat_scaling in gl_clf_list:
            if id == int(best_clf_class_id):

                try:
                    # instantiate classifier
                    best_clf = clf_class(**best_clf_params)
                    best_clf_org = clf_class(**best_clf_params)

                    if best_clf_config["features_scaled"]:
                        # train the algorithm
                        best_clf.fit(my_features_train_scaled, my_labels_train)
                        best_clf_org.fit(orig_features_train_scaled, orig_labels_train)
                    else:
                        # train the algorithm
                        best_clf_org.fit(orig_features_train, orig_labels_train)
                        best_clf.fit(my_features_train, my_labels_train)

                    print "start original data set"
                    # test with original data set
                    #v_o_total_predictions, v_o_accuracy, v_o_precision, v_o_recall, v_o_f1, v_o_f2 =\
                    #    test_classifier(best_clf_org, orig_dataset, orig_features_list)

                    #clf_best_collection.loc[1000 + index] = (best_clf_config["class_id"],
                    #                                  best_clf_config["clf"],
                    #                                  best_clf_config['features_scaled'],
                    #                                  len(orig_features_list),
                    #                                  str(my_features_list),
                    #                                  v_o_accuracy, v_o_precision,
                    #                                  v_o_recall, best_clf_params,
                    #                                  best_clf_config["best estimator"],
                    #                                  True,
                    #                                  create_new_message_features,
                    #                                  create_new_finance_features,
                    #                                  PCA_info)

                    # dump final information
                    dump_classifier_and_data(best_clf, my_dataset, my_features_list)

                    print "start original data set with new features"
                     #test with newly created features on top of the original data set
                    v_total_predictions, v_accuracy, v_precision, v_recall, v_f1, v_f2 =\
                        test_classifier(best_clf, my_dataset, my_features_list, do_perform_PCA,
                            pca_components, best_clf_config['features_scaled'])

                    clf_best_collection.loc[index] = (best_clf_config["class_id"],
                                                      best_clf_config["clf"],
                                                      best_clf_config['features_scaled'],
                                                      len(my_features_list),
                                                      str(my_features_list),
                                                      v_accuracy, v_precision, v_recall,
                                                      best_clf_params,
                                                      best_clf_config["best estimator"],
                                                      False,
                                                      create_new_message_features,
                                                      create_new_finance_features,
                                                      PCA_info
                                                      )
                except TypeError:
                    clf_best_collection.loc[index] = (best_clf_config["class_id"],
                                                      best_clf_config["clf"],
                                                      best_clf_config['features_scaled'],
                                                      len(my_features_list),
                                                      str(my_features_list),
                                                      "Error", "Error", "Error",
                                                      best_clf_params,
                                                      best_clf_config["best estimator"],
                                                      False,
                                                      create_new_message_features,
                                                      create_new_finance_features,
                                                      PCA_info
                                                     )

    cbc = clf_best_collection.sort_values(['precision', 'recall', 'accuracy'],
                                                 ascending=[False, False,
                                                            False])

    # write classifier validation result to file
    cbc.to_csv(output_file_results, sep=',', encoding='utf-8')

    print "###################################################################"
    print "###################################################################"
Exemple #36
0
from sklearn.naive_bayes import GaussianNB
'''
parameters = {}
clf = GaussianNB()
find_kbest(clf, features, labels, parameters )


# In[63]:

doPCA(grid_search, features, labels, parameters)
'''

# ## Testing top two classifiers

# In[75]:
'''
from tester import dump_classifier_and_data
from tester import main

my_features = ['poi', 'exercised_stock_options', 'total_stock_value', 'bonus',
              'salary', 'deferred_income']
clf = GaussianNB()
dump_classifier_and_data(clf, my_dataset, my_features)
main()
'''

# In[76]:
from tester import dump_classifier_and_data
from tester import main

my_features = ['poi', 'exercised_stock_options', 'total_stock_value', 'bonus']
t= time.time()
pipeline = Pipeline([('normalization', scaler), 
                     ('classifier', KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto', 
                                                         leaf_size=30, p=1, metric='minkowski'))])

test_classifier(pipeline, enron_data, features_select(4))

print time.time()-t


# ###Data dump

# In[45]:


### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.

dump_classifier_and_data(pipeline, enron_data, features_select(4))


# ###Additional methods to explore include:
# 
# * using k-fold cross-validation to improve model validation

# In[ ]:



Exemple #38
0
    clf__C=[0.001, 0.1, 1, 10, 100, 1000, 10000, 1e3, 5e3, 1e4, 5e4, 1e5],
    clf__gamma=[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    reduce_dim__n_components=[1, 2, 4, 6, 8, 10, 12, 13])

param_grid = dict(clf__kernel=['sigmoid'],
                  clf__C=[0.1],
                  clf__gamma=[0.0001],
                  reduce_dim__n_components=[14])

# scoring='%s_macro' % scores[1],
grid_search = GridSearchCV(pipe, param_grid=param_grid, refit=True, cv=10)
grid_search.fit(features_train, labels_train)
labels_predict = grid_search.predict(features_test)

from sklearn.ensemble import RandomForestClassifier
clf_r = RandomForestClassifier(max_depth=6, random_state=0)
clf_r.fit(features_train, labels_train)
labels_predict_r = clf_r.predict(features_test)

from sklearn.metrics import classification_report
print('PCA and SVC', classification_report(labels_test, labels_predict))
print('Random Forest', classification_report(labels_test, labels_predict_r))
print(grid_search.best_estimator_.named_steps['reduce_dim'].n_components)
# Task 6: Dump your classifier, dataset, and features_list so anyone can
# check your results. You do not need to change anything below, but make sure
# that the version of poi_id.py that you submit can be run on its own and
# generates the necessary .pkl files for validating your results.

dump_classifier_and_data(grid_search.best_estimator_, my_dataset,
                         features_list)
                feats.append(y)
        
        for x,y in zip(full_features_list[1:], clf_best.named_steps['skb'].scores_): 
            list_scores.append({'feature_list' : x, "scores" : y})
        
        print feats
        print pd.DataFrame(list_scores)        
        print "---------------------------------------------------------------"
        for param_name in sorted(grid_search.param_grid.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))

    print "GridSearch time:" 
    time1 = round(time()-t0,2)
    print time1
    print "test_classifier time:" 
    t1 = time()
    test_classifier(clf_best, my_dataset, full_features_list)
    time2 = round(time()-t1, 2)
    print time2
    print "total time:", time2+time1
print "-----------------------------------------------------------------------"
############################################################################### 
## Tune classifier
### Generates the necessary .pkl files for validating results.

if full_report:
    for clf in [ dtc, gnc, knn, abc, rfc ]:
       test_classifier(clf, my_dataset, features_list)

dump_classifier_and_data(clf_best, my_dataset, full_features_list)
Exemple #40
0
# Without new features
_ = build_model(original_features, estimator, {}, use_kbest=True, k=['all'], use_scaler=True)

# With grand_total
_ = build_model(original_features + ['grand_total'], estimator, {}, use_kbest=True, k=['all'], use_scaler=True)

# With from_poi_ratio
_ = build_model(original_features + ['from_poi_ratio'], estimator, {}, use_kbest=True, k=['all'], use_scaler=True)

# With to_poi_ratio
_ = build_model(original_features + ['to_poi_ratio'], estimator, {}, use_kbest=True, k=['all'], use_scaler=True)


# ----------------------------------------------------------
#   Final Model
# ----------------------------------------------------------

final_model, final_features = build_model(original_features + ['grand_total'],
                                          estimator, {},
                                          use_kbest=True,
                                          use_scaler=True)

test_classifier(final_model, data_dict, final_features, folds=1000)

# ----------------------------------------------------------
#    Dump Classifier and Data
# ----------------------------------------------------------

dump_classifier_and_data(final_model, data_dict, final_features)
Exemple #41
0


### KNN Classifier ###
#######################

# start_time = time.time()

# params = {'n_neighbors': [3,4,5,6] ,  'weights':['uniform','distance'],'leaf_size':[15,20,25,30,40], 'n_jobs':[-1]}
# cv_KNN = GridSearchCV(clf_KNN, params)
# cv_KNN.fit(features, labels)
# clf1 = cv_KNN.best_estimator_
# print cv_KNN.best_score_

# #test_classifier(clf1,my_dataset,features_list)
# test_classifier(clf1,enron_data_sub,cols)

# elapsed_time= start_time - time.time()
# print elapsed_time
# print


###################################################################################
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
###################################################################################

dump_classifier_and_data(clf1, enron_data_sub, cols)
Exemple #42
0
                ('select_features', SelectKBest(f_classif, k=opt_features)),
                ('reduce_dim', PCA()),
                ('naive', GaussianNB())])
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("")
print("Efficiency of selected algorithm:")
print 'F1 score:\t', '{0:.2f}'.format(f1_score(labels_test, pred))
print 'Accuracy:\t',  '{0:.2f}'.format(accuracy_score(labels_test, pred))
print 'Precision:\t', '{0:.2f}'.format(precision_score(labels_test, pred))
print 'Recall:\t',  '{0:.2f}'.format(recall_score(labels_test, pred))

scores = clf.named_steps['select_features'].scores_
features_selected_bool = clf.named_steps['select_features'].get_support(indices=True)
features_selected = [features_list[i+1] for i in features_selected_bool]
features_scores = [scores[i] for i in features_selected_bool]

print("")
print('Feature scores:')
for i in range(len(features_scores)):
    print features_selected[i], '{0:.2f}'.format(features_scores[i])

features_selected.insert(0, 'poi')

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_selected)
def run_main():   
    ### Task 1: Select what features you'll use.
    ### features_list is a list of strings, each of which is a feature name.
    ### The first feature must be "poi".
    features_list = ['poi','email_subject','to_poi_ratio','combined', 'from_messages','expenses',
                     'deferred_income','other','restricted_stock', 'email_body']
                     #,'long_term_incentive','deferral_payments','email_body','restricted_stock_deferred'] # You will need to use more features
    
    ''' FEATURE LIST 
    bonus, deferral_payments, deferred_income, director_fees, email_address,
    email_body, email_subject, exercised_stock_options, expenses, from_messages, from_poi_to_this_person,
    from_this_person_to_poi, loan_advances, long_term_incentive, other, poi,
    restricted_stock, restricted_stock_preferred, salary, shared_receipt_with_poi,
    to_messages, total_payments, total_stock_value
        ------------ '''
    
    ### Load the dictionary containing the dataset
    data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
    data_dict = remove_key(data_dict, 'TOTAL')
    #data_dict = pickle.load(open("my_dataset.pkl", "r") )
    get_sent_by_date.process_text_learning_features()
    data_dict = text_results_to_dataset.add_text_results(data_dict)
    
    def value_or_zero(inp):
        if inp == 'NaN':
            return 0
        else:
            return float(inp)
    ### Task 2: Remove outliers
    ### Task 3: Create new feature(s)
    # create percent email from poi
    for key in data_dict.keys():
        if data_dict[key]['to_messages'] == 'NaN':
            data_dict[key]['to_poi_ratio'] = 'NaN'
        else:
            data_dict[key]['to_poi_ratio'] = float(data_dict[key]['from_this_person_to_poi']) / float(data_dict[key]['from_messages'])
            
        combined = value_or_zero(data_dict[key]['salary']) + value_or_zero(data_dict[key]['bonus']) + \
            value_or_zero(data_dict[key]['total_stock_value']) + value_or_zero(data_dict[key]['total_payments']) + \
            value_or_zero(data_dict[key]['exercised_stock_options'])
        data_dict[key]['combined'] = combined
    # create percent email from poi
    ### Store to my_dataset for easy export below.
    features_list = scale_features(data_dict, [], features_list)
    my_dataset = data_dict
    
    #outlier_treatment(my_dataset, 'combined', elim_top=.01)
    
    ### Extract features and labels from dataset for local testing
    #data = featureFormat(my_dataset, features_list, sort_keys = True)
    #labels, features = targetFeatureSplit(data)
    
    ### Task 4: Try a varity of classifiers
    ### Please name your classifier clf for easy export below.
    ### Note that if you want to do PCA or other multi-stage operations,
    ### you'll need to use Pipelines. For more info:
    ### http://scikit-learn.org/stable/modules/pipeline.html
    
    #from sklearn.naive_bayes import GaussianNB
    #clf = GaussianNB()    # Provided to give you a starting point. Try a varity of classifiers.
    
    # Fit classifier with out-of-bag estimates
    from sklearn import ensemble
    params = {'n_estimators': 200, 'max_depth': 2,'min_samples_split':20,
              'learning_rate': .5, 'min_samples_leaf': 1}
    clf = ensemble.GradientBoostingClassifier(**params)
    scaler = MinMaxScaler()
    scaler_clf =  Pipeline([('scaler',scaler),('clf',clf)])
    #from sklearn.ensemble import AdaBoostClassifier
    #from sklearn.tree import DecisionTreeClassifier
    #clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2,min_samples_split=20),algorithm="SAMME",n_estimators=200)
    # RECALL: .39 features: 'email_subject','email_body','to_poi_ratio','combined' max_depth=3, min_samples_split=10
    # RECALL: .41 featuers  < SAME AS ABOVE but max_depth = 2
    # RECALL: .36 with just email_body & email_subject
    
    #tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
    #                     'C': [1, 10, 100, 1000]},
    #tuned_parameters = [{'C': [.001,1,.01,10]}]
    
    
    #from sklearn.grid_search import GridSearchCV
    #from sklearn.svm import LinearSVC
    #clf = GridSearchCV(LinearSVC(C=1,penalty="l2",class_weight='auto',loss="squared_hinge"), tuned_parameters, scoring='recall', verbose=3, n_jobs=5)
    
    
    
    # Maybe some original features where good, too?
    #fil = SelectKBest(f_regression, k=4)
    # create the pipeline to do the best selection:
    #clf = make_pipeline(fil, clf)
    #from sklearn.svm import LinearSVC      
    #clf = LinearSVC(C=.001,penalty="l2",class_weight='auto',loss="squared_hinge")
    
    
    ### Task 5: Tune your classifier to achieve better than .3 precision and recall 
    ### using our testing script.
    ### Because of the small size of the dataset, the script uses stratified
    ### shuffle split cross validation. For more info: 
    ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
    
    test_classifier(scaler_clf, my_dataset, features_list)
    weights = clf.feature_importances_
    for w, f in zip(weights,features_list[1:]):
        print str(w) + ' is the weight of '+f
        
    
    ### Dump your classifier, dataset, and features_list so 
    ### anyone can run/check your results.
    
    dump_classifier_and_data(clf, my_dataset, features_list)
pipe = Pipeline(steps=[('skbest', SelectKBest(score_func=f_classif)), ('clf', GaussianNB())])
cv = StratifiedShuffleSplit(labels,n_iter = 60,random_state = 42)
b_grid_search = grid_search.GridSearchCV(pipe, param_grid = clf_params,cv = cv,scoring = 'precision')
b_grid_search.fit(features_saved,labels_saved)

print 'Time:',round(time()-t0,3) ,'s\n'
t0 = time()

# pick a winner
best_clf_nb = b_grid_search.best_estimator_
print best_clf_nb

found_skb_nb=best_clf_nb.steps[0][1]
found_clf_nb=best_clf_nb.steps[1][1]

features=found_skb_nb.fit_transform(features_saved,labels_saved)
features_list_to_use_nb=np.asarray(all_features_list_saved)[found_skb_nb.get_support()].tolist()
print "\nFeatures used:"
print features_list_to_use_nb

test_classifier(found_clf_nb, dataset_to_export, ['poi']+features_list_to_use_nb)



### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(svm_clf, dataset_to_export, ['poi']+features_list)
Exemple #45
0
# grid search
parameters = _get_parameters()
scoring = ['precision', 'recall']
grid_search = GridSearchCV(mypipeline, parameters, scoring=scoring)
_evaluate_grid_search(grid_search, mypipeline, parameters, feature_train, label_train)
# this is for fixed parameters
mypipeline.set_params(feat_select__n_components=5, clf__C=1e6, clf__gamma=1).fit(feature_train, label_train)
_cross_validate(mypipeline, feature_train, label_train)
prediction = mypipeline.predict(feature_test)

# Provided to give you a starting point. Try a variety of classifiers.

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, data_dict, features_list)
Exemple #46
0
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

features_train, features_test, labels_train, labels_test = \
    df.peform_StratifiedShuffleSplit(features, labels)

## GaussianNB
print 'Performing GaussianNB'
from sklearn.naive_bayes import GaussianNB
clf_NB = GaussianNB()
''' Finding best number of features
df.perform_plot_evaluation_metrics(clf_NB, my_dataset, kBest_features, 'GaussianNB')
'''
tester.dump_classifier_and_data(clf_NB, my_dataset, kBest_features[:7])
t0 = time()
tester.main()
print "training time GaussianNB: ", round(time() - t0, 3), "s"

## Decision Tree
print 'Performing Decision Tree'
from sklearn import tree
clf_DT = tree.DecisionTreeClassifier()
params = {'criterion': ('gini', 'entropy'), 'splitter': ('best', 'random')}
clf = GridSearchCV(clf_DT, params)
clf.fit(features_train, labels_train)
best_params = clf.best_params_
print 'Best parameters for Decision Tree: '
print best_params
clf_DT = tree.DecisionTreeClassifier(splitter='random', criterion='entropy')
        testing_features_list = [u'poi']
        for feature in features_list_score_order:
            testing_features_list.append(feature)
            pipe = Pipeline([('impute', Imputer(strategy='median')), 
                    ('classify', GaussianNB(priors=[(i/2.)*.1, (1 - (i/2.)*.1)]))])
            total_predictions, accuracy, precision, recall, f1, f2 = \
                test_classifier(pipe, my_dataset, testing_features_list, folds=200)
            acc.append(accuracy)
            prec.append(precision)
            reca.append(recall)
        acc_all.append(acc)
        prec_all.append(prec)
        reca_all.append(reca)
        results_dict['prec' + str(i)] = prec
        results_dict['reca' + str(i)] = reca
        results_dict['acc' + str(i)] = acc
#tuneNB()
test_df = pd.DataFrame(results_dict)
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results

features_list_score_order = [u'poi', u'exercised_stock_options', u'total_stock_value', u'bonus']
pipe = Pipeline([('impute', Imputer(strategy='median')), 
        ('classify', GaussianNB(priors=[.15, .85]))])
total_predictions, accuracy, precision, recall, f1, f2 = \
    test_classifier(pipe, my_dataset, features_list_score_order, folds=1000)   
        
dump_classifier_and_data(pipe, my_dataset, features_list_score_order)
Exemple #48
0
pred = pipe.predict(features_test)


print "\nPCA - explained variance: ", pca.explained_variance_ratio_

first_pc = pca.components_[0]

#print "\nFirst PC: ", first_pc

print "\ntester result: ", \
test_classifier(pipe, my_dataset_t, features_list_all, folds=1000)

# Best settings for NB: PCA - n_components = 8
# Best settings for DT: min_samples_split = 10; PCA - n_components = 3
#

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

# dump decision tree classifier

clf_sub = clf_dt_1
my_dataset_sub = my_dataset_v5
features_list_sub = features_list_selector_9


dump_classifier_and_data(clf_sub, my_dataset_sub, features_list_sub)
#!/usr/bin/python
'''###FINAL RESULTS###
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='distance')
        Accuracy: 0.86969       Precision: 0.63235      Recall: 0.36550 F1: 0.46324     F2: 0.39919
        Total predictions: 13000        True positives:  731    False positives:  425   False negatives: 1269   True negatives: 10575
'''
import sys
import pickle
import pprint
#sys.path.append("../tools/")
#All files are in the final_project folder
#### Comment out the 4 lines below before running
import os
os.getcwd()
os.chdir("/Users/jas/Project-4-Identifying-Fraud-from-Enron-Email/final_project")
os.getcwd()
####
from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary','exercised_stock_options', 'bonus']
#This simpler model gives higher recall and precision than more features and
#accuracy only goes down slightly - less than 1 %
### Load the dictionary containing the dataset
data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
### Task 2: Remove outliers
'''
Exemple #50
0
### extract features and labels for gridsearch optimization

# data extraction using k_best features list
data = featureFormat(my_dataset, my_features_list, sort_keys=True)

tru, trn = targetFeatureSplit(data)

## scale extracted features
scaler = preprocessing.MinMaxScaler()
trn = scaler.fit_transform(trn)

# Set up cross validator (will be used for tuning all classifiers)
cv = cross_validation.StratifiedShuffleSplit(tru, n_iter=10, random_state=42)

## Evaluate Final Adaboost Classifier

# load tuned classifier pipeline

best_a_pipe = pickle.load(open('best_clf_pipe.pkl', "r"))

print 'best_a_clf\n'
best_a_pipe
test_classifier(best_a_pipe, my_dataset, my_features_list)
print sep

### Dump your classifier, dataset, and features_list so
### anyone can run/check your results.

dump_classifier_and_data(best_a_pipe, my_dataset, my_features_list)
#        #print self.X_fit+X
#        best_words=self.wt.transform(self.X_fit+X)
#	word_pca = self.pca.fit_transform(best_words)
#        qqq = np.array(word_pca)[np.arange(len(self.y_fit)),:]
#        best_pca_train = self.pt.fit_transform(qqq,self.y_fit)
#	self.clf.fit(best_pca_train,self.y_fit)
#        #x=remove_low_frequency_words(X)
#        best_pca_test = self.pt.transform(np.array(word_pca)[np.arange(len(X))+len(self.X_fit)])
#	#word_pca = self.pca.transform(best_words)
#        #best_pca = self.pt.transform(word_pca)
#        return self.clf.predict(best_pca_test)

## create filtered_gnb classifier
#word_transformer = SelectKBest(f_regression,200)
#pca              = PCA(n_components=86)
#pca_transformer  = SelectKBest(f_classif,20)
#classifier1      = DecisionTreeClassifier(min_samples_leaf=2)
#classifier2      = GaussianNB()
#classifier3      = KNeighborsClassifier()
#filtered_gnb=FilteredGNB(word_transformer,pca,pca_transformer,classifier1)

#print "FILTERED GNB CLASSIFIER USING ALL WORD FEATURES"
#test_classifier(filtered_gnb, my_dataset, ["poi"]+ words.tolist(),folds=5)

print "Gaussian NB with Word PCA Features:"
test_classifier(GaussianNB(), my_dataset, ["poi"]+ best_word_pca_features)

### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.
dump_classifier_and_data(GaussianNB(), my_dataset, ["poi"]+best_word_pca_features)
Exemple #52
0
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

from sklearn.preprocessing import scale
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.ensemble import AdaBoostClassifier
import tester

# 创建GaussianNB
clf_an1 = GaussianNB()
tester.dump_classifier_and_data(clf_an1, my_dataset, features_list)
tester.main()

# 创建决策树
clf_an2 = DecisionTreeClassifier()
tester.dump_classifier_and_data(clf_an2, my_dataset, features_list)
tester.main()

# 创建svc
clf_an3 = SVC(kernel='linear')
tester.dump_classifier_and_data(clf_an3, my_dataset, features_list)
tester.main()

# 创建kmeans
clf_an4 = KMeans(n_clusters=2)
tester.dump_classifier_and_data(clf_an4, my_dataset, features_list)
test_classifier(Clf, my_dataset, selected_features_list, folds = 1000)
print "------"

# Tuning K-Nearest Neighbors
print "Tuning K-Nearest Neighbors"
t0 = time()
tuning_parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'weights': ('uniform', 'distance'),
                'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
                'leaf_size': [1, 5, 10, 20, 30, 40, 50, 75, 100, 200]}
print("Tuning Parameters for Recall")
KNN = GridSearchCV(KNeighborsClassifier(), tuning_parameters, cv=scv, scoring = 'recall')
KNN.fit(selected_features, labels)
print("Best parameters are:")
print(KNN.best_params_)
print "tunning time: {0}".format(round(time()-t0, 3))

Clf = KNN.best_estimator_
print "measurements for tuned random forest classifier: "
test_classifier(Clf, my_dataset, selected_features_list, folds = 1000)

## Final Selection and Evaluation
clf = RF.best_estimator_

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, selected_features_list)
Exemple #54
0
print f1_score(pred, labels_test1)

CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"


def dump_classifier_and_data(clf, dataset, feature_list):
    with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
        pickle.dump(clf, clf_outfile)
    with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
        pickle.dump(dataset, dataset_outfile)
    with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
        pickle.dump(feature_list, featurelist_outfile)


dump_classifier_and_data(bestknn, data_dict, features_list)

with open("my_classifier.pkl", "r") as file:
    clf = pickle.load(file)
print clf

with open("my_dataset.pkl", "r") as file:
    data = pickle.load(file)
for person in data:
    print person, data[person]
    break

with open("my_feature_list.pkl", "r") as file:
    features = pickle.load(file)
print features
 def runTest(self,clf,features_list):
     print "test result on stratified cross validation data...."
     dump_classifier_and_data(clf, self.data_dict, features_list)
     tester.main()
     return
Exemple #56
0
def evaluate_clasifier(df, extras, algo, dump=False):
    """Evaluate and possibly store classifier and data"""

    if not dump:
        # Only redirect output for the search
        orig_stdout, logfile = init_logfile(extras, algo)

    ### Task 3: Create new feature(s)
    df = create_features(df, *extras)

    ### Extract features and labels from dataset for local testing
    dfx, dfy = features_split_df(df)

    ### Task 4: Try a varity of classifiers
    ### Please name your classifier clf for easy export below.
    ### Note that if you want to do PCA or other multi-stage operations,
    ### you'll need to use Pipelines. For more info:
    ### http://scikit-learn.org/stable/modules/pipeline.html

    ### Task 5: Tune your classifier to achieve better than .3 precision and recall
    ### using our testing script. Check the tester.py script in the final project
    ### folder for details on the evaluation method, especially the test_classifier
    ### function. Because of the small size of the dataset, the script uses
    ### stratified shuffle split cross validation. For more info:
    ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

    split_indices = StratifiedShuffleSplit(dfy, n_iter=1000, test_size=0.1)

    features_list = ['poi'] + dfx.columns.values.tolist()

    pipeline, params = create_pipeline(
        algo,
        extras,
        is_search=(not dump),
        max_features=len(dfx.columns))

    grid_searcher = GridSearchCV(
        pipeline,
        param_grid=params,
        cv=split_indices,
        n_jobs=-1,
        scoring=create_scorer(),
        verbose=0)

    t0 = time()
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)

        grid_searcher.fit(dfx, y=dfy)
        print '\nTime to fit: {:0>8}\n'.format(dt.timedelta(seconds=(time() - t0)))

        print "Best parameters set:"
        print grid_searcher.best_params_
        print ''

        print 'Grid score:'
        for params, mean_score, scores in grid_searcher.grid_scores_:
            print "%0.3f for %r" % (mean_score, params)
        print ''

        selector = grid_searcher.best_estimator_.named_steps['selection']
        scored = pd.DataFrame(zip(
            dfx.columns.tolist(),
            selector.scores_,
            selector.get_support()))

        scored.columns = ['Feature', 'Score', 'Selected']
        scored = scored.sort_values(by=['Score'], ascending=False)
        scored.index = range(1, len(scored) + 1)
        n_selected = len(scored[scored.Selected])
        print 'Scored features: {} selected'.format(n_selected)
        print scored
        print ''

        # n_pca_components = grid_searcher.best_estimator_.named_steps[
        #     'reducer'].n_components_

        # print "Reduced to {0} PCA components".format(n_pca_components)

        ### Task 6: Dump your classifier, dataset, and features_list so anyone can
        ### check your results. You do not need to change anything below, but make sure
        ### that the version of poi_id.py that you submit can be run on its own and
        ### generates the necessary .pkl files for validating your results.

        clf = grid_searcher.best_estimator_

        ### Store to my_dataset for easy export below.
        df = features_combine_df(dfx, dfy)
        my_dataset = df.to_dict(orient='index')

        test_classifier(clf, my_dataset, features_list)

        if dump:
            dump_classifier_and_data(clf, my_dataset, features_list)
        else:
            close_logfile(orig_stdout, logfile)
Exemple #57
0
rf = rft.best_estimator_
t0 = time()
test_classifier(rf, data_dict, features_list, folds = 100)
print("Random Forest evaluation time: %rs" % round(time()-t0, 3))

from sklearn.tree import DecisionTreeClassifier
dt = []
for i in range(5):
    dt.append(DecisionTreeClassifier(max_depth=(i+1)))
ab_params = {'base_estimator': dt, 'n_estimators': range(50, 101, 10)}
t0 = time()
abt = GridSearchCV(ab, ab_params, scoring=metric, cv=sss)
print("AdaBoost tuning: %r" % round(time()-t0, 3))
t0 = time()
abt = abt.fit(features, labels)
print("AdaBoost fitting time: %rs" % round(time()-t0, 3))
ab = abt.best_estimator_
t0 = time()
test_classifier(ab, data_dict, features_list, folds = 100)
print("AdaBoost evaluation time: %rs" % round(time()-t0, 3))

### Select tuned adaboost as best classifier
clf = ab
    
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
Exemple #58
0
def evaluate(clf, my_dataset, features_list):
    dump_classifier_and_data(clf, my_dataset, features_list)
    print '{1}Udacity\'s Evaluation:{0}'.format(color.Normal, color.BlinkBlue)
    return main()  # from tester.py