forked from tybyers/Udacity_IntroMachineLearning
/
poi_id.py
105 lines (85 loc) · 3.74 KB
/
poi_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import matplotlib.pyplot as plt
import sys
import pickle
import numpy as np
from ggplot import *
sys.path.append("../tools/") # moved this directory to local to make
# it easy to put all in one git repo
sys.path.append("./tools/")
def computeFraction( poi_messages, all_messages ):
""" given a number messages to/from POI (numerator)
and number of all messages to/from a person (denominator),
return the fraction of messages to/from that person
that are from/to a POI
"""
if all_messages == 'NaNNaN': # occurred when created additive features (all emails)
all_messages = 'NaN'
if poi_messages == 'NaNNaN':
poi_messages = 'NaN'
if all_messages == 'NaN':
return 0
if poi_messages == 'NaN':
return 0
if all_messages == 0:
return 0
return 1.*poi_messages/all_messages
return fraction
def scaleFeatures(arr): # scaled features. Not used in final, but when testing out kmeans algorithm
for i in range(1,arr.shape[1]):
arrmin = min(arr[:,i])
arrmax = max(arr[:,i])
if arrmin == arrmax:
arr[:,i] = arr[:,i]/arrmin
else:
arr[:,i] = (arr[:,i]-arrmin)/(arrmin-arrmax)
return arr
def algorithm(data_dict, features_list):
from feature_format import featureFormat
from feature_format import targetFeatureSplit
### store to my_dataset for easy export below
my_dataset = data_dict
data = featureFormat(my_dataset, features_list)
# scale features
#data = scaleFeatures(data)
### split into labels and features (this line assumes that the first
### feature in the array is the label, which is why "poi" must always
### be first in features_list
labels, features = targetFeatureSplit(data)
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators = 1000, random_state = 202, \
learning_rate = 1.0, algorithm = "SAMME.R")
### dump your classifier, dataset and features_list so
### anyone can run/check your results
pickle.dump(clf, open("my_classifier.pkl", "w") )
pickle.dump(data_dict, open("my_dataset.pkl", "w") )
pickle.dump(features_list, open("my_feature_list.pkl", "w") )
def create_features(data_dict):
### if you are creating any new features, you might want to do that here
# have some ideas for new features
for name in data_dict:
poi_msg_to = data_dict[name]['from_poi_to_this_person']
all_msg_to = data_dict[name]['to_messages']
data_dict[name]['fraction_from_poi'] = computeFraction(poi_msg_to, all_msg_to)
poi_msg_from = data_dict[name]['from_this_person_to_poi']
all_msg_from = data_dict[name]['from_messages']
data_dict[name]['fraction_to_poi'] = computeFraction(poi_msg_from, all_msg_from)
expenses = data_dict[name]['expenses']
salary = data_dict[name]['salary']
data_dict[name]['expenses_per_salary'] = computeFraction(expenses, salary)
poi_msg_all = poi_msg_to + poi_msg_from
all_msg_all = all_msg_to + all_msg_from
data_dict[name]['fraction_emails_with_poi'] = computeFraction(poi_msg_all, all_msg_all)
return data_dict
def load_data():
### load the dictionary containing the dataset
data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
data_dict.pop('TOTAL',0) # spreadsheet phenomenon
data_dict.pop('LOCKHART EUGENE E',0) # all data 0 or N/A -- not helpful
return data_dict
def main():
data_dict = load_data()
data_dict = create_features(data_dict)
selected_featurenames = ["poi", "salary", "bonus", "expenses", "deferral_payments"]
algorithm(data_dict, selected_featurenames)
if __name__ == "__main__":
main()