/
test.py
215 lines (173 loc) · 10.1 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/python
#this file is for testing data
#it is created by ali makki for that purpose
import sys
from time import time
import nltk
sys.path.append("../tools/")
import pickle
#from email_preprocess import preprocess
### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
#features_train, features_test, labels_train, labels_test = preprocess()
#print len(features_train)
### Here I am experimenting with enron dataset to find best
### features for training and create new features and find outliers
from feature_format import featureFormat, targetFeatureSplit
import numpy as np
#original features
#features_list = ['poi', 'salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus',
# 'restricted_stock_deferred', 'deferred_income', 'total_stock_value',
# 'expenses', 'exercised_stock_options', 'long_term_incentive',
# 'restricted_stock',
# 'from_poi_to_this_person', 'from_this_person_to_poi',
# 'shared_receipt_with_poi'] # You will need to use more features
#new features
features_list = ['poi', 'salary', 'total_payments', 'bonus', 'director_fees',
'deferred_income', 'total_stock_value', 'restricted_stock_deferred',
'expenses', 'exercised_stock_options', 'long_term_incentive',
'restricted_stock', 'from_messages', 'to_messages','deferral_payments',
'from_poi_to_this_person', 'from_this_person_to_poi',
'shared_receipt_with_poi'] # You will need to use more features
###definition of features:
###restricted stock: certain kind of stock that is not fully transferable until a certain conditions are met.
###
with open("../final_project/final_project_dataset.pkl", "r") as data_file:
data_dict = pickle.load(data_file)
#print data_dict
data_dict.pop("TOTAL")
data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)
print "Type of data: ", type(data)
#print data
#print data[:,0]
all_pois = data[data[:,0] == 1]
all_non_pois = data[data[:,0] == 0]
print "Number of pois:", len(all_pois)
new_feature = np.divide(data[:,15],data[:,12])
print "new_feature is: ", new_feature
print "nan changed to zeros: ", np.nan_to_num(new_feature)
### in this section I am comparing the average of different features
### between pois and non_pois I removed director fee because the values for it were all 0
def poi_nonpoi_with_features():
for inx, a_feature in enumerate(features_list):
print "non_pois with (", a_feature, "): ", len(all_non_pois[all_non_pois[:,inx]!=0.0])
print "non_pois with no (", a_feature, "): ", len(all_non_pois[all_non_pois[:,inx]==0.0])
print "pois with (", a_feature, "): ", len(all_pois[all_pois[:,inx]!=0.0])
print "pois with no (", a_feature, "): ", len(all_pois[all_pois[:,inx]==0.0])
print "..................................."
poi_nonpoi_with_features()
###I took out three features: 1- deferral_payment 2- loan_advances 3- restricted stock deferred
###with salary there are 49 non pois without salary and 77 with salary
###with pois it is only 1 without salary (maybe salary should be removed)
###only 2 of the non_pois get "loan advances" and 1 of pois. this feature clearly
###does not give enough information and should be removed.
###deferral_payment has been eliminated because of lack of info for many entries
###all pois have "expenses" and it is not the case with non-pois and also average
###of "expenses" of pois and non-pois are the same.maybe this feature should be removed as well
###there is no poi with "restricted stock differed" so this feature might be taken out
###KAMINSKI WINCENTY J is a non_pois he has a from_messages of 14368 which is way higher that average and
###very different from the next person which has 6759
def showAveMax():
print "Now compute averages, standard deviations, ..... :"
for inx, a_feature in enumerate(features_list):
if a_feature != 'poi':
non_zero_pois = all_pois[:,inx][np.nonzero(all_pois[:,inx])]
if len(non_zero_pois)==0:
print 'all stats zero or NaN.'
else:
print "average",a_feature ,"of pois", np.mean(non_zero_pois), " and max: ", 0 if len(non_zero_pois)==0 else np.max(non_zero_pois)\
,"and Median: ", 0 if len(non_zero_pois)==0 else np.median(non_zero_pois), " and 98 percentile: "\
, np.percentile(non_zero_pois, 90), "standard deviation for pois: ", np.std(non_zero_pois)
#non_zero_non_pois = all_non_pois[all_non_pois[:,1] != 0.0][:,inx][np.nonzero(all_non_pois[all_non_pois[:,1] != 0.0][:,inx])]
non_zero_non_pois = all_non_pois[:,inx][np.nonzero(all_non_pois[:,inx])]
#non_zero_non_pois = np.nonzero(all_non_pois[:,inx])
print 'inx: ',inx
print "average",a_feature , "of non pois", np.mean(non_zero_non_pois), " and max: "\
, np.max(non_zero_non_pois), " and median: "\
, np.median(non_zero_non_pois)\
, " and 90 percentile: ", np.percentile(non_zero_non_pois, 90), "standard deviation for non_pois: ", np.std(non_zero_non_pois)
print "..................................."
#def showAveMaxDev():
# for inx, a_feature in enumerate(features_list):
# if a_feature != 'poi':
# print 'average: ',
showAveMax()
from collections import OrderedDict
pois = { k:v for k, v in data_dict.items() if v['poi'] == True}
non_pois = { k:v for k, v in data_dict.items() if v['poi'] == False}
#print data_dict
def show_persons_features(group, group_name,a_feature, number_of_persons):
ordered_group = OrderedDict(sorted(group.items(), key=lambda x: x[1][a_feature] if x[1][a_feature]!='NaN' else 0, reverse=False))
print a_feature," for ", group_name, ": "
#for person in ordered_group:
# print person, ordered_group[person][a_feature]
for i in xrange(1,number_of_persons+1):
print ordered_group.items()[len(ordered_group)-i][0], ' :', ordered_group.items()[len(ordered_group)-i][1][a_feature]
print '************************************'
print 'top 4 non_pois with highest in a specific feature: '
for feature in features_list:
show_persons_features(non_pois, 'non_pois',feature, 4)
print '************************************'
print 'top 4 pois with highest in a specific feature: '
for feature in features_list:
show_persons_features(pois, 'pois',feature, 4)
#get threshold for outliers in each feature for pois and non-pois separately, using standard deviation
def get_thresholds(numOfStds):
threshold_poi_list = dict()
threshold_non_poi_list = dict()
all_pois = data[data[:,0] == 1]
all_non_pois = data[data[:,0] == 0]
for inx, a_feature in enumerate(features_list):
if a_feature != 'poi':
non_zero_pois = all_pois[:,inx][np.nonzero(all_pois[:,inx])]
stdPois = np.std(non_zero_pois)
threshold_poi_list[a_feature] = stdPois*numOfStds
#print "standard deviation for pois feature ", a_feature,": ", stdPois
non_zero_non_pois = all_non_pois[:,inx][np.nonzero(all_non_pois[:,inx])]
stdNonPois = np.std(non_zero_non_pois)
threshold_non_poi_list[a_feature] = stdNonPois*numOfStds
#print "standard deviation for non_pois feature: ", a_feature, ": ", stdNonPois
#print "..................................."
return threshold_poi_list, threshold_non_poi_list
#using the lists produced by get_thresholds detect outliers for pois and non_pois separately
def detect_outliers(pois_features_threshold, non_pois_features_threshold):
outliers = dict()
for person in data_dict:
for feature in data_dict[person]:
if feature in features_list and feature != 'poi':
if data_dict[person]['poi'] == True:
if data_dict[person][feature] != 'NaN' and float(data_dict[person][feature]) > pois_features_threshold[feature]:
if outliers.has_key(person):
outliers[person][feature] = [pois_features_threshold[feature],data_dict[person][feature],'poi']
else:
outliers[person] = {feature:[pois_features_threshold[feature],data_dict[person][feature],'poi']}
else:
if data_dict[person][feature] != 'NaN' and float(data_dict[person][feature]) > non_pois_features_threshold[feature]:
if outliers.has_key(person):
outliers[person][feature] = [non_pois_features_threshold[feature],data_dict[person][feature],'non_poi']
else:
outliers[person] = {feature:[non_pois_features_threshold[feature],data_dict[person][feature],'non_poi']}
return outliers
pois_features_threshold, non_pois_features_threshold = get_thresholds(5.6)
outliers = detect_outliers(pois_features_threshold, non_pois_features_threshold)
print outliers
for feature in features_list:
od_pois = OrderedDict(sorted(pois.items(), key=lambda x: x[1][feature], reverse=False))
# [~np.isnan(A)].mean()
od_non_pois = OrderedDict(sorted(non_pois.items(), key=lambda x: x[1][feature], reverse=False))
#print "feature: ",feature
for k, v in od_pois.iteritems():
#print k
#print v[feature]
if k == feature:
#print v
m = True
#print od_pois
#my_dataset = od[len(od)*best_results[9]:len(od)*(1-best_results[9])]
#cleaned_pois = dict(od_pois.items()[int(len(od_pois)*best_results[9]):int(len(od_pois)*(1.0-best_results[9]))])
#cleaned_non_pois = dict(od_non_pois.items()[int(len(od_non_pois)*best_results[9]):int(len(od_non_pois)*(1.0-best_results[9]))])
#my_dataset = cleaned_pois.copy()
#my_dataset.update(cleaned_non_pois)
#print "len(my_dataset)",len(my_dataset)