-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
62 lines (45 loc) · 1.84 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/python
import pickle
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectKBest
from tools.feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data, load_classifier_and_data, test_classifier
def setup_and_test(my_dataset, features_list, classifier):
# Dump classifier and features list, so we can test them
dump_classifier_and_data(classifier, my_dataset, features_list)
# load up student's classifier, dataset, and feature_list
clf, dataset, feature_list = load_classifier_and_data()
# Run testing script
test_classifier(clf, dataset, feature_list)
return
def get_original_data():
"""
Unpickle data and return it.
"""
with open("final_project_dataset.pkl", "r") as data_file:
data_dict = pickle.load(data_file)
return data_dict
def computeFraction(poi_messages, all_messages):
"""
Given a number messages to/from POI (numerator)
and number of all messages to/from a person (denominator),
return the fraction of messages to/from that person
that are from/to a POI
"""
if not poi_messages == 'NaN' and not all_messages == 'NaN':
return poi_messages / float(all_messages)
return 0.
def get_k_best_features(data, features_list, k=10):
# Setup the label and features
data = featureFormat(data, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
# Apply SelectKBest
k_best = SelectKBest(k=k)
k_best.fit(features, labels)
scores = k_best.scores_
# pair up with feature name, ignore the first one, since
# that is the 'poi' label
unsorted_pairs = zip(features_list[1:], scores)
# Sort based on score
sorted_pairs = list(sorted(unsorted_pairs, key=lambda x: x[1], reverse=True))
return sorted_pairs