-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_decision_tree.py
55 lines (47 loc) · 1.63 KB
/
test_decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#! /usr/bin/python
"""
test_decision_tree.py
test the decision tree classifier
"""
from decision_tree import DecisionTree
from random import shuffle
def gender_features(name):
"""
extract features from a name
"""
return {
"last_letter": name[-1],
"first_letter": name[0]
}
def main():
"""
main function
"""
# build corpus
with open("names/female.txt") as textfile:
females = textfile.readlines()
females = [name.strip() for name in females]
with open("names/male.txt") as textfile:
males = textfile.readlines()
males = [name.strip() for name in males]
female_features = [gender_features(name) for name in females]
male_features = [gender_features(name) for name in males]
shuffle(female_features)
shuffle(male_features)
train_set = [(feature, "female") for feature in female_features[:4500]] + \
[(feature, "male") for feature in male_features[:2500]]
test_set = [(feature, "female") for feature in female_features[4500:]] + \
[(feature, "male") for feature in male_features[2500:]]
# feed corpus into the tree!
tree = DecisionTree(train_set)
print "Trained decision tree (with ID3 heuristic) using {} samples." \
.format(len(train_set))
print "Evaluating accuracy with a test set of {} samples..." \
.format(len(test_set))
accuracy, nones = tree.evaluate(test_set)
print "Percent of items classified correctly: {}%" \
.format(round(accuracy * 100, 2))
print "Percent of items not classified: {}%" \
.format(round(nones * 100), 2)
if __name__ == "__main__":
main()