-
Notifications
You must be signed in to change notification settings - Fork 0
/
tree.py
50 lines (40 loc) · 1.49 KB
/
tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import numpy as np
import sklearn.tree
import sklearn.cross_validation
import data_loader
from sklearn.metrics import make_scorer
# Load train data
(X_train, Y_train) = data_loader.load("Dataset/churn.data.txt", standardize=False)
(X_test, Y_test) = data_loader.load("Dataset/churn.test.txt", standardize=False)
def custom_scorer(ground_truth, predictions):
ground_truth = ground_truth
predictions = predictions
prec = sklearn.metrics.precision_score(ground_truth, predictions)
rec = sklearn.metrics.recall_score(ground_truth, predictions)
f1 = sklearn.metrics.f1_score(ground_truth, predictions)
print "prec: " + str(prec)
print "rec: " + str(rec)
print "f1: " + str(f1)
return f1
model = sklearn.tree.DecisionTreeClassifier()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
score = custom_scorer(Y_test, Y_pred)
# Visualize graph tree
from sklearn.externals.six import StringIO
import pydot
dot_data = StringIO()
sklearn.tree.export_graphviz(model, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("tree.pdf")
## Conclusions:
# :: prec, recall, f1 ~ 70%
# By inspecting the tree, dim 56 (total day minutes) seems to be chosen often (root split and many child splits).
#
# X_train[Y_train,56].mean() = 250
# X_train[1-np.array(Y_train),56].mean() = 176
#
# => Seems to be a big difference there, which makes the split viable
#
# Other split dimensions are: number customer service calls, number vmail messages