-
Notifications
You must be signed in to change notification settings - Fork 0
/
Decision_tree.py
245 lines (205 loc) · 10.3 KB
/
Decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import numpy as np
from Node import Node
from graphviz import Graph
from Config import args
class DecisionTree(object):
def __init__(self,name='DecisionTree',weights=[.5,1.5]):
self.total_depth = args.depth
self.metric = args.metric
self.name = name
self.weights= weights
self.graph = Graph(name=name,format='png')
self.total_loss = 0
def score(self,y):
"""
:param y: all y values
:return: calculate the gini or entropy for any node
"""
n_total= y.size
num_y_per_label= [sum(y==label) for label in range(self.num_labels)]
if self.metric == 'gini':
return 1-sum([(num_y_per_label[i]/n_total)**2 for i in range(len(num_y_per_label))])
else :
return sum(list(map(lambda x: - (x/n_total)*np.log2(x/n_total) if x!=0 else 0,num_y_per_label)))
def split(self, x, y, metric='gini'):
"""
:param x: are the features
:param y: are labels
:param metric: it is 'gini' or 'entropy'
:return:
"""
"Specify distinct classes"
n_total = y.size
"specify how many number of y we have in each class"
y_per_class = [np.sum(y == c) for c in range(self.num_labels)]
"""Specify initial entropy or gini,
Initial split_index and Initial split_value"""
best_entropy_or_gini = 10
best_split_idx = None
best_split_value = None
"iterate over features"
for idx in range(x.shape[1]):
"I slice column with index = idx and corresponding y and sort them to save time"
col, y_label = zip(*sorted(zip(x[:, idx], y)))
"""Specify how many labels do we have in the left_split
and since we start from the top to go down we consider we have no class
at the begining for left hand side split
"""
y_left_class = [0 for _ in y_per_class]
"Instead we have every classes in right hand split and I use copy to save the original list"
y_right_class = y_per_class.copy()
for i in range(1, n_total):
"Move y one by one to left hand split "
y_left_class[y_label[i - 1]] += 1
y_right_class[y_label[i - 1]] -= 1
if metric == 'gini':
"""claculate gini of left and right hand side splits"""
sum_=sum(self.weights)
y_left_gini = 1.0 - sum(
(self.weights[c]/sum_)*(y_left_class[c] / i) ** 2 for c in range(self.num_labels)
)
y_right_gini = 1.0 - sum(
(self.weights[c]/sum_)*(y_right_class[c] / (n_total - i)) ** 2 for c in range(self.num_labels)
)
"calculate total weighted gini of the split"
loss = (i / n_total) * y_left_gini + ((n_total - i) / n_total) * y_right_gini
else:
"find the probability of each class within each split"
sum_weights=sum(self.weights)
sum_left=sum(y_left_class)
sum_right=sum(y_right_class)
y_left_prob = [(yy / sum_left)*(w/sum_weights) for yy,w in zip(y_left_class,self.weights)]
y_right_prob = [(yy / sum_right)*(w/sum_weights) for yy,w in zip(y_right_class,self.weights)]
"claculate entropy for left hand split and right hand split"
entropy_left = sum(list(map(lambda x: -x * np.log2(x) if x != 0 else 0, y_left_prob)))
entropy_right = sum(list(map(lambda x: -x * np.log2(x) if x != 0 else 0, y_right_prob)))
"calculate total weighted entropy"
loss = ((i) / n_total) * entropy_left + ((n_total - i) / n_total) * entropy_right
"ignore the same features in a column"
if col[i - 1] == col[i]:
continue
"""if current loss is less than the minimum loss (entropy or gini), then change their place as well as
corresponding column index and the mean average value of current and next values"""
if loss < best_entropy_or_gini:
best_entropy_or_gini = loss
best_split_idx = idx
best_split_value = (col[i - 1] + col[i]) / 2
return [best_entropy_or_gini, best_split_value, best_split_idx]
def create_tree(self,x,y,depth=0,name='root'):
"""
:param x: training features
:param y: training labels
:param depth: depth of three
:return: returns a tree of nodes (expand root node to the leafs)
I used pre-ordered traversal to create tree root-->left-->right
"""
"""
Initialize the current_node
"""
num_y_per_label= [sum(y==label) for label in range(self.num_labels)]
current_node = Node(score=self.score(y),depth= depth,num_y_per_label=num_y_per_label,name=name)
"if we want to save the graph of current tree"
"""
Specify the stopping situation of expanding nodes it is eighter total depth or gini-entropy= is equal zero
the second condition prevents from over fitting
"""
if depth < self.total_depth and current_node.IsLeaf()==False :
"get the best place to split for current node"
curr_loss, cur_split_value, cur_split_idx = self.split(x,y,self.metric)
"make sure I have some output to update"
if cur_split_value != None:
"update the current node with new values"
current_node.score= curr_loss
current_node.split_index= cur_split_idx
current_node.split_feature =cur_split_value
if current_node.depth == self.total_depth-1:
self.total_loss += current_node.score *(sum(current_node.num_y_per_label)/self.sample_size)
"Specify the bounds to split the current node to left and right "
boundry = x[:,cur_split_idx] < cur_split_value
"create left and right nodes using recursive method and each iteration x,y,depth and node position"
current_node.left= self.create_tree(x[boundry],y[boundry],depth+1,'left_{}_{}'.format(depth+1,current_node.name))
current_node.right= self.create_tree(x[~boundry],y[~boundry],depth+1,'right_{}_{}'.format(depth+1,current_node.name))
return current_node
def fit(self,x,y):
"""
:param x: x_train
:param y: y_train
:return: returns the created tree
"""
print('Tree {} starts fitting'.format(self.name))
self.num_labels= len(set(y))
self.sample_size = y.size
self.decision_tree = self.create_tree(x,y)
if args.draw_trees == True:
print("Corresponding Graphs of trees are saved in {}".format(args.dir))
graph=self.squized_graph(self.decision_tree,{})
self.draw_graph(graph)
self.graph.render('{}.gv'.format(self.name),args.dir)
def squized_graph(self,node,dic={}):
"""
:param node: start from given node (which in our case is root)
:param dic: initial dictionary to gather information for drawing
:return: dictionary of graph
"""
if node.left != None:
dic[node]=[node.left,node.right]
self.squized_graph(node.left,dic)
self.squized_graph(node.right, dic)
return dic
def draw_graph(self,data):
"""
:param data: dictionary graph of the network
:return: graph of network in form of .png or .pdf
"""
"list of all nodes"
list_nodes=list(data.keys())
for node in list_nodes:
self.graph.node(node.name,'depth : {} \n Score {} : {} \n y per sample:{}'.
format(node.depth,self.metric,node.score,node.num_y_per_label))
"""
connected nodes together if they are not in the initial list
means they are leaf and we create them
"""
for key,value in data.items():
if value[0] not in list_nodes:
list_nodes.append(value[0])
self.graph.node(value[0].name,'depth : {} \n Score {} : {} \n y per sample:{} '.
format(value[0].depth,self.metric,value[0].score,value[0].num_y_per_label))
if value[1] not in list_nodes:
list_nodes.append(value[1])
self.graph.node(value[1].name,'depth : {} \n,Score {} : {} \n y per sample:{}'.
format(value[1].depth,self.metric,value[1].score,value[1].num_y_per_label))
self.graph.edge(key.name,value[0].name,'x[{}] < {}'.
format(key.split_index,key.split_feature))
self.graph.edge(key.name,value[1].name,'x[{}] > {}'.
format(key.split_index,key.split_feature))
def predict(self,x):
"""
:param x: x test
:return: predicted label for given x
"""
"Then we create and empty prediction list"
predicted_labels= []
"For each sample we walk through the tree and if we reach to leaf we do prediction"
for x_i in x:
"""
first we get the created tree from fit method and initialize a tree with fitted one
"""
fitted_tree = self.decision_tree
"""
loop through tree to reach the leaf node
"""
while(fitted_tree.left):
"""
start from left node if it is selected proceed with that
else choose the right node
"""
if x_i[fitted_tree.split_index] < fitted_tree.split_feature:
fitted_tree = fitted_tree.left
else:
fitted_tree= fitted_tree.right
"""
attache the predicted labels to list
"""
predicted_labels.append(np.argmax(fitted_tree.num_y_per_label))
return np.array(predicted_labels)