def construct_DT(self, X, y): self.depth += self.depth if len(y) < self.leafsize or self.depth > 25 or len(set(y)) == 1: return [-1, int(round(np.mean(y))), -1, -1] min = -1.0 for index in range(len(X[0])): if isinstance(X[0][index], Number): val = np.mean([x[index] for x in X]) l_x, r_x, l_y, r_y = partition_classes(X, y, index, val) t_gain = information_gain(y, [l_y, r_y]) if t_gain > min: min = t_gain split_val = val split_attr = index else: lv = set([x[index] for x in X]) for val in lv: lx, rx, ly, ry = partition_classes(X, y, index, val) t_gain = information_gain(y, [l_y, r_y]) if t_gain > min: min = t_gain split_val = val split_attr = index x_l, x_r, y_l, y_r = partition_classes(X, y, split_attr, split_val) tl = self.construct_DT(x_l, y_l) rl = self.construct_DT(x_r, y_r) return [split_attr, split_val, tl, rl]
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) if (len(y) == 0): self.tree['type'] = 'empty' return cnt = np.unique(y, return_counts=True)[1] if len(cnt) == 1: self.tree['type'] = 'leaf' self.tree['y'] = np.argmax(cnt) return max_gain = 0 max_gain_attr = 0 max_gain_index = 0 for i in range(len(X[0])): if isinstance(X[0][i], str): cat = list(set([x[i] for x in X])) for j in cat: xl, xr, yl, yr = partition_classes(X, y, i, j) temp_gain = information_gain(y, [yl, yr]) if temp_gain > max_gain: max_gain = temp_gain max_gain_attr = j max_gain_index = i else: cat = [x[i] for x in X] cat_mean = sum(cat) / len(cat) xl, xr, yl, yr = partition_classes(X, y, i, cat_mean) temp_gain = information_gain(y, [yl, yr]) if temp_gain > max_gain: max_gain = temp_gain max_gain_attr = cat_mean max_gain_index = i Xl, Xr, yl, yr = partition_classes(X, y, max_gain_index, max_gain_attr) left_subtree = DecisionTree() right_subtree = DecisionTree() left_subtree.learn(Xl, yl) right_subtree.learn(Xr, yr) self.tree['type'] = 'node' self.tree['left'] = left_subtree self.tree['right'] = right_subtree self.tree['split_attr'] = max_gain_index self.tree['split_val'] = max_gain_attr pass
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) root = { 'attribute_index': None, 'value': None, 'left': None, 'right': None } if entropy( y ) < .2: ##Reasonable bound/cut off based on entropy to prevent overfitting return sp.stats.mode(y)[0][0] else: info_gain = 0 split_attribute_index = 0 split_value = X[0][split_attribute_index] for item in X: for i in range(len(item)): if information_gain(y, [ partition_classes(X, y, i, item[i])[2], partition_classes(X, y, i, item[i])[3] ]) > info_gain: info_gain = information_gain( y, [ partition_classes(X, y, i, item[i])[2], partition_classes(X, y, i, item[i])[3] ] ) ##Base attribute and split value on combination that maximizes info gain split_attribute_index = i split_value = item[i] root['attribute_index'] = split_attribute_index root['value'] = split_value root['left'] = self.learn( partition_classes(X, y, split_attribute_index, split_value)[0], partition_classes(X, y, split_attribute_index, split_value)[2]) root['right'] = self.learn( partition_classes(X, y, split_attribute_index, split_value)[1], partition_classes(X, y, split_attribute_index, split_value)[3]) self.tree.insert(0, root) return root
def treeBuilding(X, y): if sum(y) == len(y) or sum(y) == 0: return classLabel(y) tree = {} cols = len(X[0]) #tree['entropy'] = entropy(y) #tree['num_rows'] = len(X) #tree['num_one'] = sum(y) #tree['num_zero'] = len(y)-sum(y) info_gain_list = [] split_val_list = [] for idx in range(cols): best_val = 0 best_gain = 0 col = [row[idx] for row in X] if isinstance(col[0], str): val_set = set(col) for val in val_set: X_left, X_right, y_left, y_right = partition_classes(X, y, idx, val) if len(y_left) == 0 or len(y_right) == 0: break gain = information_gain(y, [y_left, y_right]) if gain > best_gain: best_gain = gain best_val = val else: steps = np.linspace(start=np.min(col), stop=np.max(col), num=5, endpoint=False)[1:] for val in steps: X_left, X_right, y_left, y_right = partition_classes(X, y, idx, val) if len(y_left) == 0 or len(y_right) == 0: break gain = information_gain(y, [y_left, y_right]) if gain > best_gain: best_gain = gain best_val = val info_gain_list.append(best_gain) split_val_list.append(best_val) best_split_col = np.argmax(info_gain_list) best_split_value = split_val_list[best_split_col] X_left, X_right, y_left, y_right = partition_classes(X, y, best_split_col, best_split_value) tree['split_attribute'] = best_split_col tree['split_value'] = best_split_value tree['left_child'] = treeBuilding(X_left, y_left) tree['right_child'] = treeBuilding(X_right, y_right) return tree
def divide_tree(self, val1, val2): val_x = val1[0][0] val_y, max_info_gain = 0, 0 for i in range(len(val1[0])): caller_row = [row[i] for row in val1] caller_row = set(caller_row) for j in caller_row: y_l, y_r = partition_classes(val1, val2, i, j)[2:4] if information_gain(val2, [y_l, y_r]) > max_info_gain: max_info_gain = information_gain(val2, [y_l, y_r]) val_y, val_x = i, j return val_y, val_x
def get_split(dataset2, labels2, n_features2, side): counter = 0 b_index, b_value, b_score, b_groups = 999, 999, 999, None IG_index = -1.0 ig = -1.0 features = list() while len(features) < n_features2: index = np.random.random_integers(len(dataset2[0]) - 2) if index not in features: features.append(index) for index in features: col = [] for row in dataset2: col.append(row[index]) colMean2 = np.mean(col) colMedian2 = np.median(col) X_left, X_right, y_left, y_right = partition_classes( dataset2, labels2, index, colMean2) ig_mean = information_gain(labels2, [y_left, y_right]) X_left_med, X_right_med, y_left_med, y_right_med = partition_classes( dataset2, labels2, index, colMedian2) ig_median = information_gain(labels2, [y_left_med, y_right_med]) ig = max(ig_mean, ig_median) if ig_median > ig_mean: X_left, X_right, y_left, y_right = X_left_med, X_right_med, y_left_med, y_right_med #elif ig == 0.0: if len(y_left) == 0: y_left = None X_left = None if len(y_right) == 0: y_right = None X_right = None if ig > IG_index: IG_index = ig b_index = index # row that split occurred on b_value = colMean2 # value that split occurred b_score = ig # gini index that caused split b_groups = [X_left, y_left, X_right, y_right] return {'index': b_index, 'value': b_value, 'groups': b_groups}
def build_tree(self, X, y): #numerical_cols = set([0,1,2,3,4,5,6,7]) numerical_cols = set([1, 2, 7, 10, 13, 14, 15]) # indices of numeric attributes (columns) if y.count(0) == len(y): self.tree["label"] = 0 return if y.count(1) == len(y): self.tree["label"] = 1 return if len(X) == 0: if (y.count(0) > y.count(1)): self.tree["label"] = 0 return else: self.tree["label"] = 1 return #find best attribute to split on X_arr = np.array(X) for col in range(X_arr.shape[1] - 1): #calculate mean for numerical values if col in numerical_cols: temp = X_arr[:, col] #print("temp = ", temp) split_val = np.mean([float(a) for a in temp]) #print("split_val = " , split_val) #calculate mpde for numerical values else: split_val = stats.mode(X_arr[:, col])[0][0] #calculate infogain X_left, X_right, y_left, y_right = partition_classes( X, y, col, split_val) current_y = [] current_y.append(y_left) current_y.append(y_right) current_info_gain = information_gain(y, current_y) #keep max infogain arguments if (self.tree['info_gain'] < current_info_gain): self.tree['info_gain'] = current_info_gain self.tree['split_val'] = split_val self.tree['split_attr'] = col #print("current tree: " , self.tree) if len(X) > 0: self.tree["left"] = DecisionTree() self.tree["right"] = DecisionTree() X_left, X_right, y_left, y_right = partition_classes( X, y, self.tree['split_attr'], self.tree['split_val']) self.tree["left"].build_tree(X_left, y_left) self.tree["right"].build_tree(X_right, y_right)
def split_tree_based_on_max_info_gain(self, X, y): """ Returns True if there's only one unique y left Else False If there is only one unique y value left, the decision tree has partitioned the data set to the lowest level of granularity where the leaf contains a single value Args: y: array of labels Returns: bool """ split_value = X[0][0] row_one = X[0] split_column, max_info_gain = 0, 0 for column in range(len(row_one)): unique_column = set([row[column] for row in X]) for value in unique_column: y_left, y_right = partition_classes(X, y, column, value)[2:4] info_gain = information_gain(y, [y_left, y_right]) if info_gain > max_info_gain: max_info_gain = info_gain split_column, split_value = column, value return split_column, split_value
def learnlearn(X, y): if entropy(y) == 0: # all the same label -> end of tree return {'label': y[0]} best_split = {} # split_attr,split_value,left,right max_IG = -1 current_split = None for attribute in range(len(X[0])): # attribute = column indices unique_value = np.unique([x[attribute] for x in X]) for value in unique_value: X_left, X_right, y_left, y_right = partition_classes( X, y, attribute, value) IG = information_gain(y, [y_left, y_right]) if IG > max_IG: max_IG = IG current_split = [attribute, value] if max_IG == 0: # just couldn't split better -> end of tree cnt_0_1 = np.bincount(y) return {'label': [1, 0][cnt_0_1[0] > cnt_0_1[1]]} # record and split best_split["split_attr"] = current_split[0] best_split["split_value"] = current_split[1] X_left, X_right, y_left, y_right = partition_classes( X, y, current_split[0], current_split[1]) # next level best_split['left'] = learnlearn(X_left, y_left) best_split['right'] = learnlearn(X_right, y_right) return best_split
def lookingforIG(self, X, y): a = np.array(X) rows = a.shape[0] columns = a.shape[1] ig_max = 0 ig_row = 0 ig_col = 0 # looking for ig_max for col in range(columns): split_attr = col for row in range(rows): splitvalue = X[row][col] aaa = partition_classes(X, y, split_attr, splitvalue) # X_left = aaa[0] # X_right = aaa[1] y_left = aaa[2] y_right = aaa[3] if not y_left or not y_right: continue ig_temp = information_gain(y, [y_left, y_right]) if ig_temp > ig_max: ig_max = ig_temp ig_row = row ig_col = col return ig_col, X[ig_row][ig_col]
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) gain_list = {} if entropy(y) != 0: for attr in range(len(X[0])): values = list(set([item[attr] for item in X])) for val in values: X_left, X_right, y_left, y_right = partition_classes( X, y, attr, val) gain_list[(attr, val)] = information_gain(y, [y_left, y_right]) sp_attr, sp_val = list(gain_list.keys())[list( gain_list.values()).index(max(gain_list.values()))] X_left, X_right, y_left, y_right = partition_classes( X, y, sp_attr, sp_val) self.tree['split_attribute'] = sp_attr self.tree['split_val'] = sp_val self.tree['left'] = (X_left, y_left) self.tree['right'] = (X_right, y_right) pass
def try_split(self, X, y): b_index, b_value, b_gain = -1, -1, float('-inf') X_arr = np.asarray(X, dtype=object) number_of_attr = X_arr.shape[1] idx = np.random.choice(range(number_of_attr), int(np.floor(0.4 * number_of_attr)), replace=False) for index in idx: partition_values = np.unique(X_arr[:, index], return_counts=False) if not type(partition_values[0]) == str: partition_values = np.percentile( partition_values, [10 * i for i in range(1, 10, 2)]) gain = np.array([ information_gain(y, try_partition_classes(X, y, index, value)) for value in partition_values ]) max_gain = max(gain) if max_gain < self.threshold: continue else: if max_gain > b_gain: b_index, b_value, b_gain = index, partition_values[ np.argmax(gain)], max_gain return (b_gain, b_index, b_value)
def make_tree(self, X, y): if len(set(y)) == 1: return y[0] max_info_gain = -1 index = None value = None X_left, X_right, y_left, y_right = [], [], [], [] for i in range(len(X[0])): current = [] for row in X: current.append(row[i]) for split_val in set(current): X_left_t, X_right_t, y_left_t, y_right_t = partition_classes( X, y, i, split_val) info_gain = information_gain(y, [y_left_t, y_right_t]) if info_gain > max_info_gain: max_info_gain = info_gain index, value = i, split_val X_left, X_right, y_left, y_right = X_left_t, X_right_t, y_left_t, y_right_t node_data = (index, value) return { node_data: [self.make_tree(X_left, y_left), self.make_tree(X_right, y_right)] }
def store_tree(self, X, Y, Z): for i in range(0,2): if Y.count(i) == len(Y): self.tree["lbl"] = i return if len(Z) == 0: self.tree["lbl"] = (Y.count(0) <= Y.count(1)) + 0 for j in Z: ig = information_gain(Y, partition_classes([i[j] for i in X], Y, self.tree["TH"])) if self.tree["Split"] == -1 or ig > self.tree["IG"]: self.tree["Split"] = j self.tree["IG"] = ig self.tree["TH"] = np.mean(np.array(X), axis = 0)[j] if len(Z) > 0: Z.remove(self.tree["Split"]) self.tree["L"] = DecisionTree() self.tree["R"] = DecisionTree() Data = [[],[],[],[]] for i in range(len(X)): if X[i][self.tree["Split"]] <= self.tree["TH"]: Data[0].append(X[i]) Data[1].append(Y[i]) else: Data[2].append(X[i]) Data[3].append(Y[i]) self.tree["L"].store_tree(Data[0], Data[1], Z) self.tree["R"].store_tree(Data[2], Data[3], Z)
def select_split(self, X, y): """ Finds the best split attribute and value to maximize information gain. :param X: Numpy array of data :param y: Numpy array of classification :return: split_attr, split_val, split_data """ max_info_gain = 0 split_attr = 0 split_val = None split_data = {'X_left': None, 'X_right': None, 'y_left': None, 'y_right': None} # Loop through each value and find the split that will maximize information gain. for row_idx in range(len(X)): for col_idx in range(len(X[0])): temp_attr = col_idx temp_val = X[row_idx][col_idx] temp_split_data = partition_classes(X, y, temp_attr, temp_val) temp_info_gain = information_gain(y, [temp_split_data[2], temp_split_data[3]]) if temp_info_gain > max_info_gain: max_info_gain = temp_info_gain split_attr = temp_attr split_val = temp_val split_data['X_left'] = temp_split_data[0] split_data['X_right'] = temp_split_data[1] split_data['y_left'] = temp_split_data[2] split_data['y_right'] = temp_split_data[3] return split_attr, split_val, split_data
def find_best_numeric(X, y, split_attribute): ''' find the split value (mean or median) with largest IG for a continuous variable Inputs: X: data containing all attributes y: labels split_attribute: column index of the attribute to split on ''' if isinstance(np.array(X)[:, split_attribute][0], np.int32) or isinstance( np.array(X)[:, split_attribute][0], np.float64): all_values = [val for val in np.array(X)[:, split_attribute]] else: all_values = [ ast.literal_eval(val) for val in np.array(X)[:, split_attribute] ] median = np.median(all_values) mean = np.mean(all_values) best_ig = 0 best_val = 0 for val in (median, mean): y_left = partition_classes(X, y, split_attribute, val)[2] y_right = partition_classes(X, y, split_attribute, val)[3] if len(y_left) * len(y_right) != 0: ig = information_gain(y, [y_left, y_right]) if ig >= best_ig: best_ig = ig best_val = val else: continue return best_ig, best_val
def learn(self, X, y): #Train the decision tree (self.tree) using the the sample X and labels y #test if y is the same y_sum = np.sum(y) if y_sum == len(y): self.tree['output'] = 1 return self.tree elif y_sum == 0: self.tree['output'] = 0 return self.tree #test if x is the same x_same = 1 for i in range(len(X) - 1): if X[i] == X[-1]: x_same += 1 if x_same == (len(X) - 1): self.tree['output'] = y_sum // len(y) return self.tree #compute information gain of each feature and split on best one split_attribute = 0 info_gain = [] split_vals = [] for col in zip(*X): #split based on average split_val = np.average(col) split_vals.append(split_val) #split and compute information gain X_left, X_right, y_left, y_right = partition_classes( X, y, split_attribute, split_val) current_y = [] current_y.append(y_left) current_y.append(y_right) ig = information_gain(y, current_y) info_gain.append(ig) #increment the column number split_attribute += 1 #find highest info gain and split the tree there max_val = max(info_gain) max_ind = info_gain.index(max_val) split_val = split_vals[max_ind] X_left, X_right, y_left, y_right = partition_classes( X, y, max_ind, split_val) #store the column and value to split on for classifying and set output -1 to flag it is not a leaf node self.tree['split col'] = max_ind self.tree['split val'] = split_val self.tree['output'] = -1 #right and left subtrees self.tree['left'] = DecisionTree().learn(X_left, y_left) self.tree['right'] = DecisionTree().learn(X_right, y_right) #return the tree return self.tree
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) # starting with the 1st sf if len(set(y)) == 1: return y[0] temp = 0 split_attribute, split_val = 0, X[0][0] x_left, x_right, y_left, y_right = [], [], [], [] for i in range(len(X[0])): for j in range(len(X)): print(X[j][i]) infoGain = information_gain(y, [ partition_classes(X, y, i, X[j][i])[2], partition_classes(X, y, i, X[j][i])[3] ]) if infoGain > temp: temp = infoGain split_attribute = i split_val = X[j][i] DecisionTree().learn(x_left, y_left) DecisionTree().learn(x_right, y_right)
def find_best_split(n, k): x = list(np.random.choice(range(k), size=n, p=[0.8,0.05,0.1,0.05])) print(Counter(x)) prior_entropy = calculate_entropy(x) best_ig = 0 best_left = None best_right = None for m in range(1, len(x)): for left in set(itertools.combinations(x, m)): copy = x.copy() for elt in left: copy.remove(elt) right = copy ig = information_gain(left, right, prior_entropy) if ig > best_ig: best_ig = ig best_left = left best_right = right print(best_ig) print(best_left) print(best_right)
def helper(tree, X, y): if checkEqual(y): if len(y) == 0: tree["rst"] = 1 else: tree["rst"] = y[0] tree["index"] = None else: size = len(X[0]) gain, index, val = 0, 0, 0 X_left, X_right, y_left, y_right = [], [], [], [] for i in range(size): X_i = [item[i] for item in X] if isDiscrete(X_i): m = majority(X_i) else: m = np.mean(X_i) X_l, X_r, y_l, y_r = partition_classes(X, y, i, m) temp = information_gain(y, [y_l, y_r]) if temp > gain: index, gain, val = i, temp, m X_left, X_right, y_left, y_right = X_l, X_r, y_l, y_r tree["index"] = index tree["split_val"] = val tree["rst"] = None tree["left"] = {} helper(tree["left"], X_left, y_left) tree["right"] = {} helper(tree["right"], X_right, y_right)
def chooseBestAttr(self, X, y): best_split_attr = -1 best_spilt_value = None best_info_gain = -1 for i in range(len(X[0])): attr_list = [dt[i] for dt in X] split_list = [] if isinstance(attr_list[0], str): sort_attr_list = sorted(list(set(attr_list))) for j in range(len(sort_attr_list)): split_list.append(sort_attr_list[j]) else: a = np.array(attr_list) split_list.append(np.mean(a)) for split_value in split_list: partition_res = partition_classes(X, y, i, split_value) y_left = partition_res[2] y_right = partition_res[3] if not y_left or not y_right: continue new_info_gain = information_gain(y, [y_left, y_right]) if new_info_gain > best_info_gain: best_info_gain = new_info_gain best_split_attr = i best_spilt_value = split_value return best_split_attr, best_spilt_value
def recursive(X,y): if y.count(y[0]) == len(y): return y[0] MaxIG = 0 for i in range(len(X[0])): column = [x[i] for x in X] for value in set(column): current_y = partition(column, y, value) current_IG = information_gain(y, current_y) if current_IG > MaxIG: MaxIG = current_IG split_attribute = i split_value = value if MaxIG == 0: a = Counter(y) top = a.most_common(1) return top[0][0] X_left, X_right, y_left, y_right = partition_classes(X, y, split_attribute, split_value) return {(split_attribute, split_value):[recursive(X_left, y_left), recursive(X_right, y_right)]}
def choose_attr_value(self, X, y): """Choose the attribute and value to split the tree that maximized the information gain criterion""" #https://stackoverflow.com/questions/44360162/how-to-access-a-column-in-a-list-of-lists-in-python max_split_attribute,max_split_val, max_information_gain = -1,-1,-1 #The range is the number of attributes in the dataset for split_attribute_temp in range(len(X[0])): #For numbers if self.is_number(X[0][split_attribute_temp]): #print("numeric",X[0][split_attribute_temp]) #If split_value is numerical then assign average value of the column column_item = [float(row[split_attribute_temp]) for row in X] split_val_temp = np.mean(column_item) #For strings else: #print("string",X[0][split_attribute_temp]) #If split_value is string then assign the mode of the column #https://stackoverflow.com/questions/16330831/most-efficient-way-to-find-mode-in-numpy-array column_item = [row[split_attribute_temp] for row in X] (_, idx, counts) = np.unique(column_item, return_index=True, return_counts=True) index = idx[np.argmax(counts)] split_val_temp = column_item[index] X_left_, X_right_, y_left_, y_right_ = partition_classes(X,y,split_attribute_temp,split_val_temp) temp_information_gain = information_gain(y,[y_left_,y_right_]) #Store the split with the best information gain if temp_information_gain > max_information_gain: max_split_attribute,max_split_val, max_information_gain = split_attribute_temp,split_val_temp,temp_information_gain best_X_left,best_X_right, best_y_left,best_y_right = X_left_, X_right_, y_left_, y_right_ return max_split_attribute,max_split_val,best_X_left,best_X_right, best_y_left,best_y_right
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree max_ig = 0 max_index = 0 if y.count(y[0]) == len(y): self.tree['is_leaf'] = True self.tree['label'] = y[0] return if self.depth > 100: self.tree['is_leaf'] = True self.tree['label'] = self.max_element(y) return for index in range(len(X[0])): X_left, X_right, y_left, y_right = partition_classes( X, y, index, np.mean(X, axis=0)[index]) ig = information_gain(y, [y_left, y_right]) if ig > max_ig: max_ig = ig max_index = index X_left, X_right, y_left, y_right = partition_classes( X, y, max_index, np.mean(X, axis=0)[max_index]) tig = information_gain(y, [y_left, y_right]) if tig < 0.001: self.tree['is_leaf'] = True self.tree['label'] = self.max_element(y) return if len(X_left) == len(X) or len(X_right) == len(X): self.tree['is_leaf'] = True self.tree['label'] = self.max_element(y) return else: self.tree['is_leaf'] = False self.tree['split_attr'] = max_index self.tree['split_val'] = np.mean(X, axis=0)[max_index] self.depth += 1 self.tree['left'] = self.learn(X_left, y_left) self.tree['right'] = self.learn(X_right, y_right) return self.tree['label']
def _learn(X, y): Y_entropy = entropy(y) if Y_entropy == 0: return [-1, y[0], None, None] cur_max_gain = 0 best_attr = [] best_index = -1 for index in data_length: if type(X[0][index]) == str: is_str = True else: is_str = False if is_str == True: attr_X = np.unique([X[i][index] for i in range(len(X))]) for attr in attr_X: X_left, X_right, y_left, y_right = partition_classes( X, y, index, attr) gain = information_gain(y, [y_left, y_right]) if gain > cur_max_gain: cur_max_gain = gain best_index = index best_attr = attr best_X_left, best_X_right = X_left, X_right best_y_left, best_y_right = y_left, y_right else: attr_X = np.mean([X[i][index] for i in range(len(X))]) X_left, X_right, y_left, y_right = partition_classes( X, y, index, attr_X) gain = information_gain(y, [y_left, y_right]) if gain > cur_max_gain: cur_max_gain = gain best_index = index best_attr = attr_X best_X_left, best_X_right = X_left, X_right best_y_left, best_y_right = y_left, y_right if cur_max_gain <= 0: return [-1, np.argmax(np.bincount(y)), None, None] left = _learn(best_X_left, best_y_left) right = _learn(best_X_right, best_y_right) return [best_index, best_attr, left, right]
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) #pass max_info_gain, max_attribute, max_value = -1, 0, 0 X_left = [] X_right = [] y_left = [] y_right = [] self.tree_depth = 0 self.id = None if self.tree_depth > 10 or entropy(y) <= 0: self.id = y[0] return for i in range(0, len(X[0])): values = [X[j][i] for j in range(0, len(X))] # choose the split value according to its average in order to reduce the running time if isinstance(values[0], str): split_avg = values[0] else: split_avg = sum(values) / len(values) xLeft, xRight, yLeft, yRight = partition_classes( X, y, i, split_avg) current = [] current.append(yLeft) current.append(yRight) temp = information_gain(y, current) if temp > max_info_gain: max_attribute = i max_value = split_avg max_info_gain = temp X_left = xLeft X_right = xRight y_left = yLeft y_right = yRight #build tree self.tree['max_attribute'], self.tree[ 'max_value'] = max_attribute, max_value self.tree['L'], self.tree['R'] = DecisionTree(), DecisionTree() #grow tree self.tree['L'].learn(X_left, y_left) self.tree['R'].learn(X_right, y_right) self.tree['L'].tree_depth = self.tree_depth + 1 self.tree['R'].tree_depth = self.tree_depth + 1
def ___calculate_gain(self, X, y, attribute, values): # Calculate information gain for given attribute splits = {} if isinstance(values[0], str): targets = np.unique(values) new_y = [] for s in targets: xl, _, yl, _ = partition_classes(X, y, attribute, s) splits[s] = [[xl], [yl]] new_y.append(yl) gain = information_gain(y, new_y) else: s = np.mean(values) xl, xr, yl, yr = partition_classes(X, y, attribute, s) splits[s] = [[xl, xr], [yl, yr]] gain = information_gain(y, [yl, yr]) return gain, splits
def learn(self, X, y): #Train the decision tree (self.tree) using the the sample X and labels y #The functions in utils.py are used to train the tree #* Method used to implement the tree: #* Each node in self.tree is in the form of a dictionary: #* https://docs.python.org/2/library/stdtypes.html#mapping-types-dict #* For example, a non-leaf node with two children can have a 'left' key and a #* 'right' key. leny = len(set(y)) #print(leny) if leny == 1: self.tree['label'] = y[0] return if leny == 0: self.tree['label'] = 0 return x_left = [] x_right = [] y_left = [] y_right = [] maxInfoGain = 0 maxIndex = 0 best_split_attr = 0 split_val = 0 terminate_case = 0, 0.0, 0 for i in range(len(X) - 4): for j in range(len(X[0])): split_val_update = X[i][j] x_l, x_r, y_l, y_r = partition_classes(X, y, j, split_val_update) y_update = [y_l, y_r] infoGain = information_gain(y, y_update) if infoGain > maxInfoGain: maxInfoGain = infoGain split_val = split_val_update best_split_attr = j testing = i, maxInfoGain, best_split_attr x_left = x_l x_right = x_r y_left = y_l y_right = y_r self.tree['left'] = DecisionTree() self.tree['right'] = DecisionTree() self.tree['left'].learn(x_left, y_left) self.tree['right'].learn(x_right, y_right) self.tree['attribute'] = best_split_attr self.tree['value'] = split_val pass
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # for loop to iterate through each column, outside of this function # distinct values (set(new_words)) # of distinct values len(set(new_words)) # for categorical, # also iterate through the split values for each attribute # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) info_gain = 0 best = {} best['IG'] = 0 zipped = (zip(*X)) sf_column = list(zipped[sf]) sf_range = [min(sf_column), max(sf_column)] sv_list = (np.random.uniform(sf_range[0],sf_range[1],25)) for sf in range(0, len(X)-1): # for all split attributes for val in sv_list: # for split values [X_left, X_right, y_left, y_right] = partition_classes(X, y, sf, sv) cur_y = [y_left, y_right] prev_y = y info_gain = information_gain(prev_y, cur_y) if (info_gain > best['IG']): best['IG'] = info_gain best['sf'] = sf best['sv'] = val # after choosing the best split feature and the best split value, # partition the classes and set the .left = X_left and the .right = X_right [X_left, X_right, y_left, y_right] = partition_classes(X, y, sf, sv) # The recursively call learn on X_left and X_right left = addNode(X_left, y_left) right = addNode(X_right, y_right) self.tree['left'] = left self.tree['right'] = right self.tree['sv'] = sv self.tree['sf'] = sf
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) # If only 1 y label, set that as label if len(set(y)) == 1: self.tree['label'] = y[0] return elif len(set(y)) == 0: self.tree['label'] = 0 return # Create variables with initial values max_info_gain = 0 # Initialize variables for loop x_len = len(X) row_len = len(X[0]) # Find maximum information gain by looping through every possible partition for i in range(x_len): for j in range(row_len): test_split_val = X[i][j] xL, xR, yL, yR = partition_classes(X, y, j, test_split_val) info_gain = information_gain(y, [yL, yR]) if info_gain > max_info_gain: max_info_gain = info_gain split_attr = j split_val = test_split_val x_left, y_left, x_right, y_right = xL, yL, xR, yR # Create left and right trees self.tree['left'] = DecisionTree() self.tree['right'] = DecisionTree() # Train left and right trees self.tree['left'].learn(x_left, y_left) self.tree['right'].learn(x_right, y_right) # Store split attribute and split value in tree self.tree['split_attribute'] = split_attr self.tree['split_value'] = split_val pass