def split(self): self.best_info_gain = float('-inf') self.best_attribute_values = [] for attribute_index in range(len(self.features[0])): branch = dict() for attribute_value in sorted(set(self.features[:, attribute_index])): branch[attribute_value] = [0] * self.num_cls label_map = sorted(set(self.labels)) for label_index, label in enumerate(self.labels): branch[self.features[label_index, attribute_index]][label_map.index(label)] \ = branch.get(self.features[label_index, attribute_index], 0)[label_map.index(label)] + 1 current_info_gain = Util.Information_Gain(self.entropy, list(branch.values())) if (current_info_gain != 0) and ((current_info_gain > self.best_info_gain) or (current_info_gain == self.best_info_gain and len(branch) > self.feature_uniq_split)): self.best_info_gain = current_info_gain self.dim_split = attribute_index self.feature_uniq_split = len(branch) self.best_attribute_values = list(branch.keys()) if self.best_info_gain != float('-inf'): # split the best attribute and create children child_feature_array = np.column_stack((self.features[:, :self.dim_split], self.features[:, self.dim_split+1:])) for attribute_value in self.best_attribute_values: subset_of_indices = np.where(self.features[:, self.dim_split] == attribute_value)[0] child_labels = np.array(self.labels)[subset_of_indices].tolist() self.children.append(TreeNode(child_feature_array[subset_of_indices].tolist(), child_labels,\ len(set(child_labels)))) if self.children[-1].splittable: self.children[-1].split() else: self.splittable = False return
def get_split_attribute(self): ent = [self.labels.count(x) for x in set(self.labels)] S = Util.get_entropy(ent) A = 0 max_ig = 0 split_features = [] for a in range(len(self.features[0])): newFeatures = [] branches = {} for idf, f in enumerate(self.features): newFeatures.append(f[a]) if f[a] in branches: branches[f[a]].append(self.labels[idf]) else: branches[f[a]] = [self.labels[idf]] counts = [] for b in branches: counts.append([branches[b].count(x) for x in set(branches[b])]) ig = Util.Information_Gain(S, counts) if ig == max_ig: if len(newFeatures) > len(split_features): max_ig = ig A = a split_features = newFeatures if ig > max_ig: max_ig = ig A = a split_features = newFeatures #print("Attribute to split on: " + str(A)) return A, split_features
def split(self): if len(self.features) == 0: self.splittable = False if len(self.labels) == 0: self.splittable = False if len(self.features[0]) == 0: self.splittable = False # if not splittable, return majority label if self.splittable == False: return unique, counts = np.unique(self.labels, return_counts=True) current_entropy = Util.get_entropy(counts) max_info = None for index in range(len(self.features[0])): branches = self.make_br(index) info = Util.Information_Gain(current_entropy, branches) if max_info is None or info > max_info: max_info = info self.dim_split = index elif info == max_info: current_index_values = self.all_attrib_values(index) best_index_values = self.all_attrib_values(self.dim_split) if current_index_values > best_index_values: self.dim_split = index elif current_index_values == best_index_values: self.dim_split = min(self.dim_split, index) attribute_val_dict = {} for index in range(0, len(self.features)): attribute_val = self.features[index][self.dim_split] features_and_labels = [[], []] if attribute_val in attribute_val_dict: features_and_labels = attribute_val_dict[attribute_val] feature_to_del = self.features[index] feature_to_del = np.delete(feature_to_del, [self.dim_split]) features_and_labels[0].append(feature_to_del) features_and_labels[1].append(self.labels[index]) attribute_val_dict[attribute_val] = features_and_labels self.feature_uniq_split = list(attribute_val_dict.keys()) self.feature_uniq_split = sorted(self.feature_uniq_split, key=lambda e: ({ int: 1, float: 1, str: 0 }.get(type(e), 0), e)) for key in self.feature_uniq_split: features_and_labels = attribute_val_dict[key] childn = TreeNode(features_and_labels[0], features_and_labels[1], np.unique(features_and_labels[1])) if childn.splittable: childn.split() self.children.append(childn)
def split(self): feature_information_gains = [] unique_labels, unique_label_count = np.unique(self.labels, return_counts=True) for f in range(len(np.array(self.features).T)): feature_class_count = [[ len([ i for i, j in zip( np.array(self.features)[:, f], np.array(self.labels)) if i == feature and j == label ]) for label in unique_labels ] for feature in self.features_unique[f]] Entropy = sum([(-1) * (float(x) / sum(unique_label_count)) * np.log2(float(x) / sum(unique_label_count)) for x in unique_label_count]) feature_information_gains.append( (Util.Information_Gain(Entropy, feature_class_count), len(np.unique(np.array(self.features)[:, f])))) information_gains = np.array([i[0] for i in feature_information_gains]) if all(information_gains == 0.0): self.splittable = False return self.dim_split = feature_information_gains.index( max(feature_information_gains, key=lambda x: (x[0], x[1]))) feature_labels = np.column_stack((self.features, self.labels)).tolist() feature_labels.sort(key=lambda x: x[self.dim_split]) feature_unique, unique_index = np.unique( np.array(feature_labels)[:, self.dim_split], return_index=True) feature_class_split = np.split(feature_labels, unique_index[1:]) self.feature_uniq_split = self.features_unique[self.dim_split] self.feature_uniq_split = self.feature_uniq_split.tolist() self.feature_uniq_split.sort() feature_unique = feature_unique.tolist() for i in range(len(self.feature_uniq_split)): if not self.feature_uniq_split[i] in feature_unique: new_child = TreeNode([[]], self.labels, [[]]) new_child.cls_max = self.cls_max self.children.append(new_child) else: index = feature_unique.index(self.feature_uniq_split[i]) child_labels = feature_class_split[index][:, -1] child_features = np.delete(feature_class_split[index], -1, 1) child_features = np.delete(child_features, self.dim_split, 1) child_features_unique = np.delete(self.features_unique, self.dim_split, 0) new_child = TreeNode(child_features.tolist(), child_labels.astype(int).tolist(), child_features_unique) self.children.append(new_child) if new_child.splittable: new_child.split()
def split(self): s = Util.Weighted_Average_Entropy(Util.get_amount_cls(self.labels)) best_gain_value = -1.0 * np.inf best_branches = {} best_a = 0 best_class_amounts = [] # find best attributes to spilt for a in range(len(self.features[0])): branches = Util.get_branches(self.features, self.labels, a) class_amounts = [ Util.get_amount_cls(branch[1]) for branch in list(branches.values()) ] gain_value = Util.Information_Gain(s, class_amounts) if (gain_value > best_gain_value) or ( gain_value == best_gain_value and len(branches.keys()) > len(best_branches.keys())): best_a = a best_class_amounts = class_amounts best_gain_value = gain_value best_branches = branches # setup the selected attributes splits self.dim_split = best_a self.feature_uniq_split = list(best_branches.keys()) features = [data[0] for data in list(best_branches.values())] labels = [data[1] for data in list(best_branches.values())] best_class_amounts = [ x for _, x in sorted(zip(self.feature_uniq_split, best_class_amounts), reverse=False) ] features = [ feature for _, feature in sorted(zip(self.feature_uniq_split, features), reverse=False) ] labels = [ label for _, label in sorted(zip(self.feature_uniq_split, labels), reverse=False) ] self.feature_uniq_split = sorted(self.feature_uniq_split) self.children = [ TreeNode(feature, label, len(class_amount)) for feature, label, class_amount in zip(features, labels, best_class_amounts) ] #self.children = [ TreeNode(data[0],data[1],len(class_amount)) for data,class_amount in zip(list(best_branches.values()),best_class_amounts)] debug_i = 0 for child in self.children: child.debug_path = self.debug_path + [debug_i] debug_i += 1 if child.splittable: child.split()
def split(self): for idx_dim in range(len(self.features[0])): ############################################################ # TODO: compare each split using conditional entropy # find the ############################################################ if not 'max_entropy' in locals(): max_entropy = -1 xi = np.array(self.features)[:, idx_dim] if None in xi: continue branch_values = np.unique(xi) branches = np.zeros((len(branch_values), self.num_cls + 1)) for i, val in enumerate(branch_values): y = np.array(self.labels)[np.where(xi == val)] for yi in y: branches[i, yi] += 1 e = 0 X = np.unique(self.labels) for x in X: i = float(np.count_nonzero(self.labels == x)) / len( self.labels) e += i * np.log2(1 / i) info_gain_current = Util.Information_Gain(e, branches) if info_gain_current > max_entropy: #parent_entropy=Util.entropy(branches) max_entropy = info_gain_current self.dim_split = idx_dim self.feature_uniq_split = branch_values.tolist() ############################################################ # TODO: split the node, add child nodes ############################################################ xi = np.array(self.features)[:, self.dim_split] x = np.array(self.features, dtype=object) x[:, self.dim_split] = None # x = np.delete(self.features, self.dim_split, axis=1) for val in self.feature_uniq_split: indexes = np.where(xi == val) x_new = x[indexes].tolist() y_new = np.array(self.labels)[indexes].tolist() child = TreeNode(x_new, y_new, self.num_cls) if np.array(x_new).size == 0 or all(v is None for v in x_new[0]): child.splittable = False self.children.append(child) # split the child nodes for child in self.children: if child.splittable: child.split() return
def split(self): if len(self.features[0]) == 0: self.splittable = False if self.splittable: self.features = np.array(self.features) [r, c] = len(self.features), len(self.features[0]) Gain = [] for j in range(c): #print("c",c) count = [] for key in np.sort(np.unique(self.features[:, j])): temp = {} for k in np.unique(self.labels): temp[k] = 0 for i in range(r): if self.features[i][j] == key: z = self.labels[i] temp[z] = temp[z] + 1 count.append(temp) sub = [] for row in count: row2 = [] for key in np.unique(self.labels): row2.append(row[key]) sub.append(row2) Gain.append(Util.Information_Gain(0, sub)) maxind = np.argwhere(Gain == np.max(Gain)) if len(maxind) != 1: uft = [] for m in range(len(maxind)): uft.append(len(np.unique(self.features[:, maxind[m][0]]))) maxuft = np.argwhere(uft == np.max(uft)) self.dim_split = maxind[maxuft[0][0]][0] else: self.dim_split = np.argmax(Gain) #print(self.dim_split) self.feature_uniq_split = np.sort( np.unique(self.features[:, self.dim_split])) for k in self.feature_uniq_split: feat_new = [] slce1 = [] slce2 = [] for i in range(r): if self.features[i][self.dim_split] == k: slce1 = self.features[i] slce1 = np.delete(slce1, self.dim_split) feat_new.append(slce1) slce2.append(self.labels[i]) child = TreeNode(feat_new, slce2, len(self.feature_uniq_split)) self.children.append(child) for ch in self.children: ch.split() return raise NotImplementedError
def split(self): max_ig = float('-inf') local_feature = [] ig_array = [] for i in range(0, len(self.features[0])): branches = self.getbranches( (np.unique(np.array(self.features)[:, i])), i, self.features, self.labels, (np.unique(np.array(self.labels)))) parent = self.getparententropy(self.labels) ig = Util.Information_Gain(parent, branches) ig_array.append(ig) if max_ig < ig: # entropy is greater self.dim_split = i max_ig = ig local_feature = (np.unique(np.array( self.features)[:, i])).tolist() elif max_ig == ig: # entropy is same if len( (np.unique(np.array(self.features)[:, i])).tolist()) > len( local_feature ): # select features with greater attributes self.dim_split = i local_feature = (np.unique( np.array(self.features)[:, self.dim_split])).tolist() elif len((np.unique(np.array(self.features)[:, i]) ).tolist()) == len(local_feature): # same attributes if self.dim_split >= i: # select with lower index self.dim_split = i local_feature = (np.unique( np.array(self.features)[:, self.dim_split])).tolist() if max(ig_array) == 0.0: self.splittable = False return else: self.feature_uniq_split = local_feature feature_selected = np.array(self.features)[:, self.dim_split] modf_feature = np.delete(np.array(self.features), self.dim_split, 1) for fs in np.sort(self.feature_uniq_split): indexes = np.where(fs == feature_selected) x_fs = modf_feature[indexes].tolist() l_fs = np.array(self.labels)[indexes].tolist() child = TreeNode(x_fs, l_fs, self.num_cls) self.children.append(child) if len(x_fs) == 0 or len(x_fs[0]) == 0: child.splittable = False for child in self.children: if child.splittable: child.split() return raise NotImplementedError
def split(self): branches = [] IG = [] #get entropy for this node labels_count = np.bincount(self.labels) h = 0.0 esum = np.sum(labels_count) if esum == 0: h = 0.0 else: for k in labels_count: if k == 0: h1 = 0 else: h1 = -k/esum*np.log2(k/esum) h += h1 #get IG for i in range(len(self.features[0])): result1 = [] for j in np.unique(np.transpose(self.features)[i]): labels = [] for ind,k in enumerate(np.transpose(self.features)[i]): if(j == k): labels.append(self.labels[ind]) result1.append(np.bincount(labels).tolist()) branches.append(result1) for i in branches: IG.append(Util.Information_Gain(h,i)) self.dim_split = np.argmax(IG) self.feature_uniq_split = np.unique(np.transpose(self.features)[self.dim_split]).tolist() for i in np.unique(np.transpose(self.features)[self.dim_split]): feature = [] label = [] for index, j in enumerate(self.features): if i == j[self.dim_split]: inter = list(j) inter.pop(self.dim_split) feature.append(inter) label.append(self.labels[index]) node = TreeNode(feature,label,len(np.unique(label))) if node.splittable: node.split() self.children.append(node) return
def split(self): Sn=Util.entropy(self.labels) max_gain=-np.inf branch_vals=[] min_entropy=np.inf df=pd.DataFrame(self.features) if df.empty: self.splittable=False else: df['labels']=self.labels for col in df.drop(columns='labels'): branches = np.nan_to_num(df[[col, 'labels']].groupby(by=[col, 'labels']).size().unstack().fillna(value=0).values) # branches=df[col].value_counts().to_frame().reset_index().values gain = Util.Information_Gain(Sn, branches.tolist()) branch_vals = sorted(df[col].unique().tolist()) if gain > max_gain: max_gain=gain self.dim_split = col self.feature_uniq_split = branch_vals #branches[:,0] #list(d.keys()) split_df=df.groupby(by=[self.dim_split]) for feature_val in self.feature_uniq_split: if feature_val not in split_df.groups.keys(): continue child=split_df.get_group(feature_val).drop(columns=self.dim_split) new_node = TreeNode(child.drop(columns=['labels']).values.tolist(), child['labels'].values.tolist(),self.num_cls) if child.drop(columns=['labels']).empty: new_node.splittable = False #new_node.cls_max = self.cls_max if (len(new_node.features) <= 1) or (child.drop(columns=['labels']).values == []): #new_node.cls_max = self.cls_max new_node.splittable = False self.children.append(new_node) for child in self.children: if child.splittable: child.split() return
def calculate_information_gain(self, features_array, label_array, thisdict, S): inforamtion_gain = [] num_features = len(features_array[0]) for i in range(num_features): cur_features = features_array[:, i] uni_features = np.unique(cur_features) branches = [] for val in uni_features: label_for_each_value = label_array[features_array[:, i] == val] attribute_num = len(np.unique(label_for_each_value)) num_for_each_class = [0] * len(thisdict) for element in np.unique(label_for_each_value): times = label_for_each_value[label_for_each_value == element].size num_for_each_class[thisdict[element]] = times branches.append(num_for_each_class) res = Util.Information_Gain(S, branches) inforamtion_gain.append((res, uni_features, i)) return sorted(inforamtion_gain, key=lambda x: (-x[0], -len(x[1]), x[2]))
def split(self): if self.splittable: labels_split = attribute_split_count(self.labels) feat_transpose = Util.transpose_list(self.features) # calculate S S = Util.calc_entropy(labels_split, sum(labels_split)) info_gain_all_features = [] col_index = 0 for feat_col in feat_transpose: # branches unique_feat_vals = np.unique(feat_col).tolist() branches = {} for item in unique_feat_vals: branches[item] = {} for cls in self.classes: branches[item][cls] = 0 for feat_val, feat_label in zip(feat_col, self.labels): branches[feat_val][feat_label] += 1 # convert branches dict to 2D array of counts only branches_2d_array = [] for key, branch in branches.items(): temp_array = [] for inner_key, count in branch.items(): temp_array.append(count) branches_2d_array.append(temp_array) info_gain_all_features.append( (Util.Information_Gain(S, branches_2d_array), unique_feat_vals, col_index)) col_index += 1 info_gain_all_features.sort(key=lambda tup: tup[0], reverse=True) # base case if no features left if not info_gain_all_features: if self.dim_split is None: self.splittable = False return # filter ties info_gain_all_features = Util.filter_ties(info_gain_all_features) info_gain_all_features.sort(key=lambda tup: tup[1], reverse=True) info_gain_all_features = Util.filter_ties(info_gain_all_features) info_gain_all_features.sort(key=lambda tup: tup[2]) self.assign_selected_feature(info_gain_all_features[0]) # assign Children # The children variable is a list of TreeNode after split # the current node based on the best attributes. self.feature_uniq_split.sort() for feat_val_extract in self.feature_uniq_split: extract_feat = [] extract_labels = [] for row_feat, row_labels in zip(self.features, self.labels): if feat_val_extract == row_feat[self.dim_split]: temp_row_feat = row_feat[:] temp_row_feat.pop(self.dim_split) extract_feat.append(temp_row_feat) extract_labels.append(row_labels) self.children.append( TreeNode(extract_feat, extract_labels, np.unique(extract_labels).size)) for node in self.children: node.split() else: return
def split(self): max_score = 0 best_child_list = [] best_label_list = [] best_class_list = [] temp_split = -1 if len(self.features[0]) == len(self.feature_uniq_split): self.splittable = False return else: # find best feature to split # best_feature = self.features[0] # transpose transpose_feature = np.transpose(self.features) # feature index for feature_index in range(len(self.features[0])): can_split = True for already_split in self.feature_uniq_split: if feature_index == already_split: can_split = False if can_split: # print('feature_index',len(self.features[0])) temp_child_list = [] temp_label_list = [] temp_count_list = [] temp_class_list = [] # feature class for feature_class in np.unique( transpose_feature[feature_index]): child_num = np.unique( transpose_feature[feature_index]).size temp_child = [] temp_label = [] temp_count = [] temp_class = feature_class # feature number for feature_num in range(len(self.features)): if self.features[feature_num][ feature_index] == feature_class: temp_child.append(self.features[feature_num]) temp_label.append(self.labels[feature_num]) # count label number and calculate IG for label_class in np.unique(self.labels): count_temp_label_num = 0 for temp_label_index in range(len(temp_label)): if temp_label[temp_label_index] == label_class: count_temp_label_num += 1 temp_count.append(count_temp_label_num) # print('temp_count', temp_count) temp_child_list.append(temp_child) temp_label_list.append(temp_label) temp_count_list.append(temp_count) temp_class_list.append(temp_class) parentlist = [] # parent_score = 0 for child in range(len(temp_count_list[0])): num = 0 for index in range(len(temp_count_list)): num += temp_count_list[index][child] parentlist.append(num) # count_parent = sum(parentlist) parent_score = -1 * Util.Information_Gain(0, [parentlist]) # for j in range(len(parentlist)): # if parentlist[j] != 0: # parent_score -= (parentlist[j] / count_parent) * (np.log2((parentlist[j] / count_parent))) score = Util.Information_Gain(parent_score, temp_count_list) if score < 0.000000000000001: score = 0.0 print(score > 0.0) print('score', score, temp_label_list) if score > max_score: max_score = score best_child_list = temp_child_list best_label_list = temp_label_list best_class_list = temp_class_list temp_split = feature_index elif score == max_score and score > 0: if len(temp_label_list) > len(best_label_list): best_child_list = temp_child_list best_label_list = temp_label_list best_class_list = temp_class_list temp_split = feature_index # self.dim_split = feature_index # self.dim_split = temp_split if max_score == 0: raise NotImplementedError self.splittable = False self.cls_max = self.labels[0] return print('max_score', max_score, best_label_list) self.dim_split = temp_split self.feature_uniq_split.append(temp_split) self.child_class = best_class_list # end for loop for child_index in range(len(best_label_list)): child_num_cls = np.unique(best_label_list[child_index]).size if len(best_child_list[child_index]) == 0: return else: child_node = TreeNode(best_child_list[child_index], best_label_list[child_index], child_num_cls) child_node.feature_uniq_split = self.feature_uniq_split if child_node.splittable: child_node.split() self.children.append(child_node) return
def split(self): features = np.array(self.features) number_of_attributes = features[0].size if (number_of_attributes != 0): max_info_gain = -1 max_unique_values = 0 max_attr_number = 0 #Entropy for root: root_branch = [] root_label_count = [] for label in np.unique(self.labels): root_label_count.append(self.labels.count(label)) root_branch.append(root_label_count) entropy_root = Util.Information_Gain(0, root_branch) entropy_root *= -1 #Split according to attributes for attr_number in range(number_of_attributes): unique_values = np.unique(features[:, attr_number]) splits = unique_values.size branches = [] children = [] np.sort(unique_values) for unique_value in unique_values: branch = [] branch_feat = [] branch_label = [] branch_label_count = [] zero_label = 0 one_label = 0 for pos, feature in enumerate(features): if (feature[attr_number] == unique_value): branch_feat.append(np.delete(feature, attr_number)) branch_label.append(self.labels[pos]) for label in np.unique(self.labels): branch_label_count.append(branch_label.count(label)) branches.append(branch_label_count) child = TreeNode(branch_feat, branch_label, np.unique(branch_label).size) child.parent = self children.append(child) # Check Information Gain for each attribute info_gain = Util.Information_Gain(entropy_root, branches) if (info_gain > max_info_gain): max_info_gain = info_gain max_unique_values = unique_values.size max_attr_number = attr_number self.children = children self.dim_split = attr_number self.feature_uniq_split = unique_values elif (info_gain == max_info_gain): if (unique_values.size > max_unique_values): max_info_gain = info_gain max_unique_values = unique_values.size max_attr_number = attr_number self.children = children self.dim_split = attr_number self.feature_uniq_split = unique_values elif (unique_values.size == max_unique_values): if (attr_number < max_attr_number): max_info_gain = info_gain max_unique_values = unique_values.size max_attr_number = attr_number self.children = children self.dim_split = attr_number self.feature_uniq_split = unique_values if (max_info_gain == 0.0): self.children = [] self.dim_split = None self.feature_uniq_split = None self.splittable = False return for child in self.children: if (child.splittable is True): child.split() else: self.splittable = False
def split(self): #split on the basis of best_attribute -> highest information gain #calculate root entropy # print("features =========", self.features, " labels: ", self.labels, " num_cls ====", self.num_cls) num_class = np.unique(self.labels, return_counts=True) val_set = num_class[0] counts = num_class[1] total_entries = len(self.labels) prob = counts / total_entries root_entropy = 0 for p in prob: root_entropy += -1 * p * np.log2(p) # print("root_entropy",root_entropy) information_gains = [] branch_feature_values = [] best_information_gain = -1 branch_feature_list = [] #now find best attributes for index_col in range(len(self.features[0])): #for each feature coli = np.array(self.features)[:, index_col] branch_feature = np.unique(coli) #[a,b] # print("branch_feature ====", branch_feature) branch = [0] * len(branch_feature) for b in range(len(branch_feature)): branch[b] = [0] * self.num_cls # print(branch) # branch = np.zeros((len(branch_feature), self.num_cls))#num_branches * num_class feature_dict = {} count = 0 for fea in branch_feature: feature_dict[fea] = count count += 1 # print(feature_dict) labels_dict = {} count = 0 for lab in np.unique(self.labels): labels_dict[lab] = count count += 1 # print("coli ========", coli) for i in range(len(coli)): branch_feature_num = feature_dict[coli[i]] label_feature_num = labels_dict[self.labels[i]] branch[branch_feature_num][label_feature_num] += 1 gain = Util.Information_Gain(root_entropy, branch) branch_feature_values.append([len(branch_feature), index_col]) information_gains.append(gain) branch_feature_list.append(branch_feature.tolist()) # print("gain====", gain) # if gain > best_information_gain: # best_information_gain = gain # self.dim_split = index_col # self.feature_uniq_split = branch_feature.tolist() if (len(information_gains) <= 0): self.dim_split = None self.feature_uniq_split = None self.splittable = False return gain_max = max(information_gains) best_index_gain = [ i for i in range(len(information_gains)) if information_gains[i] == gain_max ] #we need to pick feature with most number of attr values max_attr = -1 fea_col = -1 best_branch_feature_list = [] best_position = -1 for ind in best_index_gain: if (max_attr < branch_feature_values[ind][0]): best_position = ind max_attr = branch_feature_values[ind][0] fea_col = branch_feature_values[ind][1] best_branch_feature_list = branch_feature_list[ind] best_information_gain = information_gains[best_position] self.dim_split = fea_col self.feature_uniq_split = best_branch_feature_list if best_information_gain <= -1: self.dim_split = None self.feature_uniq_split = None self.splittable = False return # print("dividing on the basis of =====", best_information_gain, "dimension: ===", self.dim_split, "feature: ===", self.feature_uniq_split) #split the nodes, and add child nodes coli = np.array(self.features)[:, self.dim_split] # print("coli===========", coli) #column to be removed print("feature_uniq_split == ", self.feature_uniq_split) self.feature_uniq_split.sort() if len(self.feature_uniq_split) > 0: for val in self.feature_uniq_split: labels_new = [] features_new = [] for index, row in enumerate(self.features): if row[self. dim_split] == val: #value for which need to split labels_new.append(self.labels[index]) features_new.append(row) features_new = np.delete(features_new, self.dim_split, axis=1) num_class = np.unique(labels_new) child = TreeNode(features_new.tolist(), labels_new, len(num_class)) self.children.append(child) # print(self.children) # split the child nodes for child in self.children: # print("child =========", child.splittable) if child.splittable: child.split() return
def split(self): # get each feature list if len(self.features) == 0: self.splittable = False return features = np.array(self.features) labels = np.array(self.labels) max_IG = 0 unique_feature_split = np.array([]) for k in range(len(self.features[0])): uniq_features_dict = {} uniq_label_dict = {} unique_features = np.unique(features.T[k]) unique_labels = np.unique(labels) features_label = [] counter_dict = {} for i in self.labels: if i not in counter_dict.keys(): counter_dict[i] = 1 else: counter_dict[i] += 1 # calculate parents entropy S = 0 for i in unique_labels: S += (-1) * (counter_dict[i] / len(self.labels) * np.log2(counter_dict[i] / len(self.labels))) for i, j in zip(self.features, self.labels): features_label.append((i[k], j)) c_dict = {} for i in features_label: if i not in c_dict.keys(): c_dict[i] = 1 else: c_dict[i] += 1 for i in range(len(unique_features)): uniq_features_dict = { unique_features[i]: i for i in range(len(unique_features)) } for j in range(len(unique_labels)): uniq_label_dict = { unique_labels[i]: i for i in range(len(unique_labels)) } branches = [[0] * len(unique_labels) for i in range(len(unique_features))] for x in features_label: branches[uniq_features_dict[x[0]]][uniq_label_dict[ x[1]]] = c_dict[x] # calculate the information gain IG = Util.Information_Gain(S, branches) if IG > max_IG or (IG == max_IG and len(unique_features) > len(unique_feature_split)): max_IG = IG unique_feature_split = unique_features selected_index = k self.dim_split = selected_index self.feature_uniq_split = unique_feature_split.tolist() if len(self.feature_uniq_split) == 0 or max_IG == 0: self.splittable = False # split the node to_split = self.feature_uniq_split cut = self.dim_split for i in range(len(to_split)): children_features = features[features[:, cut] == to_split[i]] New_features = np.delete(children_features, cut, axis=1).tolist() New_labels = labels[features[:, cut] == to_split[i]].tolist() New_num_cls = len(New_labels) chil = TreeNode(New_features, New_labels, New_num_cls) self.children.append(chil) for child in self.children: if child.splittable: child.split()
def split(self): if self.splittable==True: index_use=[] features=np.array(self.features) n=len(self.features[0]) nn=len(self.features) bestIG=-1.0 baseEn=0.0 bestcc=0 labelLength = len(self.labels) labelCount = len(set(self.labels)) for i in range(labelCount): a = float(self.labels.count(list(set(self.labels))[i])) / float(labelLength) if a > 0: baseEn += -(a * np.log2(a)) for i in range(n): branchs=[] featureL=[aaa[i] for aaa in self.features] ind=list(set(featureL)) cc=len(ind) for attr in ind: l=[] for j in range(nn): if featureL[j]==attr: l.append(self.labels[j]) d={} for ii in l: if ii not in d: d[ii]=1 else: d[ii]+=1 l1=[] for iii in d.values(): l1.append(iii) while len(l1)<self.num_cls: l1.append(0) branchs.append(l1) Info=Util.Information_Gain(baseEn,branchs) if Info - bestIG > 1e-5: bestcc=cc bestIG=Info self.dim_split=i self.feature_uniq_split=np.unique(features[:,self.dim_split]).tolist() elif Info==bestIG: if cc>bestcc: bestcc=cc self.dim_split=i self.feature_uniq_split=np.unique(features[:,self.dim_split]).tolist() if self.features==None: self.splittable=False return if bestIG < 1e-5: self.splittable=False return if self.num_cls==1: self.splittable=False return if self.feature_uniq_split==None: self.splittable=False return index_use.append(i) labels = self.labels t = self.feature_uniq_split c = self.dim_split for m in t: res = [] l = [] for i in range(len(features)): if m == features[i][c]: l.append(labels[i]) a = list(features[i]) a.remove(m) res.append(a) num_cls=len(set(l)) child = TreeNode(res, l, num_cls) if len(index_use)==n: child.splittable=False if res==None: child.splittable=False if len(set(l))==1: child.splittable=False self.children.append(child) for child in self.children: if child.splittable: child.split() return else: return
def split(self): self.feature_uniq_split = [] S = 0 # self.features = [[0, 0], [1, 0], [0, 1], [1, 1], [0, 0], [1, 0], [0, 1], [1, 1]] # self.labels = [1,2,0,1,2,0,0,0] # self.features = [['a', 'b'], ['b', 'a'], ['b', 'c'], ['a', 'c']] # self.labels = [0, 0, 1, 1] np_features = np.array(self.features) features_T = np.transpose(np_features) np_labels = np.array(self.labels) print("features transpose", features_T) print("labels", np_labels) unique_labels = list(np.unique(np_labels)) for label in np.unique(self.labels): p_of_label = self.labels.count(label) / len(self.labels) # print (p_of_label) S -= p_of_label * np.log2(p_of_label) print(S) #unique_labels = list(np.unique(np_labels)) # features_transpose = np.transpose(features) list_info_gain = [] for i in range(len(features_T)): attribute_value = list(np.unique(features_T[i])) current_attribute = features_T[i] branches = [] for idx in range(len(attribute_value)): count_of_labels = [] for j in range(self.num_cls): counter = 0 for attr_val, k in zip(current_attribute, np_labels): if attr_val == attribute_value[ idx] and k == unique_labels[j]: counter += 1 count_of_labels.append(counter) branches.append(count_of_labels) print(branches) list_info_gain.append(Util.Information_Gain(S, branches)) # for i in range(len(unique_labels)): # list_info_gain.append(Util.Information_Gain(S, branches)) print(list_info_gain) if (list_info_gain == []): self.dim_split = None self.feature_uniq_split = None self.splittable = False return if (max(list_info_gain) == 0): self.dim_split = None self.feature_uniq_split = None self.splittable = False return selected_attr_to_split = list_info_gain.index(max(list_info_gain)) list_max_info_gain = [] count_attr_clash = [] for index, info_gain_value in enumerate(list_info_gain): if (info_gain_value == max(list_info_gain)): list_max_info_gain.append(index) for index in list_max_info_gain: count_attr_clash.append(np.unique(features_T[index]).size) selected_attr_clash = count_attr_clash.index(max(count_attr_clash)) selected_attr_to_split = list_max_info_gain[selected_attr_clash] print(selected_attr_to_split) self.dim_split = selected_attr_to_split # self.feature_uniq_split.append(np.unique(features_T[self.dim_split])) self.feature_uniq_split = list(np.unique(features_T[self.dim_split])) # self.feature_uniq_split.sort() # self.feature_uniq_split = np.array(self.feature_uniq_split) # print(self.feature_uniq_split.dtype) print("dim split", self.dim_split) print("feature unique split", self.feature_uniq_split) for selected_attr_value in self.feature_uniq_split: new_features = [] new_labels = [] print(selected_attr_value) # new_features = np.delete(self.features, self.dim_split, axis = 1) #check this loop print(np_features.shape[0]) for i in range(np_features.shape[0]): if (np_features[i][self.dim_split] == selected_attr_value): new_features.append(self.features[i][0:self.dim_split] + self.features[i][self.dim_split + 1:]) # new_features.append(str(np_features[i, 0:self.dim_split]) + str(np_features[i, self.dim_split+1:])) # new_features.append(self.features[i][:] new_labels.append(self.labels[i]) # print ("new features before transformation ", new_features) new_features = [x for x in new_features if x] new_num_classes = np.unique(new_labels).size print(new_features, " size is ", len(new_features)) print(new_features, " these are new features") print(new_labels, " size is", len(new_labels)) print(new_labels, " these are new labels") self.children.append( TreeNode(new_features, new_labels, new_num_classes)) for child in self.children: if (child.splittable): child.split()
def information_gain_test(): branch = data.sample_branch_data() score = Utils.Information_Gain(0, branch) print('Your information gain: ', score) print('My information gain: ', -0.91829583405448956)
def split(self): # compute the entropy of this tree node if self.splittable: # find the entropy of current node node_entropy = 0 for label in np.unique(self.labels): num_label = self.labels.count(label) prop = num_label / len(self.labels) node_entropy += -prop * np.log2(prop) # find which feature leads to largest information gain max_inf_gain = -1 best_attr_size = -1 uni_label_class = np.unique(self.labels) all_attr = [i for i in range(len(self.features[0]))] available_attr = [j for j in all_attr if j not in self.used_attr] for i in available_attr: cur_feature = np.array(self.features)[:, i] # extract certain feature row cur_branches = [] # find out # of attributes for current feature uni_cur_attr = np.unique(cur_feature) # for each attribute find the labels for attr in uni_cur_attr: attr_index = np.where(cur_feature == attr) attr_label = np.array(self.labels)[attr_index] cur_row = [] for label in uni_label_class: num = attr_label.tolist().count(label) cur_row.append(num) cur_branches.append(cur_row) cur_inf_gain = Util.Information_Gain(node_entropy, cur_branches) if cur_inf_gain > max_inf_gain or \ (cur_inf_gain == max_inf_gain and len(uni_cur_attr) > best_attr_size): max_inf_gain = cur_inf_gain self.dim_split = i best_attr_size = len(uni_cur_attr) # extract the whole line of the best split feature we just computed split_feature = np.array(self.features)[:, self.dim_split] self.feature_uniq_split = np.unique(split_feature).tolist() # store the number of each attribute for attr in np.unique(split_feature): child_features = [] child_labels = [] value_indices_arr = np.array(self.features) value_indices = np.where(value_indices_arr[:, self.dim_split] == attr)[0] for index in value_indices: child_features.append(self.features[index]) child_labels.append(self.labels[index]) '''idx = np.where(split_feature == attr) child_features = np.delete(np.array(self.features)[idx], self.dim_split, axis=1).tolist() child_labels = np.array(self.labels)[idx].tolist()''' num_cls_child = np.unique(child_labels).size child = TreeNode(child_features, child_labels, num_cls_child) child.used_attr.extend(self.used_attr) child.used_attr.append(self.dim_split) if len(child.used_attr) == len(self.features[0]): child.splittable = False self.children.append(child) for child in self.children: if child.splittable: child.split() return
def split(self): #split based on IG #get unique values of a feature #iterate feature wise #for parent entropy calculation #print(self.labels) #print(self.features) featuresT = np.transpose(self.features) inf_gain_list = [] max_gain = 0 for i in range(len(featuresT)): p_ent = 0 values = np.unique(featuresT[i]) print("Type of values is ", type(values)) num_values = len(values) print("Num of unique values: ", num_values) #child nodes num_examples = len(featuresT[i]) for label in np.unique(self.labels): p_ent += -np.float( (self.labels.count(label))) / num_examples * np.log2( np.float((self.labels.count(label))) / num_examples) branches = {} print(values) #attribute values for j in range(len(featuresT[i])): value = featuresT[i][j] if branches.get(value) is None: branches[value] = {} branches[value][self.labels[j]] = 1 #print(branches[value][self.labels[j]]) #print(branches.get(value)) elif branches.get(value).get(self.labels[j]) is None: branches.get(value)[self.labels[j]] = 1 #print(branches.get(value)[self.labels[j]]) else: branches[value][self.labels[j]] = branches.get(value).get( self.labels[j]) + 1 #print(branches) #branches.get(value).append(self.labels[j]) """for j in range(len(featuresT[i])): value = featuresT[i][j] if branches.get(value) is None: branches[value] = 1 print("created new key") elif branches.get(value)>0: branches[value] = branches.get(value) +1 print(branches)""" branchesList = [] j = 0 for key in branches: branchesList.append([]) #print(key) for i in range(self.num_cls): #print(branches[key][i]) if branches.get(key).get(i) is None: branchesList[j].append(0) elif branches[key][i] > 0: branchesList[j].append(branches[key][i]) j = j + 1 #print(branchesList) inf_gain = Util.Information_Gain(p_ent, branchesList) if inf_gain > max_gain: max_gain = inf_gain selected_feature_index = i #inf_gain_list.append(inf_gain) #print(inf_gain," is the information gain") #selected_feature_index = inf_gain_list.index(max(inf_gain_list)) #print(selected_feature_index) #split into children nodes self.dim_split = selected_feature_index self.feature_uniq_split = np.sort(values) for attribute in values: attribute_label = [] featureList = [] for i in range(len(featuresT[selected_feature_index])): if featuresT[selected_feature_index][i] == attribute: attribute_label.append(self.labels[i]) featureList.append(self.features[i]) for i in featureList: del i[selected_feature_index] #print(featureList) num_cls = np.unique(attribute_label).size #print(num_cls, "::" , len(attribute_label)) self.children.append( TreeNode(featureList, attribute_label, num_cls)) for child in self.children: if child.splittable: child.split()
def split(self): if self.splittable == False: return if len(self.features[0]) == 0: self.splittable = False return if len(self.features) == 0: return igmax = -1 more_attributes = -1 S = 0.0 ulabels = np.unique(self.labels) for l in range(len(ulabels)): if self.labels.count(ulabels[l]) == 0: S += 0 continue frac = self.labels.count(ulabels[l]) / (len(self.labels)) S -= frac * np.log2(frac) igSum = 0.0 for i in range(len(self.features[0])): ig = 0 feature_column = [row[i] for row in self.features] labels = self.labels unique_features = np.unique(feature_column) unique_labels = np.unique(labels) dict_features = {} dict_labels = {} for k in range(len(unique_features)): dict_features[unique_features[k]] = k for p in range(len(unique_labels)): dict_labels[unique_labels[p]] = p branches = [[0 for x in range(len(unique_labels.tolist()))] for y in range(len(unique_features.tolist()))] for l in range(len(feature_column)): if feature_column[l] in dict_features: branches[dict_features[feature_column[l]]][dict_labels[ labels[l]]] += 1 ig = Util.Information_Gain(S, branches) igSum += ig if ig > igmax: igmax = ig self.dim_split = i self.feature_uniq_split = unique_features more_attributes = len(np.unique(unique_features)) elif ig == igmax: if len(np.unique(unique_features)) > more_attributes: igmax = ig self.dim_split = i self.feature_uniq_split = unique_features more_attributes = len(np.unique(unique_features)) if igSum == 0: self.splittable = False return else: feature_column = [row[self.dim_split] for row in self.features] unique_features = self.feature_uniq_split for i in range(len(unique_features)): new_features_list = [] new_labels_list = [] for j in range(len(self.features)): if unique_features[i] == feature_column[j]: new_features_list.append(self.features[j]) new_labels_list.append(self.labels[j]) new_features_list = np.asarray(new_features_list) x1 = new_features_list.transpose() x2 = np.delete(x1, self.dim_split, 0) x3 = x2.transpose().tolist() new_features_list = x3 # new_features_list=(np.delete(new_features_list.transpose(),self.dim_split,0)).transpose().tolist() child = TreeNode(new_features_list, new_labels_list, len(np.unique(new_labels_list))) if len(new_features_list) == 0: child.cls_max = self.cls_max child.splittable = False if len(new_features_list[0]) == 0: count_max = 0 for label in np.unique(new_labels_list): if new_labels_list.count(label) > count_max: count_max = new_labels_list.count(label) child.cls_max = label child.splittable = False self.children.append(child) for i in range(len(unique_features)): self.children[i].split()
def split(self): if not self.splittable: # this case for only one class (self.cls_max) # set all split as default and return self.cls_max as result return elif len(self.features[0]) == 0 and self.num_cls != 0: # this case for no more features available # choose majority of classes as result self.cls_max return elif len(self.features[0]) == 0 and self.num_cls == 0: # this case return majority of classes with parent node # !!! Consider when predicating parent result should be hold return #TODO: produce specifc feature branch result candidate_value_list:List[List[any]] = [] tmp = np.sort(np.array(self.features).transpose()) for row in tmp: # candidate_value_list is transpose feature matrix with deduplication candidate_value_list.append(np.unique(row)) Entropy_for_Features:List[float] = [] # candidate_feature is each feature row with its index in feature matrix for index,candidate_feature in enumerate(candidate_value_list, start=0): tmp_branches_data:List[List[int]] = [] for current_value in candidate_feature: class_dic = dict() for class_label in sorted(np.unique(self.labels)): class_dic[class_label] = 0 #choose instances with specific feature value and return as branch set, #this branch set should have deleted specific locaiton feature for label_index, row in enumerate(self.features, start=0): if row[index] == current_value: class_dic[self.labels[label_index]] +=1 #entropy for a specifc feature tmp_branch_data:List[int] = [] for _,value in class_dic.items(): tmp_branch_data.append(value) #need normalization for tmp_brach_data tmp_branches_data.append(tmp_branch_data) #directly append entropy for each attribute #Entropy_for_Features.append(-1*Util.Information_Gain(0,tmp_branches_data)) #follow instruction produce S #Entropy_for_Features.append(-1*Util.Information_Gain(self.entropy_root(),tmp_branches_data)) #print('tmp_branches_data_for_each'+str(tmp_branches_data)) Entropy_for_Features.append(Util.Information_Gain(self.entropy_root(),tmp_branches_data)) #print('Entropy for features' + str(Entropy_for_Features)) #get a entropy list in Entropy_for_Features:List[float] #consider when have same entropy value and how to compare # find index of all max entropy candidate_features:List[int] = [] for index, entropy in enumerate(Entropy_for_Features, start=0): if entropy == max(Entropy_for_Features): candidate_features.append(index) #print('value of Entropys is:' + str(Entropy_for_Features)) # if only one maximum entropy if len(candidate_features) == 1: self.dim_split = candidate_features[0] # more than one maximum entropy # init: transpose features matrix for picking up data line transpose_features = np.array(self.features).transpose() # storage for best candidate if len(candidate_features) > 1: #print("============") #print("equal entropy features: "+ str(candidate_features)) best_candidate_index = len(transpose_features) + 1 best_unique_number = 0 for candidate_feature_index in candidate_features: # oringinal based on possible kinds of values for [2, 4, 5, 7] is 4 unique_feature_number = len(np.unique(transpose_features[candidate_feature_index])) # try based on range of values for [2, 4, 5 ,7] should be 5 #unique_feature_number = max(transpose_features[candidate_feature_index]) - min(transpose_features[candidate_feature_index]) # print("all feature values:"+ str(transpose_features[candidate_feature_index])) # if unique_feature_number_1 != unique_feature_number: # print("range of feature value:" + str(unique_feature_number)) # print("new range of feature:" + str(unique_feature_number_1)) if unique_feature_number > best_unique_number: best_candidate_index = candidate_feature_index best_unique_number = unique_feature_number elif (unique_feature_number == best_unique_number) and (best_candidate_index > candidate_feature_index): best_candidate_index = candidate_feature_index best_unique_number = unique_feature_number self.dim_split = best_candidate_index #print("final choice feature:" + str(self.dim_split)) # dimension has been chosen ##### # TODO: check variable # put candidate unique values into self.feature_uniq_split # self.feature_uniq_split = np.unique(transpose_features[self.dim_split]) ##### # initialize treenode and put in self.children # features, labels, num_cls are required parameters # pick up data row with specific value # feature value from min to max feature_values = sorted(np.unique(transpose_features[self.dim_split])) for value_index,cur_value in enumerate(feature_values,start=0): children_features:List[any] = [] children_labels:List[int] = [] for index,feature_row in enumerate(self.features, start=0): if feature_row[self.dim_split] == cur_value: #features with specific value has been taken out #labels for that position tmp = list(feature_row) tmp.pop(self.dim_split) children_features.append(tmp) children_labels.append(self.labels[index]) # num_cls for new node children_num_cls = len(np.unique(children_labels)) # new index should be added based on parent one and its index new_index = self.index.copy() new_index.append(value_index) #print(value_index) new_instance = TreeNode(children_features, children_labels, children_num_cls,new_index) new_instance.split() self.children.append(new_instance) self.feature_uniq_split.append(cur_value)
def split(self): self.lables = np.array(self.labels) self.features = np.array(self.features) if self.num_cls == 1: self.splittable = False return self if self.features.size == 0: self.splittable = False return self branches = [] entropy = 0 total_label = len(self.labels) feat = self.features lab = self.labels for label in np.unique(lab): number = lab.count(label) / total_label if number > 0: entropy += -number * np.log2(number) arr = [] for i in range(0, feat.shape[1]): attribute = feat[:, i] current_attribute_unique_size = np.unique(attribute).size current_index = i for unique_attribute in np.unique(attribute): indices_array = np.where(feat[:, i] == unique_attribute) labels_ar = [] for index in indices_array[0]: labels_ar.append(lab[index]) label_per_value = [] label_ctr = Counter(labels_ar) for label in np.unique(lab): if label in label_ctr: label_per_value.append(label_ctr[label]) else: label_per_value.append(0) branches.append(label_per_value) gain = Util.Information_Gain(entropy, branches) arr.append([gain, current_attribute_unique_size, current_index]) branches = [] arr = sorted(arr, key=lambda x: (x[0], x[1], -x[2]), reverse=True) self.dim_split = arr[0][2] ig_attributes = np.array(arr) ig_attributes = ig_attributes[:, 0] if np.all(ig_attributes == 0): self.splittable = False return self unique_features = np.unique(feat[:, self.dim_split]) for value in unique_features: indices_array = np.where(feat[:, self.dim_split] == value) labels_pass = [] features_pass = [] labels_left = [] features_left = [] for i, feature in enumerate(feat): if i in indices_array[0]: features_pass.append(feature.tolist()) labels_pass.append(lab[i]) else: features_left.append(feature.tolist()) labels_left.append(lab[i]) features_pass = np.delete(features_pass, self.dim_split, axis=1) feat = np.array(features_left) lab = np.array(labels_left) tree_node = TreeNode(features_pass, labels_pass, np.unique(labels_pass).size) tree_node.feature_uniq_split = value self.children.append(tree_node.split()) return self raise NotImplementedError
import data import hw1_dt as decision_tree import utils as Utils from sklearn.metrics import accuracy_score import numpy as np #TEST IG: root = [8, 12] branches = [[5, 2], [3, 10]] igRoot = Utils.get_entropy(root) print("IG root", igRoot) print("IG branches", Utils.Information_Gain(igRoot, branches)) features, labels = data.sample_decision_tree_data() print(features) print(labels) # # data X_test, y_test = data.sample_decision_tree_test() print(X_test) print(y_test) # build the tree dTree = decision_tree.DecisionTree() dTree.train(features, labels) # print Utils.print_tree(dTree) # testing y_est_test = dTree.predict(X_test)
def split(self): #raise NotImplementedError #calculating parent entropy label_count = np.array(np.unique(self.labels, return_counts=True)).T total = np.sum( [int(label_count[i][1]) for i in range(len(label_count))]) entropy_parent = np.sum([ -1 * (int(label_count[i][1]) / total) * np.log2( (int(label_count[i][1]) / total)) for i in range(len(label_count)) ]) #print("entropy parent", entropy_parent) #maximum gain max_gain = -1 for j in range(len(self.features[0])): split_attr = [ self.features[i][j] for i in range(len(self.features)) ] split_unique = np.unique(split_attr) unique_labels = np.unique(self.labels) branches = np.zeros((len(split_unique), len(unique_labels)), dtype='int') #print(branches) for i in range(len(split_attr)): attr_index = list(split_unique).index(split_attr[i]) class_index = list(unique_labels).index(self.labels[i]) branches[attr_index][class_index] += 1 gain = Util.Information_Gain(entropy_parent, branches) #print(gain) if (gain > max_gain): max_gain = gain self.dim_split = j self.feature_uniq_split = split_unique if (gain == max_gain): if (len(split_unique) > len(self.feature_uniq_split)): self.dim_split = j self.feature_uniq_split = split_unique #print("node",self.dim_split,self.feature_uniq_split) #split child for f in self.feature_uniq_split: sub_data = [] sub_label = [] for j in range(len(self.features)): if (self.features[j][self.dim_split] == f): sub_feature = self.features[ j][:self.dim_split] + self.features[j][self.dim_split + 1:] sub_data.append(sub_feature) sub_label.append(self.labels[j]) child = TreeNode(sub_data, sub_label, self.num_cls) #what is num_cls if (all(child.features[0][j] == None for j in range(len(child.features[0])))): child.splittable = False self.children.append(child) for child in self.children: if child.splittable: child.split() return
def split(self): for current_dim in range(len(self.features[0])): if not 'max_gain' in locals(): max_gain = -9999 current_x = np.array(self.features)[:, current_dim] if None in current_x: continue branch_values = np.unique(current_x) if not 'branch_values_current' in locals(): branch_values_current = -1 if not 'current_current_dim' in locals(): current_current_dim = -1 #branches = np.zeros((self.num_cls, len(branch_values))) branches = np.zeros((len(branch_values), self.num_cls + 1)) for i, val in enumerate(branch_values): y = np.array(self.labels)[np.where(current_x == val)] for current_y in y: branches[i, current_y] += 1 total_entropy = 0 C = np.unique(self.labels) for c in C: p = float(np.count_nonzero(self.labels == c)) / len( self.labels) total_entropy += p * np.log2(1 / p) max_gain_current = Util.Information_Gain(total_entropy, branches) if max_gain_current == max_gain and branch_values.shape[ 0] > branch_values_current: max_gain = max_gain_current self.dim_split = current_dim self.feature_uniq_split = branch_values.tolist() branch_values_current = branch_values.shape[0] current_current_dim = current_dim if max_gain_current > max_gain: max_gain = max_gain_current self.dim_split = current_dim self.feature_uniq_split = branch_values.tolist() branch_values_current = branch_values.shape[0] current_current_dim = current_dim current_x = np.array(self.features)[:, self.dim_split] x = np.array(self.features, dtype=object) x[:, self.dim_split] = None for i in self.feature_uniq_split: index = np.where(current_x == i) x_child = x[index].tolist() y_child = np.array(self.labels)[index].tolist() child = TreeNode(x_child, y_child, self.num_cls) if np.array(x_child).size == 0 or all(x is None for x in x_child[0]): child.splittable = False self.children.append(child) for child in self.children: if child.splittable: child.split() return
def split(self): #print(self.features) if (self.features is None or self.labels is None): self.splittable = False return m = len(self.features) n = len(self.features[0]) if (n == 0 or m == 0): self.splittable = False return # FOR ALL COLUMNS # for ith each column maxIG = -np.inf UniqueFeatures = -1 Labels = self.labels total_labels = len(Labels) NUniqueLabels = np.unique(np.array(Labels)) MinUniqueSubLabels = -1 entropy = 0 for i in NUniqueLabels: count_i = Labels.count(i) if (count_i != 0 and total_labels != 0): entropy = entropy - (count_i / total_labels) * np.log2( count_i / total_labels) for i in range(n): Attr = np.array(self.features)[:, i] # finding unique values of ith coulumn #print(Attr) #break NUniqueSubAttr = np.unique(np.array(Attr)) if (len(NUniqueSubAttr) == 1): continue branches = [] # for each unique value of ith column for j in sorted(NUniqueSubAttr): # Counting indexof features where this column is present in features Index_i = [] subbranch = [] count = 0 for row in self.features: if (row[i] == j and row[i] is not None): Index_i.append(count) count += 1 # Index_i.append((np.array(self.features[i])).index(j)) # take labels for each NUniueq attributes Labels_i = [] for k in Index_i: Labels_i.append(self.labels[k]) for k in sorted(NUniqueLabels): subbranch.append(Labels_i.count(k)) branches.append(subbranch) # calculating entropy for given coulmn # in whole feature we are calculating no of yes and no of nos #self.feature_uniq_split = NUniqueSubAttr IG = Util.Information_Gain(entropy, branches) if (IG > maxIG): maxIG = IG self.dim_split = i self.feature_uniq_split = NUniqueSubAttr #MinUniqueSubLabels = self.feature_uniq_split elif (IG == maxIG): if (len(self.feature_uniq_split) < len(NUniqueSubAttr)): maxIG = IG self.dim_split = i self.feature_uniq_split = NUniqueSubAttr #MinUniqueSubLabels = self.feature_uniq_split elif (len(self.feature_uniq_split) == len(NUniqueSubAttr)): if (self.dim_split > i): maxIG = IG self.dim_split = i self.feature_uniq_split = NUniqueSubAttr #MinUniqueSubLabels = self.feature_uniq_split if (self.feature_uniq_split is not None and len(self.feature_uniq_split) > 1 and self.labels is not None and self.dim_split is not None): count = 0 for i in self.feature_uniq_split: NewFeature = [] NewLabels = [] c = 0 for j in self.features: if (j[self.dim_split] == i): NewFeature.append(j) NewLabels.append(self.labels[c]) c += 1 NoOfUniqueLabels = len(np.unique(NewLabels)) NewFeature = np.delete(np.array(NewFeature), self.dim_split, 1) NewC = TreeNode(NewFeature, NewLabels, NoOfUniqueLabels) self.children.append(NewC) #ChildrenDict[NewC] = NoOfUniqueLabels count += 1 for t in self.children: if (t.splittable == True): t.split() #listofTuples = sorted(ChildrenDict.items(), key=lambda x: x[1]) #for item in listofTuples: # self.children.append(item[0]) else: return return
def split(self): if self.splittable: featuresT = np.array(self.features).T.tolist() D = len(featuresT) N = len(self.features) # calculate entropy count = np.unique(self.labels, return_counts=True)[1] entropy = 0 for i in count: possibility = i / N if possibility != 0: entropy -= possibility * np.log2(possibility) # split by feature d def split_by(d): this_feature = featuresT[d] label_dic = {} feature_dic = {} entropy = 0 for n in range(0, N): point = self.features[n] label = self.labels[n] if this_feature[n] not in label_dic.keys(): label_dic[this_feature[n]] = [label] feature_dic[this_feature[n]] = [point] else: label_dic[this_feature[n]].append(label) feature_dic[this_feature[n]].append(point) branches_features = list(feature_dic.values()) branches_labels = list(label_dic.values()) branches_count = [] for branch_labels in branches_labels: branch_count = np.unique(branch_labels, return_counts=True)[1].tolist() branches_count.append(branch_count) return branches_features, branches_labels, branches_count # greed best feature dic_IG = {} # dic_IG = { d : IG } d_num_attributes = {} for d in range(0, D): branches_features, branches_labels, branches_count = split_by( d) IG = Util.Information_Gain(entropy, branches_count) dic_IG[d] = IG d_num_attributes[d] = len(branches_features) sorted_IG = sorted(dic_IG.items(), key=lambda x: x[1], reverse=True) # sorted_IG = [(d , IG)] num_tie = 1 for i in range(0, len(sorted_IG) - 1): if sorted_IG[i][1] != sorted_IG[i + 1][1]: break num_tie += 1 tie_ds = {} for item in sorted_IG[:num_tie]: tie_ds[item[0]] = d_num_attributes[item[0]] sorted_IG = sorted(tie_ds.items(), key=lambda x: x[1], reverse=True) best_d = sorted_IG[0][0] best_features, best_labels, branches_count = split_by(best_d) self.dim_split = best_d self.feature_uniq_split = [] for feature_value in featuresT[best_d]: if feature_value not in self.feature_uniq_split: self.feature_uniq_split.append(feature_value) # build child node children_sort_info = {} # { child : attributes } for i in range(0, len(best_labels)): child_num_cls = len(best_labels[i]) child_features = np.delete(np.array(best_features[i]), best_d, axis=1).tolist() child_labels = best_labels[i] child = TreeNode(features=child_features, labels=child_labels, num_cls=child_num_cls) if len(child_features) < 1: # samples run out child.splittable = False child.cls_max = self.cls_max if len(child_features[0]) <= 0: child.splittable = False else: # features run out child.split() children_sort_info[child] = len(self.feature_uniq_split) children_sorted_info = sorted(children_sort_info.items(), key=lambda x: x[1], reverse=True) for child_num_attrebutes in children_sorted_info: child = child_num_attrebutes[0] self.children.append(child) else: return
def __init__(self, features, labels, num_cls): # features: List[List[any]], labels: List[int], num_cls: int self.features = features self.labels = labels self.children = [] self.num_cls = num_cls self.children_with_attributes = None self.dim_split = None self.attribute_val = None #attributes for pruning self.expectedLabels = [] self.expectedLabelMap = dict() self.trainingLabelsCountMap = dict() self.currentExpectedLabel = None self.correct_predictions = 0 self.parentNode = None # find the most common labels in current node count_max = 0 labels_with_count = np.unique(labels, return_counts=True) for i in range(len(labels_with_count[0])): if labels_with_count[1][i] > count_max: count_max = labels_with_count[1][i] self.cls_max = labels_with_count[0][i] #print('treenode',self.features, num_cls,self.labels) # splitable is false when all features belongs to one class if len(np.unique(labels)) < 2: self.splittable = False else: self.splittable = True if len(self.features[0]) == 0 or len(self.features) == 0: #print('max_class', self.cls_max) self.splittable = False return indexMap = self.getIndexMap(np.unique(labels)) listOfLabelCounts = [0] * num_cls trainingLabelsCountMap = [0] * num_cls for label in labels: listOfLabelCounts[indexMap.get(label)] += 1 if self.trainingLabelsCountMap.get(label) == None: self.trainingLabelsCountMap[label] = 1 else: self.trainingLabelsCountMap[label] += 1 entropy = Util.entropy(len(labels), listOfLabelCounts) #print("entropy:",entropy) max_ig = -1 feature_index = None max_num_attributes = [] values = None for attribute in range(len(features[0])): num_attributes = [] branches = [] values = dict() for training_point in range(len(features)): labelCountsFetched = values.get( features[training_point][attribute]) if labelCountsFetched != None: if labels[training_point] in labelCountsFetched: currentLabelCount = labelCountsFetched.get( labels[training_point]) labelCountsFetched[ labels[training_point]] = currentLabelCount + 1 else: labelCountsFetched[labels[training_point]] = 1 values[features[training_point] [attribute]] = labelCountsFetched else: labelCounts = dict() labelCounts[labels[training_point]] = 1 values[features[training_point][attribute]] = labelCounts #num_attributes=[row[attribute] for row in features] num_attributes = np.sort(list(values.keys())) #print("num_attributes",num_attributes, "features", features) #{'a': {0: 1}, 'b': {0: 1, 1: 1}, 'c': {1: 1}} for key, value in values.items(): newList = [0] * num_cls for k, v in value.items(): newList[indexMap.get(k)] = v branches.append(newList) #print(branches) ig = Util.Information_Gain(entropy, branches) print("ig:", ig, "max_ig", max_ig) if ig > max_ig or (ig == max_ig and len(num_attributes) > len(max_num_attributes)): max_ig = ig feature_index = attribute max_num_attributes = num_attributes self.dim_split = feature_index # the index of the feature to be split #if feature_index==None: #print(max_ig, feature_index, labels, self.splittable) self.feature_uniq_split = max_num_attributes #print(self.feature_uniq_split) if max_ig > 0 and self.splittable: #print("called") self.split() return