def build_tree_helper(used, data, varnames, l, curr_ent, max_class): # Everything is the same, return the Class of the first data point as a Leaf if curr_ent == 0: return node.Leaf(varnames, data[0][l]) # We can assume this is non-zero since handled by wrapper function feat, ig = split_on_variable(used, data) zeroes, ones = partition(feat, data) # There are no more valid features, - we are done (all splits were bad) - return most common if len(used) == l or ig == 0: return node.Leaf(varnames, max_class) # This split fully explains the Classes of the data if ig == curr_ent: if data[0][feat] == data[0][l]: return node.Split(varnames, feat, node.Leaf(varnames, 0), node.Leaf(varnames, 1)) else: return node.Split(varnames, feat, node.Leaf(varnames, 1), node.Leaf(varnames, 0)) z_ent = my_entropy(l, zeroes) o_ent = my_entropy(l, ones) # True copy of python list u = used[:] u.append(feat) # Recursive case return node.Split( varnames, feat, build_tree_helper(u, zeroes, varnames, l, z_ent, max_class), build_tree_helper(u, ones, varnames, l, o_ent, max_class))
def build_tree(data, varnames): py, total = collect_count (data) guess = float (py) / total if guess == 1 or (len(varnames) == 1 and guess > 0.5): return node.Leaf(varnames, 1) elif guess == 0 or (len(varnames) == 1 and guess <= 0.5): return node.Leaf(varnames, 0) gain, index = find_split(data, varnames, py, total) if gain == 0: if guess > 0.5: return node.Leaf(varnames, 1) else: return node.Leaf(varnames, 0) left = [] right = [] for i in range(total): if data[i][index] == 0: l = data[i] left.append(l) else: l = data[i] right.append(l) return node.Split(varnames, index, build_tree(left, varnames), build_tree(right, varnames))
def build_tree(data, varnames, visited): guess = get_most_label(data) #most common label in data #base cases #base case 1 and 2 : all examples 0 if check_pure_example(data, 0): #print (data) #print("all zero") return node.Leaf(varnames, 0) elif check_pure_example(data, 1): #print("all one") return node.Leaf(varnames, 1) #base case 2: no more attributes #elif check_all_visited(visited): # return node.Leaf(varnames, guess) else: #get best atrribute (att_index, threshold) = calculate_entropy(data, varnames, visited) #check if threshold then stop if threshold: #print("threshold") return node.Leaf(varnames, guess) att_val = varnames[att_index] #set the attribute as used visited[att_index] = True #divide the data for att_val os 0 and 1 no = get_data_subset(data, 0, att_index) yes = get_data_subset(data, 1, att_index) left = build_tree(no, varnames, visited) right = build_tree(yes, varnames, visited) return node.Split(varnames, att_val, left, right)
def build_tree(data, varnames): # >>>> YOUR CODE GOES HERE <<<< total = len(data) var_len = len(varnames) - 1 # base case if len(data) == 0: return if len(data) == 1: return node.Leaf(varnames, data[0][var_len]) if var_len == 0: return node.Leaf(varnames, ) # do the counting to calculate info gain py, pxi_list, py_pxi_list = count_collect(data, varnames) # calculate all info gain (checked) gain_list = [0] * var_len for i in range(var_len): gain_list[i] = infogain(py_pxi_list[i], pxi_list[i], py, total) print("gain list:", gain_list) # find max info gain max_gain_pos = split_on_variable(var_len, gain_list) print("max_gain_pos:", max_gain_pos) # partition data_l, data_r = partition(data, max_gain_pos) print(data_l) varnames.pop(max_gain_pos) rt = node.Split(varnames, max_gain_pos, build_tree(data_l, varnames), build_tree(data_r, varnames)) # recursive call return rt
def build_tree(data, varnames): localdata = data[:] localVarNames = varnames[:] XPosYPos = count_var(localdata) totalPosY = count_Y(localdata) total = len(localdata) listofVarGain = [0]*(len(localVarNames)-1) ind = 0 for (XVal, YVal) in XPosYPos: #super inefficient maybe change this if ind != len(XPosYPos)-1: listofVarGain[ind] = infogain(YVal,XVal, totalPosY, total) ind += 1 indextoremove = listofVarGain.index(max(listofVarGain)) if max(listofVarGain) == 0: if YVal >= (totalPosY-YVal): return node.Split(localVarNames, indextoremove, node.Leaf(localVarNames, 1), node.Leaf(localVarNames, 0)) else: return node.Split(localVarNames, indextoremove, node.Leaf(localVarNames, 0), node.Leaf(localVarNames, 1)) #split data leftData = [] rightData = [] countRight = 0 countLeft = 0 newVarNames = localVarNames[:] for datarow in localdata: if datarow[indextoremove] == 0: countLeft += datarow[-1] leftData.append(datarow) else: countRight += datarow[-1] rightData.append(datarow) if countRight == 0 and countLeft == len(leftData): return node.Split(localVarNames, indextoremove, node.Leaf(localVarNames, 1), node.Leaf(localVarNames, 0)) elif countRight == len(rightData) and countLeft == 0: return node.Split(localVarNames, indextoremove, node.Leaf(localVarNames, 0), node.Leaf(localVarNames, 1)) elif countRight == 0: return node.Split(localVarNames, indextoremove, (build_tree(leftData, newVarNames)), node.Leaf(localVarNames, 0)) elif countRight == len(rightData): return node.Split(localVarNames, indextoremove, (build_tree(leftData, newVarNames)), node.Leaf(localVarNames, 1)) elif countLeft == 0: return node.Split(localVarNames, indextoremove, node.Leaf(localVarNames, 0), (build_tree(rightData, newVarNames))) elif countLeft == len(leftData): return node.Split(localVarNames, indextoremove, node.Leaf(localVarNames, 1), (build_tree(rightData, newVarNames))) else: return node.Split(localVarNames, indextoremove, (build_tree(leftData, newVarNames)), (build_tree(rightData, newVarNames)))
def build_tree(data, varnames, used_attributes=[]): if len(data[0]) == len(used_attributes): return node.Leaf(varnames, find_bigger(data)) else: split_on = highest_info_gain(data, used_attributes) used_attributes.append(split_on) new_used = copy.deepcopy(used_attributes) #print "Split on(",split_on,",\t",varnames[split_on],")" left, right = partition_data(data, split_on) l_pure, l_val = check_if_pure(left) r_pure, r_val = check_if_pure(right) if l_pure and r_pure: return node.Split(varnames, \ split_on, \ node.Leaf(varnames, \ l_val), node.Leaf(varnames, \ r_val)) elif l_pure: return node.Split(varnames, \ split_on, \ node.Leaf(varnames, \ l_val), build_tree(right, \ varnames, \ copy.deepcopy(used_attributes))) elif r_pure: return node.Split(varnames, \ split_on, \ build_tree(left, \ varnames, \ copy.deepcopy(used_attributes)), \ node.Leaf(varnames, \ r_val)) else: return node.Split(varnames,\ split_on, build_tree(left,\ varnames,\ copy.deepcopy(used_attributes)),\ build_tree(right,\ varnames,\ copy.deepcopy(used_attributes)))
def build_tree(data, varnames): #py_pxi, pxi, py, total py_pxi = 0 pxi = 0 py = 0 total = len(data) for i in data: if i[len(data[0]) - 1] == 1: py += 1 guess = py / (total * 1.0) if guess == 1: return node.Leaf(varnames, 1) elif guess == 0: return node.Leaf(varnames, 0) if len(varnames) == 1: if guess > 0.5: return node.Leaf(varnames, 1) else: return node.Leaf(varnames, 0) gain = 0 for i in range(len(varnames) - 1): for j in data: if j[i] == 1: pxi += 1 if j[i] == 1 and j[-1] == 1: py_pxi += 1 if infogain(py_pxi, pxi, py, total) > gain: gain = infogain(py_pxi, pxi, py, total) index = i py_pxi = 0 pxi = 0 if gain == 0: if guess > 0.5: return node.Leaf(varnames, 1) else: return node.Leaf(varnames, 0) # divide the data data0 = [] data1 = [] for i in range(len(data)): if data[i][index] == 0: list = data[i] data0.append(list) else: list = data[i] data1.append(list) return node.Split(varnames, index, build_tree(data0, varnames), build_tree(data1, varnames))
def build_tree(data, varnames): total = len(data) py = getpy(data) if py == 0: return node.Leaf(varnames, 0) elif py == total: return node.Leaf(varnames, 1) else: (positivedata, negativedata) = splitdata(data, varnames) index = findbestindex(data, varnames) if index == -1: return node.Leaf(varnames, 0) return node.Split(varnames, index, build_tree(negativedata, varnames), build_tree(positivedata, varnames)) return None
def build_tree(data, ingredients, labels): # >>>> YOUR CODE GOES HERE <<<< # global CUR_ITER guess = [0] * (len(labels)) for item in data: index = item[len(item) - 1] # print index # print "Old guess[index] ", guess[index] guess[index] = guess[index] + 1 # print "New guess[index] ", guess[index] print guess g = 0 gIndex = 0 oneLabel = 0 for item in guess: if item > 0: oneLabel += 1 if item > g: g = item gIndex = guess.index(item) moreFeatures = False for i in ingredients: if i != "": moreFeatures = True if oneLabel <= 1: print "Only one label", gIndex return node.Leaf(ingredients, gIndex) elif not moreFeatures: return node.Leaf(ingredients, gIndex) # elif CUR_ITER == MAX_ITER: # return node.Leaf(ingredients, g) else: # # CUR_ITER += 1 # print "Finding best var" (value, index) = bestVar(data, ingredients, labels) if value <= 0: return node.Leaf(ingredients, gIndex) (no, yes) = partition(data, index) var = list(ingredients) var[index] = "" left = build_tree(no, var, labels) right = build_tree(yes, var, labels) return node.Split(ingredients, index, left, right)
def build_tree(data, varnames, depth): #print "Current Depth:", depth if len(data) == 0: print "BAD SPLIT" return # compute the max gain split_index = compute_max_gain(data, varnames) # Base cases if split_index == -1: #print "LEAF CASE" #print data #print "\n" # choose whichever result is more common pos, neg = collect_counts(data) #print "pos:", pos, "neg:", neg if pos > neg: return node.Leaf(varnames, 1) else: return node.Leaf(varnames, 0) # split the data at max_index attribute l_data, r_data = split_data(data, split_index) # make new node split # left child - buildtree on left split # right child - buildtree on right split var = varnames[split_index] #print "SPLIT CASE:", var #print "\n" #print "***Recursing L_tree***" #print l_data L_tree = build_tree(l_data, varnames, depth + 1) #print "***L_tree returned, depth=", depth #print "***Recursing R_tree***" #print r_data R_tree = build_tree(r_data, varnames, depth + 1) #print "***R_tree returned, depth=", depth return node.Split(varnames, split_index, L_tree, R_tree)
def build_tree(data, varnames): # >>>> YOUR CODE GOES HERE <<<< # For now, always return a leaf predicting "1": for i in range(len(varnames) - 1): counts = var_counts(data, i) if counts[2] == counts[3]: return node.Leaf(varnames, 1) elif counts[2] == 0: return node.Leaf(varnames, 0) else: best_split = split_on(data, varnames) if best_split == None: return node.Leaf(varnames,1) left_split, right_split = partition(data, varnames, best_split) best_node = node.Split(varnames, best_split, build_tree(left_split, varnames), build_tree(right_split, varnames)) return best_node
def build_tree(data, varnames): # >>>> YOUR CODE GOES HERE <<<< # For now, always return a leaf predicting "1": for i in range(len(varnames) - 1): #counts = count(data, i) var_count = 0 var_count_label = 0 label = 0 total = 0 for item in data: total += 1 if item[i] == 1: var_count += 1 if item[-1] == 1: label += 1 if item[i] == 1 and item[-1] == 1: var_count_label += 1 if label == total: return node.Leaf(varnames, 1) elif label == 0: return node.Leaf(varnames, 0) else: gain = 0 best_gain = None for i in range(len(varnames) - 1): counts = count(data, i) temp = infogain(counts[0], counts[1], counts[2], counts[3]) if temp > gain: gain = temp best_gain = i best_split = best_gain if best_split == None: return node.Leaf(varnames,1) left_split, right_split = partition(data, varnames, best_split) best_node = node.Split(varnames, best_split, build_tree(left_split, varnames), build_tree(right_split, varnames)) return best_node
def build_tree(data, varnames): succ = findSuccess(data) #grabs the initial success rate pred = (succ / len(data) ) #predicts based on the ratio of passes / failures if len( varnames ) == 1: #if there's only one attribute left, split on it automaticall return splitr(pred, varnames) info = 0 #information gain, tallied per attribute for i in range(len(varnames) - 1): posx = 0 #count of the total entries with a 1 in their ith index pos_x_y = 0 #and of the total entries with a 1 in their last and ith index res = findattr(data, i) posx = res[0] pos_x_y = res[1] calced = infogain(pos_x_y, posx, succ, len(data)) if calced > info: #find largest gain to split on info = calced place = i if info == 0: return splitr(pred, varnames) left = [] right = [] for i in range(len( data)): #split each branch's possible values into left and right if data[i][place] == 0: left.append(data[i]) else: right.append(data[i]) #recursive call, continues the tree return node.Split(varnames, place, build_tree(left, varnames), build_tree(right, varnames))
def build_tree(data, varnames): best_candidate = maxgain(data) # Base Case: if all our data has the same classification then we can create a leaf node to classify our root->leaf path if best_candidate == -1: for entry in data: if entry[-1] == 1: return node.Leaf(varnames, 1) return node.Leaf(varnames, 0) # Recursive Step: split our data into two lists: (1) positive classification (2) negative classifcation. Then # build the left and right subtree by calling the 'build_tree' function again. else: positive_data = [] negative_data = [] for entry in data: if entry[best_candidate] == 1: positive_data.append(entry) else: negative_data.append(entry) # Trees are by defintion recursive, so after splitting data we can continue with the process of building our tree. # autograder checks left -> zero and right -> one left_node = build_tree(negative_data, varnames) right_node = build_tree(positive_data, varnames) return node.Split(varnames, best_candidate, left_node, right_node)
def build_tree(data, varnames): # Check if data is empty if len(data) == 0: return node.Leaf(varnames, 0) # Check if leaf node, attribute = len(data[0]) - 1 pc, tc, tp, t = collect_counts(data, attribute) # Check if all values are 0 or 1 if tc[0] == 0: return node.Leaf(varnames, 1) if tc[1] == 0: return node.Leaf(varnames, 0) # Find best feature to split data on max_gain = best_split(data, varnames) #print(varnames[max_gain]) # Split data based on best splitter newData = split(data, max_gain) left = newData[0] right = newData[1] # Update data/varnames # Change variable name in varnames to notify that that feature has already been used varnames[max_gain] == "USED" # Build left and right subtrees left_subtree = build_tree(left, varnames) right_subtree = build_tree(right, varnames) # Build tree root = node.Split(varnames, max_gain, left_subtree, right_subtree) # Return tree return root
def build_tree_helper(used, data, varnames, var_len, curr_ent): if curr_ent == 0: return node.Leaf(varnames, data[0][l]) feat, ig = split_on_variable(used, data) zeros, ones = partition(feat, data) if len(used) == var_len or ig == 0: aff = count_class(var_len, data) neg = len(data) - aff if aff > neg: return node.Leaf(varnames, 1) else: return node.Leaf(varnames, 0) if ig == curr_ent: if len(zeros) == 0: return node.Leaf(varnames, ones[0][var_len]) elif len(ones) == 0: return node.Leaf(varnames, zeros[0][var_len]) else: if data[0][feat] == data[0][var_len]: return node.Split()
def build_tree(data, varnames): # Get class column length and count values of it class_length = get_column(data, len(varnames) - 1) class_pos_neg = count_values(class_length) # Make leaf if only 0's if class_pos_neg[0] == len(class_length): return node.Leaf(varnames, 0) # Make leaf if only 1's elif class_pos_neg[1] == len(class_length): return node.Leaf(varnames, 1) else: # Get column of i and class column, then compute gain and return it returned_entropy = [] for i in range(0, len(varnames) - 1): columns = [] columns.append(get_column(data, i)) columns.append(get_column(data, len(data[0]) - 1)) pos_and_neg = partition_data(columns) positive_values = pos_and_neg[1] class_values = columns[-1] returned_entropy.append( infogain(get_ones(positive_values), get_ones(columns[0]), get_ones(class_values), len(columns[0]))) # Once we have a list of gains, get the highest one best_value = split_data(returned_entropy) # Branch the data into left and right, depending on higehst gain value in array branch = branch_data(data, best_value[1]) # If the gain is exactly 0 if best_value[0] == 0.0: if class_pos_neg[0] > class_pos_neg[1]: return node.Leaf(varnames, 0) elif class_pos_neg[0] < class_pos_neg[1]: return node.Leaf(varnames, 1) left = None right = None # There is no information gain progress if best_value[0] <= 0.0: # Get the columns of the branched data and count values in it check_branch_negatives = count_values( get_column(branch[0], best_value[1])) check_branch_positives = count_values( get_column(branch[1], best_value[1])) # Get length of both original branches length_neg = len(branch[0]) length_pos = len(branch[1]) # Make sure that neither the left, nor the right branch values are equal in length to the whole branch # That is, the tree needs to stop branching if one of the two branches is empty if (check_branch_negatives[0] == length_neg or check_branch_negatives[1] == length_neg): return node.Leaf(varnames, 1) elif (check_branch_negatives[0] < length_neg or check_branch_negatives[1] < length_neg): left = build_tree(branch[0], varnames) if (check_branch_positives[0] == length_pos or check_branch_positives[1] == length_pos): return node.Leaf(varnames, 1) elif (check_branch_positives[0] < length_pos or check_branch_positives[1] < length_pos): right = build_tree(branch[1], varnames) return node.Split(varnames, best_value[1], left, right) # The gain is higher than 0, everything is good and we can keep on branching else: # We split the tree and recursively go through both new branches left = build_tree(branch[0], varnames) right = build_tree(branch[1], varnames) return node.Split(varnames, best_value[1], left, right)
def build_tree(data, varnames): bestChosenAttribute = -1 alreadyConsidered = [] leftToBeConsidered = [] global_varnames = globals().get('varnames') #print("varnames") #print(varnames) #For the first time varnames is ['A', 'B', 'C', 'D', '[A_and_B]_or_[C_and_D]'] #for the next recursions: varnames is leftToBeConsidered (x, value) = checkAllSame(data, varnames) if x == len(data): #print("Leaf 1") return node.Leaf(global_varnames, value) if (len(varnames) == 0): #print("Leaf 2") return node.Leaf(global_varnames, value) if len(data[0]) == len(varnames): j = 0 for j in range(len(varnames)): leftToBeConsidered.insert(j, j) alreadyConsidered.append( len(varnames) - 1) #For the first time: alreadyConsidered is [4] #print("first time") else: leftToBeConsidered.extend(varnames) j = 0 for j in range(0, leftToBeConsidered[len(varnames) - 1]): if j not in leftToBeConsidered: alreadyConsidered.append( j ) #For the next recursions: alreadyConsidered is [0]--[0, 2]--[0]--[0, 1]--[0, 1, 2] #print("I am recursing") #print("alreadyConsidered") stri = '' for row in alreadyConsidered: stri = stri + ',' + global_varnames[row] #print(stri) if len(leftToBeConsidered) == 1: #print("Leaf 3") return node.Leaf(global_varnames, value) (bestChosenAttribute, maxGain) = selectMaxGainAttribute( data, varnames, alreadyConsidered) # bestChosenAttribute: 0 -- 2--3--1--2--3 if bestChosenAttribute in leftToBeConsidered: leftToBeConsidered.remove( bestChosenAttribute ) #leftToBeConsidered: [1, 2, 3, 4]--[1, 3, 4]--[1, 4]--[2, 3, 4]--[3, 4]--[4] #print("bestChosenAttribute") #print(global_varnames[bestChosenAttribute]) #print("Gain") #print(maxGain) #print("leftToBeConsidered") stri = '' for row in leftToBeConsidered: stri = stri + ',' + global_varnames[row] #print(stri) if len(data) == 0: return node.Leaf(global_varnames, value) negativeDataSet = negData(data, bestChosenAttribute) positiveDataSet = posData(data, bestChosenAttribute) if maxGain == 0.0 and bestChosenAttribute >= 0: #print("Leaf 4") if data[0][-1] == 0: return node.Split( globals().get('varnames'), bestChosenAttribute, node.Leaf(global_varnames, negativeDataSet[0][-1]), build_tree(positiveDataSet, leftToBeConsidered)) else: return node.Split( globals().get('varnames'), bestChosenAttribute, build_tree(negativeDataSet, leftToBeConsidered), node.Leaf(global_varnames, positiveDataSet[0][-1])) elif maxGain > 0: return node.Split( globals().get('varnames'), bestChosenAttribute, build_tree(negativeDataSet, leftToBeConsidered), build_tree(positiveDataSet, leftToBeConsidered) ) #for the next recursions: varnames is leftToBeConsidered else: #print("Leaf 5") return node.Leaf(global_varnames, value)