Example #1
0
def build_tree(data, varnames):
    # >>>> YOUR CODE GOES HERE <<<<
    total = len(data)
    var_len = len(varnames) - 1
    # base case
    if len(data) == 0:
        return
    if len(data) == 1:
        return node.Leaf(varnames, data[0][var_len])
    if var_len == 0:
        return node.Leaf(varnames, )

    # do the counting to calculate info gain
    py, pxi_list, py_pxi_list = count_collect(data, varnames)

    # calculate all info gain (checked)
    gain_list = [0] * var_len
    for i in range(var_len):
        gain_list[i] = infogain(py_pxi_list[i], pxi_list[i], py, total)
    print("gain list:", gain_list)
    # find max info gain
    max_gain_pos = split_on_variable(var_len, gain_list)
    print("max_gain_pos:", max_gain_pos)

    # partition
    data_l, data_r = partition(data, max_gain_pos)
    print(data_l)

    varnames.pop(max_gain_pos)

    rt = node.Split(varnames, max_gain_pos, build_tree(data_l, varnames),
                    build_tree(data_r, varnames))
    # recursive call
    return rt
Example #2
0
def build_tree(data, varnames):
	
	py, total = collect_count (data)
	guess = float (py) / total
	if guess == 1 or (len(varnames) == 1 and guess > 0.5):
		return node.Leaf(varnames, 1)
	elif guess == 0 or (len(varnames) == 1 and guess <= 0.5):
		return node.Leaf(varnames, 0)

	gain, index = find_split(data, varnames, py, total)

	if gain == 0:
		if guess > 0.5:
			return node.Leaf(varnames, 1)
		else:
			return node.Leaf(varnames, 0)

	left = []
	right = []
	for i in range(total):
		if data[i][index] == 0:
			l = data[i]
			left.append(l)
		else:
			l = data[i]
			right.append(l)

	return node.Split(varnames, index, build_tree(left, varnames), build_tree(right, varnames))
Example #3
0
def build_tree(data, varnames, visited):
    guess = get_most_label(data)  #most common label in data
    #base cases
    #base case 1 and 2 : all examples 0
    if check_pure_example(data, 0):
        #print (data)
        #print("all zero")
        return node.Leaf(varnames, 0)
    elif check_pure_example(data, 1):
        #print("all one")
        return node.Leaf(varnames, 1)
    #base case 2: no more attributes
    #elif check_all_visited(visited):
    #	return node.Leaf(varnames, guess)
    else:
        #get best atrribute
        (att_index, threshold) = calculate_entropy(data, varnames, visited)
        #check if threshold then stop
        if threshold:
            #print("threshold")
            return node.Leaf(varnames, guess)
        att_val = varnames[att_index]
        #set the attribute as used
        visited[att_index] = True
        #divide the data for att_val os 0 and 1
        no = get_data_subset(data, 0, att_index)
        yes = get_data_subset(data, 1, att_index)
        left = build_tree(no, varnames, visited)
        right = build_tree(yes, varnames, visited)
        return node.Split(varnames, att_val, left, right)
Example #4
0
def build_tree(data, varnames):
	localdata = data[:]
	localVarNames = varnames[:]
	XPosYPos = count_var(localdata)
	totalPosY = count_Y(localdata)
	total = len(localdata)
	
	listofVarGain = [0]*(len(localVarNames)-1)

	ind = 0
	for (XVal, YVal) in XPosYPos:
		#super inefficient maybe change this
		
		if ind != len(XPosYPos)-1:
			listofVarGain[ind] = infogain(YVal,XVal, totalPosY, total)
			ind += 1 
	indextoremove = listofVarGain.index(max(listofVarGain))

	if max(listofVarGain) == 0:
		if YVal >= (totalPosY-YVal):
			return node.Split(localVarNames, indextoremove, node.Leaf(localVarNames, 1), node.Leaf(localVarNames, 0))
		else:
			return node.Split(localVarNames, indextoremove, node.Leaf(localVarNames, 0), node.Leaf(localVarNames, 1))
	#split data
	leftData = []
	rightData = []
	countRight = 0
	countLeft = 0
	newVarNames = localVarNames[:]
	 
	for datarow in localdata:
		if datarow[indextoremove] == 0:
			countLeft += datarow[-1]
			leftData.append(datarow)
		else:
			countRight += datarow[-1]
			rightData.append(datarow)

	if countRight == 0 and countLeft == len(leftData):
		return node.Split(localVarNames, indextoremove, node.Leaf(localVarNames, 1), node.Leaf(localVarNames, 0))
	elif countRight == len(rightData) and countLeft == 0:
		return node.Split(localVarNames, indextoremove, node.Leaf(localVarNames, 0), node.Leaf(localVarNames, 1))
	elif countRight == 0:
		return node.Split(localVarNames, indextoremove, (build_tree(leftData, newVarNames)), node.Leaf(localVarNames, 0))
	elif countRight == len(rightData):
		return node.Split(localVarNames, indextoremove, (build_tree(leftData, newVarNames)), node.Leaf(localVarNames, 1))
	elif countLeft == 0:
		return node.Split(localVarNames, indextoremove, node.Leaf(localVarNames, 0), (build_tree(rightData, newVarNames)))
	elif countLeft == len(leftData):
		return node.Split(localVarNames, indextoremove, node.Leaf(localVarNames, 1), (build_tree(rightData, newVarNames)))
	else:	
		return node.Split(localVarNames, indextoremove, (build_tree(leftData, newVarNames)), (build_tree(rightData, newVarNames)))
Example #5
0
def build_tree(data, varnames):
    total = len(data)

    py = getpy(data)
    if py == 0:
        return node.Leaf(varnames, 0)
    elif py == total:
        return node.Leaf(varnames, 1)
    else:
        (positivedata, negativedata) = splitdata(data, varnames)
        index = findbestindex(data, varnames)
        if index == -1:
            return node.Leaf(varnames, 0)
        return node.Split(varnames, index, build_tree(negativedata, varnames),
                          build_tree(positivedata, varnames))
    return None
Example #6
0
def build_tree(data, ingredients, labels):
    # >>>> YOUR CODE GOES HERE <<<<
    # global CUR_ITER
    guess = [0] * (len(labels))
    for item in data:
        index = item[len(item) - 1]
        # print index
        # print "Old guess[index] ", guess[index]
        guess[index] = guess[index] + 1
        # print "New guess[index] ", guess[index]
    print guess
    g = 0
    gIndex = 0
    oneLabel = 0

    for item in guess:
        if item > 0:
            oneLabel += 1
        if item > g:
            g = item
            gIndex = guess.index(item)

    moreFeatures = False
    for i in ingredients:
        if i != "":
            moreFeatures = True
    if oneLabel <= 1:
        print "Only one label", gIndex
        return node.Leaf(ingredients, gIndex)
    elif not moreFeatures:
        return node.Leaf(ingredients, gIndex)
    # elif CUR_ITER == MAX_ITER:
    #   return node.Leaf(ingredients, g)
    else:
        # # CUR_ITER += 1
        # print "Finding best var"
        (value, index) = bestVar(data, ingredients, labels)
        if value <= 0:
            return node.Leaf(ingredients, gIndex)
        (no, yes) = partition(data, index)
        var = list(ingredients)
        var[index] = ""

        left = build_tree(no, var, labels)
        right = build_tree(yes, var, labels)
        return node.Split(ingredients, index, left, right)
Example #7
0
def build_tree(data, varnames, used_attributes=[]):
    if len(data[0]) == len(used_attributes):
        return node.Leaf(varnames, find_bigger(data))

    else:
        split_on = highest_info_gain(data, used_attributes)
        used_attributes.append(split_on)
        new_used = copy.deepcopy(used_attributes)
        #print "Split on(",split_on,",\t",varnames[split_on],")"
        left, right = partition_data(data, split_on)
        l_pure, l_val = check_if_pure(left)
        r_pure, r_val = check_if_pure(right)
        if l_pure and r_pure:
            return node.Split(varnames, \
                              split_on, \
                              node.Leaf(varnames, \
                                        l_val),
                              node.Leaf(varnames, \
                                        r_val))
        elif l_pure:
            return node.Split(varnames, \
                              split_on, \
                              node.Leaf(varnames, \
                                        l_val),
                              build_tree(right, \
                                         varnames,  \
                                         copy.deepcopy(used_attributes)))
        elif r_pure:
            return node.Split(varnames, \
                              split_on, \
                              build_tree(left, \
                                         varnames, \
                                         copy.deepcopy(used_attributes)), \
                              node.Leaf(varnames, \
                                        r_val))

        else:
            return node.Split(varnames,\
                              split_on,
                              build_tree(left,\
                                         varnames,\
                                         copy.deepcopy(used_attributes)),\
                              build_tree(right,\
                                         varnames,\
                                         copy.deepcopy(used_attributes)))
Example #8
0
def build_tree(data, varnames, depth):
    #print "Current Depth:", depth

    if len(data) == 0:
        print "BAD SPLIT"
        return

    # compute the max gain
    split_index = compute_max_gain(data, varnames)

    # Base cases
    if split_index == -1:
        #print "LEAF CASE"
        #print data
        #print "\n"
        # choose whichever result is more common
        pos, neg = collect_counts(data)
        #print "pos:", pos, "neg:", neg
        if pos > neg:
            return node.Leaf(varnames, 1)
        else:
            return node.Leaf(varnames, 0)

    # split the data at max_index attribute
    l_data, r_data = split_data(data, split_index)

    # make new node split
    # left child - buildtree on left split
    # right child - buildtree on right split
    var = varnames[split_index]
    #print "SPLIT CASE:", var
    #print "\n"

    #print "***Recursing L_tree***"
    #print l_data
    L_tree = build_tree(l_data, varnames, depth + 1)
    #print "***L_tree returned, depth=", depth

    #print "***Recursing R_tree***"
    #print r_data
    R_tree = build_tree(r_data, varnames, depth + 1)
    #print "***R_tree returned, depth=", depth

    return node.Split(varnames, split_index, L_tree, R_tree)
Example #9
0
def build_tree(data, varnames):
    # >>>> YOUR CODE GOES HERE <<<<
    # For now, always return a leaf predicting "1":

	for i in range(len(varnames) - 1):
		counts = var_counts(data, i)

	if counts[2] == counts[3]:
		return node.Leaf(varnames, 1)
	elif counts[2] == 0:
		return node.Leaf(varnames, 0)
	else:
		best_split = split_on(data, varnames)
		if best_split == None:
			return node.Leaf(varnames,1)
		left_split, right_split = partition(data, varnames, best_split)
		best_node = node.Split(varnames, best_split, build_tree(left_split, varnames), build_tree(right_split, varnames))

		return best_node
Example #10
0
def build_tree(data, varnames):
    # >>>> YOUR CODE GOES HERE <<<<
    # For now, always return a leaf predicting "1":
    for i in range(len(varnames) - 1):
        #counts = count(data, i)
        var_count = 0
        var_count_label = 0
        label = 0
        total = 0

        for item in data:
            total += 1
            if item[i] == 1:
                var_count += 1
            if item[-1] == 1:
                label += 1
            if item[i] == 1 and item[-1] == 1:
                var_count_label += 1

    if label == total:
        return node.Leaf(varnames, 1)
    elif label == 0:
        return node.Leaf(varnames, 0)
    else:
        gain = 0
        best_gain = None

        for i in range(len(varnames) - 1):
            counts = count(data, i)
            temp = infogain(counts[0], counts[1], counts[2], counts[3])
            if temp > gain:
                gain = temp
                best_gain = i

        best_split = best_gain
        if best_split == None:
            return node.Leaf(varnames,1)
        left_split, right_split = partition(data, varnames, best_split)
        best_node = node.Split(varnames, best_split, build_tree(left_split, varnames), build_tree(right_split, varnames))

        return best_node
Example #11
0
def build_tree_helper(used, data, varnames, l, curr_ent, max_class):
    # Everything is the same, return the Class of the first data point as a Leaf
    if curr_ent == 0:
        return node.Leaf(varnames, data[0][l])

    # We can assume this is non-zero since handled by wrapper function
    feat, ig = split_on_variable(used, data)
    zeroes, ones = partition(feat, data)

    # There are no more valid features, - we are done (all splits were bad) - return most common
    if len(used) == l or ig == 0:
        return node.Leaf(varnames, max_class)

    # This split fully explains the Classes of the data
    if ig == curr_ent:
        if data[0][feat] == data[0][l]:
            return node.Split(varnames, feat, node.Leaf(varnames, 0),
                              node.Leaf(varnames, 1))
        else:
            return node.Split(varnames, feat, node.Leaf(varnames, 1),
                              node.Leaf(varnames, 0))

    z_ent = my_entropy(l, zeroes)
    o_ent = my_entropy(l, ones)

    # True copy of python list
    u = used[:]
    u.append(feat)

    # Recursive case
    return node.Split(
        varnames, feat,
        build_tree_helper(u, zeroes, varnames, l, z_ent, max_class),
        build_tree_helper(u, ones, varnames, l, o_ent, max_class))
Example #12
0
def build_tree(data, varnames):
    best_candidate = maxgain(data)
    # Base Case: if all our data has the same classification then we can create a leaf node to classify our root->leaf path
    if best_candidate == -1:
        for entry in data:
            if entry[-1] == 1:
                return node.Leaf(varnames, 1)
        return node.Leaf(varnames, 0)
    # Recursive Step: split our data into two lists: (1) positive classification (2) negative classifcation. Then
    # build the left and right subtree by calling the 'build_tree' function again.
    else:
        positive_data = []
        negative_data = []
        for entry in data:
            if entry[best_candidate] == 1:
                positive_data.append(entry)
            else:
                negative_data.append(entry)
        # Trees are by defintion recursive, so after splitting data we can continue with the process of building our tree.
        # autograder checks left -> zero and right -> one
        left_node = build_tree(negative_data, varnames)
        right_node = build_tree(positive_data, varnames)
        return node.Split(varnames, best_candidate, left_node, right_node)
Example #13
0
def build_tree(data, varnames):

    # Check if data is empty
    if len(data) == 0:
        return node.Leaf(varnames, 0)

    # Check if leaf node,
    attribute = len(data[0]) - 1
    pc, tc, tp, t = collect_counts(data, attribute)
    # Check if all values are 0 or 1
    if tc[0] == 0:
        return node.Leaf(varnames, 1)
    if tc[1] == 0:
        return node.Leaf(varnames, 0)

    # Find best feature to split data on
    max_gain = best_split(data, varnames)
    #print(varnames[max_gain])

    # Split data based on best splitter
    newData = split(data, max_gain)
    left = newData[0]
    right = newData[1]

    # Update data/varnames
    # Change variable name in varnames to notify that that feature has already been used
    varnames[max_gain] == "USED"

    # Build left and right subtrees
    left_subtree = build_tree(left, varnames)
    right_subtree = build_tree(right, varnames)

    # Build tree
    root = node.Split(varnames, max_gain, left_subtree, right_subtree)

    # Return tree
    return root
Example #14
0
def build_tree(data, varnames):

    #py_pxi, pxi, py, total
    py_pxi = 0
    pxi = 0
    py = 0
    total = len(data)
    for i in data:
        if i[len(data[0]) - 1] == 1:
            py += 1

    guess = py / (total * 1.0)
    if guess == 1:
        return node.Leaf(varnames, 1)
    elif guess == 0:
        return node.Leaf(varnames, 0)

    if len(varnames) == 1:
        if guess > 0.5:
            return node.Leaf(varnames, 1)
        else:
            return node.Leaf(varnames, 0)

    gain = 0

    for i in range(len(varnames) - 1):
        for j in data:
            if j[i] == 1:
                pxi += 1
            if j[i] == 1 and j[-1] == 1:
                py_pxi += 1
        if infogain(py_pxi, pxi, py, total) > gain:
            gain = infogain(py_pxi, pxi, py, total)
            index = i
        py_pxi = 0
        pxi = 0

    if gain == 0:
        if guess > 0.5:
            return node.Leaf(varnames, 1)
        else:
            return node.Leaf(varnames, 0)

    # divide the data
    data0 = []
    data1 = []

    for i in range(len(data)):
        if data[i][index] == 0:
            list = data[i]
            data0.append(list)
        else:
            list = data[i]
            data1.append(list)

    return node.Split(varnames, index, build_tree(data0, varnames),
                      build_tree(data1, varnames))
Example #15
0
def build_tree_helper(used, data, varnames, var_len, curr_ent):
    if curr_ent == 0:
        return node.Leaf(varnames, data[0][l])

    feat, ig = split_on_variable(used, data)

    zeros, ones = partition(feat, data)

    if len(used) == var_len or ig == 0:
        aff = count_class(var_len, data)
        neg = len(data) - aff
        if aff > neg:
            return node.Leaf(varnames, 1)
        else:
            return node.Leaf(varnames, 0)

    if ig == curr_ent:
        if len(zeros) == 0:
            return node.Leaf(varnames, ones[0][var_len])
        elif len(ones) == 0:
            return node.Leaf(varnames, zeros[0][var_len])
        else:
            if data[0][feat] == data[0][var_len]:
                return node.Split()
Example #16
0
def build_tree(data, varnames):
    # >>>> YOUR CODE GOES HERE <<<<
    # For now, always return a leaf predicting "1":
    
  attr = []
  val = []
  for index, entry in enumerate(varnames):
      attr.append(entry)
      val.append([0])
      val[index].append(0)
      val[index].append(0)
      val[index].append(0)

  for line in data:
    for index, x in enumerate(line):
      if ( x == 1):
        if ( line[len(line) - 1] == 1 ):
          val[index][0] += 1
        else:
          val[index][1] += 1
      else:
        if ( line[len(line) - 1] == 1):
          val[index][2] += 1
        else:
          val[index][3] += 1

#select new node
  current = 0.0
  for index, x in enumerate(attr):
    if ( x != '-' ):
      check = info_Gain(pEnt, val[index][0], val[index][1], val[index][2], val[index][3])
    if (check > current):
      current = check
      use = index

#create node
  parent = Node(attr[use])
  attr[use] = '-'
  recur_Tree(parent, attr[], val[])



  return node.Leaf(varnames, 1)
Example #17
0
def build_tree(data, varnames):

    bestChosenAttribute = -1
    alreadyConsidered = []
    leftToBeConsidered = []
    global_varnames = globals().get('varnames')

    #print("varnames")
    #print(varnames)

    #For the first time varnames is ['A', 'B', 'C', 'D', '[A_and_B]_or_[C_and_D]']
    #for the next recursions: varnames is leftToBeConsidered

    (x, value) = checkAllSame(data, varnames)
    if x == len(data):
        #print("Leaf 1")
        return node.Leaf(global_varnames, value)

    if (len(varnames) == 0):
        #print("Leaf 2")
        return node.Leaf(global_varnames, value)

    if len(data[0]) == len(varnames):
        j = 0
        for j in range(len(varnames)):
            leftToBeConsidered.insert(j, j)
        alreadyConsidered.append(
            len(varnames) - 1)  #For the first time: alreadyConsidered is [4]

        #print("first time")
    else:
        leftToBeConsidered.extend(varnames)
        j = 0
        for j in range(0, leftToBeConsidered[len(varnames) - 1]):
            if j not in leftToBeConsidered:
                alreadyConsidered.append(
                    j
                )  #For the next recursions: alreadyConsidered is [0]--[0, 2]--[0]--[0, 1]--[0, 1, 2]
        #print("I am recursing")

    #print("alreadyConsidered")
    stri = ''
    for row in alreadyConsidered:
        stri = stri + ',' + global_varnames[row]
    #print(stri)

    if len(leftToBeConsidered) == 1:
        #print("Leaf 3")
        return node.Leaf(global_varnames, value)

    (bestChosenAttribute, maxGain) = selectMaxGainAttribute(
        data, varnames,
        alreadyConsidered)  # bestChosenAttribute: 0 -- 2--3--1--2--3

    if bestChosenAttribute in leftToBeConsidered:
        leftToBeConsidered.remove(
            bestChosenAttribute
        )  #leftToBeConsidered: [1, 2, 3, 4]--[1, 3, 4]--[1, 4]--[2, 3, 4]--[3, 4]--[4]

    #print("bestChosenAttribute")
    #print(global_varnames[bestChosenAttribute])

    #print("Gain")
    #print(maxGain)

    #print("leftToBeConsidered")
    stri = ''
    for row in leftToBeConsidered:
        stri = stri + ',' + global_varnames[row]
    #print(stri)

    if len(data) == 0:

        return node.Leaf(global_varnames, value)

    negativeDataSet = negData(data, bestChosenAttribute)
    positiveDataSet = posData(data, bestChosenAttribute)

    if maxGain == 0.0 and bestChosenAttribute >= 0:
        #print("Leaf 4")
        if data[0][-1] == 0:
            return node.Split(
                globals().get('varnames'), bestChosenAttribute,
                node.Leaf(global_varnames, negativeDataSet[0][-1]),
                build_tree(positiveDataSet, leftToBeConsidered))
        else:
            return node.Split(
                globals().get('varnames'), bestChosenAttribute,
                build_tree(negativeDataSet, leftToBeConsidered),
                node.Leaf(global_varnames, positiveDataSet[0][-1]))

    elif maxGain > 0:
        return node.Split(
            globals().get('varnames'), bestChosenAttribute,
            build_tree(negativeDataSet, leftToBeConsidered),
            build_tree(positiveDataSet, leftToBeConsidered)
        )  #for the next recursions: varnames is leftToBeConsidered

    else:
        #print("Leaf 5")
        return node.Leaf(global_varnames, value)
Example #18
0
def splitr(pred, varnames):
    #helper function: given a prediction, creates either a 1 or 0 leaf
    if pred >= 0.5:
        return node.Leaf(varnames, 1)
    else:
        return node.Leaf(varnames, 0)
Example #19
0
 def p_t_ID(self, p):
     "t : ID"
     p[0] = node.Leaf('ID', p[1])
Example #20
0
 def p_t_NUM(self, p):
     "t : NUM"
     p[0] = node.Leaf('NUM', p[1])
Example #21
0
def build_tree(data, varnames):

    # Get class column length and count values of it
    class_length = get_column(data, len(varnames) - 1)
    class_pos_neg = count_values(class_length)

    # Make leaf if only 0's
    if class_pos_neg[0] == len(class_length):
        return node.Leaf(varnames, 0)
    # Make leaf if only 1's
    elif class_pos_neg[1] == len(class_length):
        return node.Leaf(varnames, 1)

    else:
        # Get column of i and class column, then compute gain and return it
        returned_entropy = []
        for i in range(0, len(varnames) - 1):
            columns = []
            columns.append(get_column(data, i))
            columns.append(get_column(data, len(data[0]) - 1))
            pos_and_neg = partition_data(columns)
            positive_values = pos_and_neg[1]
            class_values = columns[-1]
            returned_entropy.append(
                infogain(get_ones(positive_values), get_ones(columns[0]),
                         get_ones(class_values), len(columns[0])))

        # Once we have a list of gains, get the highest one
        best_value = split_data(returned_entropy)
        # Branch the data into left and right, depending on higehst gain value in array
        branch = branch_data(data, best_value[1])

        # If the gain is exactly 0
        if best_value[0] == 0.0:
            if class_pos_neg[0] > class_pos_neg[1]:
                return node.Leaf(varnames, 0)
            elif class_pos_neg[0] < class_pos_neg[1]:
                return node.Leaf(varnames, 1)

        left = None
        right = None
        # There is no information gain progress
        if best_value[0] <= 0.0:

            # Get the columns of the branched data and count values in it
            check_branch_negatives = count_values(
                get_column(branch[0], best_value[1]))
            check_branch_positives = count_values(
                get_column(branch[1], best_value[1]))
            # Get length of both original branches
            length_neg = len(branch[0])
            length_pos = len(branch[1])

            # Make sure that neither the left, nor the right branch values are equal in length to the whole branch
            # That is, the tree needs to stop branching if one of the two branches is empty
            if (check_branch_negatives[0] == length_neg
                    or check_branch_negatives[1] == length_neg):
                return node.Leaf(varnames, 1)
            elif (check_branch_negatives[0] < length_neg
                  or check_branch_negatives[1] < length_neg):
                left = build_tree(branch[0], varnames)

            if (check_branch_positives[0] == length_pos
                    or check_branch_positives[1] == length_pos):
                return node.Leaf(varnames, 1)
            elif (check_branch_positives[0] < length_pos
                  or check_branch_positives[1] < length_pos):
                right = build_tree(branch[1], varnames)

            return node.Split(varnames, best_value[1], left, right)

        # The gain is higher than 0, everything is good and we can keep on branching
        else:
            # We split the tree and recursively go through both new branches
            left = build_tree(branch[0], varnames)
            right = build_tree(branch[1], varnames)

            return node.Split(varnames, best_value[1], left, right)
Example #22
0
def build_tree(data, varnames):
    # >>>> YOUR CODE GOES HERE <<<<
    # For now, always return a leaf predicting "1":
    return node.Leaf(varnames, 1)