Example #1
0
def select_rewrite_expression(name, exprs):
    """
    Given an expression name and a list of expressions,
    tries to select an expression with the highest selectivity
    for use in AST re-writing.
    """
    # For equality check (=, !=, is), select the mode
    if name[1] == "equality":
        values = [e.right.value for e in exprs]
        filter_using = util.mode(values)
        for e in exprs:
            if e.right.value == filter_using:
                return e

    # For ordering checks, select the median value for static
    elif name[1] == "order":
        is_static = name[3][1] == "static"
        values = [e.right.value for e in exprs]

        # For static (numeric) compares, we use median
        # value to eliminate as many as possible.
        # For non-numeric, we use mode
        if is_static:
            filter_using = util.median(values)
        else:
            filter_using = util.mode(values)

        for e in exprs:
            if e.right.value == filter_using:
                return e

    # For ordering checks without static values, use any
    else:
        return exprs[0]
Example #2
0
def select_rewrite_expression(name, exprs):
    """
    Given an expression name and a list of expressions,
    tries to select an expression with the highest selectivity
    for use in AST re-writing.
    """
    # For equality check (=, !=, is), select the mode
    if name[1] == "equality":
        values = [e.right.value for e in exprs]
        filter_using = util.mode(values)
        for e in exprs:
            if e.right.value == filter_using:
                return e

    # For ordering checks, select the median value for static
    elif name[1] == "order":
        is_static = name[3][1] == "static"
        values = [e.right.value for e in exprs]

        # For static (numeric) compares, we use median
        # value to eliminate as many as possible.
        # For non-numeric, we use mode
        if is_static:
            filter_using = util.median(values)
        else:
            filter_using = util.mode(values)

        for e in exprs:
            if e.right.value == filter_using:
                return e

    # For ordering checks without static values, use any
    else:
        return exprs[0]
Example #3
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N,D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True               ### SOLUTION-AFTER-EQUALS
            self.label  = util.mode(Y)       ### SOLUTION-AFTER-EQUALS

        else:
            # we need to find a feature to split on
            bestFeature = -1     # which feature has lowest error
            bestError   = N      # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY  = Y[X[:,d] <  0.5]   ### SOLUTION-AFTER-EQUALS
                rightY = Y[X[:,d] >= 0.5]   ### SOLUTION-AFTER-EQUALS

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = sum(leftY != util.mode(leftY)) + sum(rightY != util.mode(rightY))    ### SOLUTION-AFTER-EQUALS

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError   = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label  = util.mode(Y)

            else:
                self.isLeaf  = False                    ### SOLUTION-AFTER-EQUALS
                self.feature = bestFeature              ### SOLUTION-AFTER-EQUALS

                self.left  = DT({'maxDepth': maxDepth-1})
                self.right = DT({'maxDepth': maxDepth-1})
                # recurse on our children by calling
                #   self.left.trainDT(...) 
                # and
                #   self.right.trainDT(...) 
                # with appropriate arguments
                ### BEGIN-SOLUTION
                self.left.trainDT( X[ X[:,bestFeature] <  0.5, :], Y[ X[:,bestFeature] <  0.5 ], maxDepth-1, [bestFeature] + used)
                self.right.trainDT(X[ X[:,bestFeature] >= 0.5, :], Y[ X[:,bestFeature] >= 0.5 ], maxDepth-1, [bestFeature] + used)
    def train(self, X, Y):
        '''
        just figure out what the most frequent class is for each value of X[:,0] and store it
        '''
        negY = Y[X[:, 0] <= 0]
        posY = Y[X[:, 0] > 0]

        self.classForNeg = util.mode(negY)
        self.classForPos = util.mode(posY)
Example #5
0
    def trainDT(self, X, Y, maxDepth, used):
        # size of the data set
        N, D = X.shape

        # Stopping Critera
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            self.isLeaf = 1
            self.label = util.mode(Y)

        else:
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature

            for d in range(D):

                # have we used this feature yet
                if d in used:
                    continue

                leftY = Y[X[:, d] <= 0.5]
                rightY = Y[X[:, d] >= 0.5]

                leftYmode = util.mode(leftY)
                rightYmode = util.mode(rightY)

                leftYerror = (leftY != leftYmode).sum()
                rightYerror = (rightY != rightYmode).sum()

                error = leftYerror + rightYerror
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            # Error check.
            if bestFeature < 0:
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False
                self.feature = bestFeature
                used.append(bestFeature)

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})

                self.left.trainDT(X[X[:, bestFeature] <= 0.5, :],
                                  Y[X[:, bestFeature] <= 0.5], maxDepth - 1,
                                  used)
                self.right.trainDT(X[X[:, bestFeature] >= 0.5, :],
                                   Y[X[:, bestFeature] >= 0.5], maxDepth - 1,
                                   used)
Example #6
0
def main(rects, img):
    transformRects(rects)
    # rects = getGoodRects(rects,40,7)
    rects = hi(rects)
    #rects = sortRects(rects)[0:1]
    # rects = getSquare(rects)#[0:1]
    if (len(rects) == 0):
        raise NoRectException
    edges = formEdges(rects)
    length = mode(
        list(map(lambda e: dist(e[kCorners][0], e[kCorners][1]), edges)))
    print(length)
    edges = formEdges(rects, length=length, threshold=10)
    if (len(edges) == 0):
        raise NoEdgeException
    rects = []
    # edges = filterEdgesByLength(edges)
    # edges = noEdge(edges)
    lRotation = getLocalRotation(edges)
    edges = filterEdgesByRotation(edges, lRotation)
    edges = adjustIntercept(edges)
    d = getLocalTranslation(edges, lRotation)
    print("lRot: ", lRotation, "lTra: ", d)
    printEdges(edges, img)
    # E1, E2 = partitionEdges(edges)
    # t1 = partitionAlignInterceptType(E1)
    # t2 = partitionAlignInterceptType(E2)
    # print(t1, t2)
    # [ox,oy,ix,iy,thetaIntercept] = getInterceptCommonOffsets(E1,E2, lRotation)
    # print(ox,oy,ix,iy,thetaIntercept,lRotation)
    # printIntercepts(E1, img)
    # printIntercepts(E2, img)
    # printGrid(ox,oy,ix,iy,thetaIntercept,lRotation,img,color=(255,0,255))
    # printGrid(oy,ox,ix,iy,thetaIntercept,lRotation,img,color=(0,255,255))
    return lRotation, d
    def train(self, X, Y):
        '''
        just figure out what the most frequent class is and store it in self.mostFrequentClass
        '''

        ### TODO: YOUR CODE HERE
        self.mostFrequentClass = util.mode(Y)
Example #8
0
    def train(self, X, Y):
        '''
        just figure out what the most frequent class is for each value of X[:,0] and store it
        '''

        ### TODO: YOUR CODE HERE
        greater = []
        lesser = []
        temp = X[:,0] > 0
        for i in range(temp.size):
            if temp[i]:
                greater.append(Y[i])
            else:
                lesser.append(Y[i])

        self.classForNeg = util.mode(lesser)
        self.classForPos = util.mode(greater)
Example #9
0
def partitionAlignInterceptType(edges):
    L = list(map(lambda e: e[kInterceptType], edges))
    interceptType = mode(L)
    print(interceptType, L)
    if interceptType == 'x':
        partitionChangeToUseX(edges)
    else:  # y-intercept
        partitionChangeToUseY(edges)
    return interceptType
Example #10
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True
            self.label = util.mode(Y)

        else:
            # get the size of the data set
            N, D = X.shape

            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                #put negative values on the left and positive on the right
                negInd = [i for i, x in enumerate(X) if x[d] < 0.5]  #indices
                negVal = [x for i, x in enumerate(X)
                          if x[d] < 0.5]  #entire x values
                posInd = [i for i, x in enumerate(X) if x[d] >= 0.5]  #indices
                posVal = [x for i, x in enumerate(X)
                          if x[d] >= 0.5]  #entire y values
                # negX = [X[i][d] for i in negInd]
                # posX = [X[i][d] for i in posInd]
                leftY = [Y[i] for i in negInd]
                #leftY  = util.raiseNotDefined()    ### TODO: YOUR CODE HERE
                rightY = [Y[i] for i in posInd]
                #rightY = util.raiseNotDefined()    ### TODO: YOUR CODE HERE

                #calculating guesses
                left_guess = 0
                if len(leftY) != 0:
                    if np.mean(leftY) >= 0:
                        left_guess = 1
                    else:
                        left_guess = -1
                right_guess = 0
                if len(rightY) != 0:
                    if np.mean(rightY) >= 0:
                        right_guess = 1
                    else:
                        right_guess = -1

                # calculating error by looking at mislabeled points
                num_errors = 0.0
                for y in leftY:
                    if y != left_guess:
                        num_errors = num_errors + 1
                for y in rightY:
                    if y != right_guess:
                        num_errors = num_errors + 1
                error = num_errors / N

                # check to see if this is a better error rate
                if error <= bestError:
                    permNeg = array(negVal)
                    permPos = array(posVal)
                    permLeft = array(leftY)
                    permRight = array(rightY)
                    permRight
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False  ### TODO: YOUR CODE HERE

                self.feature = bestFeature  ### TODO: YOUR CODE HERE

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                used.append(bestFeature)
                self.left.trainDT(permNeg, permLeft, maxDepth - 1, used)
                self.right.trainDT(permPos, permRight, maxDepth - 1, used)
Example #11
0
File: dt.py Project: Ruhjkg/ciml
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

            self.label = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                rightY = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                self.feature = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                self.left = DT({"maxDepth": maxDepth - 1})
                self.right = DT({"maxDepth": maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                ### TODO: YOUR CODE HERE
                util.raiseNotDefined()
Example #12
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True

            self.label = util.mode(Y)  # do not have to make any decision

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY = Y[X[:, d] <
                          0.5]  # left for feature that are less than 0.5

                rightY = Y[X[:, d] >= 0.5]

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = size((leftY != util.mode(leftY)).nonzero()) + size(
                    (rightY != util.mode(rightY)).nonzero())

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False

                self.feature = bestFeature

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                self.left.trainDT(X[X[:, self.feature] < 0.5],
                                  Y[X[:, self.feature] < 0.5],
                                  self.left.opts['maxDepth'], used + [
                                      self.feature,
                                  ])
                self.right.trainDT(X[X[:, self.feature] >= 0.5],
                                   Y[X[:, self.feature] >= 0.5],
                                   self.right.opts['maxDepth'], used + [
                                       self.feature,
                                   ])
                # For Chi-square pruning
                self.split = array(
                    [[
                        size((Y[X[:, self.feature] < 0.5] == 1).nonzero()),
                        size((Y[X[:, self.feature] < 0.5] == -1).nonzero())
                    ],
                     [
                         size((Y[X[:, self.feature] >= 0.5] == 1).nonzero()),
                         size((Y[X[:, self.feature] >= 0.5] == -1).nonzero())
                     ]])
Example #13
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True  # TODO: well, that's leaf

            self.label = util.mode(Y)  # TODO: and retturn mode of labels

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature

            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY = Y[X[:, d] <
                          0.5]  # TODO: Labels which feature value less than .5
                rightY = Y[
                    X[:, d] >=
                    0.5]  # TODO: Labels which feature value greater than or equal to .5

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = size(nonzero([leftY != util.mode(leftY)])) + size(
                    nonzero([rightY != util.mode(rightY)])
                )  # TODO: counting in each branch amount of labels that are not equal to their mode

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False  # TODO: that's not leaf, cause it is a whole branch
                self.feature = bestFeature  # TODO: which carries its own feature

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments

                # TODO: define X and Y for left and right parts of current tree and init DT training
                # redefine labels with the best feature
                used = used + [self.feature]  # anti infinite loop

                leftX = X[X[:, self.feature] < 0.5]
                rightX = X[X[:, self.feature] >= 0.5]

                leftY = Y[X[:, self.feature] < 0.5]
                rightY = Y[X[:, self.feature] >= 0.5]

                self.left.trainDT(leftX, leftY, maxDepth - 1, used)
                self.right.trainDT(rightX, rightY, maxDepth - 1, used)
Example #14
0
	def trainDT(self, X, Y, maxDepth, used):
		"""
		recursively build the decision tree
		"""

		# get the size of the data set
		N,D = X.shape

		# check to see if we're either out of depth or no longer
		# have any decisions to make
		if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
			# we'd better end at this point.  need to figure
			# out the label to return
			# self.isLeaf = util.raiseNotDefined()	### TODO: YOUR CODE HERE
			self.isLeaf = True

			#self.label  = util.raiseNotDefined()	### TODO: YOUR CODE HERE
			self.label  = util.mode(Y)	### TODO: YOUR CODE HERE


		else:
			# we need to find a feature to split on
			bestFeature = -1	 # which feature has lowest error
			bestError   = N	  # the number of errors for this feature
			for d in range(D):
				# have we used this feature yet
				if d in used:
					continue
					
				# Split X and Y according to a feature value of a feature d
				left_vx = X[X[:,d]<=0]
				left_vy = Y[X[:,d]<=0]
				right_vx = X[X[:,d]>0]
				right_vy = Y[X[:,d]>0]
				# suppose we split on this feature; what labels
				# would go left and right?
				#leftY  = util.raiseNotDefined()	### TODO: YOUR CODE HERE
				
				#rightY = util.raiseNotDefined()	### TODO: YOUR CODE HERE

				# count majority
				leftY  = util.mode( left_vy )
				rightY = util.mode( right_vy )

				# we'll classify the left points as their most
				# common class and ditto right points.  our error
				# is the how many are not their mode.
				#error = util.raiseNotDefined()	### TODO: YOUR CODE HERE
				# count errors 
				error = len( left_vy[ left_vy!=leftY ] ) + len( right_vy[ right_vy!=rightY ] )


				# check to see if this is a better error rate
				if error <= bestError:
					bestFeature = d
					bestError   = error

			if bestFeature < 0:
				# this shouldn't happen, but just in case...
				self.isLeaf = True
				self.label  = util.mode(Y)

			else:
				#self.isLeaf  = util.raiseNotDefined()	### TODO: YOUR CODE HERE
				self.isLeaf  = False

				#self.feature = util.raiseNotDefined()	### TODO: YOUR CODE HERE
				self.feature = bestFeature

				# update used features and prepare arrays to pass child nodes.
				used.append( bestFeature )
				right_used = deepcopy( used )
				left_used  = deepcopy( used )

				# set split training data according to the feature selected.
				left_vx = X[X[:,self.feature]<=0]
				left_vy = Y[X[:,self.feature]<=0]
				right_vx = X[X[:,self.feature]>0]
				right_vy = Y[X[:,self.feature]>0]

				self.left  = DT({'maxDepth': maxDepth-1})
				self.right = DT({'maxDepth': maxDepth-1})
				# recurse on our children by calling
				#   self.left.trainDT(...) 
				# and
				#   self.right.trainDT(...) 
				# with appropriate arguments
				### TODO: YOUR CODE HERE
				# util.raiseNotDefined()
				self.left.trainDT(left_vx, left_vy, maxDepth-1, left_used)
				self.right.trainDT(right_vx, right_vy, maxDepth-1, right_used)
 def train(self, X, Y):
     '''
     just figure out what the most frequent class is and store it in self.mostFrequentClass
     '''
     self.mostFrequentClass = util.mode(Y)
Example #16
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

            self.label = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                rightY = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                self.feature = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                ### TODO: YOUR CODE HERE
                util.raiseNotDefined()
Example #17
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        if len(X) <= 0:
            N = D = 0
        else:
            N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            ### TODO: YOUR CODE HERE
            self.isLeaf = True
            self.label = util.mode(Y)

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                ### TODO: YOUR CODE HERE
                counterno = util.Counter()
                counteryes = util.Counter()
                for i, x in enumerate(X):
                    if x[d] < 0.5:
                        counterno['NO' if Y[i] < 0 else 'YES'] += 1
                    else:
                        counteryes['NO' if Y[i] < 0 else 'YES'] += 1
                leftY = 1 if counterno['YES'] >= counterno['NO'] else -1
                rightY = 1 if counteryes['YES'] >= counteryes['NO'] else -1

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                ### TODO: YOUR CODE HERE
                error = counterno['YES' if counterno['YES'] < counterno['NO'] else 'NO'] +\
                        counteryes['YES' if counteryes['YES'] < counteryes['NO'] else 'NO']

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False
                self.feature = bestFeature  ### TODO: YOUR CODE HERE
                new_used = used[:]
                new_used.append(bestFeature)

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                ### TODO: YOUR CODE HERE
                nos = [[], []]
                yess = [[], []]
                for i, x in enumerate(X):
                    if x[bestFeature] < 0.5:
                        nos[0].append(x)
                        nos[1].append(Y[i])
                    else:
                        yess[0].append(x)
                        yess[1].append(Y[i])

                self.left.trainDT(array(nos[0]), nos[1], maxDepth - 1,
                                  new_used)
                self.right.trainDT(array(yess[0]), yess[1], maxDepth - 1,
                                   new_used)
Example #18
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """
        # get the size of the data set
        N,D = X.shape
        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True
            self.label  = util.mode(Y);

        else:
            # we need to find a feature to split on
            bestFeature = -1     # which feature has lowest error
            bestError   = N      # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?

                leftY  = Y[X[:, d] < 0.5]
                rightY = Y[X[:, d] >= 0.5]
                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = size((leftY!=util.mode(leftY)).nonzero()) + size((rightY!=util.mode(rightY)).nonzero())

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError   = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label  = util.mode(Y)

            else:
                self.isLeaf  = False;
                self.feature = bestFeature;

                self.left  = DT({'maxDepth': maxDepth-1})
                self.right = DT({'maxDepth': maxDepth-1})
                # recurse on our children by calling
                #   self.left.trainDT(...) 
                # and
                #   self.right.trainDT(...) 
                # with appropriate arguments
                leftD = X[X[:, self.feature] < 0.5]
                rightD = X[X[:, self.feature] >= 0.5]
                # redefine labels with the best feature
                leftY = Y[X[:, self.feature] < 0.5]
                rightY = Y[X[:, self.feature] >= 0.5]
                used = used + [self.feature]
                # print "best feature found is ", self.feature
                # print "updated used:", used, " maxDepth:", self.left.opts['maxDepth']
                # print "leftY:", leftY
                # print "rightY:", rightY
#                pdb.set_trace()
                self.left.trainDT(leftD, leftY, self.left.opts['maxDepth'], used);
 
                self.right.trainDT(rightD, rightY, self.right.opts['maxDepth'], used);
Example #19
0
    def trainDT(self, X, Y, maxDepth, criterion, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

            self.label = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

        else:
            if criterion == 'ig':  # information gain
                # compute the entropy at this node
                ### TODO: YOUR CODE HERE
                self.entropy = util.raiseNotDefined()

            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error

            # use error stats or gain stats (not both) depending on criterion

            # initialize error stats
            bestError = np.finfo('d').max

            # initialize gain stats
            bestGain = np.finfo('d').min

            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                rightY = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                # misclassification rate
                if criterion == 'mr':
                    # we'll classify the left points as their most
                    # common class and ditto right points.  our error
                    # is the how many are not their mode.
                    error = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                    # update min, max, bestFeature
                    if error <= bestError:
                        bestFeature = d
                        bestError = error

                # information gain
                elif criterion == 'ig':
                    # now use information gain
                    gain = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                    # update min, max, bestFeature
                    if gain >= bestGain:
                        bestFeature = d
                        bestGain = gain

            self.gain = bestGain  # information gain corresponding to this split
            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                self.feature = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                self.left = DT({
                    'maxDepth': maxDepth - 1,
                    'criterion': criterion
                })
                self.right = DT({
                    'maxDepth': maxDepth - 1,
                    'criterion': criterion
                })
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                ### TODO: YOUR CODE HERE
                util.raiseNotDefined()
Example #20
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """
        # get the size of the data set
        N, D = X.shape
        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True
            self.label = util.mode(Y)

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?

                leftY = Y[X[:, d] < 0.5]
                rightY = Y[X[:, d] >= 0.5]
                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = size((leftY != util.mode(leftY)).nonzero()) + size(
                    (rightY != util.mode(rightY)).nonzero())

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False
                self.feature = bestFeature

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                leftD = X[X[:, self.feature] < 0.5]
                rightD = X[X[:, self.feature] >= 0.5]
                # redefine labels with the best feature
                leftY = Y[X[:, self.feature] < 0.5]
                rightY = Y[X[:, self.feature] >= 0.5]
                used = used + [self.feature]
                # print "best feature found is ", self.feature
                # print "updated used:", used, " maxDepth:", self.left.opts['maxDepth']
                # print "leftY:", leftY
                # print "rightY:", rightY
                #                pdb.set_trace()
                self.left.trainDT(leftD, leftY, self.left.opts['maxDepth'],
                                  used)

                self.right.trainDT(rightD, rightY, self.right.opts['maxDepth'],
                                   used)
Example #21
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape
        bestLeftY = []
        bestRightY = []

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return

            self.isLeaf = True  ### TODO: YOUR CODE HERE

            # self.label  = util.mode(Y)   ### TODO: YOUR CODE HERE
            self.label = util.mode(Y)

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature

            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY = Y[X[:, d] < 0.5]  ### TODO: YOUR CODE HERE
                rightY = Y[X[:, d] >= 0.5]  ### TODO: YOUR CODE HERE

                # y_counter = 0
                # for fe_val in X[:,d]:
                #     if fe_val >= 0.5:
                #         rightY.append(Y[y_counter])
                #     else:
                #         leftY.append(Y[y_counter])
                #
                #     y_counter += 1
                # print 'left ==== {left}'.format(left = leftY)
                # print rightY

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = self.sizeWithoutK(
                    util.mode(leftY), leftY) + self.sizeWithoutK(
                        util.mode(rightY), rightY)  ### TODO: YOUR CODE HERE

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error
                    bestLeftY = leftY
                    bestRightY = rightY

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False  ### TODO: YOUR CODE HERE

                self.feature = bestFeature  ### TODO: YOUR CODE HERE

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                ### TODO: YOUR CODE HERE

                # New X without feature
                used.append(self.feature)
                Xne = delete(X, self.feature, 1)

                self.left.trainDT(X[X[:, self.feature] < 0.5, :], bestLeftY,
                                  maxDepth - 1, used)
                self.right.trainDT(X[X[:, self.feature] >= 0.5, :], bestRightY,
                                   maxDepth - 1, used)
Example #22
0
    def trainDT(self, X, Y, maxDepth, criterion, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape  ### N and D are number of rows and columns of X data matrix

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True  ###util.raiseNotDefined()    ### TODO: YOUR CODE HERE Boolean true or false

            self.label = util.mode(
                Y
            )  ###util.raiseNotDefined()  which class to return to for leaf nodes?

        else:
            if criterion == 'ig':  # information gain
                # compute the entropy at this node
                ### TODO: YOUR CODE HERE
                def entropy(y):
                    P = np.count_nonzero(y == 1)
                    #print("p")
                    #print(P)
                    N = np.count_nonzero(y == -1)
                    #print("n")
                    #print(N)
                    S = N + P
                    if (P > 0):
                        a = (-(P / S) * math.log((P / S), 2))
                    else:
                        a = 0
                    if (N > 0):
                        b = (-(N / S) * math.log((N / S), 2))
                    else:
                        b = 0

                    #return ((-(P/S) * math.log((P/S),2)) - ((N/S)* math.log((N/S),2)))
                    return a + b

                self.entropy = entropy(
                    Y)  # entropy(Y) ###it depends on count at each feature
                print(self.entropy)

            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error -- split feature

            # use error stats or gain stats (not both) depending on criterion

            # initialize error stats
            bestError = np.finfo(
                'd').max  # finding max value of d -- just initializing

            # initialize gain stats
            bestGain = np.finfo('d').min  # Minimum value of d assigned to gain

            for d in range(D):  ### d is FEATURE and iteration variable i
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?   check the feature value if its less than 0.5 goes left and greater than 0.5 goes rig
                leftY = Y[
                    X[:, d] <=
                    0.5]  ###util.raiseNotDefined()  x[:,d] slicing the matrix to give the dth column
                left_node = np.count_nonzero(leftY)
                #print("echo")
                #print(left_node)
                rightY = Y[
                    X[:, d] >
                    0.5]  ###util.raiseNotDefined()    ### TODO: YOUR CODE HERE
                right_node = np.count_nonzero(rightY)
                #print(right_node)

                # misclassification rate
                if criterion == 'mr':
                    # we'll classify the left points as their most-- so error is difference between all Y on left and Y=0 on left                   # common class and ditto right points.  our error-- for right same except Y=1
                    # is the how many are not their mode.

                    count_left = util.mode(
                        leftY
                    )  ### finding the most common class on the left (+1 or -1)
                    count_right = util.mode(
                        rightY
                    )  ### finding the most common class on the right (+1 or -1)

                    error_lefttree = len(leftY[leftY != count_left])
                    error_righttree = len(rightY[rightY != count_right])
                    error = error_lefttree + error_righttree  #util.raiseNotDefined()#TODO:YOURCODE HERE difference between the

                    # update min, max, bestFeature
                    if error <= bestError:
                        bestFeature = d
                        bestError = error

                # information gain
                elif criterion == 'ig':
                    # now use information gain
                    Total = np.count_nonzero(Y)
                    #print(Total)
                    N1 = np.count_nonzero(leftY)
                    #print(N1)
                    P1 = np.count_nonzero(rightY)
                    #print(P1)
                    entropy_left = entropy(leftY)
                    #print(entropy_left)
                    entropy_right = entropy(rightY)
                    #print(entropy_right)

                    gain = (entropy(Y)) - ((N1 / Total) * entropy(leftY)) - (
                        (P1 / Total) * entropy(rightY)
                    )  ### TODO: YOUR CODE HERE afterwords
                    #print(gain)
                    # update min, max, bestFeature
                    if gain >= bestGain:
                        bestFeature = d
                        bestGain = gain

            self.gain = bestGain  # information gain corresponding to this split
            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False  ###util.raiseNotDefined()    ### TODO: YOUR CODE HERE

                self.feature = bestFeature  ###util.raiseNotDefined()    ### TODO: YOUR CODE HERE

                self.left = DT({
                    'maxDepth': maxDepth - 1,
                    'criterion': criterion
                })  ## left sub tree
                self.right = DT({
                    'maxDepth': maxDepth - 1,
                    'criterion': criterion
                })  ## right sub tree
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                ### TODO: YOUR CODE HERE -- First we need to divide rows and columns to respective trees
                ###util.raiseNotDefined()
                used.append(bestFeature
                            )  ## so that the feature does not get used again
                Y_left = Y[X[:, bestFeature] <= 0.5]
                Y_right = Y[X[:, bestFeature] > 0.5]
                X_left = X[X[:, bestFeature] <= 0.5, :]
                X_right = X[X[:, bestFeature] > 0.5, :]

                self.left.trainDT(X_left, Y_left, maxDepth - 1, criterion,
                                  used)
                self.right.trainDT(X_right, Y_right, maxDepth - 1, criterion,
                                   used)
Example #23
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N,D = X.shape
          

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True    ### TODO: YOUR CODE HERE

            self.label  = util.mode(Y)    ### TODO: YOUR CODE HERE


        else:
            # we need to find a feature to split on
            bestFeature = -1     # which feature has lowest error
            bestError   = N      # the number of errors for this feature           
            leftY = []
            rightY = []
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue
                countLeftpos = 0
                countLeftneg = 0
                countRightpos = 0
                countRightneg = 0
                errorLeft = 0
                errorRight = 0

                # suppose we split on this feature; what labels
                # would go left and right?
                            
                for i in range (N):
                    if X[i,d] < 0.5 and Y[i] == 1:
                        countLeftpos += 1
                    if X[i,d] < 0.5 and Y[i] == -1:
                        countLeftneg += 1
                    if X[i,d] >= 0.5 and Y[i] == 1:
                        countRightpos += 1
                    if X[i,d] >= 0.5 and Y[i] == -1:
                        countRightneg += 1
                        
                if countLeftpos > countLeftneg:
                    errorLeft = countLeftneg
                if countLeftneg > countLeftpos:
                    errorLeft = countLeftpos
                if countLeftneg == countLeftpos:
                    errorLeft = countLeftpos
                if countRightpos > countRightneg:
                    errorRight = countRightneg
                if countRightneg > countRightpos:
                    errorRight = countRightpos
                if countRightneg == countRightpos:
                    errorRight = countRightpos


                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error =  errorLeft + errorRight   ### TODO: YOUR CODE HERE


                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError   = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label  = util.mode(Y)

            else:
                self.isLeaf  = False    ### TODO: YOUR CODE HERE

                self.feature = bestFeature
                
                used.append(bestFeature)
                ### TODO: YOUR CODE HERE
                
                leftX = X[X[:,bestFeature] < 0.5, :]
                rightX = X[X[:,bestFeature] >= 0.5, :]
                
                for i in range (N):
                    if X[i,bestFeature] < 0.5:
                        leftY.append(Y[i])
                    else:
                        rightY.append(Y[i])

                self.left  = DT({'maxDepth': maxDepth-1})
                self.right = DT({'maxDepth': maxDepth-1})
                self.left.trainDT(leftX, leftY, maxDepth-1, used)
                self.right.trainDT(rightX, rightY, maxDepth-1, used)
Example #24
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """
        # print used
        # get the size of the data set
        N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True
            self.label = util.mode(Y)

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):  # Index
                # have we used this feature yet
                if d in used:
                    continue

                xFiltered = X[:, d]
                leftY = Y[xFiltered < 0.5]
                rightY = Y[xFiltered >= 0.5]
                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = size((leftY != util.mode(leftY)).nonzero()) + size(
                    (rightY != util.mode(rightY)).nonzero())

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False
                self.feature = bestFeature  # To maximize accuracy
                # print "Feature:"
                # print repr(self.feature)

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments

                xFiltered = X[:, self.feature]
                leftY = Y[xFiltered < 0.5]
                rightY = Y[xFiltered >= 0.5]

                self.left.trainDT(X[X[:, self.feature] < 0.5], leftY,
                                  (maxDepth - 1), used + [self.feature])
                self.right.trainDT(X[X[:, self.feature] >= 0.5], rightY,
                                   (maxDepth - 1), used + [self.feature])