예제 #1
0
class min_max_l2distance(learner):
    #curr_minmax=float('inf')
    #curr_winner=0

    def __init__(self, fp, leaf_size):
        #print 'inside'
        learner.__init__(self, fp)
        self.active_method = 'minmax_l2distance'
        self.leaf_size = leaf_size
        self.max_min_dist = 0
        self.max_min_point = 0
        self.curr_minmax = float('inf')
        self.curr_winner = 0
        self.curr_id = 0
        self.complexity = 0
        self.fcomplexity = self.fsave + 'complex'
        #self.bound=bound
    def create_ball_tree(self):  # done
        idx = np.array(range(self.Xtrain.shape[0]))
        self.tree = BallTree(self.Xtrain, self.leaf_size, idx)

    def show_ball_tree_n_points(self):
        #------------------------------------------------------------
        # Plot four different levels of the Ball tree
        X = self.Xtrain.toarray()
        fig = plt.figure(figsize=(5, 5))
        fig.subplots_adjust(wspace=0.1,
                            hspace=0.15,
                            left=0.1,
                            right=0.9,
                            bottom=0.05,
                            top=0.9)

        for level in range(4):
            ax = fig.add_subplot(2, 2, level, xticks=[], yticks=[])

            #ax.scatter(X[:, 0], X[:, 1], s=9)
            self.tree.draw_circle(ax, depth=level)

            #ax.scatter(Q[:, 0], Q[:, 1], s=9, color='r')
            #BT.draw_circle(ax, depth=None)
            #ax.set_xlim(-1.35, 1.35)
            #ax.set_ylim(-1.0, 1.7)
            ax.set_title('level %i' % level)

        # suptitle() adds a title to the entire figure
        fig.suptitle('Ball-tree Example')
        plt.show()

    def load_data(self):
        learner.load_data(self)
        #print 'not calling learner load data'
        #print 'creating ball tree'
        self.create_ball_tree()

    def create_query_ball(self):  # done

        q_center = np.array(self.Q.mean(0))
        #print q_center
        """
        plt.scatter(self.Q.toarray()[:,0], self.Q.toarray()[:,1], s=2)
        plt.scatter(q_center[0,0],q_center[0,1],c='r')
        plt.show()
        """
        q_radius = 0
        for i in range(self.Q.shape[0]):

            #print type(a)

            norm_val = LA.norm(self.Q.getrow(i).toarray() - q_center, 2)
            #print norm_val
            if norm_val > q_radius:
                q_radius = norm_val

        #print (self.Q-q_center)**2
        #q_radius = np.sqrt(np.max(np.sum((self.Q - q_center) ** 2, 1)))
        return q_center, q_radius

    def get_bounds(self, q_center, q_radius, BT):

        #print BT.loc.shape
        #print q_center.shape
        min_dist = LA.norm(BT.loc - q_center) - (BT.radius + q_radius)
        max_dist = LA.norm(BT.loc - q_center) + (BT.radius + q_radius)

        maxmin_dist = min_dist  #max( 0, LA.norm(BT.loc - q_center) - min(BT.radius, q_radius))
        minmax_dist = max(0, LA.norm(BT.loc - q_center) - BT.radius)

        #print "min %f\nmax %f\nmaxmin %f\nminmax %f\n" % (min_dist, max_dist, maxmin_dist, minmax_dist)
        return min_dist, max_dist, maxmin_dist, minmax_dist

    def prune_child_level1(self, q_center, q_radius, BT):  # left
        # check how ball tree implementation refer their children as
        # Compute the bounds for BOTH the children
        c1_min_d, c1_max_d, c1_maxmin_d, c1_minmax_d = self.get_bounds(
            q_center, q_radius, BT.child1)
        c2_min_d, c2_max_d, c2_maxmin_d, c2_minmax_d = self.get_bounds(
            q_center, q_radius, BT.child2)

        # If the lower bound (c1_maxmin_d) of child1 is higher
        # than upper bound (c2_minmax_d) of child2, prune child1
        #if( c1_maxmin_d > c2_minmax_d ):
        #if( c1_maxmin_d > c2_max_d ):
        if (c1_maxmin_d > c2_minmax_d):
            return 1, 0

        # If the lower bound (c2_maxmin_d) of child2 is higher
        # than upper bound (c1_minmax_d) of child1, prune child2
        #if( c2_maxmin_d > c1_minmax_d ):
        #if( c2_maxmin_d > c1_max_d ):
        if (c2_maxmin_d > c1_minmax_d):
            return 0, 1

        # Nothing to prune!
        return 0, 0

    def brute_force_l2_norm(self, X, idx):
        # Compute the minmax l2-norm distance now.
        minmax_eu = float("inf")
        #print X.getrow(0)
        for x, id in zip(X, idx):
            #print x.todense()
            #print id

            max_eu = float("-inf")
            for q in self.Q:
                #print type(x-q)

                eu = LA.norm((x - q).toarray())
                if eu > max_eu:
                    max_eu = eu
                    max_x = x  # unnecessary

            if max_eu < minmax_eu:
                minmax_eu = max_eu
                minmax_x = max_x
                minmax_id = id
            #print max_eu
            #print "for x=(%f,%f) , max_eu is %f" % (x.toarray()[0,0], x.toarray()[0,1],max_eu)
        #print "for x =(%f,%f) minmax dist is obtained as %f with idx as %d " % (minmax_x.toarray()[0,0],minmax_x.toarray()[0,1],minmax_eu,minmax_id)
        """
        plt.scatter(X.toarray()[:,0],X.toarray()[:,1], s=9 , c='b')
        
        plt.scatter(self.Q.toarray()[:,0],self.Q.toarray()[:,1], s=9,c='r')
        plt.scatter(minmax_x.toarray()[0,0],minmax_x.toarray()[0,1],s=9,c='g')
        plt.show()
        """
        complexity = X.shape[0] * self.Q.shape[0]
        return minmax_eu, minmax_x, minmax_id, complexity
        #print 'minmax_eu : {0}, winning data point : {1}'.format(minmax_eu, minmax_x)

    def minmaxdist(self, BT, q_center, q_radius, depth):

        #global curr_minmax
        #global curr_winner

        # Leaf Node
        if BT.child1 is None:
            print 'I am in child node at depth %d' % (depth)

            # We shouldn't be dumping a lot of data here as
            # we hope to prune more branches and hit the leaf
            # nodes less number of times.
            #dump_ball_contents(depth,BT)

            min_d, max_d, maxmin_d, minmax_d = self.get_bounds(
                q_center, q_radius, BT)
            print 'Current actual minmax = {0}, Ball minmax = {1}'.format(
                self.curr_minmax, minmax_d)
            #print 'Current actual minmax = {0}, Ball minmax bound = {1}'.format(self.curr_minmax,minmax_d)
            if (minmax_d < self.curr_minmax):
                #print 'brute force check '
                # Now just do a brute force computation
                win_dist, win_x, win_id, curr_complexity = self.brute_force_l2_norm(
                    BT.data, BT.idx)
                self.complexity += curr_complexity
                if (win_dist < self.curr_minmax):
                    #print win_dist
                    #print 'previous minmax was %f where curr minmax is %f' % (self.curr_minmax,win_dist)
                    self.curr_minmax = win_dist
                    self.curr_winner = win_x
                    self.curr_id = win_id
                    #print '----- Current minmax_euclidean, minmax data point = <{0},{1}>'.format(win_dist,win_id)
                    #print '----- Number of points processed = {0}\n'.format(BT.data.shape[0])
            else:
                #print  'therefore not checking'
                return

        # Internal Node
        else:
            #print 'now in internal node at depth %d' % (depth)
            #dump_ball_contents(depth,BT)
            # Compute the bounds for BOTH the children
            c1_min_d, c1_max_d, c1_maxmin_d, c1_minmax_d = self.get_bounds(
                q_center, q_radius, BT.child1)
            c2_min_d, c2_max_d, c2_maxmin_d, c2_minmax_d = self.get_bounds(
                q_center, q_radius, BT.child2)

            # Work out what nodes to prune and what to leave!
            #print 'check which child to prune '
            c1_prune, c2_prune = self.prune_child_level1(
                q_center, q_radius, BT)
            print '---- pruning flags after level 1 = ({0},{1})'.format(
                c1_prune, c2_prune)

            if (c1_prune == 0 and c2_prune == 1):
                #print '--- pruned child2! ---'
                print 'Current minmax = {0} where child 1 minmax bound = {1}'.format(
                    self.curr_minmax, c1_minmax_d)
                if (c1_minmax_d < self.curr_minmax):
                    #print 'going in child 1 '
                    self.minmaxdist(BT.child1, q_center, q_radius, depth + 1)

            if (c2_prune == 0 and c1_prune == 1):
                #print '--- pruned child1! ---'
                print 'Current minmax = {0} where child 2 minmax bound = {1}'.format(
                    self.curr_minmax, c2_minmax_d)
                if (c2_minmax_d < self.curr_minmax):
                    #print 'going in child 2'
                    self.minmaxdist(BT.child2, q_center, q_radius, depth + 1)

            if (c1_prune == 0 and c2_prune == 0):
                #print '--- No child pruned, so we order them! ---'

                # First descend down child 1
                if (c1_minmax_d < c2_minmax_d):
                    #print 'c1 before c2'
                    print 'Current minmax = {0} where child 1 minmax bound = {1}'.format(
                        self.curr_minmax, c1_minmax_d)
                    if (c1_minmax_d < self.curr_minmax):
                        #print 'going in first child  %d' % (depth)
                        self.minmaxdist(BT.child1, q_center, q_radius,
                                        depth + 1)
                    print 'Current minmax = {0} where child 2 minmax bound = {1}'.format(
                        self.curr_minmax, c2_minmax_d)
                    if (c2_minmax_d < self.curr_minmax):
                        #print 'going in second child  %d' % (depth)
                        self.minmaxdist(BT.child2, q_center, q_radius,
                                        depth + 1)
                else:
                    #print 'c2 before c1'
                    print 'Current minmax = {0} where child 2 minmax bound = {1}'.format(
                        self.curr_minmax, c2_minmax_d)
                    if (c2_minmax_d < self.curr_minmax):
                        #print 'going in second child  %d' % (depth)
                        self.minmaxdist(BT.child2, q_center, q_radius,
                                        depth + 1)
                    print 'Current minmax = {0} where child 1 minmax bound = {1}'.format(
                        self.curr_minmax, c1_minmax_d)
                    if (c1_minmax_d < self.curr_minmax):
                        #print 'going in first child  %d' % (depth)
                        self.minmaxdist(BT.child1, q_center, q_radius,
                                        depth + 1)

    def write_complexity_ratio(self):
        max_complexity = self.Xtrain.shape[0] * self.Q.shape[0]
        with open(self.fcomplexity, 'a') as fp:
            fp.write('search complexity ' + str(self.complexity) +
                     ' out of total ' + str(max_complexity) + ' nodes\n')

    def active_select(self):
        self.complexity = 0
        q_center, q_radius = self.create_query_ball()
        #print 'query ball created'
        #self.prune_child_level1(q_center, q_radius, self.tree)
        #self.get_bounds(q_center, q_radius,self.tree)
        self.minmaxdist(self.tree, q_center, q_radius, 0)

        #self.write_complexity_ratio()
        #print 'min max dist'
        #print 'current minmax distance %f and winner sample is %s' % (self.curr_minmax,','.join(str(e) for e in list(self.curr_winner.toarray())))
        #idx=0 # to be found
        #self.brute_force_l2_norm(self.Xtrain)
        #self.minmaxdist(self.tree, q_center, q_radius, depth)
        #print self.curr_id
        #return self.curr_minmax,self.curr_winner,self.curr_id, self.complexity
        return self.curr_id

    def check_recursion(self):
        if self.ck_rec == 0:
            print 'ck rec %d ' % (self.ck_rec)
            return
        print 'ck rec %d' % (self.ck_rec)
        self.ck_rec -= 1
        self.check_recursion()
        return

    """Doubt
    should we return only nearest points? How to deal with repeatation?
    """
    # print the ball at level 4
    # whenever getting dist for a leaf  draw the circle with red
    """