def __sample_neighbor_nodes(self, sample_size, nodeId):
     '''
     Sample subset of neighborhood nodes. 
     '''    
     p = sample_size
     neighbor_nodes = Set()
     held_out_set = self._network.get_held_out_set()
     test_set = self._network.get_test_set()
     
     while p > 0:
         nodeList = random.sample_range("neighbor sampler", self._N, sample_size * 2)
         if self._compatibility_mode:  # to be able to replay from C++
             nodeList = sorted(nodeList)
         for neighborId in nodeList:
                 if p < 0:
                     if False:
                         print sys._getframe().f_code.co_name + ": Are you sure p < 0 is a good idea?"
                     break
                 if neighborId == nodeId:
                     continue
                 # check condition, and insert into mini_batch_set if it is valid. 
                 edge = (min(nodeId, neighborId), max(nodeId, neighborId))
                 if edge in held_out_set or edge in test_set or neighborId in neighbor_nodes:
                     continue
                 else:
                     # add it into mini_batch_set
                     neighbor_nodes.add(neighborId)
                     p -= 1
                     
     if self._compatibility_mode:  # to be able to replay from C++
         neighbor_nodes = sorted(neighbor_nodes)
     return neighbor_nodes
Beispiel #2
0
    def __sample_neighbor_nodes(self, sample_size, nodeId):
        '''
        Sample subset of neighborhood nodes. 
        '''
        p = sample_size
        neighbor_nodes = Set()
        held_out_set = self._network.get_held_out_set()
        test_set = self._network.get_test_set()

        while p > 0:
            nodeList = random.sample_range("neighbor sampler", self._N,
                                           sample_size * 2)
            if self._compatibility_mode:  # to be able to replay from C++
                nodeList = sorted(nodeList)
            for neighborId in nodeList:
                if p < 0:
                    if False:
                        print sys._getframe(
                        ).f_code.co_name + ": Are you sure p < 0 is a good idea?"
                    break
                if neighborId == nodeId:
                    continue
                # check condition, and insert into mini_batch_set if it is valid.
                edge = (min(nodeId, neighborId), max(nodeId, neighborId))
                if edge in held_out_set or edge in test_set or neighborId in neighbor_nodes:
                    continue
                else:
                    # add it into mini_batch_set
                    neighbor_nodes.add(neighborId)
                    p -= 1

        if self._compatibility_mode:  # to be able to replay from C++
            neighbor_nodes = sorted(neighbor_nodes)
        return neighbor_nodes
Beispiel #3
0
    def __stratified_random_node_sampling(self, num_pieces):
        """
        stratified sampling approach gives more attention to link edges (the edge is connected by two
        nodes). The sampling process works like this: 
        a) randomly choose one node $i$ from all nodes (1,....N)
        b) decide to choose link edges or non-link edges with (50%, 50%) probability. 
        c) if we decide to sample link edge:
                return all the link edges for the chosen node $i$
           else 
                sample edges from all non-links edges for node $i$. The number of edges
                we sample equals to  number of all non-link edges / num_pieces
        """
        # randomly select the node ID
        nodeId = random.get("minibatch sampler").randint(0, self.__N - 1)
        # decide to sample links or non-links
        flag = random.get("minibatch sampler").randint(
            0, 1)  # flag=0: non-link edges  flag=1: link edges
        # sys.stderr.write ("Sample minibatch num_pieces %d minibatch size %d\n" % (num_pieces, (self.__N / self.__num_pieces)))

        mini_batch_set = Set()

        if flag == 0:
            """ sample non-link edges """
            # this is approximation, since the size of self.train_link_map[nodeId]
            # greatly smaller than N.
            mini_batch_size = int(self.__N / self.__num_pieces)
            p = mini_batch_size
            while p > 0:
                # because of the sparsity, when we sample $mini_batch_size*2$ nodes, the list likely
                # contains at least mini_batch_size valid nodes.
                nodeList = random.sample_range("minibatch sampler", self.__N,
                                               mini_batch_size * 2)
                for neighborId in nodeList:
                    if p < 0:
                        if False:
                            print sys._getframe(
                            ).f_code.co_name + ": Are you sure p < 0 is a good idea?"
                        break
                    if neighborId == nodeId:
                        continue
                    # check condition, and insert into mini_batch_set if it is valid.
                    edge = (min(nodeId, neighborId), max(nodeId, neighborId))
                    if edge in self.__linked_edges or edge in self.__held_out_map or \
                            edge in self.__test_map or edge in mini_batch_set:
                        # print "Discard edge " + str(edge)
                        continue

                    # add it into mini_batch_set
                    mini_batch_set.add(edge)
                    p -= 1

            print "A Create mini batch size " + str(
                len(mini_batch_set)) + " scale " + str(
                    self.__N * self.__num_pieces)
            # for e in mini_batch_set:
            #     sys.stdout.write("%s " % str(e))
            # sys.stdout.write("\n")

            return (mini_batch_set, self.__N * self.__num_pieces)

        else:
            """ sample linked edges """
            # return all linked edges
            # print "train_link_map[" + str(nodeId) + "] size " + str(len(self.__train_link_map[nodeId]))
            for neighborId in self.__train_link_map[nodeId]:
                mini_batch_set.add((min(nodeId,
                                        neighborId), max(nodeId, neighborId)))

            print "B Create mini batch size " + str(
                len(mini_batch_set)) + " scale " + str(self.__N)
            return (mini_batch_set, self.__N)