def __init_test_set(self): """ sample test set. we draw equal number of samples for linked and non-linked edges """ p = int(self.__held_out_size / 2) # sample p linked edges from the network while p > 0: # Because we already used some of the linked edges for held_out sets, # here we sample twice as much as links, and select among them, which # is likely to contain valid p linked edges. sampled_linked_edges = random.get("graph init").sample( self.__linked_edges, 2 * p) if self.__compatibility_mode: sampled_linked_edges = sorted(sampled_linked_edges) for edge in sampled_linked_edges: if p < 0: print sys._getframe( ).f_code.co_name + ": Are you sure p < 0 is a good idea?" break # check whether it is already used in hold_out set if edge in self.__held_out_map or edge in self.__test_map: continue else: self.__test_map[edge] = True self.__train_link_map[edge[0]].remove(edge[1]) self.__train_link_map[edge[1]].remove(edge[0]) p -= 1 # sample p non-linked edges from the network p = int(self.__held_out_size / 2) while p > 0: edge = self.__sample_non_link_edge_for_test() self.__test_map[edge] = False p -= 1
def __update_phi(self, i, neighbors): ''' update phi for current node i. ''' eps_t = self.__a*((1 + self._step_count/self.__b)**-self.__c) phi_i_sum = np.sum(self.__phi[i]) phi_star = copy.copy(self.__phi[i]) # updated \phi phi_i_sum = np.sum(self.__phi[i]) grads = np.zeros(self._K) noise = random.get("phi update").randn(self._K) # random noise. for neighbor in neighbors: if neighbor == i: continue y_ab = 0 # observation edge = (min(i, neighbor), max(i, neighbor)) if edge in self._network.get_linked_edges(): y_ab = 1 probs = np.empty(self._K) for k in range(0,self._K): probs[k] = self._beta[k] ** y_ab * (1 - self._beta[k]) ** (1 - y_ab) * self._pi[i][k] * self._pi[neighbor][k] probs[k] += self._epsilon ** y_ab * (1 - self._epsilon) ** (1 - y_ab) * self._pi[i][k] * (1 - self._pi[neighbor][k]) prob_sum = np.sum(probs) for k in range(0,self._K): grads[k] += (probs[k] / prob_sum) / self.__phi[i][k] - 1.0 / phi_i_sum # update phi for node i for k in range(0, self._K): self.__phi[i][k] = abs(self.__phi[i][k] + eps_t / 2 * (self._alpha - self.__phi[i][k] + (self._N*1.0 / self.__num_node_sample) *grads[k]) + eps_t ** 0.5 * self.__phi[i][k] ** 0.5 *noise[k])
def __init__(self, args, graph, compatibility_mode): # call base class initialization Learner.__init__(self, args, graph, compatibility_mode) self._strategy = args.strategy self._interval = args.interval self.__num_pieces = graph.get_num_pieces() # step size parameters. # FIXME RFHH make SURE self.__b is initialized to a float. As published, # it is an int = 1024, which results in integer step_count / b so # eps_t always equals self.__a. self.__b = 1.0 * args.b self.__c = 1.0 * args.c if (args.a == 0.0): self.__a = pow(self.__b, -self.__c) else: self.__a = 1.0 * args.a # control parameters for learning #self.__num_node_sample = int(math.sqrt(self._network.get_num_nodes())) # TODO: automative update..... # self.__num_node_sample = int(self._N/50) self.__num_node_sample = args.num_node_sample # model parameters and re-parameterization # since the model parameter - \pi and \beta should stay in the simplex, # we need to restrict the sum of probability equals to 1. The way we # restrict this is using re-reparameterization techniques, where we # introduce another set of variables, and update them first followed by # updating \pi and \beta. # self.__theta = random.gamma(100,0.01,(self._K, 2)) # parameterization for \beta self.__theta = random.get("theta init").gamma( self._eta[0], self._eta[1], (self._K, 2)) # parameterization for \beta self.__phi = random.get("phi init").gamma( 1, 1, (self._N, self._K)) # parameterization for \pi # temp = self.__theta/np.sum(self.__theta,1)[:,np.newaxis] # self._beta = temp[:,1] # self._pi = self.__phi/np.sum(self.__phi,1)[:,np.newaxis] self.update_pi_from_phi() self.update_beta_from_theta()
def __init__(self, args, graph, compatibility_mode): # call base class initialization Learner.__init__(self, args, graph, compatibility_mode) self._strategy = args.strategy self._interval = args.interval self.__num_pieces = graph.get_num_pieces() # step size parameters. # FIXME RFHH make SURE self.__b is initialized to a float. As published, # it is an int = 1024, which results in integer step_count / b so # eps_t always equals self.__a. self.__b = 1.0 * args.b self.__c = 1.0 * args.c if (args.a == 0.0): self.__a = pow(self.__b, -self.__c) else: self.__a = 1.0 * args.a # control parameters for learning #self.__num_node_sample = int(math.sqrt(self._network.get_num_nodes())) # TODO: automative update..... # self.__num_node_sample = int(self._N/50) self.__num_node_sample = args.num_node_sample # model parameters and re-parameterization # since the model parameter - \pi and \beta should stay in the simplex, # we need to restrict the sum of probability equals to 1. The way we # restrict this is using re-reparameterization techniques, where we # introduce another set of variables, and update them first followed by # updating \pi and \beta. # self.__theta = random.gamma(100,0.01,(self._K, 2)) # parameterization for \beta self.__theta = random.get("theta init").gamma(self._eta[0], self._eta[1], (self._K, 2)) # parameterization for \beta self.__phi = random.get("phi init").gamma(1,1,(self._N, self._K)) # parameterization for \pi # temp = self.__theta/np.sum(self.__theta,1)[:,np.newaxis] # self._beta = temp[:,1] # self._pi = self.__phi/np.sum(self.__phi,1)[:,np.newaxis] self.update_pi_from_phi() self.update_beta_from_theta()
def __sample_non_link_edge_for_test(self): """ Sample one non-link edge for test set from the network. We first randomly generate one edge, then check conditions. If that edge passes all the conditions, return that edge. TODO prevent the infinit loop """ while True: firstIdx = random.get("graph init").randint(0, self.__N - 1) secondIdx = random.get("graph init").randint(0, self.__N - 1) if (firstIdx == secondIdx): continue # ensure the first index is smaller than the second one. edge = (min(firstIdx, secondIdx), max(firstIdx, secondIdx)) # check conditions: if edge in self.__linked_edges or edge in self.__held_out_map \ or edge in self.__test_map: continue return edge
def __sample_non_link_edge_for_held_out(self): ''' sample one non-link edge for held out set from the network. We should make sure the edge is not been used already, so we need to check the condition before we add it into held out sets TODO: add condition for checking the infinit-loop ''' while True: firstIdx = random.get("graph init").randint(0, self.__N - 1) secondIdx = random.get("graph init").randint(0, self.__N - 1) if (firstIdx == secondIdx): continue # ensure the first index is smaller than the second one. edge = (min(firstIdx, secondIdx), max(firstIdx, secondIdx)) # check conditions. if edge in self.__linked_edges or edge in self.__held_out_map: continue return edge
def __random_edge_sampling(self, mini_batch_size): if mini_batch_size >= self.__num_total_edges - self.__held_out_size - len( self.__test_map): return self.__sample_full_training_set() mini_batch_set = Set() p = mini_batch_size while p > 0: sampled_linked_edges = random.get("minibatch sampler").sample( self.__linked_edges, mini_batch_size) for edge in sampled_linked_edges: if p < 0: break if edge in self.__held_out_map or edge in self.__test_map or edge in mini_batch_set: continue mini_batch_set.add(edge) p -= 1 return (mini_batch_set, len(self.__linked_edges) / mini_batch_size)
def __update_phi(self, i, neighbors): ''' update phi for current node i. ''' eps_t = self.__a * ((1 + self._step_count / self.__b)**-self.__c) phi_i_sum = np.sum(self.__phi[i]) phi_star = copy.copy(self.__phi[i]) # updated \phi phi_i_sum = np.sum(self.__phi[i]) grads = np.zeros(self._K) noise = random.get("phi update").randn(self._K) # random noise. for neighbor in neighbors: if neighbor == i: continue y_ab = 0 # observation edge = (min(i, neighbor), max(i, neighbor)) if edge in self._network.get_linked_edges(): y_ab = 1 probs = np.empty(self._K) for k in range(0, self._K): probs[k] = self._beta[k]**y_ab * (1 - self._beta[k])**( 1 - y_ab) * self._pi[i][k] * self._pi[neighbor][k] probs[k] += self._epsilon**y_ab * (1 - self._epsilon)**( 1 - y_ab) * self._pi[i][k] * (1 - self._pi[neighbor][k]) prob_sum = np.sum(probs) for k in range(0, self._K): grads[k] += (probs[k] / prob_sum) / self.__phi[i][k] - 1.0 / phi_i_sum # update phi for node i for k in range(0, self._K): self.__phi[i][k] = abs( self.__phi[i][k] + eps_t / 2 * (self._alpha - self.__phi[i][k] + (self._N * 1.0 / self.__num_node_sample) * grads[k]) + eps_t**0.5 * self.__phi[i][k]**0.5 * noise[k])
def __update_beta(self, mini_batch, scale): ''' update beta for mini_batch. ''' grads = np.zeros((self._K, 2)) # gradients K*2 dimension theta_sum = np.sum(self.__theta, 1) eps_t = self.__a * ((1 + self._step_count / self.__b)**-self.__c) for edge in mini_batch: y = 0 if edge in self._network.get_linked_edges(): y = 1 i, j = edge probs = np.zeros(self._K) pi_sum = 0.0 for k in range(0, self._K): pi_sum += self._pi[i][k] * self._pi[j][k] probs[k] = self._beta[k]**y * (1 - self._beta[k])**( 1 - y) * self._pi[i][k] * self._pi[j][k] prob_0 = self._epsilon**y * (1 - self._epsilon)**(1 - y) * (1 - pi_sum) prob_sum = np.sum(probs) + prob_0 for k in range(0, self._K): grads[k][0] += (probs[k] / prob_sum) * ( abs(1 - y) / self.__theta[k][0] - 1 / theta_sum[k]) grads[k][1] += (probs[k] / prob_sum) * ( abs(-y) / self.__theta[k][1] - 1 / theta_sum[k]) # update theta noise = random.get("beta update").randn(self._K, 2) # random noise. for k in range(0, self._K): for i in range(0, 2): self.__theta[k][i] = abs(self.__theta[k][i] + eps_t / 2.0 * (self._eta[i] - self.__theta[k][i] + \ scale * grads[k][i]) + eps_t**.5*self.__theta[k][i] ** .5 * noise[k][i]) self.update_beta_from_theta()
def __init_held_out_set(self): """ Sample held out set. we draw equal number of links and non-links from the whole graph. """ p = self.__held_out_size / 2 # Sample p linked-edges from the network. if len(self.__linked_edges) < p: print "There are not enough linked edges that can sample from. \ please use smaller held out ratio." sampled_linked_edges = random.get("graph init").sample( self.__linked_edges, p) for edge in sampled_linked_edges: self.__held_out_map[edge] = True self.__train_link_map[edge[0]].remove(edge[1]) self.__train_link_map[edge[1]].remove(edge[0]) # print sampled_linked_edges if False: sys.stdout.write("Sampled part of held out set:\n") for m in sorted(sampled_linked_edges): a, b = m sys.stdout.write("(%d,%d) " % (a, b)) sys.stdout.write("\n") # sample p non-linked edges from the network while p > 0: edge = self.__sample_non_link_edge_for_held_out() self.__held_out_map[edge] = False p -= 1 if False: sys.stdout.write("Held out set:\n") for m in sorted(self.__held_out_map): a, b = m sys.stdout.write("(%d,%d) " % (a, b)) sys.stdout.write("\n")
def __update_beta(self, mini_batch, scale): ''' update beta for mini_batch. ''' grads = np.zeros((self._K, 2)) # gradients K*2 dimension theta_sum = np.sum(self.__theta,1) eps_t = self.__a*((1 + self._step_count/self.__b)**-self.__c) for edge in mini_batch: y = 0 if edge in self._network.get_linked_edges(): y = 1 i, j = edge probs = np.zeros(self._K) pi_sum = 0.0 for k in range(0,self._K): pi_sum += self._pi[i][k] * self._pi[j][k] probs[k] = self._beta[k] ** y * (1 - self._beta[k]) ** (1 - y) * self._pi[i][k] * self._pi[j][k] prob_0 = self._epsilon ** y * (1 - self._epsilon) ** (1 - y) * (1 - pi_sum) prob_sum = np.sum(probs) + prob_0 for k in range(0,self._K): grads[k][0] += (probs[k] / prob_sum) * (abs(1-y)/self.__theta[k][0] - 1/ theta_sum[k]) grads[k][1] += (probs[k] / prob_sum) * (abs(-y)/self.__theta[k][1] - 1/theta_sum[k]) # update theta noise = random.get("beta update").randn(self._K, 2) # random noise. for k in range(0,self._K): for i in range(0,2): self.__theta[k][i] = abs(self.__theta[k][i] + eps_t / 2.0 * (self._eta[i] - self.__theta[k][i] + \ scale * grads[k][i]) + eps_t**.5*self.__theta[k][i] ** .5 * noise[k][i]) self.update_beta_from_theta()
def __stratified_random_node_sampling(self, num_pieces): """ stratified sampling approach gives more attention to link edges (the edge is connected by two nodes). The sampling process works like this: a) randomly choose one node $i$ from all nodes (1,....N) b) decide to choose link edges or non-link edges with (50%, 50%) probability. c) if we decide to sample link edge: return all the link edges for the chosen node $i$ else sample edges from all non-links edges for node $i$. The number of edges we sample equals to number of all non-link edges / num_pieces """ # randomly select the node ID nodeId = random.get("minibatch sampler").randint(0, self.__N - 1) # decide to sample links or non-links flag = random.get("minibatch sampler").randint( 0, 1) # flag=0: non-link edges flag=1: link edges # sys.stderr.write ("Sample minibatch num_pieces %d minibatch size %d\n" % (num_pieces, (self.__N / self.__num_pieces))) mini_batch_set = Set() if flag == 0: """ sample non-link edges """ # this is approximation, since the size of self.train_link_map[nodeId] # greatly smaller than N. mini_batch_size = int(self.__N / self.__num_pieces) p = mini_batch_size while p > 0: # because of the sparsity, when we sample $mini_batch_size*2$ nodes, the list likely # contains at least mini_batch_size valid nodes. nodeList = random.sample_range("minibatch sampler", self.__N, mini_batch_size * 2) for neighborId in nodeList: if p < 0: if False: print sys._getframe( ).f_code.co_name + ": Are you sure p < 0 is a good idea?" break if neighborId == nodeId: continue # check condition, and insert into mini_batch_set if it is valid. edge = (min(nodeId, neighborId), max(nodeId, neighborId)) if edge in self.__linked_edges or edge in self.__held_out_map or \ edge in self.__test_map or edge in mini_batch_set: # print "Discard edge " + str(edge) continue # add it into mini_batch_set mini_batch_set.add(edge) p -= 1 print "A Create mini batch size " + str( len(mini_batch_set)) + " scale " + str( self.__N * self.__num_pieces) # for e in mini_batch_set: # sys.stdout.write("%s " % str(e)) # sys.stdout.write("\n") return (mini_batch_set, self.__N * self.__num_pieces) else: """ sample linked edges """ # return all linked edges # print "train_link_map[" + str(nodeId) + "] size " + str(len(self.__train_link_map[nodeId])) for neighborId in self.__train_link_map[nodeId]: mini_batch_set.add((min(nodeId, neighborId), max(nodeId, neighborId))) print "B Create mini batch size " + str( len(mini_batch_set)) + " scale " + str(self.__N) return (mini_batch_set, self.__N)