def __init__(self, av, gr: PermGnnGraph, num_hash_tables=10, subset_size=8): super(LSH, self).__init__() assert (subset_size <= av.HASHCODE_DIM) self.av = av self.gr = gr self.all_nodes = self.gr.get_num_nodes() self.num_hash_tables = num_hash_tables #No. of buckets in a hashTable is 2^subset_size self.subset_size = subset_size self.powers_of_two = cudavar( self.av, torch.from_numpy(1 << np.arange(self.subset_size - 1, -1, -1)).type( torch.FloatTensor)) self.hash_functions = None self.init_hash_functions() #This contains +1 or -1. used for bucketifying self.hashcode_mat = cudavar(self.av, torch.tensor([])) self.all_hash_tables = [] self.candidate_set = np.zeros( (self.gr.get_num_nodes(), self.gr.get_num_nodes()))
def __init__(self, av, gr: PermGnnGraph): super(PermutationGenerator, self).__init__() self.av = av self.gr = gr self.features = nn.Embedding(self.gr.get_num_nodes(), self.gr.get_num_features()) self.features.weight = nn.Parameter(cudavar( self.av, torch.FloatTensor(self.gr.node_features)), requires_grad=False) self.feature_dim = self.gr.get_num_features() self.adj_list = self.gr.adjacency_list #Max set size is one greater than max node outdegree accounting for presence of the node itself in set self.max_set_size = self.gr.get_max_node_outdegree() + 1 # Lookup table set_size:mask . Given set_size k and max_set_size n # Mask pattern sets top left (k)*(k) square to 1 inside arrays of size n*n. Rest elements are 0 self.set_size_to_mask_map = [ torch.cat((torch.repeat_interleave( torch.tensor([1, 0]), torch.tensor([x, self.max_set_size - x])).repeat(x, 1), torch.repeat_interleave( torch.tensor([1, 0]), torch.tensor([0, self.max_set_size ])).repeat(self.max_set_size - x, 1))) for x in range(1, self.max_set_size + 1) ] # List of tensors corr to each node. Each tensor is input sequence of neighbourhood features #if self.av.TASK == "1Perm": self.neighbour_features_all = [ self.features( cudavar(self.av, torch.LongTensor(sorted(list(self.adj_list[node]))))) for node in range(self.gr.get_num_nodes()) ] #else: # self.neighbour_features_all = [self.features(cudavar(self.av,torch.LongTensor(list(self.adj_list[node])))) for node in range(self.gr.get_num_nodes())] #numpy array of set sizes for all node ids. Used later for variable length LSTM code self.set_size_all = np.array( [len(x) for x in self.neighbour_features_all]) #Generate boolean mask for each node based on it's set_size. Used for masked sinkhorn normalization self.sets_maskB_all = cudavar( self.av, torch.stack([ self.set_size_to_mask_map[x - 1] == 0 for x in self.set_size_all ])) #Generates padded tensor of dim(num_nodes*max_set_size*feature_dimension) self.padded_neighbour_features_all = pad_sequence( self.neighbour_features_all, batch_first=True) self.latent_dim = self.av.PERM_NETWORK_LATENT_DIM self.output_dim = self.max_set_size self.linear1 = nn.Linear(self.feature_dim, self.latent_dim) self.relu1 = nn.ReLU() self.linear2 = nn.Linear(self.latent_dim, self.output_dim)
def init_non_nbr_mat(self, list_training_edges): for (a, b) in list_training_edges: self.non_nbr_mat[a][b] = 1 z = cudavar( self.av, torch.zeros(self.gr.get_num_nodes(), self.gr.get_num_nodes())) o = cudavar( self.av, torch.ones(self.gr.get_num_nodes(), self.gr.get_num_nodes())) reverse = torch.where(self.non_nbr_mat == 0, o, z) self.non_nbr_mat = reverse
def init_hash_functions(self): self.hash_functions = cudavar(self.av, torch.LongTensor([])) hash_code_dim = self.av.HASHCODE_DIM indices = list(range(hash_code_dim)) for i in range(self.num_hash_tables): random.shuffle(indices) self.hash_functions = torch.cat( (self.hash_functions, cudavar(self.av, torch.LongTensor( [indices[:self.subset_size]]))), dim=0)
def computeLoss(self, nodes): """ :param nodes : batch of node ids from range 0..NUM_NODES :return loss : Hinge ranking loss """ loss = 0 all_nodes = list(range(self.gr.get_num_nodes())) all_embeds = cudavar(self.av, torch.tensor([])) if self.av.TASK == "Multiperm": all_embeds_perms = [] for rep in range(self.av.NUM_PERMS): temp = cudavar(self.av, torch.tensor([])) #batch and send nodes to avoid memory limit crash for larger graphs for i in range(0, self.gr.get_num_nodes(), self.av.BATCH_SIZE): batch_nodes = all_nodes[i:i + self.av.BATCH_SIZE] temp = torch.cat((temp, self.forward(batch_nodes)), dim=0) all_embeds_perms.append(temp) all_embeds = torch.mean(torch.stack(all_embeds_perms), 0) else: #batch and send nodes to avoid memory limit crash for larger graphs for i in range(0, self.gr.get_num_nodes(), self.av.BATCH_SIZE): batch_nodes = all_nodes[i:i + self.av.BATCH_SIZE] all_embeds = torch.cat((all_embeds, self.forward(batch_nodes)), dim=0) #Filter for query_nodes nodes = list(set(self.gr.query_node_list).intersection(set(nodes))) for i in range(len(nodes)): selfemb = all_embeds[nodes[i]] nbrs = all_embeds[list(self.gr.query_node_nbr[nodes[i]])] nonnbrs = all_embeds[list(self.gr.query_node_non_nbr[nodes[i]])] #https://pytorch.org/docs/master/generated/torch.nn.CosineSimilarity.html cos = nn.CosineSimilarity(dim=1, eps=1e-6) pos_scores = cos(nbrs, selfemb.unsqueeze(0)) neg_scores = cos(nonnbrs, selfemb.unsqueeze(0)) len_pos = pos_scores.shape[0] len_neg = neg_scores.shape[0] expanded_pos_scores = pos_scores.unsqueeze(1).expand( len_pos, len_neg) expanded_neg_scores = neg_scores.unsqueeze(0).expand( len_pos, len_neg) loss += torch.max( self.av.MARGIN + expanded_neg_scores - expanded_pos_scores, cudavar(self.av, torch.tensor([0.]))).sum() return loss
def fetch_permgnn_embeddings(av,gr: PermGnnGraph): avTask = av.TASK av.TASK = "PermGNN" pickle_fp = "./data/embeddingPickles/"+av.TASK+"_"+av.DATASET_NAME+"_tfrac_"+str(av.TEST_FRAC)+"_vfrac_"+str(av.VAL_FRAC) + "_embedding_mat.pkl" if not os.path.exists(pickle_fp): query_nodes, \ list_training_edges, \ list_training_non_edges, \ list_test_edges, \ list_test_non_edges, \ list_val_edges, \ list_val_non_edges = fetch_lp_data_split(av,gr) prep_permgnn_graph(av,gr,query_nodes,list_training_edges,list_training_non_edges,list_val_edges,list_test_edges,list_val_non_edges,list_test_non_edges) device = "cuda" if av.has_cuda and av.want_cuda else "cpu" permNet = PermutationGenerator(av,gr).to(device) permGNN = PermutationInvariantGNN(av,gr,permNet).to(device) #if VAL_FRAC is 0, we fetch model weights from last trained epoch # else we fetch best performing model on validation dataset if av.VAL_FRAC==0: checkpoint = load_model(av) logger.info("Loading latest trained model from training epoch %d",checkpoint['epoch']) else: es = EarlyStoppingModule(av) checkpoint = es.load_best_model() logger.info("Loading best validation result model from training epoch %d",checkpoint['epoch']) permGNN.load_state_dict(checkpoint['model_state_dict']) all_nodes = list(range(permGNN.gr.get_num_nodes())) all_embeds = cudavar(av,torch.tensor([])) for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : batch_nodes = all_nodes[i:i+av.BATCH_SIZE] set_size = permGNN.permNet.set_size_all[batch_nodes] neighbour_features = permGNN.permNet.padded_neighbour_features_all[batch_nodes] all_embeds = torch.cat((all_embeds,permGNN.getEmbeddingForFeatures(set_size,neighbour_features).data),dim=0) logger.info("Creating permgnn embedding pickle at %s",pickle_fp) with open(pickle_fp, 'wb') as f: pickle.dump(all_embeds, f) else: logger.info("Loading permgnn embedding pickle from %s",pickle_fp) with open(pickle_fp, 'rb') as f: all_embeds = pickle.load(f) av.TASK = avTask return cudavar(av,all_embeds)
def computeLoss(self, nodes): """ :param nodes : batch of node ids from range 0..NUM_NODES :return loss : Hinge ranking loss """ loss1 = loss2 = loss3 = 0 all_hashcodes = self.forward(nodes) num_nodes = len(nodes) for i in range(len(nodes)): selfcode = all_hashcodes[i] loss1 = loss1 + torch.abs(torch.sum(selfcode)) loss2 = loss2 + torch.norm(torch.abs(selfcode) - 1, p=1) indices = cudavar(av, torch.tensor(nodes)) non_nbrs = torch.index_select( torch.index_select(self.non_nbr_mat, 0, indices), 1, indices) similarity_mat = torch.mul( torch.abs( torch.mm(all_hashcodes, torch.transpose(all_hashcodes, 0, 1))), non_nbrs) loss3 = torch.sum(similarity_mat) - torch.sum( torch.diagonal(similarity_mat)) return loss1, loss2, loss3, num_nodes
def compute_scores(av,permGNN,query_nodes,list_test_edges,list_test_non_edges): all_nodes = list(range(permGNN.gr.get_num_nodes())) all_embeds = cudavar(av,torch.tensor([])) #batch and send nodes to avoid memory limit crash for larger graphs for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : batch_nodes = all_nodes[i:i+av.BATCH_SIZE] all_embeds = torch.cat((all_embeds,permGNN.forward(batch_nodes).data),dim=0) return compute_scores_from_embeds(av,all_embeds,query_nodes,list_test_edges,list_test_non_edges)
def assign_bucket(self, function_id, node_hash_code): func = self.hash_functions[function_id] # convert sequence of -1 and 1 to binary by replacing -1 s to 0 binary_id = torch.max( torch.index_select(node_hash_code, dim=0, index=func), cudavar(self.av, torch.tensor([0.]))) #map binary sequence to int which is bucket Id bucket_id = self.powers_of_two @ binary_id return bucket_id.item()
def getEmbeddingForFeatures(self, set_size, neighbour_features, diagnostic_mode=False): """ :param set_size : neighbourhood set sizes for each node. Needed for variable lenghth LSTM code :param neighbour_features : permutation of neighbour feature vectors for each node For node_set_size k and max_set_size n last (n-k) rows are padded with 0 :return node_embeddings : Embedding dim currently same as input feature dimension. """ #Below 3 steps of pack_padded_sequence -> LSTM -> pad_packed_sequence #ensures variable length LSTM. So 0 padded rows not fed to LSTM network packed_neighbour_features = pack_padded_sequence(neighbour_features, set_size, batch_first=True, enforce_sorted=False) packed_lstm_output, (ht, ct) = self.lstm(packed_neighbour_features) padded_lstm_output = pad_packed_sequence(packed_lstm_output, batch_first=True) #appends 1 in bias column except for the rows which are pads aug_lstm_output = torch.cat( (padded_lstm_output[0], pad_sequence([ cudavar(self.av, torch.ones([x])).unsqueeze(0).t() for x in padded_lstm_output[1].tolist() ], batch_first=True)), dim=2) node_embeddings = torch.sum( self.fully_connected_layer(aug_lstm_output), dim=1) #diagonistic mode wasadded later to instrument sensitivity to permutations across layers if diagnostic_mode: lstm_output_flat = padded_lstm_output[0].flatten(1) zero_pad = cudavar( self.av, torch.zeros(padded_lstm_output[0].shape[0], (self.permNet.max_set_size - padded_lstm_output[0].shape[1]) * padded_lstm_output[0].shape[2])) final = torch.cat((lstm_output_flat, zero_pad), 1) return final.data, node_embeddings.data else: return node_embeddings
def __init__(self, av, gr: PermGnnGraph): super(HashCodeGenerator, self).__init__() self.av = av self.gr = gr self.all_embeddings = nn.Embedding(self.gr.get_num_nodes(), self.av.EMBEDDING_DIM) self.non_nbr_mat = cudavar( self.av, torch.zeros(self.gr.get_num_nodes(), self.gr.get_num_nodes())) #Reusing PERM_NETWORK_LATENT_DIM here because why not \()/ self.latent_dim = self.av.PERM_NETWORK_LATENT_DIM self.hash_linear1 = nn.Linear(self.av.EMBEDDING_DIM, self.av.HASHCODE_DIM) self.hash_tanh1 = nn.Tanh() nn.init.normal_(self.hash_linear1.weight)
def run_graph_lp_hash_gaussian(av, gr: PermGnnGraph): pickle_fp = av.DIR_PATH + "/data/hashcodePickles/" + av.TASK + "_gaussian_" + av.DATASET_NAME + "_tfrac_" + str( av.TEST_FRAC) + "_vfrac_" + str(av.VAL_FRAC) + "_hashcode_mat.pkl" if not os.path.exists(pickle_fp): #fetch permGNN embeddings device = "cuda" if av.has_cuda and av.want_cuda else "cpu" all_embeds = fetch_permgnn_embeddings(av, gr) fp = av.DIR_PATH + "/data/hashcodePickles/gauss_hplanes_D_16.pkl" hplanes = pickle.load(open(fp, 'rb')) projections = all_embeds.cpu().numpy() @ np.transpose(hplanes) hcode = np.sign(projections) all_hashcodes = cudavar(av, torch.tensor(hcode)) logger.info("Dumping gaussian hashcode pickle at %s", pickle_fp) with open(pickle_fp, 'wb') as f: pickle.dump(all_hashcodes, f)
def compute_loss(av,gr, all_embeds): loss=0 nodes = gr.query_node_list for i in range(len(nodes)): selfemb = all_embeds[nodes[i]] nbrs = all_embeds[list(gr.query_node_nbr[nodes[i]])] nonnbrs = all_embeds[list(gr.query_node_non_nbr[nodes[i]])] #https://pytorch.org/docs/master/generated/torch.nn.CosineSimilarity.html cos = nn.CosineSimilarity(dim=1, eps=1e-6) pos_scores = cos(nbrs,selfemb.unsqueeze(0)) neg_scores = cos(nonnbrs,selfemb.unsqueeze(0)) len_pos = pos_scores.shape[0] len_neg = neg_scores.shape[0] expanded_pos_scores = pos_scores.unsqueeze(1).expand(len_pos,len_neg) expanded_neg_scores = neg_scores.unsqueeze(0).expand(len_pos,len_neg) loss += torch.max(av.MARGIN + expanded_neg_scores - expanded_pos_scores,cudavar(av,torch.tensor([0.]))).sum() return loss.item()
def __init__(self, av, gr: PermGnnGraph, permNet: PermutationGenerator): super(PermutationInvariantGNN, self).__init__() self.av = av self.gr = gr self.features = nn.Embedding(self.gr.get_num_nodes(), self.gr.get_num_features()) self.features.weight = nn.Parameter(cudavar( self.av, torch.FloatTensor(self.gr.node_features)), requires_grad=False) self.adj_list = self.gr.adjacency_list self.lstm_input_size = self.gr.get_num_features() self.lstm_hidden_size = self.av.LSTM_HIDDEN_DIM self.fclayer_output_size = self.av.EMBEDDING_DIM #LSTM layer init self.lstm = nn.LSTM(self.lstm_input_size, self.lstm_hidden_size, num_layers=1, batch_first=True) #FC layer init. Bias folded in with Weight matrix, so aug_lstm_output generated below to compensate self.fully_connected_layer = nn.Linear(self.lstm_hidden_size + 1, self.fclayer_output_size, bias=False) self.permNet = permNet
def init_hash_code_mat(self, all_hashcodes): self.hashcode_mat = cudavar(self.av, torch.sign(all_hashcodes)) if (torch.sign(all_hashcodes) == 0).any(): logger.info("Hashcode had 0 bits. replacing all with 1") all_hashcodes[all_hashcodes == 0] = 1
def compute_scores_from_embeds(av,all_embeds,query_nodes,list_test_edges,list_test_non_edges): cos = nn.CosineSimilarity(dim=1, eps=1e-6) #per qnode #all_qnode_auc = [] all_qnode_ap = [] all_qnode_rr = [] #all_qnode_ndcg = [] for qnode in query_nodes : qnode_edges = list(filter(lambda x: x[0]==qnode or x[1]==qnode, list_test_edges)) qnode_non_edges = list(filter(lambda x: x[0]==qnode or x[1]==qnode, list_test_non_edges)) if len(qnode_edges)==0 or len(qnode_non_edges)==0: continue a,b = zip(*qnode_edges) self_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(a))) nbr_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(b))) pos_scores = cos(self_tensors,nbr_tensors) a,b = zip(*qnode_non_edges) self_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(a))) nbr_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(b))) neg_scores = cos(self_tensors,nbr_tensors) if av.has_cuda and av.want_cuda: all_scores = torch.cat((pos_scores,neg_scores)).cpu().numpy() else: all_scores = torch.cat((pos_scores,neg_scores)).numpy() all_labels = np.hstack([np.ones(len(pos_scores)), np.zeros(len(neg_scores))]) auc_score = roc_auc_score(all_labels, all_scores) ap_score = average_precision_score(all_labels, all_scores) #ndcg = ndcg_score([all_labels],[all_scores]) so = np.argsort(all_scores)[::-1] labels_rearranged = all_labels[so] rr_score = 1/(labels_rearranged.tolist().index(1)+1) #all_qnode_auc.append(auc_score) all_qnode_ap.append(ap_score) all_qnode_rr.append(rr_score) #all_qnode_ndcg.append(ndcg) #agglo pos_scores = [] neg_scores = [] a,b = zip(*list_test_edges) self_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(a))) nbr_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(b))) pos_scores = cos(self_tensors,nbr_tensors) a,b = zip(*list_test_non_edges) self_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(a))) nbr_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(b))) neg_scores = cos(self_tensors,nbr_tensors) if av.has_cuda and av.want_cuda: all_scores = torch.cat((pos_scores,neg_scores)).cpu().numpy() else: all_scores = torch.cat((pos_scores,neg_scores)).numpy() all_labels = np.hstack([np.ones(len(pos_scores)), np.zeros(len(neg_scores))]) auc_score = roc_auc_score(all_labels, all_scores) ap_score = average_precision_score(all_labels, all_scores) #ndcg = ndcg_score([all_labels],[all_scores]) #so = np.argsort(all_scores)[::-1] #labels_rearranged = all_labels[so] #rr_score = 1/(labels_rearranged.tolist().index(1)+1) return auc_score, ap_score, np.mean(all_qnode_ap), np.mean(all_qnode_rr)
def run_graph_lp_hash(av, gr: PermGnnGraph): pickle_fp = av.DIR_PATH + "/data/hashcodePickles/" + av.TASK + "_" + av.DATASET_NAME + "_tfrac_" + str( av.TEST_FRAC) + "_vfrac_" + str(av.VAL_FRAC) + "_L1_" + str( av.LAMBDA1) + "_L2_" + str(av.LAMBDA2) + "_hashcode_mat.pkl" if not os.path.exists(pickle_fp): #if av.has_cuda: # torch.cuda.reset_max_memory_allocated(0) #fetch permGNN embeddings device = "cuda" if av.has_cuda and av.want_cuda else "cpu" query_nodes, \ list_training_edges, \ list_training_non_edges, \ list_test_edges, \ list_test_non_edges, \ list_val_edges, \ list_val_non_edges = fetch_lp_data_split(av,gr) prep_permgnn_graph(av, gr, query_nodes, list_training_edges, list_training_non_edges, list_val_edges, list_test_edges, list_val_non_edges, list_test_non_edges) all_embeds = fetch_permgnn_embeddings(av, gr) hashCodeGenerator = HashCodeGenerator(av, gr).to(device) hashCodeGenerator.init_embeddings(all_embeds) hashCodeGenerator.init_non_nbr_mat(list_training_edges) es = EarlyStoppingModule(av, 50, 0.001) optimizerFunc = torch.optim.SGD(hashCodeGenerator.parameters(), lr=av.LEARNING_RATE_FUNC) nodes = list(range(gr.get_num_nodes())) epoch = 0 #if VAL_FRAC is 0, we train model for NUM_EPOCHS #else we train model till early stopping criteria is met while av.VAL_FRAC != 0 or epoch < av.NUM_EPOCHS: random.shuffle(nodes) start_time = time.time() totalEpochLoss = 0 for i in range(0, gr.get_num_nodes(), av.BATCH_SIZE): nodes_batch = nodes[i:i + av.BATCH_SIZE] hashCodeGenerator.zero_grad() loss1, loss2, loss3, num_nodes = hashCodeGenerator.computeLoss( nodes_batch) totalLoss = (av.LAMBDA1 / num_nodes) * loss1 + ( av.LAMBDA2 / num_nodes) * loss2 + ( (1 - (av.LAMBDA1 + av.LAMBDA2)) / (num_nodes**2)) * loss3 totalLoss.backward() optimizerFunc.step() totalEpochLoss = totalEpochLoss + totalLoss.item() end_time = time.time() logger.info("Epoch: %d totalEpochLoss: %f time: %.2f", epoch, totalEpochLoss, end_time - start_time) if av.VAL_FRAC != 0: if es.check([-totalEpochLoss], hashCodeGenerator, epoch): break epoch += 1 if av.has_cuda: logger.info("Max gpu memory used: %.6f ", torch.cuda.max_memory_allocated(device=0) / (1024**3)) #generate and dump hashcode pickles all_nodes = list(range(gr.get_num_nodes())) all_hashcodes = cudavar(av, torch.tensor([])) for i in range(0, gr.get_num_nodes(), av.BATCH_SIZE): batch_nodes = all_nodes[i:i + av.BATCH_SIZE] all_hashcodes = torch.cat( (all_hashcodes, hashCodeGenerator.forward(batch_nodes).data), dim=0) logger.info("Dumping trained hashcode pickle at %s", pickle_fp) with open(pickle_fp, 'wb') as f: pickle.dump(all_hashcodes, f)
def compute_lp_scores(self, all_embeds, query_nodes, candidate_list, k, use_tensor=False): """ return both AP/MAP for given candidate_list """ agglo_k = len(query_nodes) * k time_dict = {} time_dict['start_score_computation'] = time.time() #cos = nn.CosineSimilarity(dim=1, eps=1e-6) #a,b = zip(*candidate_list) #self_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(self.av,torch.tensor(a))) #nbr_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(self.av,torch.tensor(b))) #scores = cos(self_tensors,nbr_tensors).tolist() #cos = nn.CosineSimilarity(dim=0, eps=1e-6) #scores = [] #for (a,b) in candidate_list: # scores.append( cos(all_embeds[a],all_embeds[b])) if use_tensor: cos = nn.CosineSimilarity(dim=1, eps=1e-6) a, b = zip(*candidate_list) self_tensors = torch.index_select(all_embeds, dim=0, index=cudavar( self.av, torch.tensor(a))) nbr_tensors = torch.index_select(all_embeds, dim=0, index=cudavar( self.av, torch.tensor(b))) scores = cos(self_tensors, nbr_tensors).tolist() else: cos = nn.CosineSimilarity(dim=0, eps=1e-6) scores = [] for (a, b) in candidate_list: scores.append(cos(all_embeds[a], all_embeds[b])) scores = torch.stack(scores).tolist() time_dict['end_score_computation'] = time.time() time_dict['start_heap_procedure'] = time.time() score_heap = [] heap_size = 0 qnode_heap_dict = {} qnode_heap_size_dict = {} for node in query_nodes: qnode_heap_dict[node] = [] qnode_heap_size_dict[node] = 0 for i in range(len(candidate_list)): if heap_size < agglo_k: heap_size = heap_size + 1 heapq.heappush(score_heap, (scores[i], candidate_list[i])) else: heapq.heappushpop(score_heap, (scores[i], candidate_list[i])) for node in candidate_list[i]: if node in query_nodes: if qnode_heap_size_dict[node] < k: qnode_heap_size_dict[ node] = qnode_heap_size_dict[node] + 1 heapq.heappush(qnode_heap_dict[node], (scores[i], candidate_list[i])) else: heapq.heappushpop(qnode_heap_dict[node], (scores[i], candidate_list[i])) time_dict['end_heap_procedure'] = time.time() scores, predicted_edges = list(zip(*score_heap)) all_scores = list(scores) all_labels = np.array([ self.candidate_set[a][b] for (a, b) in list(list(predicted_edges)) ]) all_labels[all_labels == -1] = 0 if np.all(all_labels == 1): ap_score = 1 elif np.all(all_labels == 0): ap_score = 0 else: ap_score = average_precision_score(all_labels, all_scores) ndcg = ndcg_score([all_labels], [all_scores]) #,k=agglo_k) ap_score_agglo = ap_score ndcg_score_agglo = ndcg all_qnode_ap = [] all_qnode_ndcg = [] for qnode in query_nodes: if qnode_heap_size_dict[qnode] == 0: continue scores, predicted_edges = list(zip(*qnode_heap_dict[qnode])) all_scores = list(scores) all_labels = np.array([ self.candidate_set[a][b] for (a, b) in list(list(predicted_edges)) ]) all_labels[all_labels == -1] = 0 if np.all(all_labels == 1): ap_score = 1 ndcg = 1 elif np.all(all_labels == 0): ap_score = 0 ndcg = 0 else: ap_score = average_precision_score(all_labels, all_scores) ndcg = ndcg_score([all_labels], [all_scores]) all_qnode_ap.append(ap_score) all_qnode_ndcg.append(ndcg) return ap_score_agglo, ndcg_score_agglo, np.mean( all_qnode_ap), np.mean(all_qnode_ndcg), time_dict
def lp_permute_test_result(av,gr: PermGnnGraph): query_nodes, \ list_training_edges, \ list_training_non_edges, \ list_test_edges, \ list_test_non_edges, \ list_val_edges, \ list_val_non_edges = fetch_lp_data_split(av,gr) prep_permgnn_graph(av,gr,query_nodes,list_training_edges,list_training_non_edges,list_val_edges,list_test_edges,list_val_non_edges,list_test_non_edges) device = "cuda" if av.has_cuda and av.want_cuda else "cpu" permNet = PermutationGenerator(av,gr).to(device) permGNN = PermutationInvariantGNN(av,gr,permNet).to(device) #if VAL_FRAC is 0, we fetch model weights from last trained epoch # else we fetch best performing model on validation dataset if av.VAL_FRAC==0: checkpoint = load_model(av) logger.info("Loading latest trained model from training epoch %d",checkpoint['epoch']) else: es = EarlyStoppingModule(av) checkpoint = es.load_best_model() logger.info("Loading best validation result model from training epoch %d",checkpoint['epoch']) permGNN.load_state_dict(checkpoint['model_state_dict']) logger.info("Test scores with canonical input sequence") start_time = time.time() all_nodes = list(range(permGNN.gr.get_num_nodes())) all_embeds = cudavar(av,torch.tensor([])) #batch and send nodes to avoid memory limit crash for larger graphs for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : batch_nodes = all_nodes[i:i+av.BATCH_SIZE] set_size = permGNN.permNet.set_size_all[batch_nodes] neighbour_features = permGNN.permNet.padded_neighbour_features_all[batch_nodes] all_embeds = torch.cat((all_embeds,permGNN.getEmbeddingForFeatures(set_size,neighbour_features).data),dim=0) auc_score, ap_score, map_score, mrr_score = compute_scores_from_embeds(av,all_embeds,query_nodes,list_test_edges,list_test_non_edges) end_time = time.time() logger.info("auc_score: %.6f ap_score: %.6f map_score: %.6f mrr_score: %.6f Time: %.2f",auc_score,ap_score,map_score,mrr_score ,end_time-start_time) logger.info("Test scores with randomly permuted input sequence") for num_run in range(10): start_time = time.time() all_nodes = list(range(permGNN.gr.get_num_nodes())) all_embeds = cudavar(av,torch.tensor([])) #permute neighbour features perm_neighbour_features = pad_sequence([mat[torch.randperm(int(size))] \ for (mat,size) in zip(permGNN.permNet.padded_neighbour_features_all,permGNN.permNet.set_size_all)],\ batch_first=True) #batch and send nodes to avoid memory limit crash for larger graphs for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : batch_nodes = all_nodes[i:i+av.BATCH_SIZE] set_size = permGNN.permNet.set_size_all[batch_nodes] #neighbour_features = permGNN.padded_neighbour_features_all[batch_nodes] neighbour_features = perm_neighbour_features[batch_nodes] all_embeds = torch.cat((all_embeds,permGNN.getEmbeddingForFeatures(set_size,neighbour_features).data),dim=0) auc_score, ap_score, map_score, mrr_score = compute_scores_from_embeds(av,all_embeds,query_nodes,list_test_edges,list_test_non_edges) end_time = time.time() logger.info("auc_score: %.6f ap_score: %.6f map_score: %.6f mrr_score: %.6f Time: %.2f",auc_score,ap_score,map_score,mrr_score ,end_time-start_time)
def performance_analysis(av,gr: PermGnnGraph): query_nodes, \ list_training_edges, \ list_training_non_edges, \ list_test_edges, \ list_test_non_edges, \ list_val_edges, \ list_val_non_edges = fetch_lp_data_split(av,gr) prep_permgnn_graph(av,gr,query_nodes,list_training_edges,list_training_non_edges,list_val_edges,list_test_edges,list_val_non_edges,list_test_non_edges) #num_perms = 5 num_perms = 1 all_info = generate_global_permutations(av,gr,num_perms) device = "cuda" if av.has_cuda and av.want_cuda else "cpu" permNet = PermutationGenerator(av,gr).to(device) permGNN = PermutationInvariantGNN(av,gr,permNet).to(device) #if VAL_FRAC is 0, we fetch model weights from last trained epoch # else we fetch best performing model on validation dataset if av.VAL_FRAC==0: checkpoint = load_model(av) logger.info("Loading latest trained model from training epoch %d",checkpoint['epoch']) else: es = EarlyStoppingModule(av) checkpoint = es.load_best_model() logger.info("Loading best validation result model from training epoch %d",checkpoint['epoch']) permGNN.load_state_dict(checkpoint['model_state_dict']) cos = nn.CosineSimilarity(dim=1, eps=1e-6) all_nodes = list(range(permGNN.gr.get_num_nodes())) canonical_lstm_op = cudavar(av,torch.tensor([])) canonical_embeds = cudavar(av,torch.tensor([])) #batch and send nodes to avoid memory limit crash for larger graphs for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : batch_nodes = all_nodes[i:i+av.BATCH_SIZE] set_size = permGNN.permNet.set_size_all[batch_nodes] neighbour_features = permGNN.permNet.padded_neighbour_features_all[batch_nodes] lstm_op,embeds = permGNN.getEmbeddingForFeatures(set_size,neighbour_features,True) canonical_lstm_op = torch.cat((canonical_lstm_op,lstm_op),dim=0) canonical_embeds = torch.cat((canonical_embeds,embeds),dim=0) canonical_inputs = permGNN.permNet.padded_neighbour_features_all.flatten(1) canonical_tr_loss = compute_loss(av,gr,canonical_embeds) for sample_frac in [0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1]: for perm_type in ['rand','rev']: for n_perm in range(num_perms): perm_info = all_info[sample_frac][perm_type][n_perm] all_embeds = cudavar(av,torch.tensor([])) all_lstm_op = cudavar(av,torch.tensor([])) #permute neighbour features perm_neighbour_features = [] for node in range(gr.get_num_nodes()): node_feats_orig = permGNN.permNet.padded_neighbour_features_all[node] node_feats_perm = node_feats_orig[torch.tensor(perm_info[node])] perm_neighbour_features.append(node_feats_perm) perm_neighbour_features = pad_sequence(perm_neighbour_features,batch_first=True) #batch and send nodes to avoid memory limit crash for larger graphs for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : batch_nodes = all_nodes[i:i+av.BATCH_SIZE] set_size = permGNN.permNet.set_size_all[batch_nodes] neighbour_features = perm_neighbour_features[batch_nodes] lstm_op,embeds = permGNN.getEmbeddingForFeatures(set_size,neighbour_features,True) all_lstm_op = torch.cat((all_lstm_op,lstm_op),dim=0) all_embeds = torch.cat((all_embeds,embeds),dim=0) all_info[sample_frac][perm_type][n_perm]['inputs_sens_score_list'] = cos(canonical_inputs, perm_neighbour_features.flatten(1)) all_info[sample_frac][perm_type][n_perm]['lstm_op_sens_score_list'] = cos(canonical_lstm_op,all_lstm_op) all_info[sample_frac][perm_type][n_perm]['embeds_sens_score_list'] = cos(canonical_embeds,all_embeds) perm_tr_loss = compute_loss(av,gr,all_embeds) all_info[sample_frac][perm_type][n_perm]['loss_var'] = abs(perm_tr_loss-canonical_tr_loss)/canonical_tr_loss fname = av.DIR_PATH+"/data/KTAU_var_data/" + "Ktau_variation_data"+"_"+av.TASK+"_"+av.DATASET_NAME+"_tfrac_"+str(av.TEST_FRAC)+"_vfrac_"+str(av.VAL_FRAC) + "_data.pkl" pickle.dump(all_info,open(fname,"wb"))
def forward(self, nodes): node_embeddings = self.all_embeddings( cudavar(av, torch.LongTensor(nodes))) node_hashcodes = self.hash_tanh1(self.hash_linear1(node_embeddings)) return node_hashcodes