def __call__(self, data): # find idx of landmark _, node_idx = knn(data.pos, data[self.key].view(-1, 3), k=1) # extract patch around vertex mask = self._mask_geodesic_patch(node_idx, data) vert_indices = torch.nonzero(mask).view(-1) # update datastructure data.pos = data.pos[mask] data.edge_index, data.edge_attr = subgraph( vert_indices, data.edge_index, data.edge_attr, relabel_nodes=True) if hasattr(data, 'x'): data.x = data.x[mask] if hasattr(data, 'face'): # only keep faces of which all 3 edges are in the mask data.face = data.face[:, ( data.face[..., None] == vert_indices.view(-1)).any(-1).all(0)] # remap faces to match new vertex indices index_mapping = torch.zeros(mask.shape, dtype=torch.long) index_mapping[mask] = torch.arange(data.pos.shape[0]) data.face = index_mapping[data.face] return data
def subgraph(self, subset: Tensor): r"""Returns the induced subgraph given by the node indices :obj:`subset`. Args: subset (LongTensor or BoolTensor): The nodes to keep. """ out = subgraph(subset, self.edge_index, relabel_nodes=True, num_nodes=self.num_nodes, return_edge_mask=True) edge_index, _, edge_mask = out if subset.dtype == torch.bool: num_nodes = int(subset.sum()) else: num_nodes = subset.size(0) data = copy.copy(self) for key, value in data: if key == 'edge_index': data.edge_index = edge_index elif key == 'num_nodes': data.num_nodes = num_nodes elif isinstance(value, Tensor): if self.is_node_attr(key): data[key] = value[subset] elif self.is_edge_attr(key): data[key] = value[edge_mask] return data
def load_data(args, datapath): if args.dataset in ['arxiv'] and args.task == 'lp': data = {} dataset = PygNodePropPredDataset(name='ogbn-{}'.format(args.dataset), root='/pasteur/u/jeffgu/hgcn/data') split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] induced_edges_train, _ = subgraph(train_idx, dataset[0].edge_index) induced_edges_valid, _ = subgraph(valid_idx, dataset[0].edge_index) induced_edges_test, _ = subgraph(test_idx, dataset[0].edge_index) neg_edges_train = negative_sampling(induced_edges_train) neg_edges_valid = negative_sampling(induced_edges_valid) neg_edges_test = negative_sampling(induced_edges_test) data['adj_train'] = to_scipy_sparse_matrix( dataset[0].edge_index).tocsr() data['features'] = dataset[0].x data['train_edges'], data[ 'train_edges_false'] = induced_edges_train, neg_edges_train data['val_edges'], data[ 'val_edges_false'] = induced_edges_valid, neg_edges_valid data['test_edges'], data[ 'test_edges_false'] = induced_edges_test, neg_edges_test elif args.task == 'nc': data = load_data_nc(args.dataset, args.use_feats, datapath, args.split_seed) else: data = load_data_lp(args.dataset, args.use_feats, datapath) adj = data['adj_train'] if args.task == 'lp': adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_edges( adj, args.val_prop, args.test_prop, args.split_seed) data['adj_train'] = adj_train data['train_edges'], data[ 'train_edges_false'] = train_edges, train_edges_false data['val_edges'], data[ 'val_edges_false'] = val_edges, val_edges_false data['test_edges'], data[ 'test_edges_false'] = test_edges, test_edges_false data['adj_train_norm'], data['features'] = process(data['adj_train'], data['features'], args.normalize_adj, args.normalize_feats) if args.dataset == 'airport': data['features'] = augment(data['adj_train'], data['features']) return data
def test_subgraph_convert(): G = nx.complete_graph(5) edge_index = from_networkx(G).edge_index sub_edge_index_1, _ = subgraph([0, 1, 3, 4], edge_index) sub_edge_index_2 = from_networkx(G.subgraph([0, 1, 3, 4])).edge_index assert sub_edge_index_1.tolist() == sub_edge_index_2.tolist()
def test_subgraph(): edge_index = torch.tensor([ [0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6], [1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5], ]) edge_attr = torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) idx = torch.tensor([3, 4, 5], dtype=torch.long) mask = torch.tensor([0, 0, 0, 1, 1, 1, 0], dtype=torch.bool) indices = [3, 4, 5] for subset in [idx, mask, indices]: out = subgraph(subset, edge_index, edge_attr) assert out[0].tolist() == [[3, 4, 4, 5], [4, 3, 5, 4]] assert out[1].tolist() == [7, 8, 9, 10] out = subgraph(subset, edge_index, edge_attr, relabel_nodes=True) assert out[0].tolist() == [[0, 1, 1, 2], [1, 0, 2, 1]] assert out[1].tolist() == [7, 8, 9, 10]
def _subgraph(data): x = data.x.clone() edge_index = data.edge_index.clone() __edge_attr = data.edge_attribute.clone() traj_vocabs = data.traj_vocabs.clone() traj_index = torch.tensor([(x == n).nonzero().squeeze()[0].item() for n in traj_vocabs]) order2index = defaultdict(list) for i, idx in enumerate(__edge_attr, 1): order2index[i] = int(idx) edge_attr = torch.zeros(edge_index.size(1), dtype=torch.long) edge_attr[__edge_attr] = torch.arange(1, __edge_attr.size(0) + 1) mask = torch.zeros(x.shape[0], dtype=torch.bool) inds = torch.unique(traj_index) mask[inds] = True perm = torch.randperm(torch.sum(~mask.squeeze())) conn = torch.arange( mask.size(0))[~mask.squeeze()][perm[:max(3, len(inds) // 3)]] # conn = torch.tensor([], dtype=torch.long) nodes = torch.cat((inds, conn), dim=0) edge_ind, edge_att = subgraph(nodes, edge_index, edge_attr, num_nodes=len(x)) # edge_attr matching between origin and subgraph edge_attr = torch.argsort(edge_attr, descending=False)[-edge_attr.nonzero().size(0):] edge_att = torch.argsort(edge_att, descending=False)[-edge_att.nonzero().size(0):] origin_sub = {int(p): int(c) for p, c in zip(edge_attr, edge_att)} edge_att = torch.tensor( [origin_sub[index] for order, index in order2index.items()], dtype=torch.long) tm_index = torch.cat( (torch.tensor(traj_index, dtype=torch.long), conn.to(torch.long)), dim=0) data.edge_index = edge_ind.to(torch.long) data.edge_attribute = edge_att data.edge_attribute_len = torch.tensor(len(edge_att), dtype=torch.long).unsqueeze(-1) data.tm_index = tm_index data.tm_len = torch.tensor(len(tm_index), dtype=torch.long).unsqueeze(-1) return data, order2index
def to_inductive(data): mask = data.train_mask | data.val_mask data.x = data.x[mask] data.y = data.y[mask] data.train_mask = data.train_mask[mask] data.val_mask = data.val_mask[mask] data.test_mask = None data.edge_index, _ = subgraph(mask, data.edge_index, None, relabel_nodes=True, num_nodes=data.num_nodes) data.num_nodes = mask.sum().item() return data
def test_from_networkx_subgraph_convert(): import networkx as nx G = nx.complete_graph(5) edge_index = from_networkx(G).edge_index sub_edge_index_1, _ = subgraph([0, 1, 3, 4], edge_index, relabel_nodes=True) sub_edge_index_2 = from_networkx(G.subgraph([0, 1, 3, 4])).edge_index assert sub_edge_index_1.tolist() == sub_edge_index_2.tolist()
def process_cluster_data(self, data): """ Data processing for ClusterSelfGNN. First the data object will be clustered according to the number of partition specified by this class. Then, we randomly sample a number of clusters and merge them together. Finally, data augmentation is applied each of the final clusters. This is a simple strategy motivated by ClusterGCN and employed to improve the scalability of SelfGNN. :param data: A PyTorch Geometric Data object :return: a list of Data objects depending on the final number of clusters. """ data_list = [] clusters = [] num_parts, cluster_size = self.num_parts, self.num_parts // self.final_parts # Cluster the data cd = ClusterData(data, num_parts=num_parts) for i in range(1, cd.partptr.shape[0]): cls_nodes = cd.perm[cd.partptr[i - 1]: cd.partptr[i]] clusters.append(cls_nodes) # Randomly merge clusters and apply transformation np.random.shuffle(clusters) for i in tqdm(range(0, len(clusters), cluster_size), "Processing clusters"): end = i + cluster_size if len(clusters) - i > cluster_size else len(clusters) cls_nodes = torch.cat(clusters[i:end]).unique() x = data.x[cls_nodes] y = data.y[cls_nodes] train_mask = data.train_mask[cls_nodes] dev_mask = data.val_mask[cls_nodes] test_mask = data.test_mask[cls_nodes] edge_index, edge_attr = subgraph(cls_nodes, data.edge_index, relabel_nodes=True) view1data = Data(edge_index=edge_index, x=x, edge_attr=edge_attr, num_nodes=cls_nodes.shape[0]) view2data = view1data if self.augumentation is None else self.augumentation(view1data) if not hasattr(view2data, "edge_attr") or view2data.edge_attr is None: view2data.edge_attr = torch.ones(view2data.edge_index.shape[1]) diff = abs(view2data.x.shape[1] - view1data.x.shape[1]) if diff > 0: smaller_data = view1data if view1data.x.shape[1] < view2data.x.shape[1] else view2data smaller_data.x = F.pad(smaller_data.x, pad=(0, diff)) view1data.x = F.normalize(view1data.x) view2data.x = F.normalize(view2data.x) new_data = Data(y=y, x=view1data.x, x2=view2data.x, edge_index=view1data.edge_index, edge_index2=view2data.edge_index, edge_attr=view1data.edge_attr, edge_attr2=view2data.edge_attr, train_mask=train_mask, dev_mask=dev_mask, test_mask=test_mask, num_nodes=cls_nodes.shape[0], nodes=cls_nodes) data_list.append(new_data) print() return data_list
def negative_sampling(self, batch, num_negative_samples): # mask = torch.tensor([False]*len(self.data.x)) #mask[batch] = True #_,a = self.edge_index_to_train(mask) a,_ = subgraph(batch,self.data.edge_index) Adj = self.adj_list(a) g = dict() batch = batch.tolist() for node in batch: g[node] = batch for node, neghbors in Adj.items(): g[node] = list(set(batch) - set(neghbors)) #тут все элементы которые не являются соседянями, но при этом входят в батч for node, neg_elem in g.items(): g[node] = self.not_less_than(num_negative_samples, g[node]) #если просят конкретное число негативных примеров, надо либо обрезать либо дублировать return self.torch_list(g)
def forward(self, data): x, edge_index, batch = data.x, data.edge_index, data.batch x_posr = x batch_size = (int)(batch.size()[0] / self.node_per_graph) ##if(self.training == True and random.random()>0.8): # rotateAngleA = random.random() * pi # rotateAngleB = random.random() * pi # rotateAngleC = random.random() * pi # sinA, cosA = math.sin(rotateAngleA), math.cos(rotateAngleA) # sinB, cosB = math.sin(rotateAngleB), math.cos(rotateAngleB) # sinC, cosC = math.sin(rotateAngleC), math.cos(rotateAngleC) # matrix = [[cosC*cosB, -sinC*cosA+cosC*sinB*sinA, sinC*sinA+cosC*sinB*cosA], # [sinC*cosB, cosC*cosA+sinC*sinB*sinA, -cosC*sinA+sinC*sinB*cosA], # [-sinB, cosB*sinA, cosB*cosA]] # x_xyz = x[:,0:3] # x_xyz = torch.matmul(x_xyz, torch.tensor(matrix).to(x_xyz.dtype).to(x_xyz.device)) # #x_xyz = LinearTransformation(torch.tensor(matrix))(x_xyz) # x_r = x[:,3] # x_r = x_r.reshape((x_r.shape[0], 1)) # x = torch.cat((x_xyz, x_r), dim=1) add_self_loops(edge_index) if (self.training == True): mask, torchmask = random_drop_node(self.node_per_graph, (int)( batch.size()[0] / self.node_per_graph), 0.50, 0.50) x = x[mask] x_posr = x_posr[mask] batch = batch[mask] edge_index, _ = subgraph(torchmask, edge_index, relabel_nodes=True) x0 = self.linprev(x, edge_index) x1 = self.conv1(x0, edge_index) + self.lin1(x0) x1n = F.relu(x1) x2 = self.conv2(x1n, edge_index) + self.lin2(x1n) x2n = F.relu(x2) x3 = self.conv3(x2n, edge_index) + self.lin3(x2n) x3n = F.relu(x3) x4 = self.conv4(x3n, edge_index) + self.lin4(x3n) x = torch.cat((x1, x2, x3, x4), dim=1) x = torch.cat([gmp(x, batch), gap(x, batch)], dim=1) x = self.mlp(x) x = F.log_softmax(x, dim=-1) return x
def __call__(self, data): y = torch.nn.functional.one_hot(data.y) c = y.sum(dim=0).sort(descending=True) y = y[:, c.indices[:self.num_classes]] idx = y.sum(dim=1).bool() data.x = data.x[idx] data.y = y[idx].argmax(dim=1) data.num_nodes = data.y.size(0) if 'adj_t' in data: data.adj_t = data.adj_t[idx, idx] elif 'edge_index' in data: data.edge_index, data.edge_attr = subgraph(idx, data.edge_index, data.edge_attr, relabel_nodes=True) if 'train_mask' in data: data.train_mask = data.train_mask[idx] data.val_mask = data.val_mask[idx] data.test_mask = data.test_mask[idx] return data
def predict(self, data): """End to end prediction for INVASE with input batched graph """ x, edge_index, batch = data.x, data.edge_index, data.batch # pass through selector node_prob, fea_prob = self(x, edge_index, batch, component="actor") # Sampling the features based on the selection_probability node_selection_mask = torch.bernoulli(node_prob) node_selection = torch.squeeze( torch.nonzero(node_selection_mask, as_tuple=False)) fea_selection_mask = torch.bernoulli(fea_prob) # make subgraph # mask out features subgraph_x = x * fea_selection_mask[batch] # keep all the nodes subgraph_edge_index, _ = subgraph( node_selection, edge_index) # returning only the edges of the subgraph # Prediction y_hat = self.critic([subgraph_x, node_selection], subgraph_edge_index, batch) return y_hat
def pos_sample(self,batch,**kwargs): d_pb =datetime.now() batch = batch pos_batch=[] d = datetime.now() if self.loss["C"] == "Adj" and self.loss["Name"] == "LINE": name = 'pos_samples_LINE_'+self.datasetname+'.pickle' if os.path.exists(name): with open(name,'rb') as f: pos_batch = pickle.load(f) else: A = self.edge_index_to_adj_train(batch) pos_batch = self.convert_to_samples(batch, A) with open(name,'wb') as f: pickle.dump(pos_batch,f) elif self.loss["C"] == "Adj" and self.loss["Name"] == "VERSE_Adj": name = 'pos_samples_VERSEAdj_'+self.datasetname+'.pickle' if os.path.exists(name): with open(name,'rb') as f: pos_batch = pickle.load(f) else: Adj = self.edge_index_to_adj_train(batch).type(torch.FloatTensor) A = (Adj / sum(Adj)).t() A[torch.isinf(A)] = 0 A[torch.isnan(A)] = 0 pos_batch = self.convert_to_samples(batch, A) with open(name,'wb') as f: pickle.dump(pos_batch,f) elif self.loss["C"] == "SR": SimRankName = 'SimRank'+self.datasetname+'.pickle' if os.path.exists(SimRankName): with open(SimRankName,'rb') as f: A = pickle.load(f) else: Adj,_ = subgraph(batch,self.data.edge_index) row,col= Adj row = row.to(self.device) col = col.to(self.device) ASparse = SparseTensor(row=row, col=col, sparse_sizes=(len(batch), len(batch))) r = 200 length = list(map(lambda x: x*int(r/100), [22,17,14,10,8,6,5,4,3,11])) mask = [] for i, l in enumerate(length): mask1 = torch.zeros([l,10]) mask1.t()[:(i+1)] = 1 mask.append(mask1) mask = torch.cat(mask) mask_new = 1 - mask A = self.find_sim_rank_for_batch_torch(batch,ASparse,self.device,mask,mask_new,r) with open(SimRankName,'wb') as f: pickle.dump(A,f) samples_name = 'samples_simrank_' + self.datasetname +'.pickle' if os.path.exists(samples_name): with open(samples_name,'rb') as f: pos_batch = pickle.load(f) else: pos_batch = self.convert_to_samples(batch, A) with open(samples_name,'wb') as f: pickle.dump(pos_batch,f) elif self.loss["C"] == "PPR": alpha = self.alpha name_of_file = 'pos_samples_VERSEPPR_'+str(alpha)+'_' +self.datasetname+'.pickle' if os.path.exists(name_of_file): with open(name_of_file,'rb') as f: pos_batch = pickle.load(f) else: Adg = self.edge_index_to_adj_train(batch).type(torch.FloatTensor) print('1') invD =torch.diag(1/sum(Adg.t())) invD[torch.isinf(invD)] = 0 print('2') A = ((1-alpha)*torch.inverse(torch.diag(torch.ones(len(Adg))) - alpha*torch.matmul(invD,Adg))) print('3') pos_batch = self.convert_to_samples(batch, A) print('4') with open(name_of_file,'wb') as f: pickle.dump(pos_batch,f) return pos_batch
def get_subgraph_scores(loader, device, model, list_win_sizes, softmax=False): """ Get position-wise scores by scoring subgraphs, and normalize the class 1 softmax probabilities from -1 .. 1. Only works with batch size == 1. """ # List of lists to store probability list for each site. pr_ll = [] model.eval() with torch.no_grad(): for data in loader: data = data.to(device) l_x = len(data.x) # Make graph index list. idx_list = [] for i in range(l_x): idx_list.append(i) sm = torch.nn.Softmax() pr_win_list = [] for win_size in list_win_sizes: win_extlr = int(win_size / 2) pr_list = [] for i in range(l_x): s = i - win_extlr e = i + win_extlr + 1 if s < 0: s = 0 if e > l_x: e = l_x subset = idx_list[s:e] sub_edge_index = subgraph(subset, data.edge_index) output = model(data.x, sub_edge_index[0], data.batch) if softmax: probs = sm(output) #class_0_prob = float(probs[0][0].cpu().detach().numpy()) class_1_prob = float( probs[0][1].cpu().detach().numpy()) pr_list.append(class_1_prob) else: output = torch.exp(output) output = output.cpu().detach().numpy()[:, 1] class_1_prob = float(output[0]) pr_list.append(class_1_prob) for i, pr in enumerate(pr_list): pr_list[i] = min_max_normalize_probs(pr, 1, 0, borders=[-1, 1]) # Deal with scores at ends. start_idx = idx_list[:win_extlr] end_idx = idx_list[-win_extlr:] for i in start_idx: pr_list[i] = pr_list[win_extlr] for i in end_idx: pr_list[i] = pr_list[l_x - win_extlr - 1] pr_win_list.append(pr_list) # Calculate mean list scores. mean_pr_list = list(np.mean(pr_win_list, axis=0)) # Add mean scores list to existing list of lists. pr_ll.append(mean_pr_list) assert pr_ll, "pr_ll empty" return pr_ll
def prep_data(data, max_num_nodes, aggr='sum', device='cpu', NB201=False, NB101=False): device = torch.device(device) data_list=[] for graph in tqdm(data): node_atts=graph.node_atts.numpy() node_atts_reverse=np.flip(graph.node_atts.numpy(),0) num_nodes=node_atts.size L_list=list(range(num_nodes-1,-1,-1)) L= { i : L_list[i] for i in range(0,len(L_list)) } edge_list= sort_edge_index(graph.edge_index,num_nodes) edge_index_reverse= torch.flip(edge_list,[0,1]) edge_list_reverse=torch.LongTensor(np.stack(([L[x] for x in edge_index_reverse[0].numpy()],[L[x] for x in edge_index_reverse[1].numpy()]))) edge_list_reverse= sort_edge_index(edge_list_reverse,num_nodes) nodes=np.zeros(max_num_nodes-1, dtype=int) nodes[:num_nodes-1]=1 acc=graph.acc.numpy().item() if NB201: test_acc=graph.test_acc.numpy().item() acc_avg=graph.acc_avg.numpy().item() test_acc_avg=graph.test_acc_avg.numpy().item() training_time=graph.training_time.numpy().item() data=Data(edge_index=edge_list.to(device), num_nodes=num_nodes, node_atts=torch.LongTensor(node_atts).to(device), acc=torch.tensor([acc]).to(device), test_acc=torch.tensor([test_acc]).to(device), acc_avg=torch.tensor([acc_avg]).to(device), test_acc_avg=torch.tensor([test_acc_avg]).to(device), training_time=torch.tensor([training_time]).to(device), nodes=torch.tensor(nodes).unsqueeze(0).to(device) ) elif NB101: # try: training_time=graph.training_time.numpy().item() test_acc=graph.test_acc.numpy().item() data=Data(edge_index=edge_list.to(device), num_nodes=num_nodes, node_atts=torch.LongTensor(node_atts).to(device), acc=torch.tensor([acc]).to(device), test_acc=torch.tensor([test_acc]).to(device), nodes=torch.tensor(nodes).unsqueeze(0).to(device), training_time=torch.tensor([training_time]).to(device), ) # except: else: data=Data(edge_index=edge_list.to(device), num_nodes=num_nodes, node_atts=torch.LongTensor(node_atts).to(device), acc=torch.tensor([acc]).to(device), nodes=torch.tensor(nodes).unsqueeze(0).to(device) ) data_full=[data] for idx in range(max_num_nodes-1): num_nodes=idx+2 if num_nodes>node_atts.size: data=Data(edge_index=subgraph(list(range(2)), edge_list)[0].to(device), num_nodes=num_nodes, node_atts=torch.LongTensor([node_atts[0]]).to(device), edges=torch.zeros(idx+1).unsqueeze(0).to(device) ) else: data=Data(edge_index=subgraph(list(range(num_nodes)), edge_list)[0].to(device), num_nodes=num_nodes, node_atts=torch.LongTensor([node_atts[idx+1]]).to(device), edges=to_dense_adj(edge_list)[0][:,idx+1][:idx+1].unsqueeze(0).to(device) ) data_full.append(data) for idx in range(max_num_nodes-1): num_nodes=idx+2 if num_nodes>node_atts_reverse.size: data=Data(edge_index=subgraph(list(range(2)), edge_list_reverse)[0].to(device), num_nodes=num_nodes, node_atts=torch.LongTensor([node_atts_reverse[0]]).to(device), edges=torch.zeros(idx+1).unsqueeze(0).to(device) ) else: data=Data(edge_index=subgraph(list(range(num_nodes)), edge_list_reverse)[0].to(device), num_nodes=num_nodes, node_atts=torch.LongTensor([node_atts_reverse[idx+1]]).to(device), edges=to_dense_adj(edge_list_reverse)[0][:,idx+1][:idx+1].unsqueeze(0).to(device) ) data_full.append(data) data_list.append(tuple(data_full)) return data_list
source_nodes = torch.cat([ torch.where(rel_data.node_year_dict['paper'] == year)[0] for year in source_years ]) target_nodes = torch.cat([ torch.where(rel_data.node_year_dict['paper'] == year)[0] for year in target_years ]) source_nodes, _ = source_nodes.sort() target_nodes, _ = target_nodes.sort() source_edge_index, _ = subgraph(source_nodes, data.edge_index, relabel_nodes=True) target_edge_index, _ = subgraph(target_nodes, data.edge_index, relabel_nodes=True) source_data = Data(x=rel_data.x_dict['paper'][source_nodes], edge_index=source_edge_index, y=rel_data.y_dict['paper'][source_nodes]) target_data = Data(x=rel_data.x_dict['paper'][target_nodes], edge_index=target_edge_index, y=rel_data.y_dict['paper'][target_nodes]) data = target_data.to(device) # Train on Target split
def process_cluster_data(self, data): """ Augmented view data generation based on clustering. :param data: :return: """ data_list = [] clusters = [] num_parts, cluster_size = self.num_parts, self.num_parts // self.final_parts # Cluster the data cd = ClusterData(data, num_parts=num_parts) for i in range(1, cd.partptr.shape[0]): cls_nodes = cd.perm[cd.partptr[i - 1]:cd.partptr[i]] clusters.append(cls_nodes) # Randomly merge clusters and apply transformation np.random.shuffle(clusters) for i in range(0, len(clusters), cluster_size): end = i + cluster_size if len( clusters) - i > cluster_size else len(clusters) cls_nodes = torch.cat(clusters[i:end]).unique() sys.stdout.write( f'\rProcessing cluster {i + 1}/{len(clusters)} with {self.final_parts} nodes' ) sys.stdout.flush() x = data.x[cls_nodes] y = data.y[cls_nodes] train_mask = data.train_mask[cls_nodes] dev_mask = data.val_mask[cls_nodes] test_mask = data.test_mask[cls_nodes] edge_index, edge_attr = subgraph(cls_nodes, data.edge_index, relabel_nodes=True) data = Data(edge_index=edge_index, x=x, edge_attr=edge_attr, num_nodes=cls_nodes.shape[0]) view1data, view2data = self.augumentation(data) if not hasattr(view1data, "edge_attr") or view1data.edge_attr is None: view1data.edge_attr = torch.ones(view1data.edge_index.shape[1]) if not hasattr(view2data, "edge_attr") or view2data.edge_attr is None: view2data.edge_attr = torch.ones(view2data.edge_index.shape[1]) diff = abs(view2data.x.shape[1] - view1data.x.shape[1]) if diff > 0: smaller_data = view1data if view1data.x.shape[ 1] < view2data.x.shape[1] else view2data smaller_data.x = F.pad(smaller_data.x, pad=(0, diff)) view1data.x = F.normalize(view1data.x) view2data.x = F.normalize(view2data.x) print(view1data) print(view2data) new_data = Data(y=y, x1=view1data.x, x2=view2data.x, edge_index1=view1data.edge_index, edge_index2=view2data.edge_index, edge_attr1=view1data.edge_attr, edge_attr2=view2data.edge_attr, train_mask=train_mask, dev_mask=dev_mask, test_mask=test_mask, num_nodes=cls_nodes.shape[0], nodes=cls_nodes) data_list.append(new_data) print() return data_list
def evaluate(self, generator, criterion, optimizer, device, task="train"): """evaluate the model Params: - generator: graph dataloader - criterion: baseline loss function - optimise: optimiser linked to model parameters - device: cuda or cpu - task: train, val or test """ actor_loss_meter = AverageMeter() baseline_acc_meter = AverageMeter() critic_acc_meter = AverageMeter() prop_of_nodes = AverageMeter() prop_of_feas = AverageMeter() if task == "test": self.eval() x_test = [] selected_features = [] selected_nodes = [] y_trues = [] y_preds = [] else: if task == "val": self.eval() elif task == "train": self.train() else: raise NameError("Only train, val or test is allowed as task") with trange(len(generator)) as t: for data in generator: # these are batched graphs orig = data.clone() x, edge_index, batch, y_true = data.x, data.edge_index, data.batch, data.y x, edge_index, batch, y_true = x.to(device), edge_index.to( device), batch.to(device), y_true.to(device) # prediction on full graph baseline_logits = self(x, edge_index, batch, component="baseline") # print(baseline_logits) baseline_loss = criterion(baseline_logits, y_true) # pass through selector node_prob, fea_prob = self(x, edge_index, batch, component="actor") # Sampling the features based on the selection_probability node_selection_mask = torch.bernoulli(node_prob) node_selection = torch.squeeze( torch.nonzero(node_selection_mask, as_tuple=False)) fea_selection_mask = torch.bernoulli(fea_prob) # make subgraph # mask out features subgraph_x = x * fea_selection_mask[ batch] # keep all the nodes subgraph_edge_index, _ = subgraph( node_selection, edge_index) # returning only the edges of the subgraph critic_logits = self([subgraph_x, node_selection], subgraph_edge_index, batch, component="critic") critic_loss = criterion(critic_logits, y_true) actor_loss = self.actor_loss( node_selection_mask.clone().detach(), fea_selection_mask.clone().detach(), batch.clone().detach(), self.softmax(critic_logits).clone().detach(), self.softmax(baseline_logits).clone().detach(), y_true.float(), node_prob, fea_prob) actor_loss_meter.update(actor_loss.data.cpu().item(), y_true.size(0)) critic_preds = torch.argmax(critic_logits, dim=1) critic_acc = torch.sum( critic_preds == y_true).float() / y_true.size(0) critic_acc_meter.update(critic_acc) baseline_preds = torch.argmax(baseline_logits, dim=1) baseline_acc = torch.sum( baseline_preds == y_true).float() / y_true.size(0) baseline_acc_meter.update(baseline_acc) prop_of_feas.update( torch.mean(torch.mean(fea_selection_mask, dim=-1)), y_true.size(0)) prop_of_nodes.update(torch.mean(node_selection_mask), y_true.size(0)) if task == "test": # collect and analyse results x_test += orig.to_data_list() selected_features.append(fea_prob.detach().cpu().numpy()) node_prob = node_prob.detach().cpu().numpy() # get graphwise node selection selected_nodes += [[ x for j, x in enumerate(node_prob) if batch[j] == i ] for i in range(len(y_true))] y_trues.append(y_true.detach().cpu().numpy()) y_preds.append(critic_preds.detach().cpu().numpy()) elif task == "train": # compute gradient and do SGD step optimizer.zero_grad() total_loss = actor_loss + critic_loss + baseline_loss total_loss.backward() optimizer.step() t.update() # TODO explanation accuracy if task == "test": return critic_acc_meter.avg, baseline_acc_meter.avg, x_test, \ np.concatenate(selected_features, axis=0), selected_nodes, np.concatenate(y_trues), np.concatenate(y_preds) else: return actor_loss_meter.avg, critic_acc_meter.avg, baseline_acc_meter.avg, prop_of_feas.avg, prop_of_nodes.avg
def load_data(name, seed, transform=None): ''' Load data from files and return a pytorch geometric `Data` object ''' random.seed(seed) # make sure that the split of data is the same ROOT = osp.dirname(osp.abspath(__file__)) + '/..' if name in ['cora', 'citeseer', 'pubmed']: # datasets for transductive node classifiction data = Planetoid(osp.join(ROOT, 'data'), name, transform=transform)[0] data.task = 'semi' # semi-supervised data.setting = 'transductive' # transductive return data elif name in ['wikics']: dataset = WikiCS(osp.join(ROOT, 'data', 'wikics'), transform=transform) data = dataset[0] data.task = 'semi' data.setting = 'transductive' data.train_mask = data.train_mask[:,0] data.val_mask = data.val_mask[:, 0] data.stopping_mask = data.stopping_mask[:, 0] return data elif name in ['ppi']: # datasets for inductive node classification train_dataset = PPI(osp.join(ROOT, 'data', 'ppi'), split='train', transform=transform) val_dataset = PPI(osp.join(ROOT, 'data', 'ppi'), split='val', transform=transform) test_dataset = PPI(osp.join(ROOT, 'data', 'ppi'), split='test', transform=transform) return (train_dataset, val_dataset, test_dataset) elif name in ['usa-airports']: try: data = pickle.load(open(osp.join(ROOT, 'data', name, 'data.pkl'), 'rb')) return data except FileNotFoundError: print('Data not found. Re-generating...') nx_graph = nx.read_edgelist(osp.join(ROOT, 'data', name, 'edges.txt')) nx_graph = nx.convert_node_labels_to_integers(nx_graph, label_attribute='id2oid') # oid for original id oid2id = {int(v):k for k,v in nx.get_node_attributes(nx_graph, 'id2oid').items()} id2label = {} for line in open(osp.join(ROOT, 'data', name, 'labels.txt')): linesplit = line.strip().split() oid = int(linesplit[0]) label = int(linesplit[1]) id2label[oid2id[oid]] = {'y': label} # here we assume that the label id start from 0 and the labeling is consistant. nx.set_node_attributes(nx_graph, id2label) data = from_networkx(nx_graph) num_nodes = len(nx_graph.nodes) node_idxs = list(range(num_nodes)) random.shuffle(node_idxs) # split data, train:val:test = 80%:10%:10% train_idxs = node_idxs[:int(0.8 * num_nodes)] val_idxs = node_idxs[int(0.8 * num_nodes):int(0.9 * num_nodes)] test_idxs = node_idxs[int(0.9 * num_nodes):] data.train_mask = torch.zeros(num_nodes, dtype=torch.bool) data.val_mask = torch.zeros(num_nodes, dtype=torch.bool) data.test_mask = torch.zeros(num_nodes, dtype=torch.bool) data.train_mask[train_idxs] = True data.val_mask[val_idxs] = True data.test_mask[test_idxs] = True if data.x and transform: data.x = transform(data.x) data.num_nodes = num_nodes data.task = 'sup' # simi-supervised data.setting = 'transductive' # transductive pickle.dump(data, open(osp.join(ROOT, 'data', name, 'data.pkl'), 'wb')) return data elif name in ['ogbn-arxiv']: dataset = PygNodePropPredDataset(name, root=osp.join(ROOT, 'data'), transform=transform) split_idx = dataset.get_idx_split() data = dataset[0] split_idx['val'] = split_idx.pop('valid') for key, idx in split_idx.items(): mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[idx] = True data[f'{key}_mask'] = mask data.task = 'sup' # simi-supervised data.setting = 'transductive' # transductive return data elif name in ['photo']: dataset = Amazon('data/photo', 'photo', transform=transform) data = dataset[0] data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.train_mask[:-1000] = True data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.val_mask[-1000: -500] = True data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.test_mask[-500:] = True data.train_edge_index, _ = subgraph(data.train_mask, data.edge_index, relabel_nodes=True) data.val_edge_index, _ = subgraph(data.val_mask, data.edge_index, relabel_nodes=True) data.test_edge_index, _ = subgraph(data.test_mask, data.edge_index, relabel_nodes=True) data.train_x = data.x[data.train_mask] data.train_y = data.y[data.train_mask] data.val_x = data.x[data.val_mask] data.val_y = data.y[data.val_mask] data.test_x = data.x[data.test_mask] data.test_y = data.y[data.test_mask] data.num_train_nodes = data.train_x.shape[0] data.task = 'sup' # simi-supervised data.setting = 'inductive' # transductive return data else: raise NotImplementedError('Not supported dataset.')
def neg_sample(self,batch): len_batch = len(batch) a,_=subgraph(batch.tolist(),self.data.edge_index) neg_batch=self.NS.negative_sampling(batch,num_negative_samples=self.num_negative_samples) return neg_batch#%len_batch
else: graph = real_data[0] #graph = five_data graph.edge_index = to_undirected(graph.edge_index, graph.num_nodes) graph.edge_index = add_self_loops(graph.edge_index, num_nodes=graph.num_nodes)[0] graph.coalesce() temp = NeighborSampler(edge_index=graph.edge_index, sizes=[-1]) batches = temp egoNets = [0] * graph.num_nodes adjMats = [0] * graph.num_nodes plot = 331 curPlot = 0 norm_degrees = [] for batch_size, n_id, adj in batches: curData = subgraph(n_id, graph.edge_index) updated_e_index = to_undirected(curData[0], n_id.shape[0]) subgraph_size = torch.numel(n_id) cur_n_id = torch.sort(n_id)[0].tolist() cur_e_id = adj.e_id.tolist() subgraph2 = Data(edge_index=updated_e_index, edge_attr=curData[1], num_nodes=subgraph_size, n_id=cur_n_id, e_id=cur_e_id, degree=len(cur_n_id) - 1, adj=get_adj(updated_e_index, graph.edge_index, curPlot, cur_e_id)) subgraph2.coalesce() ###################### ego_degrees = {}
def inference(self, c): batch_size = c.size(0) h = self.generator.nodeInit( 'start', torch.ones(batch_size, dtype=torch.long).to(c.device), c).unsqueeze(1) h, node_atts, edges, non_zeros = self.generator.inference(h, c, None) graph = node_atts.clone() node_atts = torch.cat([edges, node_atts], 1) num_zeros = (non_zeros == 0).sum().item() while num_zeros < batch_size: edge_index = edges2index(edges) h, node_atts_new, edges_new, non_zero = self.generator.inference( h, c, edge_index) graph = torch.cat([graph, node_atts_new, edges_new], 1) node_atts = torch.cat([node_atts, node_atts_new], 1) edges = torch.cat([edges, edges_new], 1) non_zeros = torch.mul(non_zeros, non_zero) num_zeros = (non_zeros == 0).sum().item() h_rev = self.generator.nodeInit( 'end', torch.zeros(batch_size, dtype=torch.long).to(c.device), c).unsqueeze(1) h_rev, node_atts_rev, edges_rev, non_ones = self.generator.inference( h_rev, c, None, backwards=True) graph_rev = node_atts_rev.clone() node_atts_rev = torch.cat([edges_rev - edges_rev, node_atts_rev], 1) num_ones = (non_ones == 0).sum().item() while num_ones < batch_size: edge_index_rev = edges2index(edges_rev) h_rev, node_atts_new_rev, edges_new_rev, non_ones = self.generator.inference( h_rev, c, edge_index_rev, backwards=True) graph_rev = torch.cat( [graph_rev, node_atts_new_rev, edges_new_rev], 1) node_atts_rev = torch.cat([node_atts_rev, node_atts_new_rev], 1) edges_rev = torch.cat([edges_rev, edges_new_rev], 1) non_ones = torch.mul(non_ones, non_ones) num_ones = (non_ones == 0).sum().item() gf = batch2graph(graph) gb = batch2graph(graph_rev, backward=True) graph_out = list() for i in range(batch_size): ef = gf[i][1] eb_rev = gb[i][1] num_nodes = ef[1][-1].item() + 1 L_list = list(range(num_nodes - 1, -1, -1)) L = {i: L_list[i] for i in range(0, len(L_list))} if eb_rev[1][-1].item() > ef[1][-1].item(): subset = list(range(num_nodes)) eb_rev = subgraph(subset, eb_rev)[0] eb = torch.flip( torch.stack( (torch.LongTensor([L[x.item()] for x in eb_rev[0]]), torch.LongTensor([L[x.item()] for x in eb_rev[1]]))), [0, 1]) for j in torch.transpose(eb, 1, 0): if j in torch.transpose(ef, 1, 0): continue else: ef = torch.cat([ef, j.unsqueeze(1)], 1) graph_out.append((gf[i][0].to(c.device), ef.to(c.device))) return graph_out, node_atts.view(batch_size, -1), edges2index(edges, finish=True)
def denoise_graph(data, weighted_edge_mask, node_explanations, neighbours, node_idx, feat=None, label=None, threshold_num=10): """Cleaning a graph by thresholding its node values. Args: - weighted_edge_mask: Edge mask, with importance given to each edge - node_explanations : Shapley values for neighbours - neighbours - node_idx : Index of node to highlight (TODO ?) - feat : An array of node features. - label : A list of node labels. - theshold_num : The maximum number of nodes to threshold. """ # Subgraph with only relevant nodes - pytorch s = subgraph( torch.cat((torch.tensor([node_idx]), neighbours)), data.edge_index)[0] # Disregard size of explanations node_explanations = np.abs(node_explanations) # Create graph of neighbourhood of node of interest G = nx.DiGraph() G.add_nodes_from(neighbours.detach().numpy()) G.add_node(node_idx) G.nodes[node_idx]["self"] = 1 if feat is not None: for node in G.nodes(): G.nodes[node]["feat"] = feat[node].detach().numpy() if label is not None: for node in G.nodes(): G.nodes[node]["label"] = label[node].item() # Find importance threshold required to retrieve 10 most import nei. threshold_num = min(len(neighbours), threshold_num) threshold = np.sort( node_explanations)[-threshold_num] # # Keep edges that satisfy the threshold # weighted_edge_list = [ # (data.edge_index[0, i].item(), # data.edge_index[1, i].item(), weighted_edge_mask[i].item()) # for i, _ in enumerate(weighted_edge_mask) # if weighted_edge_mask[i] >= threshold # ] # Keep edges that satisfy the threshold node_expl_dico = {} for i, imp in enumerate(node_explanations): node_expl_dico[neighbours[i].item()] = imp node_expl_dico[node_idx]=torch.tensor(0) weighted_edge_list = [ (el1.item(),el2.item(),node_expl_dico[el1.item()].item()) for el1,el2 in zip(s[0],s[1]) ] # Remove edges from node of interest to neighbours weighted_edge_list = [item for item in weighted_edge_list if item[0] != 0] G.add_weighted_edges_from(weighted_edge_list) # Keep nodes that satisfy the threshold del_nodes = [] for i, node in enumerate(G.nodes()): if node != node_idx: if node_explanations[i] < threshold: del_nodes.append(node) G.remove_nodes_from(del_nodes) return G