def init_graph(self): r"""Get the initial attention matrix through the collaborative knowledge graph Returns: torch.sparse.FloatTensor: Sparse tensor of the attention matrix """ import dgl adj_list = [] for rel_type in range(1, self.n_relations, 1): edge_idxs = self.ckg.filter_edges( lambda edge: edge.data['relation_id'] == rel_type) sub_graph = dgl.edge_subgraph(self.ckg, edge_idxs, preserve_nodes=True). \ adjacency_matrix(transpose=False, scipy_fmt='coo').astype('float') rowsum = np.array(sub_graph.sum(1)) d_inv = np.power(rowsum, -1).flatten() d_inv[np.isinf(d_inv)] = 0. d_mat_inv = sp.diags(d_inv) norm_adj = d_mat_inv.dot(sub_graph).tocoo() adj_list.append(norm_adj) final_adj_matrix = sum(adj_list).tocoo() indices = torch.LongTensor( [final_adj_matrix.row, final_adj_matrix.col]) values = torch.FloatTensor(final_adj_matrix.data) adj_matrix_tensor = torch.sparse.FloatTensor(indices, values, self.matrix_size) return adj_matrix_tensor.to(self.device)
def train(args): data = load_kg_dataset(args.dataset) g = data[0] train_idx = g.edata['train_mask'].nonzero(as_tuple=False).squeeze() val_idx = g.edata['val_mask'].nonzero(as_tuple=False).squeeze() test_idx = g.edata['test_mask'].nonzero(as_tuple=False).squeeze() train_g = dgl.edge_subgraph(g, train_idx, preserve_nodes=True) train_triplets = g.find_edges(train_idx) + (train_g.edata['etype'],) model = LinkPrediction( data.num_nodes, args.num_hidden, data.num_rels * 2, args.num_layers, args.regularizer, args.num_bases, args.dropout ) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) neg_sampler = Uniform(args.num_neg_samples) labels = torch.cat([torch.ones(train_g.num_edges()), torch.zeros(train_g.num_edges() * args.num_neg_samples)] ) for epoch in range(args.epochs): model.train() embed = model(train_g, train_g.edata['etype']) neg_triplets = neg_sampler(train_g, torch.arange(train_g.num_edges())) \ + (train_g.edata['etype'].repeat_interleave(args.num_neg_samples),) pos_score = model.calc_score(embed, train_triplets) neg_score = model.calc_score(embed, neg_triplets) loss = F.binary_cross_entropy_with_logits(torch.cat([pos_score, neg_score]), labels) optimizer.zero_grad() loss.backward() optimizer.step() # TODO 计算MRR # FB-15k和FB15k-237反向传播很慢? print('Epoch {:04d} | Loss {:.4f}'.format(epoch, loss.item()))
def sampler_frontier(self, block_id, g, seed_nodes, timestamp): full_neighbor_subgraph = dgl.in_subgraph(g, seed_nodes) full_neighbor_subgraph = dgl.add_edges(full_neighbor_subgraph, seed_nodes, seed_nodes) temporal_edge_mask = (full_neighbor_subgraph.edata['timestamp'] < timestamp) + ( full_neighbor_subgraph.edata['timestamp'] <= 0) temporal_subgraph = dgl.edge_subgraph( full_neighbor_subgraph, temporal_edge_mask) # Map preserve ID temp2origin = temporal_subgraph.ndata[dgl.NID] # The added new edgge will be preserved hence root2sub_dict = dict( zip(temp2origin.tolist(), temporal_subgraph.nodes().tolist())) temporal_subgraph.ndata[dgl.NID] = g.ndata[dgl.NID][temp2origin] seed_nodes = [root2sub_dict[int(n)] for n in seed_nodes] final_subgraph = self.sampler(g=temporal_subgraph, nodes=seed_nodes) final_subgraph.remove_self_loop() return final_subgraph
def track_time(graph_name, format, seed_egdes_num): device = utils.get_bench_device() graph = utils.get_graph(graph_name, format) graph = graph.to(device) seed_edges = np.random.randint(0, graph.num_edges(), seed_egdes_num) # dry run for i in range(3): dgl.edge_subgraph(graph, seed_edges) # timing with utils.Timer() as t: for i in range(3): dgl.edge_subgraph(graph, seed_edges) return t.elapsed_secs / 3
def test_set_batch_info(idtype): ctx = F.ctx() g1 = dgl.rand_graph(30, 100).astype(idtype).to(F.ctx()) g2 = dgl.rand_graph(40, 200).astype(idtype).to(F.ctx()) bg = dgl.batch([g1, g2]) batch_num_nodes = F.astype(bg.batch_num_nodes(), idtype) batch_num_edges = F.astype(bg.batch_num_edges(), idtype) # test homogeneous node subgraph sg_n = dgl.node_subgraph(bg, list(range(10, 20)) + list(range(50, 60))) induced_nodes = sg_n.ndata['_ID'] induced_edges = sg_n.edata['_ID'] new_batch_num_nodes = _get_subgraph_batch_info(bg.ntypes, [induced_nodes], batch_num_nodes) new_batch_num_edges = _get_subgraph_batch_info(bg.canonical_etypes, [induced_edges], batch_num_edges) sg_n.set_batch_num_nodes(new_batch_num_nodes) sg_n.set_batch_num_edges(new_batch_num_edges) subg_n1, subg_n2 = dgl.unbatch(sg_n) subg1 = dgl.node_subgraph(g1, list(range(10, 20))) subg2 = dgl.node_subgraph(g2, list(range(20, 30))) assert subg_n1.num_edges() == subg1.num_edges() assert subg_n2.num_edges() == subg2.num_edges() # test homogeneous edge subgraph sg_e = dgl.edge_subgraph(bg, list(range(40, 70)) + list(range(150, 200)), preserve_nodes=True) induced_nodes = sg_e.ndata['_ID'] induced_edges = sg_e.edata['_ID'] new_batch_num_nodes = _get_subgraph_batch_info(bg.ntypes, [induced_nodes], batch_num_nodes) new_batch_num_edges = _get_subgraph_batch_info(bg.canonical_etypes, [induced_edges], batch_num_edges) sg_e.set_batch_num_nodes(new_batch_num_nodes) sg_e.set_batch_num_edges(new_batch_num_edges) subg_e1, subg_e2 = dgl.unbatch(sg_e) subg1 = dgl.edge_subgraph(g1, list(range(40, 70)), preserve_nodes=True) subg2 = dgl.edge_subgraph(g2, list(range(50, 100)), preserve_nodes=True) assert subg_e1.num_nodes() == subg1.num_nodes() assert subg_e2.num_nodes() == subg2.num_nodes()
def drop_edge(graph, drop_prob): E = graph.num_edges() mask_rates = th.FloatTensor(np.ones(E) * drop_prob) masks = th.bernoulli(1 - mask_rates) edge_idx = masks.nonzero().squeeze(1) sg = dgl.edge_subgraph(graph, edge_idx, relabel_nodes=False) return sg
def sample_fraud_event(self, g, bs, current_ts): idx = (self.ts<current_ts) num_fraud = idx.sum().item() if num_fraud > bs: idx[random.sample(list(range(num_fraud)), num_fraud-bs)] = False # 只采样一部分fraud event fraud_eid = self.fraud_eid[idx] fraud_graph = dgl.edge_subgraph(g, fraud_eid) return fraud_graph
def _collate(self, items): """根据边id采样子图 :param items: tensor(B) 边id :return: tensor(N_src), DGLGraph, List[DGLBlock] 知识图谱的输入顶点id,用户-物品图的边子图, 知识图谱根据边id关联的物品id采样的多层MFG """ items = prepare_tensor(self.g_sampling, items, 'items') pair_graph = dgl.edge_subgraph(self.g, items) seed_nodes = pair_graph.ndata[dgl.NID] blocks = self.block_sampler.sample_blocks(self.g_sampling, seed_nodes['item']) input_nodes = blocks[0].srcdata[dgl.NID] return input_nodes, pair_graph, blocks
def _create_subgraph(self, node_idx): """ get all nodes that contribute to the computation of node's embedding """ if node_idx is None: # graph classification sub_g = copy.deepcopy(self.g) sub_g.ndata[ExplainerTags.ORIGINAL_ID] = torch.range(0, self.g.num_nodes() - 1, dtype=torch.int) else: nodes = torch.tensor([node_idx]) eid_list = [] for _ in range(self.num_hops): predecessors, _, eid = self.g.in_edges(nodes, form='all') eid_list.extend(eid) predecessors = torch.flatten(predecessors).unique() nodes = torch.cat([nodes, predecessors]) nodes = torch.unique(nodes) eid_list = list(np.unique(np.array([eid_list]))) sub_g = dgl.edge_subgraph(self.g, eid_list) # TODO - handle heterogeneous graphs sub_g.ndata[ExplainerTags.ORIGINAL_ID] = sub_g.ndata[dgl.NID] return sub_g
def sample_frontier(self, block_id, g, seed_nodes, *args, **kwargs): with g.local_scope(): new_edges_masks = {} for etype in g.canonical_etypes: edge_mask = th.zeros(g.number_of_edges(etype)) # extract each node from dict because of single node type for node in seed_nodes: edges = g.in_edges(node, form='eid', etype=etype) num_neigh = th.ceil(g.in_degrees(node, etype=etype) * self.p[block_id][etype]).int().item() neigh_dist = self.dists[block_id][etype][edges] if neigh_dist.shape[0] > num_neigh: neigh_index = np.argpartition(neigh_dist.cpu().detach(), num_neigh)[:num_neigh] else: neigh_index = np.arange(num_neigh) edge_mask[edges[neigh_index]] = 1 new_edges_masks[etype] = edge_mask.bool() return dgl.edge_subgraph(g, new_edges_masks, relabel_nodes=False)
def _collate_with_negative_sampling(self, items): """根据边id采样子图,并进行负采样 :param items: tensor(B) 边id :return: tensor(N_src), DGLGraph, DGLGraph, List[DGLBlock] 知识图谱的输入顶点id,用户-物品图的边子图,负样本图, 知识图谱根据边id关联的物品id采样的多层MFG """ items = prepare_tensor(self.g_sampling, items, 'items') pair_graph = dgl.edge_subgraph(self.g, items, relabel_nodes=False) induced_edges = pair_graph.edata[dgl.EID] neg_srcdst = self.negative_sampler(self.g, items) neg_pair_graph = dgl.heterograph( {self.g.canonical_etypes[0]: neg_srcdst}) pair_graph, neg_pair_graph = dgl.compact_graphs( [pair_graph, neg_pair_graph]) pair_graph.edata[dgl.EID] = induced_edges seed_nodes = pair_graph.ndata[dgl.NID] blocks = self.block_sampler.sample_blocks(self.g_sampling, seed_nodes['item']) input_nodes = blocks[0].srcdata[dgl.NID] return input_nodes, pair_graph, neg_pair_graph, blocks
def dgl_main(): # Load from DGL dataset if args.dataset == 'cora': dataset = CoraGraphDataset(reverse_edge=False) elif args.dataset == 'citeseer': dataset = CiteseerGraphDataset(reverse_edge=False) elif args.dataset == 'pubmed': dataset = PubmedGraphDataset(reverse_edge=False) else: raise NotImplementedError graph = dataset[0] # Extract node features feats = graph.ndata.pop('feat').to(device) in_dim = feats.shape[-1] # generate input adj_orig = graph.adjacency_matrix().to_dense() # build test set with 10% positive links train_edge_idx, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges_dgl( graph, adj_orig) graph = graph.to(device) # create train graph train_edge_idx = torch.tensor(train_edge_idx).to(device) train_graph = dgl.edge_subgraph(graph, train_edge_idx, relabel_nodes=False) train_graph = train_graph.to(device) adj = train_graph.adjacency_matrix().to_dense().to(device) # compute loss parameters weight_tensor, norm = compute_loss_para(adj) # create model vgae_model = model.VGAEModel(in_dim, args.hidden1, args.hidden2) vgae_model = vgae_model.to(device) # create training component optimizer = torch.optim.Adam(vgae_model.parameters(), lr=args.learning_rate) print('Total Parameters:', sum([p.nelement() for p in vgae_model.parameters()])) # create training epoch for epoch in range(args.epochs): t = time.time() # Training and validation using a full graph vgae_model.train() logits = vgae_model.forward(graph, feats) # compute loss loss = norm * F.binary_cross_entropy( logits.view(-1), adj.view(-1), weight=weight_tensor) kl_divergence = 0.5 / logits.size(0) * ( 1 + 2 * vgae_model.log_std - vgae_model.mean**2 - torch.exp(vgae_model.log_std)**2).sum(1).mean() loss -= kl_divergence # backward optimizer.zero_grad() loss.backward() optimizer.step() train_acc = get_acc(logits, adj) val_roc, val_ap = get_scores(val_edges, val_edges_false, logits) # Print out performance print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(loss.item()), "train_acc=", "{:.5f}".format(train_acc), "val_roc=", "{:.5f}".format(val_roc), "val_ap=", "{:.5f}".format(val_ap), "time=", "{:.5f}".format(time.time() - t)) test_roc, test_ap = get_scores(test_edges, test_edges_false, logits) # roc_means.append(test_roc) # ap_means.append(test_ap) print("End of training!", "test_roc=", "{:.5f}".format(test_roc), "test_ap=", "{:.5f}".format(test_ap))