def sample_blocks(self, seed_edges): n_edges = len(seed_edges) seed_edges = th.LongTensor(np.asarray(seed_edges)) heads, tails = self.g.find_edges(seed_edges) if self.neg_share and n_edges % self.num_negs == 0: neg_tails = self.neg_sampler(n_edges) neg_tails = (neg_tails.view(-1, 1, self.num_negs).expand( n_edges // self.num_negs, self.num_negs, self.num_negs).flatten()) neg_heads = (heads.view(-1, 1).expand(n_edges, self.num_negs).flatten()) else: neg_tails = self.neg_sampler(self.num_negs * n_edges) neg_heads = (heads.view(-1, 1).expand(n_edges, self.num_negs).flatten()) # Maintain the correspondence between heads, tails and negative tails as two # graphs. # pos_graph contains the correspondence between each head and its positive tail. # neg_graph contains the correspondence between each head and its negative tails. # Both pos_graph and neg_graph are first constructed with the same node space as # the original graph. Then they are compacted together with dgl.compact_graphs. pos_graph = dgl.graph((heads, tails), num_nodes=self.g.number_of_nodes()) neg_graph = dgl.graph((neg_heads, neg_tails), num_nodes=self.g.number_of_nodes()) pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph]) # Obtain the node IDs being used in either pos_graph or neg_graph. Since they # are compacted together, pos_graph and neg_graph share the same compacted node # space. seeds = pos_graph.ndata[dgl.NID] blocks = [] for fanout in self.fanouts: # For each seed node, sample ``fanout`` neighbors. frontier = dgl.sampling.sample_neighbors(self.g, seeds, fanout, replace=True) # Remove all edges between heads and tails, as well as heads and neg_tails. _, _, edge_ids = frontier.edge_ids( th.cat([heads, tails, neg_heads, neg_tails]), th.cat([tails, heads, neg_tails, neg_heads]), return_uv=True, ) frontier = dgl.remove_edges(frontier, edge_ids) # Then we compact the frontier into a bipartite graph for message passing. block = dgl.to_block(frontier, seeds) # Pre-generate CSR format that it can be used in training directly block.in_degree(0) # Obtain the seed nodes for next layer. seeds = block.srcdata[dgl.NID] blocks.insert(0, block) # Pre-generate CSR format that it can be used in training directly return pos_graph, neg_graph, blocks
def sample_from_item_pairs(self, heads, tails, neg_tails): pos_graph = dgl.graph((heads, tails), num_nodes=self.g.number_of_nodes(self.item_type)) neg_graph = dgl.graph((heads, neg_tails), num_nodes=self.g.number_of_nodes(self.item_type)) pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph]) seeds = pos_graph.ndata[dgl.NID] blocks = self.sample_blocks(seeds, heads, tails, neg_tails) return pos_graph, neg_graph, blocks
def sample_from_item_pairs(self, heads, tails, neg_tails): # Create a graph with positive connections only and another graph with negative # connections only. pos_graph = dgl.graph((heads, tails), num_nodes=self.g.number_of_nodes(self.item_type)) neg_graph = dgl.graph((heads, neg_tails), num_nodes=self.g.number_of_nodes(self.item_type)) pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph]) seeds = pos_graph.ndata[dgl.NID] blocks = self.sample_blocks(seeds, heads, tails, neg_tails) return pos_graph, neg_graph, blocks
def sample_from_item_pairs(self, heads, tails, neg_tails): # Create a graph with positive connections only and another graph with negative # connections only. pos_graph = self.build_hetero_graph(heads, tails) neg_graph = self.build_hetero_graph(heads, neg_tails) pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph]) pos_nodes = pos_graph.ndata[dgl.NID] seed_nodes = pos_nodes # same with neg_nodes from neg_graph blocks = self.sampler.sample_blocks( self.hg, seed_nodes, exclude_eids=None) return pos_graph, neg_graph, blocks
def sample_from_item_pairs(self, heads, tails, neg_tails): # Create a graph with positive connections only and another graph with negative # connections only. pos_graph = dgl.graph((heads, tails), num_nodes=self.g.number_of_nodes(self.item_type)) neg_graph = dgl.graph((heads, neg_tails), num_nodes=self.g.number_of_nodes(self.item_type)) # eliminates isolated nodes(indeg, outdeg == 0) from all the graphs. pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph]) # compact之后 变成了0-index NID能够返回原图的原ID # seeds = heads + tails + neg_tails seeds = pos_graph.ndata[dgl.NID] blocks = self.sample_blocks(seeds, heads, tails, neg_tails) return pos_graph, neg_graph, blocks
def sample_from_item_pairs(self, heads, tails, neg_tails): # Create a graph with positive connections only and another graph with negative pos_graph = dgl.graph( (heads, tails), num_nodes=self.g.number_of_nodes(self.item_type)) neg_graph = dgl.graph( (heads, neg_tails), num_nodes=self.g.number_of_nodes(self.item_type)) # remove isolated nodes and re-indexing all nodes and edges pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph]) seeds = pos_graph.ndata[dgl.NID] # all node ids mapping to global graph g # extract 2-hop neighbor MFG structure dataset for message passing blocks, context_dicts = self.sample_blocks(seeds, heads, tails, neg_tails) return pos_graph, neg_graph, blocks, context_dicts
def get_graph(name, format=None): # global GRAPH_CACHE # if name in GRAPH_CACHE: # return GRAPH_CACHE[name].to(format) if isinstance(format, str): format = [format] # didn't specify format if format is None: format = ['csc', 'csr', 'coo'] g = None if name == 'cora': g = dgl.data.CoraGraphDataset(verbose=False)[0] elif name == 'pubmed': g = dgl.data.PubmedGraphDataset(verbose=False)[0] elif name == 'livejournal': bin_path = "/tmp/dataset/livejournal/livejournal_{}.bin".format(format) if os.path.exists(bin_path): g_list, _ = dgl.load_graphs(bin_path) g = g_list[0] else: g = get_livejournal().formats(format) dgl.save_graphs(bin_path, [g]) elif name == "friendster": bin_path = "/tmp/dataset/friendster/friendster_{}.bin".format(format) if os.path.exists(bin_path): g_list, _ = dgl.load_graphs(bin_path) g = g_list[0] else: # the original node IDs of friendster are not consecutive, so we compact it g = dgl.compact_graphs(get_friendster()).formats(format) dgl.save_graphs(bin_path, [g]) elif name == "reddit": bin_path = "/tmp/dataset/reddit/reddit_{}.bin".format(format) if os.path.exists(bin_path): g_list, _ = dgl.load_graphs(bin_path) g = g_list[0] else: g = dgl.data.RedditDataset(self_loop=True)[0].formats(format) dgl.save_graphs(bin_path, [g]) elif name.startswith("ogb"): g = get_ogb_graph(name) else: raise Exception("Unknown dataset") # GRAPH_CACHE[name] = g g = g.formats(format) return g
def sample(self, batch): users, items, ratings = zip(*batch) users = torch.stack(users) items = torch.stack(items) ratings = torch.stack(ratings) # 1 创建二部图 pair_graph = dgl.heterograph( {('user', 'watched', 'item'): (users, items)}, num_nodes_dict={ 'user': self.graph.num_nodes('user'), 'item': self.graph.num_nodes('item') }) u = users.tolist() i = items.tolist() real_data = torch.tensor(list(zip(u, i)), dtype=torch.int) pair_graph.edata['real_data'] = real_data # 2 压缩二部图 pair_graph = dgl.compact_graphs(pair_graph) pair_graph.edata['rating'] = ratings # 3 创建数据块 seeds = { 'user': pair_graph.nodes['user'].data[dgl.NID], 'item': pair_graph.nodes['item'].data[dgl.NID] } blocks = self.construct_blocks(seeds, (users, items)) # 把节点特征也复制过来 # 注意这里只需要处理源端结点 for feature_name in self.graph.nodes['user'].data.keys(): blocks[0].srcnodes['user'].data[feature_name] = \ self.graph.nodes['user'].data[feature_name][blocks[0].srcnodes['user'].data[dgl.NID]] for feature_name in self.graph.nodes['item'].data.keys(): blocks[0].srcnodes['item'].data[feature_name] = \ self.graph.nodes['item'].data[feature_name][blocks[0].srcnodes['item'].data[dgl.NID]] return pair_graph, blocks
def sample(self, labels: Tensor) -> Batch: """sample blocks and decode graph from labels :param labels: tensor of dim Nx3, (src_nodes, dst_nodes, labels) :return: blocks for calculating node representations, a decode graph, and edge labels """ labels = torch.cat([label[0].unsqueeze(0) for label in labels], dim=0) if self._negative_sampling is True: negative_labels = self._sample_negative_labels(labels) labels = torch.cat([labels, negative_labels], dim=0) decode_graph = dgl.graph(data=(labels[:, 0], labels[:, 1]), num_nodes=self._g.number_of_nodes()) decode_graph = dgl.compact_graphs(decode_graph) seed_nodes = decode_graph.ndata[dgl.NID] blocks = list() for fanout in self._fanouts: sub_graph = dgl.sampling.sample_neighbors(g=self._g, nodes=seed_nodes, fanout=fanout) block = dgl.to_block(sub_graph, seed_nodes) blocks.insert(0, block) seed_nodes = block.srcdata[dgl.NID] input_features = self._g.ndata["feat"][blocks[0].srcdata[dgl.NID]] return Batch(blocks, decode_graph, input_features, labels[:, 2])
def obtain_Bs(self, ed_ids): n_edges = len(ed_ids) ed_ids = torch.LongTensor(np.asarray(ed_ids)) heads, tails = self.g.find_edges(ed_ids) neg_tails = self.weights.multinomial(self.num_negs * n_edges, replacement=True) neg_heads = heads.view(-1, 1).expand(n_edges, self.num_negs).flatten() pos_graph = dgl.graph((heads, tails), num_nodes=self.g.number_of_nodes()) neg_graph = dgl.graph((neg_heads, neg_tails), num_nodes=self.g.number_of_nodes()) pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph]) ids = pos_graph.ndata[dgl.NID] B = [] for s in self.fanout: nf = sample_neighbors(self.g, nodes=ids, fanout=s, replace=True) # 返回采样后的图,节点不变,边仅保留采样到的 _, _, edge_ids = nf.edge_ids( torch.cat([heads, tails, neg_heads, neg_tails]), torch.cat([tails, heads, neg_tails, neg_heads]), return_uv=True) nf = dgl.remove_edges(nf, edge_ids) # 用于计算损失函数的边剔除,前向传播用剩下的边 b = dgl.to_block(nf, ids) # 转为二部图,可以方便读取src和dst节点,将后一层节点作为dst ids = b.srcdata[dgl.NID] # 二部图源节点作为前一层的ids B.insert(0, b) # 插入到列表最前 return pos_graph, neg_graph, B
def _collate_with_negative_sampling(self, items): """根据边id采样子图,并进行负采样 :param items: tensor(B) 边id :return: tensor(N_src), DGLGraph, DGLGraph, List[DGLBlock] 知识图谱的输入顶点id,用户-物品图的边子图,负样本图, 知识图谱根据边id关联的物品id采样的多层MFG """ items = prepare_tensor(self.g_sampling, items, 'items') pair_graph = dgl.edge_subgraph(self.g, items, relabel_nodes=False) induced_edges = pair_graph.edata[dgl.EID] neg_srcdst = self.negative_sampler(self.g, items) neg_pair_graph = dgl.heterograph( {self.g.canonical_etypes[0]: neg_srcdst}) pair_graph, neg_pair_graph = dgl.compact_graphs( [pair_graph, neg_pair_graph]) pair_graph.edata[dgl.EID] = induced_edges seed_nodes = pair_graph.ndata[dgl.NID] blocks = self.block_sampler.sample_blocks(self.g_sampling, seed_nodes['item']) input_nodes = blocks[0].srcdata[dgl.NID] return input_nodes, pair_graph, neg_pair_graph, blocks
def test_compact(index_dtype): g1 = dgl.heterograph({ ('user', 'follow', 'user'): [(1, 3), (3, 5)], ('user', 'plays', 'game'): [(2, 4), (3, 4), (2, 5)], ('game', 'wished-by', 'user'): [(6, 7), (5, 7)]}, {'user': 20, 'game': 10}, index_dtype=index_dtype) g2 = dgl.heterograph({ ('game', 'clicked-by', 'user'): [(3, 1)], ('user', 'likes', 'user'): [(1, 8), (8, 9)]}, {'user': 20, 'game': 10}, index_dtype=index_dtype) g3 = dgl.graph([(0, 1), (1, 2)], num_nodes=10, ntype='user', index_dtype=index_dtype) g4 = dgl.graph([(1, 3), (3, 5)], num_nodes=10, ntype='user', index_dtype=index_dtype) def _check(g, new_g, induced_nodes): assert g.ntypes == new_g.ntypes assert g.canonical_etypes == new_g.canonical_etypes for ntype in g.ntypes: assert -1 not in induced_nodes[ntype] for etype in g.canonical_etypes: g_src, g_dst = g.all_edges(order='eid', etype=etype) g_src = F.asnumpy(g_src) g_dst = F.asnumpy(g_dst) new_g_src, new_g_dst = new_g.all_edges(order='eid', etype=etype) new_g_src_mapped = induced_nodes[etype[0]][F.asnumpy(new_g_src)] new_g_dst_mapped = induced_nodes[etype[2]][F.asnumpy(new_g_dst)] assert (g_src == new_g_src_mapped).all() assert (g_dst == new_g_dst_mapped).all() # Test default new_g1 = dgl.compact_graphs(g1) induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes} induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()} assert new_g1._idtype_str == index_dtype assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7]) assert set(induced_nodes['game']) == set([4, 5, 6]) _check(g1, new_g1, induced_nodes) # Test with always_preserve given a dict new_g1 = dgl.compact_graphs( g1, always_preserve={'game': F.tensor([4, 7], dtype=getattr(F, index_dtype))}) assert new_g1._idtype_str == index_dtype induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes} induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()} assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7]) assert set(induced_nodes['game']) == set([4, 5, 6, 7]) _check(g1, new_g1, induced_nodes) # Test with always_preserve given a tensor new_g3 = dgl.compact_graphs( g3, always_preserve=F.tensor([1, 7], dtype=getattr(F, index_dtype))) induced_nodes = {ntype: new_g3.nodes[ntype].data[dgl.NID] for ntype in new_g3.ntypes} induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()} assert new_g3._idtype_str == index_dtype assert set(induced_nodes['user']) == set([0, 1, 2, 7]) _check(g3, new_g3, induced_nodes) # Test multiple graphs new_g1, new_g2 = dgl.compact_graphs([g1, g2]) induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes} induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()} assert new_g1._idtype_str == index_dtype assert new_g2._idtype_str == index_dtype assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9]) assert set(induced_nodes['game']) == set([3, 4, 5, 6]) _check(g1, new_g1, induced_nodes) _check(g2, new_g2, induced_nodes) # Test multiple graphs with always_preserve given a dict new_g1, new_g2 = dgl.compact_graphs( [g1, g2], always_preserve={'game': F.tensor([4, 7], dtype=getattr(F, index_dtype))}) induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes} induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()} assert new_g1._idtype_str == index_dtype assert new_g2._idtype_str == index_dtype assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9]) assert set(induced_nodes['game']) == set([3, 4, 5, 6, 7]) _check(g1, new_g1, induced_nodes) _check(g2, new_g2, induced_nodes) # Test multiple graphs with always_preserve given a tensor new_g3, new_g4 = dgl.compact_graphs( [g3, g4], always_preserve=F.tensor([1, 7], dtype=getattr(F, index_dtype))) induced_nodes = {ntype: new_g3.nodes[ntype].data[dgl.NID] for ntype in new_g3.ntypes} induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()} assert new_g3._idtype_str == index_dtype assert new_g4._idtype_str == index_dtype assert set(induced_nodes['user']) == set([0, 1, 2, 3, 5, 7]) _check(g3, new_g3, induced_nodes) _check(g4, new_g4, induced_nodes)
def sample_blocks(self, seeds): """Sample subgraphs from the entire graph. The input ``seeds`` represents the edges to compute prediction for. The sampling algorithm works as follows: 1. Get the head and tail nodes of the provided seed edges. 2. For each head and tail node, extract the entire in-coming neighborhood. 3. Copy the node features/embeddings from the full graph to the sampled subgraphs. """ dataset = self.dataset enc_graph = self.enc_graph dec_graph = self.dec_graph edge_ids = th.stack(seeds) # generate frontiers for user and item possible_rating_values = dataset.possible_rating_values true_relation_ratings = self.truths[edge_ids] true_relation_labels = None if self.labels is None else self.labels[ edge_ids] # 1. Get the head and tail nodes from both the decoder and encoder graphs. head_id, tail_id = dec_graph.find_edges(edge_ids) utype, _, vtype = enc_graph.canonical_etypes[0] subg = [] true_rel_ratings = [] true_rel_labels = [] for possible_rating_value in possible_rating_values: idx_loc = (true_relation_ratings == possible_rating_value) head = head_id[idx_loc] tail = tail_id[idx_loc] true_rel_ratings.append(true_relation_ratings[idx_loc]) if self.labels is not None: true_rel_labels.append(true_relation_labels[idx_loc]) subg.append( dgl.bipartite((head, tail), utype=utype, etype=str(possible_rating_value), vtype=vtype, num_nodes=(enc_graph.number_of_nodes(utype), enc_graph.number_of_nodes(vtype)))) # Convert the encoder subgraph to a more compact one by removing nodes that covered # by the seed edges. g = dgl.hetero_from_relations(subg) g = dgl.compact_graphs(g) # 2. For each head and tail node, extract the entire in-coming neighborhood. seed_nodes = {} for ntype in g.ntypes: seed_nodes[ntype] = g.nodes[ntype].data[dgl.NID] frontier = dgl.in_subgraph(enc_graph, seed_nodes) frontier = dgl.to_block(frontier, seed_nodes) # 3. Copy the node features/embeddings from the full graph to the sampled subgraphs. frontier.dstnodes['user'].data['ci'] = \ enc_graph.nodes['user'].data['ci'][frontier.dstnodes['user'].data[dgl.NID]] frontier.srcnodes['movie'].data['cj'] = \ enc_graph.nodes['movie'].data['cj'][frontier.srcnodes['movie'].data[dgl.NID]] frontier.srcnodes['user'].data['cj'] = \ enc_graph.nodes['user'].data['cj'][frontier.srcnodes['user'].data[dgl.NID]] frontier.dstnodes['movie'].data['ci'] = \ enc_graph.nodes['movie'].data['ci'][frontier.dstnodes['movie'].data[dgl.NID]] # handle features head_feat = frontier.srcnodes['user'].data[dgl.NID].long() \ if dataset.user_feature is None else \ dataset.user_feature[frontier.srcnodes['user'].data[dgl.NID]] tail_feat = frontier.srcnodes['movie'].data[dgl.NID].long()\ if dataset.movie_feature is None else \ dataset.movie_feature[frontier.srcnodes['movie'].data[dgl.NID]] true_rel_labels = None if self.labels is None else th.cat( true_rel_labels, dim=0) true_rel_ratings = th.cat(true_rel_ratings, dim=0) return (g, frontier, head_feat, tail_feat, true_rel_labels, true_rel_ratings)
g = dgl.graph((torch.cat([src, dst]), torch.cat([dst, src]))) len_event = src.shape[0] g.edata['label'] = label.repeat(2).squeeze() g.edata['timestamp'] = timestamp.repeat(2).squeeze() g.edata['feat'] = edge_feat.repeat(2, 1).squeeze() print(g) save_graphs(f"./data/{args.data}.bin", g) if args.new_node_count: origin_num_edges = g.num_edges() // 2 train_eid = torch.arange(0, int(0.7 * origin_num_edges)) un_train_eid = torch.arange(int(0.7 * origin_num_edges), origin_num_edges) train_g = dgl.graph(g.find_edges(train_eid)) val_n_test_g = dgl.compact_graphs(dgl.graph(g.find_edges(un_train_eid))) print( f'total nodes: {g.num_nodes()}, training nodes: {train_g.num_nodes()}, val_n_test nodes: {val_n_test_g.num_nodes()}' ) old_nodes = val_n_test_g.num_nodes() - g.num_nodes() + train_g.num_nodes() print( f'old nodes in val_n_test: {old_nodes} ({round((old_nodes)*100/val_n_test_g.num_nodes(),4)}%)' ) new_nodes = g.num_nodes() - train_g.num_nodes() print( f'new nodes in val_n_test: {new_nodes} ({round((new_nodes)*100/val_n_test_g.num_nodes(),4)}%)' )