def sample_blocks(self, seed_edges):
        n_edges = len(seed_edges)
        seed_edges = th.LongTensor(np.asarray(seed_edges))
        heads, tails = self.g.find_edges(seed_edges)
        if self.neg_share and n_edges % self.num_negs == 0:
            neg_tails = self.neg_sampler(n_edges)
            neg_tails = (neg_tails.view(-1, 1, self.num_negs).expand(
                n_edges // self.num_negs, self.num_negs,
                self.num_negs).flatten())
            neg_heads = (heads.view(-1, 1).expand(n_edges,
                                                  self.num_negs).flatten())
        else:
            neg_tails = self.neg_sampler(self.num_negs * n_edges)
            neg_heads = (heads.view(-1, 1).expand(n_edges,
                                                  self.num_negs).flatten())

        # Maintain the correspondence between heads, tails and negative tails as two
        # graphs.
        # pos_graph contains the correspondence between each head and its positive tail.
        # neg_graph contains the correspondence between each head and its negative tails.
        # Both pos_graph and neg_graph are first constructed with the same node space as
        # the original graph.  Then they are compacted together with dgl.compact_graphs.
        pos_graph = dgl.graph((heads, tails),
                              num_nodes=self.g.number_of_nodes())
        neg_graph = dgl.graph((neg_heads, neg_tails),
                              num_nodes=self.g.number_of_nodes())
        pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])

        # Obtain the node IDs being used in either pos_graph or neg_graph.  Since they
        # are compacted together, pos_graph and neg_graph share the same compacted node
        # space.
        seeds = pos_graph.ndata[dgl.NID]
        blocks = []
        for fanout in self.fanouts:
            # For each seed node, sample ``fanout`` neighbors.
            frontier = dgl.sampling.sample_neighbors(self.g,
                                                     seeds,
                                                     fanout,
                                                     replace=True)
            # Remove all edges between heads and tails, as well as heads and neg_tails.
            _, _, edge_ids = frontier.edge_ids(
                th.cat([heads, tails, neg_heads, neg_tails]),
                th.cat([tails, heads, neg_tails, neg_heads]),
                return_uv=True,
            )
            frontier = dgl.remove_edges(frontier, edge_ids)
            # Then we compact the frontier into a bipartite graph for message passing.
            block = dgl.to_block(frontier, seeds)

            # Pre-generate CSR format that it can be used in training directly
            block.in_degree(0)
            # Obtain the seed nodes for next layer.
            seeds = block.srcdata[dgl.NID]

            blocks.insert(0, block)

        # Pre-generate CSR format that it can be used in training directly
        return pos_graph, neg_graph, blocks
Ejemplo n.º 2
0
 def sample_from_item_pairs(self, heads, tails, neg_tails):
     pos_graph = dgl.graph((heads, tails),
                           num_nodes=self.g.number_of_nodes(self.item_type))
     neg_graph = dgl.graph((heads, neg_tails),
                           num_nodes=self.g.number_of_nodes(self.item_type))
     pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])
     seeds = pos_graph.ndata[dgl.NID]
     blocks = self.sample_blocks(seeds, heads, tails, neg_tails)
     return pos_graph, neg_graph, blocks
Ejemplo n.º 3
0
    def sample_from_item_pairs(self, heads, tails, neg_tails):
        # Create a graph with positive connections only and another graph with negative
        # connections only.
        pos_graph = dgl.graph((heads, tails),
                              num_nodes=self.g.number_of_nodes(self.item_type))
        neg_graph = dgl.graph((heads, neg_tails),
                              num_nodes=self.g.number_of_nodes(self.item_type))
        pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])
        seeds = pos_graph.ndata[dgl.NID]

        blocks = self.sample_blocks(seeds, heads, tails, neg_tails)
        return pos_graph, neg_graph, blocks
Ejemplo n.º 4
0
    def sample_from_item_pairs(self, heads, tails, neg_tails):
        # Create a graph with positive connections only and another graph with negative
        # connections only.
        pos_graph = self.build_hetero_graph(heads, tails)
        neg_graph = self.build_hetero_graph(heads, neg_tails)

        pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])
        pos_nodes = pos_graph.ndata[dgl.NID]
        seed_nodes = pos_nodes # same with neg_nodes from neg_graph

        blocks = self.sampler.sample_blocks(
            self.hg, seed_nodes, exclude_eids=None)
        return pos_graph, neg_graph, blocks
Ejemplo n.º 5
0
    def sample_from_item_pairs(self, heads, tails, neg_tails):
        # Create a graph with positive connections only and another graph with negative
        # connections only.
        pos_graph = dgl.graph((heads, tails),
                              num_nodes=self.g.number_of_nodes(self.item_type))
        neg_graph = dgl.graph((heads, neg_tails),
                              num_nodes=self.g.number_of_nodes(self.item_type))
        # eliminates isolated nodes(indeg, outdeg == 0) from all the graphs.
        pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])
        # compact之后 变成了0-index    NID能够返回原图的原ID
        # seeds = heads + tails + neg_tails
        seeds = pos_graph.ndata[dgl.NID]

        blocks = self.sample_blocks(seeds, heads, tails, neg_tails)
        return pos_graph, neg_graph, blocks
Ejemplo n.º 6
0
    def sample_from_item_pairs(self, heads, tails, neg_tails):
        # Create a graph with positive connections only and another graph with negative
        pos_graph = dgl.graph(
            (heads, tails),
            num_nodes=self.g.number_of_nodes(self.item_type))
        neg_graph = dgl.graph(
            (heads, neg_tails),
            num_nodes=self.g.number_of_nodes(self.item_type))

        # remove isolated nodes and re-indexing all nodes and edges
        pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])
        seeds = pos_graph.ndata[dgl.NID]  # all node ids mapping to global graph g

        # extract 2-hop neighbor MFG structure dataset for message passing
        blocks, context_dicts = self.sample_blocks(seeds, heads, tails, neg_tails)
        return pos_graph, neg_graph, blocks, context_dicts
Ejemplo n.º 7
0
def get_graph(name, format=None):
    # global GRAPH_CACHE
    # if name in GRAPH_CACHE:
    #     return GRAPH_CACHE[name].to(format)
    if isinstance(format, str):
        format = [format]  # didn't specify format
    if format is None:
        format = ['csc', 'csr', 'coo']
    g = None
    if name == 'cora':
        g = dgl.data.CoraGraphDataset(verbose=False)[0]
    elif name == 'pubmed':
        g = dgl.data.PubmedGraphDataset(verbose=False)[0]
    elif name == 'livejournal':
        bin_path = "/tmp/dataset/livejournal/livejournal_{}.bin".format(format)
        if os.path.exists(bin_path):
            g_list, _ = dgl.load_graphs(bin_path)
            g = g_list[0]
        else:
            g = get_livejournal().formats(format)
            dgl.save_graphs(bin_path, [g])
    elif name == "friendster":
        bin_path = "/tmp/dataset/friendster/friendster_{}.bin".format(format)
        if os.path.exists(bin_path):
            g_list, _ = dgl.load_graphs(bin_path)
            g = g_list[0]
        else:
            # the original node IDs of friendster are not consecutive, so we compact it
            g = dgl.compact_graphs(get_friendster()).formats(format)
            dgl.save_graphs(bin_path, [g])
    elif name == "reddit":
        bin_path = "/tmp/dataset/reddit/reddit_{}.bin".format(format)
        if os.path.exists(bin_path):
            g_list, _ = dgl.load_graphs(bin_path)
            g = g_list[0]
        else:
            g = dgl.data.RedditDataset(self_loop=True)[0].formats(format)
            dgl.save_graphs(bin_path, [g])
    elif name.startswith("ogb"):
        g = get_ogb_graph(name)
    else:
        raise Exception("Unknown dataset")
    # GRAPH_CACHE[name] = g
    g = g.formats(format)
    return g
Ejemplo n.º 8
0
    def sample(self, batch):
        users, items, ratings = zip(*batch)
        users = torch.stack(users)
        items = torch.stack(items)
        ratings = torch.stack(ratings)
        # 1 创建二部图
        pair_graph = dgl.heterograph(
            {('user', 'watched', 'item'): (users, items)},
            num_nodes_dict={
                'user': self.graph.num_nodes('user'),
                'item': self.graph.num_nodes('item')
            })

        u = users.tolist()
        i = items.tolist()
        real_data = torch.tensor(list(zip(u, i)), dtype=torch.int)
        pair_graph.edata['real_data'] = real_data

        # 2 压缩二部图
        pair_graph = dgl.compact_graphs(pair_graph)
        pair_graph.edata['rating'] = ratings

        # 3 创建数据块
        seeds = {
            'user': pair_graph.nodes['user'].data[dgl.NID],
            'item': pair_graph.nodes['item'].data[dgl.NID]
        }
        blocks = self.construct_blocks(seeds, (users, items))

        # 把节点特征也复制过来
        # 注意这里只需要处理源端结点
        for feature_name in self.graph.nodes['user'].data.keys():
            blocks[0].srcnodes['user'].data[feature_name] = \
                self.graph.nodes['user'].data[feature_name][blocks[0].srcnodes['user'].data[dgl.NID]]
        for feature_name in self.graph.nodes['item'].data.keys():
            blocks[0].srcnodes['item'].data[feature_name] = \
                self.graph.nodes['item'].data[feature_name][blocks[0].srcnodes['item'].data[dgl.NID]]

        return pair_graph, blocks
Ejemplo n.º 9
0
    def sample(self, labels: Tensor) -> Batch:
        """sample blocks and decode graph from labels

        :param labels: tensor of dim Nx3, (src_nodes, dst_nodes, labels)
        :return: blocks for calculating node representations, a decode graph, and edge labels
        """
        labels = torch.cat([label[0].unsqueeze(0) for label in labels], dim=0)
        if self._negative_sampling is True:
            negative_labels = self._sample_negative_labels(labels)
            labels = torch.cat([labels, negative_labels], dim=0)

        decode_graph = dgl.graph(data=(labels[:, 0], labels[:, 1]), num_nodes=self._g.number_of_nodes())
        decode_graph = dgl.compact_graphs(decode_graph)
        seed_nodes = decode_graph.ndata[dgl.NID]

        blocks = list()
        for fanout in self._fanouts:
            sub_graph = dgl.sampling.sample_neighbors(g=self._g, nodes=seed_nodes, fanout=fanout)
            block = dgl.to_block(sub_graph, seed_nodes)
            blocks.insert(0, block)
            seed_nodes = block.srcdata[dgl.NID]
        input_features = self._g.ndata["feat"][blocks[0].srcdata[dgl.NID]]
        return Batch(blocks, decode_graph, input_features, labels[:, 2])
Ejemplo n.º 10
0
    def obtain_Bs(self, ed_ids):
        n_edges = len(ed_ids)
        ed_ids = torch.LongTensor(np.asarray(ed_ids))
        heads, tails = self.g.find_edges(ed_ids)
        neg_tails = self.weights.multinomial(self.num_negs * n_edges, replacement=True)
        neg_heads = heads.view(-1, 1).expand(n_edges, self.num_negs).flatten()
        pos_graph = dgl.graph((heads, tails), num_nodes=self.g.number_of_nodes())
        neg_graph = dgl.graph((neg_heads, neg_tails), num_nodes=self.g.number_of_nodes())
        pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])

        ids = pos_graph.ndata[dgl.NID]
        B = []
        for s in self.fanout:
            nf = sample_neighbors(self.g, nodes=ids, fanout=s, replace=True)      # 返回采样后的图,节点不变,边仅保留采样到的
            _, _, edge_ids = nf.edge_ids(
                torch.cat([heads, tails, neg_heads, neg_tails]),
                torch.cat([tails, heads, neg_tails, neg_heads]),
                return_uv=True)
            nf = dgl.remove_edges(nf, edge_ids)          # 用于计算损失函数的边剔除,前向传播用剩下的边
            b = dgl.to_block(nf, ids)        # 转为二部图,可以方便读取src和dst节点,将后一层节点作为dst
            ids = b.srcdata[dgl.NID]        # 二部图源节点作为前一层的ids
            B.insert(0, b)                  # 插入到列表最前
        return pos_graph, neg_graph, B
Ejemplo n.º 11
0
    def _collate_with_negative_sampling(self, items):
        """根据边id采样子图,并进行负采样

        :param items: tensor(B) 边id
        :return: tensor(N_src), DGLGraph, DGLGraph, List[DGLBlock] 知识图谱的输入顶点id,用户-物品图的边子图,负样本图,
        知识图谱根据边id关联的物品id采样的多层MFG
        """
        items = prepare_tensor(self.g_sampling, items, 'items')
        pair_graph = dgl.edge_subgraph(self.g, items, relabel_nodes=False)
        induced_edges = pair_graph.edata[dgl.EID]

        neg_srcdst = self.negative_sampler(self.g, items)
        neg_pair_graph = dgl.heterograph(
            {self.g.canonical_etypes[0]: neg_srcdst})

        pair_graph, neg_pair_graph = dgl.compact_graphs(
            [pair_graph, neg_pair_graph])
        pair_graph.edata[dgl.EID] = induced_edges
        seed_nodes = pair_graph.ndata[dgl.NID]

        blocks = self.block_sampler.sample_blocks(self.g_sampling,
                                                  seed_nodes['item'])
        input_nodes = blocks[0].srcdata[dgl.NID]
        return input_nodes, pair_graph, neg_pair_graph, blocks
Ejemplo n.º 12
0
def test_compact(index_dtype):
    g1 = dgl.heterograph({
        ('user', 'follow', 'user'): [(1, 3), (3, 5)],
        ('user', 'plays', 'game'): [(2, 4), (3, 4), (2, 5)],
        ('game', 'wished-by', 'user'): [(6, 7), (5, 7)]},
        {'user': 20, 'game': 10}, index_dtype=index_dtype)

    g2 = dgl.heterograph({
        ('game', 'clicked-by', 'user'): [(3, 1)],
        ('user', 'likes', 'user'): [(1, 8), (8, 9)]},
        {'user': 20, 'game': 10}, index_dtype=index_dtype)

    g3 = dgl.graph([(0, 1), (1, 2)], num_nodes=10, ntype='user', index_dtype=index_dtype)
    g4 = dgl.graph([(1, 3), (3, 5)], num_nodes=10, ntype='user', index_dtype=index_dtype)

    def _check(g, new_g, induced_nodes):
        assert g.ntypes == new_g.ntypes
        assert g.canonical_etypes == new_g.canonical_etypes

        for ntype in g.ntypes:
            assert -1 not in induced_nodes[ntype]

        for etype in g.canonical_etypes:
            g_src, g_dst = g.all_edges(order='eid', etype=etype)
            g_src = F.asnumpy(g_src)
            g_dst = F.asnumpy(g_dst)
            new_g_src, new_g_dst = new_g.all_edges(order='eid', etype=etype)
            new_g_src_mapped = induced_nodes[etype[0]][F.asnumpy(new_g_src)]
            new_g_dst_mapped = induced_nodes[etype[2]][F.asnumpy(new_g_dst)]
            assert (g_src == new_g_src_mapped).all()
            assert (g_dst == new_g_dst_mapped).all()

    # Test default
    new_g1 = dgl.compact_graphs(g1)
    induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
    assert new_g1._idtype_str == index_dtype
    assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7])
    assert set(induced_nodes['game']) == set([4, 5, 6])
    _check(g1, new_g1, induced_nodes)

    # Test with always_preserve given a dict
    new_g1 = dgl.compact_graphs(
        g1, always_preserve={'game': F.tensor([4, 7], dtype=getattr(F, index_dtype))})
    assert new_g1._idtype_str == index_dtype
    induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
    assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7])
    assert set(induced_nodes['game']) == set([4, 5, 6, 7])
    _check(g1, new_g1, induced_nodes)

    # Test with always_preserve given a tensor
    new_g3 = dgl.compact_graphs(
        g3, always_preserve=F.tensor([1, 7], dtype=getattr(F, index_dtype)))
    induced_nodes = {ntype: new_g3.nodes[ntype].data[dgl.NID] for ntype in new_g3.ntypes}
    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}

    assert new_g3._idtype_str == index_dtype
    assert set(induced_nodes['user']) == set([0, 1, 2, 7])
    _check(g3, new_g3, induced_nodes)

    # Test multiple graphs
    new_g1, new_g2 = dgl.compact_graphs([g1, g2])
    induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
    assert new_g1._idtype_str == index_dtype
    assert new_g2._idtype_str == index_dtype
    assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9])
    assert set(induced_nodes['game']) == set([3, 4, 5, 6])
    _check(g1, new_g1, induced_nodes)
    _check(g2, new_g2, induced_nodes)

    # Test multiple graphs with always_preserve given a dict
    new_g1, new_g2 = dgl.compact_graphs(
        [g1, g2], always_preserve={'game': F.tensor([4, 7], dtype=getattr(F, index_dtype))})
    induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
    assert new_g1._idtype_str == index_dtype
    assert new_g2._idtype_str == index_dtype
    assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9])
    assert set(induced_nodes['game']) == set([3, 4, 5, 6, 7])
    _check(g1, new_g1, induced_nodes)
    _check(g2, new_g2, induced_nodes)

    # Test multiple graphs with always_preserve given a tensor
    new_g3, new_g4 = dgl.compact_graphs(
        [g3, g4], always_preserve=F.tensor([1, 7], dtype=getattr(F, index_dtype)))
    induced_nodes = {ntype: new_g3.nodes[ntype].data[dgl.NID] for ntype in new_g3.ntypes}
    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}

    assert new_g3._idtype_str == index_dtype
    assert new_g4._idtype_str == index_dtype
    assert set(induced_nodes['user']) == set([0, 1, 2, 3, 5, 7])
    _check(g3, new_g3, induced_nodes)
    _check(g4, new_g4, induced_nodes)
Ejemplo n.º 13
0
    def sample_blocks(self, seeds):
        """Sample subgraphs from the entire graph.

        The input ``seeds`` represents the edges to compute prediction for. The sampling
        algorithm works as follows:

          1. Get the head and tail nodes of the provided seed edges.
          2. For each head and tail node, extract the entire in-coming neighborhood.
          3. Copy the node features/embeddings from the full graph to the sampled subgraphs.
        """
        dataset = self.dataset
        enc_graph = self.enc_graph
        dec_graph = self.dec_graph
        edge_ids = th.stack(seeds)
        # generate frontiers for user and item
        possible_rating_values = dataset.possible_rating_values
        true_relation_ratings = self.truths[edge_ids]
        true_relation_labels = None if self.labels is None else self.labels[
            edge_ids]

        # 1. Get the head and tail nodes from both the decoder and encoder graphs.
        head_id, tail_id = dec_graph.find_edges(edge_ids)
        utype, _, vtype = enc_graph.canonical_etypes[0]
        subg = []
        true_rel_ratings = []
        true_rel_labels = []
        for possible_rating_value in possible_rating_values:
            idx_loc = (true_relation_ratings == possible_rating_value)
            head = head_id[idx_loc]
            tail = tail_id[idx_loc]
            true_rel_ratings.append(true_relation_ratings[idx_loc])
            if self.labels is not None:
                true_rel_labels.append(true_relation_labels[idx_loc])
            subg.append(
                dgl.bipartite((head, tail),
                              utype=utype,
                              etype=str(possible_rating_value),
                              vtype=vtype,
                              num_nodes=(enc_graph.number_of_nodes(utype),
                                         enc_graph.number_of_nodes(vtype))))
        # Convert the encoder subgraph to a more compact one by removing nodes that covered
        # by the seed edges.
        g = dgl.hetero_from_relations(subg)
        g = dgl.compact_graphs(g)

        # 2. For each head and tail node, extract the entire in-coming neighborhood.
        seed_nodes = {}
        for ntype in g.ntypes:
            seed_nodes[ntype] = g.nodes[ntype].data[dgl.NID]
        frontier = dgl.in_subgraph(enc_graph, seed_nodes)
        frontier = dgl.to_block(frontier, seed_nodes)

        # 3. Copy the node features/embeddings from the full graph to the sampled subgraphs.
        frontier.dstnodes['user'].data['ci'] = \
            enc_graph.nodes['user'].data['ci'][frontier.dstnodes['user'].data[dgl.NID]]
        frontier.srcnodes['movie'].data['cj'] = \
            enc_graph.nodes['movie'].data['cj'][frontier.srcnodes['movie'].data[dgl.NID]]
        frontier.srcnodes['user'].data['cj'] = \
            enc_graph.nodes['user'].data['cj'][frontier.srcnodes['user'].data[dgl.NID]]
        frontier.dstnodes['movie'].data['ci'] = \
            enc_graph.nodes['movie'].data['ci'][frontier.dstnodes['movie'].data[dgl.NID]]

        # handle features
        head_feat = frontier.srcnodes['user'].data[dgl.NID].long() \
                    if dataset.user_feature is None else \
                       dataset.user_feature[frontier.srcnodes['user'].data[dgl.NID]]
        tail_feat = frontier.srcnodes['movie'].data[dgl.NID].long()\
                    if dataset.movie_feature is None else \
                       dataset.movie_feature[frontier.srcnodes['movie'].data[dgl.NID]]

        true_rel_labels = None if self.labels is None else th.cat(
            true_rel_labels, dim=0)
        true_rel_ratings = th.cat(true_rel_ratings, dim=0)
        return (g, frontier, head_feat, tail_feat, true_rel_labels,
                true_rel_ratings)
Ejemplo n.º 14
0
g = dgl.graph((torch.cat([src, dst]), torch.cat([dst, src])))
len_event = src.shape[0]

g.edata['label'] = label.repeat(2).squeeze()
g.edata['timestamp'] = timestamp.repeat(2).squeeze()
g.edata['feat'] = edge_feat.repeat(2, 1).squeeze()

print(g)
save_graphs(f"./data/{args.data}.bin", g)

if args.new_node_count:
    origin_num_edges = g.num_edges() // 2
    train_eid = torch.arange(0, int(0.7 * origin_num_edges))
    un_train_eid = torch.arange(int(0.7 * origin_num_edges), origin_num_edges)

    train_g = dgl.graph(g.find_edges(train_eid))
    val_n_test_g = dgl.compact_graphs(dgl.graph(g.find_edges(un_train_eid)))

    print(
        f'total nodes: {g.num_nodes()}, training nodes: {train_g.num_nodes()}, val_n_test nodes: {val_n_test_g.num_nodes()}'
    )
    old_nodes = val_n_test_g.num_nodes() - g.num_nodes() + train_g.num_nodes()
    print(
        f'old nodes in val_n_test: {old_nodes} ({round((old_nodes)*100/val_n_test_g.num_nodes(),4)}%)'
    )
    new_nodes = g.num_nodes() - train_g.num_nodes()
    print(
        f'new nodes in val_n_test: {new_nodes} ({round((new_nodes)*100/val_n_test_g.num_nodes(),4)}%)'
    )