Ejemplo n.º 1
0
    def init_graph(self):
        r"""Get the initial attention matrix through the collaborative knowledge graph

        Returns:
            torch.sparse.FloatTensor: Sparse tensor of the attention matrix
        """
        import dgl
        adj_list = []
        for rel_type in range(1, self.n_relations, 1):
            edge_idxs = self.ckg.filter_edges(
                lambda edge: edge.data['relation_id'] == rel_type)
            sub_graph = dgl.edge_subgraph(self.ckg, edge_idxs, preserve_nodes=True). \
                adjacency_matrix(transpose=False, scipy_fmt='coo').astype('float')
            rowsum = np.array(sub_graph.sum(1))
            d_inv = np.power(rowsum, -1).flatten()
            d_inv[np.isinf(d_inv)] = 0.
            d_mat_inv = sp.diags(d_inv)
            norm_adj = d_mat_inv.dot(sub_graph).tocoo()
            adj_list.append(norm_adj)

        final_adj_matrix = sum(adj_list).tocoo()
        indices = torch.LongTensor(
            [final_adj_matrix.row, final_adj_matrix.col])
        values = torch.FloatTensor(final_adj_matrix.data)
        adj_matrix_tensor = torch.sparse.FloatTensor(indices, values,
                                                     self.matrix_size)
        return adj_matrix_tensor.to(self.device)
Ejemplo n.º 2
0
def train(args):
    data = load_kg_dataset(args.dataset)
    g = data[0]
    train_idx = g.edata['train_mask'].nonzero(as_tuple=False).squeeze()
    val_idx = g.edata['val_mask'].nonzero(as_tuple=False).squeeze()
    test_idx = g.edata['test_mask'].nonzero(as_tuple=False).squeeze()

    train_g = dgl.edge_subgraph(g, train_idx, preserve_nodes=True)
    train_triplets = g.find_edges(train_idx) + (train_g.edata['etype'],)
    model = LinkPrediction(
        data.num_nodes, args.num_hidden, data.num_rels * 2, args.num_layers,
        args.regularizer, args.num_bases, args.dropout
    )
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    neg_sampler = Uniform(args.num_neg_samples)
    labels = torch.cat([torch.ones(train_g.num_edges()), torch.zeros(train_g.num_edges() * args.num_neg_samples)] )
    for epoch in range(args.epochs):
        model.train()
        embed = model(train_g, train_g.edata['etype'])

        neg_triplets = neg_sampler(train_g, torch.arange(train_g.num_edges())) \
            + (train_g.edata['etype'].repeat_interleave(args.num_neg_samples),)
        pos_score = model.calc_score(embed, train_triplets)
        neg_score = model.calc_score(embed, neg_triplets)
        loss = F.binary_cross_entropy_with_logits(torch.cat([pos_score, neg_score]), labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # TODO 计算MRR
        # FB-15k和FB15k-237反向传播很慢?
        print('Epoch {:04d} | Loss {:.4f}'.format(epoch, loss.item()))
Ejemplo n.º 3
0
    def sampler_frontier(self,
                         block_id,
                         g,
                         seed_nodes,
                         timestamp):
        full_neighbor_subgraph = dgl.in_subgraph(g, seed_nodes)
        full_neighbor_subgraph = dgl.add_edges(full_neighbor_subgraph,
                                               seed_nodes, seed_nodes)

        temporal_edge_mask = (full_neighbor_subgraph.edata['timestamp'] < timestamp) + (
            full_neighbor_subgraph.edata['timestamp'] <= 0)
        temporal_subgraph = dgl.edge_subgraph(
            full_neighbor_subgraph, temporal_edge_mask)

        # Map preserve ID
        temp2origin = temporal_subgraph.ndata[dgl.NID]

        # The added new edgge will be preserved hence
        root2sub_dict = dict(
            zip(temp2origin.tolist(), temporal_subgraph.nodes().tolist()))
        temporal_subgraph.ndata[dgl.NID] = g.ndata[dgl.NID][temp2origin]
        seed_nodes = [root2sub_dict[int(n)] for n in seed_nodes]
        final_subgraph = self.sampler(g=temporal_subgraph, nodes=seed_nodes)
        final_subgraph.remove_self_loop()
        return final_subgraph
Ejemplo n.º 4
0
def track_time(graph_name, format, seed_egdes_num):
    device = utils.get_bench_device()
    graph = utils.get_graph(graph_name, format)
    graph = graph.to(device)

    seed_edges = np.random.randint(0, graph.num_edges(), seed_egdes_num)

    # dry run
    for i in range(3):
        dgl.edge_subgraph(graph, seed_edges)

    # timing

    with utils.Timer() as t:
        for i in range(3):
            dgl.edge_subgraph(graph, seed_edges)

    return t.elapsed_secs / 3
def test_set_batch_info(idtype):
    ctx = F.ctx()

    g1 = dgl.rand_graph(30, 100).astype(idtype).to(F.ctx())
    g2 = dgl.rand_graph(40, 200).astype(idtype).to(F.ctx())
    bg = dgl.batch([g1, g2])
    batch_num_nodes = F.astype(bg.batch_num_nodes(), idtype)
    batch_num_edges = F.astype(bg.batch_num_edges(), idtype)

    # test homogeneous node subgraph
    sg_n = dgl.node_subgraph(bg, list(range(10, 20)) + list(range(50, 60)))
    induced_nodes = sg_n.ndata['_ID']
    induced_edges = sg_n.edata['_ID']
    new_batch_num_nodes = _get_subgraph_batch_info(bg.ntypes, [induced_nodes],
                                                   batch_num_nodes)
    new_batch_num_edges = _get_subgraph_batch_info(bg.canonical_etypes,
                                                   [induced_edges],
                                                   batch_num_edges)
    sg_n.set_batch_num_nodes(new_batch_num_nodes)
    sg_n.set_batch_num_edges(new_batch_num_edges)
    subg_n1, subg_n2 = dgl.unbatch(sg_n)
    subg1 = dgl.node_subgraph(g1, list(range(10, 20)))
    subg2 = dgl.node_subgraph(g2, list(range(20, 30)))
    assert subg_n1.num_edges() == subg1.num_edges()
    assert subg_n2.num_edges() == subg2.num_edges()

    # test homogeneous edge subgraph
    sg_e = dgl.edge_subgraph(bg,
                             list(range(40, 70)) + list(range(150, 200)),
                             preserve_nodes=True)
    induced_nodes = sg_e.ndata['_ID']
    induced_edges = sg_e.edata['_ID']
    new_batch_num_nodes = _get_subgraph_batch_info(bg.ntypes, [induced_nodes],
                                                   batch_num_nodes)
    new_batch_num_edges = _get_subgraph_batch_info(bg.canonical_etypes,
                                                   [induced_edges],
                                                   batch_num_edges)
    sg_e.set_batch_num_nodes(new_batch_num_nodes)
    sg_e.set_batch_num_edges(new_batch_num_edges)
    subg_e1, subg_e2 = dgl.unbatch(sg_e)
    subg1 = dgl.edge_subgraph(g1, list(range(40, 70)), preserve_nodes=True)
    subg2 = dgl.edge_subgraph(g2, list(range(50, 100)), preserve_nodes=True)
    assert subg_e1.num_nodes() == subg1.num_nodes()
    assert subg_e2.num_nodes() == subg2.num_nodes()
Ejemplo n.º 6
0
def drop_edge(graph, drop_prob):
    E = graph.num_edges()

    mask_rates = th.FloatTensor(np.ones(E) * drop_prob)
    masks = th.bernoulli(1 - mask_rates)
    edge_idx = masks.nonzero().squeeze(1)

    sg = dgl.edge_subgraph(graph, edge_idx, relabel_nodes=False)

    return sg
Ejemplo n.º 7
0
 def sample_fraud_event(self, g, bs, current_ts):
     idx = (self.ts<current_ts)
     num_fraud = idx.sum().item()
     
     if num_fraud > bs:
         
         idx[random.sample(list(range(num_fraud)), num_fraud-bs)] = False # 只采样一部分fraud event
         
     fraud_eid = self.fraud_eid[idx]
     
     fraud_graph = dgl.edge_subgraph(g, fraud_eid)
     return fraud_graph
Ejemplo n.º 8
0
    def _collate(self, items):
        """根据边id采样子图

        :param items: tensor(B) 边id
        :return: tensor(N_src), DGLGraph, List[DGLBlock] 知识图谱的输入顶点id,用户-物品图的边子图,
        知识图谱根据边id关联的物品id采样的多层MFG
        """
        items = prepare_tensor(self.g_sampling, items, 'items')
        pair_graph = dgl.edge_subgraph(self.g, items)
        seed_nodes = pair_graph.ndata[dgl.NID]
        blocks = self.block_sampler.sample_blocks(self.g_sampling,
                                                  seed_nodes['item'])
        input_nodes = blocks[0].srcdata[dgl.NID]
        return input_nodes, pair_graph, blocks
 def _create_subgraph(self, node_idx):
     """ get all nodes that contribute to the computation of node's embedding """
     if node_idx is None:  # graph classification
         sub_g = copy.deepcopy(self.g)
         sub_g.ndata[ExplainerTags.ORIGINAL_ID] = torch.range(0, self.g.num_nodes() - 1, dtype=torch.int)
     else:
         nodes = torch.tensor([node_idx])
         eid_list = []
         for _ in range(self.num_hops):
             predecessors, _, eid = self.g.in_edges(nodes, form='all')
             eid_list.extend(eid)
             predecessors = torch.flatten(predecessors).unique()
             nodes = torch.cat([nodes, predecessors])
             nodes = torch.unique(nodes)
         eid_list = list(np.unique(np.array([eid_list])))
         sub_g = dgl.edge_subgraph(self.g, eid_list)  # TODO - handle heterogeneous graphs
         sub_g.ndata[ExplainerTags.ORIGINAL_ID] = sub_g.ndata[dgl.NID]
     return sub_g
Ejemplo n.º 10
0
    def sample_frontier(self, block_id, g, seed_nodes, *args, **kwargs):
        with g.local_scope():
            new_edges_masks = {}
            for etype in g.canonical_etypes:
                edge_mask = th.zeros(g.number_of_edges(etype))
                # extract each node from dict because of single node type
                for node in seed_nodes:
                    edges = g.in_edges(node, form='eid', etype=etype)
                    num_neigh = th.ceil(g.in_degrees(node, etype=etype) * self.p[block_id][etype]).int().item()
                    neigh_dist = self.dists[block_id][etype][edges]
                    if neigh_dist.shape[0] > num_neigh:
                        neigh_index = np.argpartition(neigh_dist.cpu().detach(), num_neigh)[:num_neigh]
                    else:
                        neigh_index = np.arange(num_neigh)
                    edge_mask[edges[neigh_index]] = 1
                new_edges_masks[etype] = edge_mask.bool()

            return dgl.edge_subgraph(g, new_edges_masks, relabel_nodes=False)
Ejemplo n.º 11
0
    def _collate_with_negative_sampling(self, items):
        """根据边id采样子图,并进行负采样

        :param items: tensor(B) 边id
        :return: tensor(N_src), DGLGraph, DGLGraph, List[DGLBlock] 知识图谱的输入顶点id,用户-物品图的边子图,负样本图,
        知识图谱根据边id关联的物品id采样的多层MFG
        """
        items = prepare_tensor(self.g_sampling, items, 'items')
        pair_graph = dgl.edge_subgraph(self.g, items, relabel_nodes=False)
        induced_edges = pair_graph.edata[dgl.EID]

        neg_srcdst = self.negative_sampler(self.g, items)
        neg_pair_graph = dgl.heterograph(
            {self.g.canonical_etypes[0]: neg_srcdst})

        pair_graph, neg_pair_graph = dgl.compact_graphs(
            [pair_graph, neg_pair_graph])
        pair_graph.edata[dgl.EID] = induced_edges
        seed_nodes = pair_graph.ndata[dgl.NID]

        blocks = self.block_sampler.sample_blocks(self.g_sampling,
                                                  seed_nodes['item'])
        input_nodes = blocks[0].srcdata[dgl.NID]
        return input_nodes, pair_graph, neg_pair_graph, blocks
Ejemplo n.º 12
0
Archivo: train.py Proyecto: yuk12/dgl
def dgl_main():
    # Load from DGL dataset
    if args.dataset == 'cora':
        dataset = CoraGraphDataset(reverse_edge=False)
    elif args.dataset == 'citeseer':
        dataset = CiteseerGraphDataset(reverse_edge=False)
    elif args.dataset == 'pubmed':
        dataset = PubmedGraphDataset(reverse_edge=False)
    else:
        raise NotImplementedError
    graph = dataset[0]

    # Extract node features
    feats = graph.ndata.pop('feat').to(device)
    in_dim = feats.shape[-1]

    # generate input
    adj_orig = graph.adjacency_matrix().to_dense()

    # build test set with 10% positive links
    train_edge_idx, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges_dgl(
        graph, adj_orig)

    graph = graph.to(device)

    # create train graph
    train_edge_idx = torch.tensor(train_edge_idx).to(device)
    train_graph = dgl.edge_subgraph(graph, train_edge_idx, relabel_nodes=False)
    train_graph = train_graph.to(device)
    adj = train_graph.adjacency_matrix().to_dense().to(device)

    # compute loss parameters
    weight_tensor, norm = compute_loss_para(adj)

    # create model
    vgae_model = model.VGAEModel(in_dim, args.hidden1, args.hidden2)
    vgae_model = vgae_model.to(device)

    # create training component
    optimizer = torch.optim.Adam(vgae_model.parameters(),
                                 lr=args.learning_rate)
    print('Total Parameters:',
          sum([p.nelement() for p in vgae_model.parameters()]))

    # create training epoch
    for epoch in range(args.epochs):
        t = time.time()

        # Training and validation using a full graph
        vgae_model.train()

        logits = vgae_model.forward(graph, feats)

        # compute loss
        loss = norm * F.binary_cross_entropy(
            logits.view(-1), adj.view(-1), weight=weight_tensor)
        kl_divergence = 0.5 / logits.size(0) * (
            1 + 2 * vgae_model.log_std - vgae_model.mean**2 -
            torch.exp(vgae_model.log_std)**2).sum(1).mean()
        loss -= kl_divergence

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_acc = get_acc(logits, adj)

        val_roc, val_ap = get_scores(val_edges, val_edges_false, logits)

        # Print out performance
        print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
              "{:.5f}".format(loss.item()), "train_acc=",
              "{:.5f}".format(train_acc), "val_roc=", "{:.5f}".format(val_roc),
              "val_ap=", "{:.5f}".format(val_ap), "time=",
              "{:.5f}".format(time.time() - t))

    test_roc, test_ap = get_scores(test_edges, test_edges_false, logits)
    # roc_means.append(test_roc)
    # ap_means.append(test_ap)
    print("End of training!", "test_roc=", "{:.5f}".format(test_roc),
          "test_ap=", "{:.5f}".format(test_ap))