Ejemplo n.º 1
0
def test_partial_edge_softmax():
    g = dgl.DGLGraph()
    g.add_nodes(30)
    # build a complete graph
    for i in range(30):
        for j in range(30):
            g.add_edge(i, j)

    score = F.randn((300, 1))
    score.requires_grad_()
    grad = F.randn((300, 1))
    import numpy as np
    eids = np.random.choice(900, 300, replace=False).astype('int64')
    eids = F.zerocopy_from_numpy(eids)
    # compute partial edge softmax
    y_1 = nn.edge_softmax(g, score, eids)
    y_1.backward(grad)
    grad_1 = score.grad
    score.grad.zero_()
    # compute edge softmax on edge subgraph
    subg = g.edge_subgraph(eids)
    y_2 = nn.edge_softmax(subg, score)
    y_2.backward(grad)
    grad_2 = score.grad
    score.grad.zero_()

    assert F.allclose(y_1, y_2)
    assert F.allclose(grad_1, grad_2)
Ejemplo n.º 2
0
Archivo: test_nn.py Proyecto: zwcdp/dgl
def test_edge_softmax():
    # Basic
    g = dgl.DGLGraph(nx.path_graph(3))
    edata = F.ones((g.number_of_edges(), 1))
    a = nn.edge_softmax(g, edata)
    assert len(g.ndata) == 0
    assert len(g.edata) == 0
    assert F.allclose(a, uniform_attention(g, a.shape))

    # Test higher dimension case
    edata = F.ones((g.number_of_edges(), 3, 1))
    a = nn.edge_softmax(g, edata)
    assert len(g.ndata) == 0
    assert len(g.edata) == 0
    assert F.allclose(a, uniform_attention(g, a.shape))

    # Test both forward and backward with PyTorch built-in softmax.
    g = dgl.DGLGraph()
    g.add_nodes(30)
    # build a complete graph
    for i in range(30):
        for j in range(30):
            g.add_edge(i, j)

    score = F.randn((900, 1))
    score.requires_grad_()
    grad = F.randn((900, 1))
    y = F.softmax(score.view(30, 30), dim=0).view(-1, 1)
    y.backward(grad)
    grad_score = score.grad
    score.grad.zero_()
    y_dgl = nn.edge_softmax(g, score)
    assert len(g.ndata) == 0
    assert len(g.edata) == 0
    # check forward
    assert F.allclose(y_dgl, y)
    y_dgl.backward(grad)
    # checkout gradient
    assert F.allclose(score.grad, grad_score)
    print(score.grad[:10], grad_score[:10])
    
    # Test 2
    def generate_rand_graph(n):
      arr = (sp.sparse.random(n, n, density=0.1, format='coo') != 0).astype(np.int64)
      return dgl.DGLGraph(arr, readonly=True)
    
    g = generate_rand_graph(50)
    a1 = F.randn((g.number_of_edges(), 1)).requires_grad_()
    a2 = a1.clone().detach().requires_grad_()
    g.edata['s'] = a1
    g.group_apply_edges('dst', lambda edges: {'ss':F.softmax(edges.data['s'], 1)})
    g.edata['ss'].sum().backward()
    
    builtin_sm = nn.edge_softmax(g, a2)
    builtin_sm.sum().backward()
    print(a1.grad - a2.grad)
    assert len(g.ndata) == 0
    assert len(g.edata) == 2
    assert F.allclose(a1.grad, a2.grad, rtol=1e-4, atol=1e-4) # Follow tolerance in unittest backend
Ejemplo n.º 3
0
def test_edge_softmax():
    # Basic
    g = dgl.DGLGraph(nx.path_graph(3))
    edata = th.ones(g.number_of_edges(), 1)
    a = nn.edge_softmax(g, edata)
    assert th.allclose(a, uniform_attention(g, a.shape))

    # Test higher dimension case
    edata = th.ones(g.number_of_edges(), 3, 1)
    a = nn.edge_softmax(g, edata)
    assert th.allclose(a, uniform_attention(g, a.shape))
Ejemplo n.º 4
0
    def forward(self, graph, feat):
        graph = graph.local_var()
        h = self.feat_drop(feat)
        feat = self.fc(h).view(-1, self._num_heads, self._out_feats)
        el = (feat * self.attn_l).sum(dim=-1).unsqueeze(-1)
        er = (feat * self.attn_r).sum(dim=-1).unsqueeze(-1)
        graph.ndata.update({"ft": feat, "el": el, "er": er})
        # compute edge attention
        graph.apply_edges(fn.u_add_v("el", "er", "e"))
        # apply leaky relu
        graph.apply_edges(self.relu_udf)

        # compute softmax/sparsemax
        if self.sparsemax:
            graph.apply_edges(self.sparsemax_udf)
        else:
            graph.edata["a"] = edge_softmax(graph, graph.edata.pop("e"))

        # attention dropout
        graph.apply_edges(self.attn_drop_udf)

        # message passing
        graph.update_all(fn.u_mul_e("ft", "a", "m"), fn.sum("m", "ft"))
        rst = graph.ndata["ft"]
        # residual
        if self.res_fc is not None:
            resval = self.res_fc(h).view(h.shape[0], -1, self._out_feats)
            rst = rst + resval
        # activation
        if self.activation:
            rst = self.activation(rst)
        return rst
Ejemplo n.º 5
0
 def forward(self, graph, feat):
     graph = graph.local_var()
     feat_c = feat.clone().detach().requires_grad_(False)
     q, k, v = self.q_proj(feat), self.k_proj(feat_c), self.v_proj(feat_c)
     q = q.view(-1, self._num_heads, self._out_feats)
     k = k.view(-1, self._num_heads, self._out_feats)
     v = v.view(-1, self._num_heads, self._out_feats)
     graph.ndata.update({
         "ft": v,
         "el": k,
         "er": q
     })  # k,q instead of q,k, the edge_softmax is applied on incoming edges
     # compute edge attention
     graph.apply_edges(fn_u_dot_v("el", "er", "e"))
     e = graph.edata.pop("e") / math.sqrt(self._out_feats * self._num_heads)
     graph.edata["a"] = edge_softmax(graph, e).unsqueeze(-1)
     # message passing
     graph.update_all(fn_u_mul_e("ft", "a", "m"), fn_sum("m", "ft2"))
     rst = graph.ndata["ft2"]
     # residual
     rst = rst.view(feat.shape) + feat
     if self._trans:
         rst = self.ln1(rst)
         rst = self.ln1(rst + self.FFN(rst))
         # use the same layer norm
     return rst
Ejemplo n.º 6
0
    def forward(self, g, features):
        h_pre = features
        g = g.local_var()
        g.ndata['h'] = features

        g.ndata['norm_h'] = F.normalize(features, p=2, dim=-1)
        g.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos'))
        cos = g.edata.pop('cos')
        e = self.beta * cos
        if self.graph_cut > 0:
            k = int(e.size()[0] * self.graph_cut)
            _, indices = e.topk(k, largest=False, sorted=False)
            e[indices] = 0

        g.edata['p'] = edge_softmax(g, e)

        g.update_all(fn.u_mul_e('h', 'p', 'm'), fn.sum('m', 'h'))
        h = g.ndata['h']
        if self.project:
            h = self.linear(h)
        if self.activation:
            h = self.activation(h)
        if self.residual:
            h = h + self.res_fc(h_pre)
        h = self.dropout(h)
        return h
Ejemplo n.º 7
0
 def forward(self, graph, feat):
     graph = graph.local_var()
     feat_c = feat.clone().detach().requires_grad_(False)
     q, k, v = self.q_proj(feat), self.k_proj(feat_c), self.v_proj(feat_c)
     q = q.view(-1, self._num_heads, self._out_feats)
     k = k.view(-1, self._num_heads, self._out_feats)
     v = v.view(-1, self._num_heads, self._out_feats)
     graph.ndata.update({
         'ft': v,
         'el': k,
         'er': q
     })  # k,q instead of q,k, the edge_softmax is applied on incoming edges
     # compute edge attention
     graph.apply_edges(fn.u_dot_v('el', 'er', 'e'))
     e = graph.edata.pop('e') / math.sqrt(self._out_feats * self._num_heads)
     graph.edata['a'] = edge_softmax(graph, e).unsqueeze(-1)
     # message passing
     graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft2'))
     rst = graph.ndata['ft2']
     # residual
     rst = rst.view(feat.shape) + feat
     if self._trans:
         rst = self.ln1(rst)
         rst = self.ln1(rst + self.FFN(rst))
         # use the same layer norm, see the author's code
     return rst
Ejemplo n.º 8
0
 def forward(self, graph, feat, device):
     graph = graph.to(device).local_var()
     feat_c = feat.clone().detach().requires_grad_(False)
     q, k, v = self.query_proj(feat), self.key_proj(
         feat_c), self.value_proj(feat_c)
     q = q.view(-1, self.num_heads, self.embedding_size // self.num_heads)
     k = k.view(-1, self.num_heads, self.embedding_size // self.num_heads)
     v = v.view(-1, self.num_heads, self.embedding_size // self.num_heads)
     graph.ndata.update({
         'ft': v,
         'el': k,
         'er': q
     })  # k,q instead of q,k, the edge_softmax is applied on incoming edges
     # compute edge attention
     graph.apply_edges(fn.u_dot_v('el', 'er', 'e'))
     e = graph.edata.pop('e') / math.sqrt(self.embedding_size)
     graph.edata['a'] = edge_softmax(graph, e)
     # message passing
     graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft2'))
     rst = graph.ndata['ft2']
     # residual
     rst = rst.view(feat.shape) + feat
     rst = self.ln1(rst)
     rst = self.ln1(rst + self.out_proj(rst))
     return rst
Ejemplo n.º 9
0
    def forward(self,
                g,
                h,
                logits,
                old_z,
                attn_l,
                attn_r,
                *,
                shared_tau=True,
                tau1=None,
                tau2=None):
        with g.local_scope():
            h = self.dropout(h)

            if self.fc is not None:
                feat = self.fc(h).view(-1, self._num_heads, self._out_feats)
            else:
                feat = h
            g.ndata["h"] = feat  # (n_node, n_feat)
            g.ndata["logits"] = logits

            degs = g.in_degrees().float().clamp(min=1)
            norm = torch.pow(degs, -0.5).to(feat.device).unsqueeze(1)
            g.ndata["degree"] = degs

            el = (feat * attn_l).sum(dim=-1).unsqueeze(-1)
            er = (feat * attn_r).sum(dim=-1).unsqueeze(-1)
            g.ndata.update({"ft": feat, "el": el, "er": er})
            # compute edge attention
            g.apply_edges(fn.u_add_v("el", "er", "e"))
            e = self.leaky_relu(g.edata.pop("e"))
            # compute softmax
            g.edata["a"] = self.dropout(edge_softmax(g, e))

            g.update_all(
                message_func=adaptive_attn_message_func,
                reduce_func=adaptive_attn_reduce_func,
            )
            f1 = g.ndata.pop("f1")
            f2 = g.ndata.pop("f2")
            norm_f1 = self.ln1(f1)
            norm_f2 = self.ln2(f2)
            if shared_tau:
                z = torch.sigmoid((-1) * (norm_f1 - tau1)) * torch.sigmoid(
                    (-1) * (norm_f2 - tau2))
            else:
                # tau for each layer
                z = torch.sigmoid(
                    (-1) * (norm_f1 - self.tau1)) * torch.sigmoid(
                        (-1) * (norm_f2 - self.tau2))

            gate = torch.min(old_z, z)

            agg = g.ndata.pop("agg")
            normagg = agg * norm.unsqueeze(1)  # normalization by tgt degree

            if self.activation:
                normagg = self.activation(normagg)
            new_h = feat + gate.unsqueeze(2) * normagg
            return new_h, z
Ejemplo n.º 10
0
    def forward(self, graph):
        node_num = graph.ndata['h'].size(0)

        Q = self.query(graph.ndata['h'])
        K = self.key(graph.ndata['h'])
        V = self.value(graph.ndata['h'])

        Q = self.transpose_for_scores(Q)
        K = self.transpose_for_scores(K)
        V = self.transpose_for_scores(V)

        graph.ndata['Q'] = Q
        graph.ndata['K'] = K
        graph.ndata['V'] = V

        graph.apply_edges(fn.u_mul_v('K', 'Q', 'attn_probs'))
        graph.edata['attn_probs'] = graph.edata['attn_probs'].sum(-1,
                                                                  keepdim=True)
        graph.edata['attn_probs'] = edge_softmax(graph,
                                                 graph.edata['attn_probs'])
        graph.edata['attn_probs'] = self.dropout(graph.edata['attn_probs'])
        graph.apply_edges(fn.u_mul_e('V', 'attn_probs', 'attn_values'))

        graph.register_message_func(fn.copy_e('attn_values', 'm'))
        graph.register_reduce_func(fn.sum('m', 'h'))
        graph.update_all()
        graph.ndata['h'] = graph.ndata['h'].view([node_num, -1])

        return graph
Ejemplo n.º 11
0
    def forward(self, g, edge_logits, node_feats):
        """Update node representations.

        Parameters
        ----------
        g : DGLGraph
            DGLGraph for a batch of graphs
        edge_logits : float32 tensor of shape (E, 1)
            The edge logits based on which softmax will be performed for weighting
            edges within 1-hop neighborhoods. E represents the number of edges.
        node_feats : float32 tensor of shape (V, node_feat_size)
            Previous node features. V represents the number of nodes.

        Returns
        -------
        float32 tensor of shape (V, node_feat_size)
            Updated node features.
        """
        g = g.local_var()
        g.edata['a'] = edge_softmax(g, edge_logits)
        g.ndata['hv'] = self.project_node(node_feats)

        g.update_all(fn.src_mul_edge('hv', 'a', 'm'), fn.sum('m', 'c'))
        context = F.elu(g.ndata['c'])
        return F.relu(self.gru(context, node_feats))
Ejemplo n.º 12
0
    def forward(self, g, h):
        g = g.local_var()
        if not self.use_pp or not self.training:
            norm = self.get_norm(g)

            # g.ndata['h'] = h
            # g.update_all(fn.copy_src(src='h', out='m'),
            #              fn.sum(msg='m', out='h'))
            # ah = g.ndata.pop('h')

            if self._aggre_type == 'mean':
                g.ndata['h'] = h
                g.update_all(fn.copy_src('h', 'm'), fn.mean('m', 'h'))
                ah = g.ndata.pop('h')
            elif self._aggre_type == 'gcn':
                g.ndata['h'] = h
                g.update_all(fn.copy_src('h', 'm'), fn.sum('m', 'h'))
                # divide in_degrees
                # degs = graph.in_degrees().float()
                # degs = degs.to(feat.device)
                # h_neigh = (graph.ndata['neigh'] + graph.ndata['h']) / (degs.unsqueeze(-1) + 1)
                ah = g.ndata.pop('h')
                ah = ah * norm
            elif self._aggre_type == 'pool':
                g.ndata['h'] = F.relu(self.fc_pool(h))
                g.update_all(fn.copy_src('h', 'm'), fn.max('m', 'h'))
                ah = g.ndata['h']
            elif self._aggre_type == 'lstm':
                g.ndata['h'] = h
                g.update_all(fn.copy_src('h', 'm'), self._lstm_reducer)
                ah = g.ndata['h']
            elif self._aggre_type == 'attn':
                feat = self.fc_attn(h).view(-1, self.num_heads, self._in_feats)
                el = (feat * self.attn_l).sum(dim=-1).unsqueeze(-1)
                er = (feat * self.attn_r).sum(dim=-1).unsqueeze(-1)
                g.ndata.update({'ft': feat, 'el': el, 'er': er})
                g.apply_edges(fn.u_add_v('el', 'er', 'e'))
                e = self.leaky_relu(g.edata.pop('e'))
                g.edata['a'] = edge_softmax(g, e)
                g.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft'))
                ah = g.ndata['ft']
                ah = ah.squeeze(1)
            else:
                raise KeyError('Aggregator type {} not recognized.'.format(
                    self._aggre_type))

            h = self.concat(h, ah, norm)
        if self.dropout:
            h = self.dropout(h)
        # GraphSAGE GCN does not require fc_self.
        # if self._aggre_type == 'gcn':
        #     rst = self.fc_neigh(ah)
        # else:
        #     rst = self.fc_self(h) + self.fc_neigh(ah)
        h = self.linear(h)
        h = self.lynorm(h)
        if self.activation:
            h = self.activation(h)
        return h
Ejemplo n.º 13
0
    def forward(self,
                g,
                h,
                logits,
                old_z,
                attn_l,
                attn_r,
                shared_tau=True,
                tau_1=None,
                tau_2=None):
        g = g.local_var()
        if self.feat_drop:
            h = self.feat_drop(h)

        if hasattr(self, 'fc'):
            feat = self.fc(h).view(-1, self._num_heads, self._out_feats)
        else:
            feat = h
        g.ndata['h'] = feat  # (n_node, n_feat)
        g.ndata['logits'] = logits

        el = (feat * attn_l).sum(dim=-1).unsqueeze(-1)
        er = (feat * attn_r).sum(dim=-1).unsqueeze(-1)
        g.ndata.update({'ft': feat, 'el': el, 'er': er})
        # compute edge attention
        g.apply_edges(fn.u_add_v('el', 'er', 'e'))
        e = self.leaky_relu(g.edata.pop('e'))
        # compute softmax
        g.edata['a'] = self.attn_drop(edge_softmax(g, e))

        g.update_all(message_func=adaptive_attn_message_func,
                     reduce_func=adaptive_attn_reduce_func)
        f1 = g.ndata.pop('f1')
        f2 = g.ndata.pop('f2')
        norm_f1 = self.ln_1(f1)
        norm_f2 = self.ln_2(f2)
        if shared_tau:
            z = F.sigmoid((-1) * (norm_f1 - tau_1)) * F.sigmoid(
                (-1) * (norm_f2 - tau_2))
        else:
            # tau for each layer
            z = F.sigmoid((-1) * (norm_f1 - self.tau_1)) * F.sigmoid(
                (-1) * (norm_f2 - self.tau_2))

        gate = torch.min(old_z, z)

        agg = g.ndata.pop('agg')
        normagg = agg * g.ndata['norm'].unsqueeze(
            1)  # normalization by tgt degree

        if self.activation:
            normagg = self.activation(normagg)
        new_h = feat + gate.unsqueeze(2) * normagg
        return new_h, z
Ejemplo n.º 14
0
    def propagate_attention(self, g):
        '''
			copied from gqp
		'''

        g.apply_edges(fn.u_mul_v('q', 'k', 'e'))
        e = (g.edata['e'].sum(dim=-1, keepdim=True)) / (self.dk**0.5)

        g.edata['e'] = self.attn_drop(edge_softmax(g, e))

        g.update_all(fn.u_mul_e('v', 'e', 'e'), fn.sum('e', 'v'))
Ejemplo n.º 15
0
 def forward(self, g):
     alpha_prime = self.leaky_relu(self.attn(g.edata[self.attn_key]))
     # Magic part is multiplying attention weights with the edge embedding
     g.edata['a'] = dglnn.edge_softmax(
         g, alpha_prime) * g.edata['emb'].view(g.edata['emb'].shape[0],
                                               self.n_heads, -1)
     attn_emb = g.ndata[self.msg_key]
     if attn_emb.ndimension() == 2:
         g.ndata[self.msg_key] = attn_emb.view(g.number_of_nodes(),
                                               self.n_heads, -1)
     g.update_all(fn.src_mul_edge(self.msg_key, 'a', 'm'),
                  fn.sum('m', 'emb'))
     return GraphLambda(lambda x: x.view(x.shape[0], -1))(g)
Ejemplo n.º 16
0
    def forward(self, graph, features):
        g = graph.local_var()

        if self.attention:
            g.ndata['h'] = features
            g.apply_edges(fn.u_sub_v('h', 'h', 'l1'))
            l1 = g.edata.pop('l1')
            # l1 = -th.norm(l1, p=1, dim=1)
            # g.edata['att'] = edge_softmax(g, l1)
            l1 = 1 / (th.norm(l1, p=2, dim=1) + 1e-7)
            g.edata['att'] = edge_softmax(g, l1)
        else:
            if self.graph_norm:
                degs = g.in_degrees().float()
                norm = th.pow(degs + 1, -0.5)
                norm = norm.to(features.device).unsqueeze(1)
            g.edata['att'] = th.ones(g.number_of_edges(),
                                     1).to(features.device)

        h_last = features
        h = self.dropout(features)
        h = self.linear(h)

        h_pre = h
        ri = h * norm * norm

        for _ in range(self.k):

            if self.attention is False:
                if self.graph_norm:
                    h = h * norm

            g.ndata['h'] = h

            g.update_all(fn.u_mul_e('h', 'att', 'm'), fn.sum('m', 'h'))

            h = g.ndata.pop('h')

            if self.attention is False:
                if self.graph_norm:
                    h = h * norm

            h = self.alpha * h + self.alpha * ri + (1 - self.alpha) * h_pre
            h_pre = h

        if self.activation is not None:
            h = self.activation(h)
        if self.residual:
            h = h + self.res_fc(h_last)

        return h
Ejemplo n.º 17
0
def test_edge_softmax(idtype):
    # Basic
    g = dgl.graph(nx.path_graph(3))
    g = g.astype(idtype).to(F.ctx())
    edata = F.ones((g.number_of_edges(), 1))
    a = nn.edge_softmax(g, edata)
    assert len(g.ndata) == 0
    assert len(g.edata) == 0
    assert F.allclose(a, uniform_attention(g, a.shape))

    # Test higher dimension case
    edata = F.ones((g.number_of_edges(), 3, 1))
    a = nn.edge_softmax(g, edata)
    assert len(g.ndata) == 0
    assert len(g.edata) == 0
    assert F.allclose(a, uniform_attention(g, a.shape))

    # Test both forward and backward with PyTorch built-in softmax.
    g = dgl.rand_graph(30, 900)
    g = g.astype(idtype).to(F.ctx())

    score = F.randn((900, 1))
    score.requires_grad_()
    grad = F.randn((900, 1))
    y = F.softmax(score.view(30, 30), dim=0).view(-1, 1)
    y.backward(grad)
    grad_score = score.grad
    score.grad.zero_()
    y_dgl = nn.edge_softmax(g, score)
    assert len(g.ndata) == 0
    assert len(g.edata) == 0
    # check forward
    assert F.allclose(y_dgl, y)
    y_dgl.backward(grad)
    # checkout gradient
    assert F.allclose(score.grad, grad_score)
    print(score.grad[:10], grad_score[:10])
Ejemplo n.º 18
0
    def forward(self, graph, feat, get_attention=False):
            # Check in degree and generate error
            if (graph.in_degrees()==0).any():
                raise DGLError('There are 0-in-degree nodes in the graph, '
                                   'output for those nodes will be invalid. '
                                   'This is harmful for some applications, '
                                   'causing silent performance regression. '
                                   'Adding self-loop on the input graph by '
                                   'calling `g = dgl.add_self_loop(g)` will resolve '
                                   'the issue. Setting ``allow_zero_in_degree`` '
                                   'to be `True` when constructing this module will '
                                   'suppress the check and let the code run.')
            # projection process to get importance vector y
            graph.ndata['y'] = torch.abs(torch.matmul(self.p,feat.T).view(-1))/torch.norm(self.p,p=2)
            # Use edge message passing function to get the weight from src node
            graph.apply_edges(fn.copy_u('y','y'))
            # Select Top k neighbors
            subgraph = select_topk(graph,self.k,'y')
            # Sigmoid as information threshold
            subgraph.ndata['y'] = torch.sigmoid(subgraph.ndata['y'])
            # Using vector matrix elementwise mul for acceleration
            feat = subgraph.ndata['y'].view(-1,1)*feat
            feat = self.feat_drop(feat)
            h = self.fc(feat).view(-1, self.num_heads, self.out_feats)
            el = (h * self.attn_l).sum(dim=-1).unsqueeze(-1)
            er = (h * self.attn_r).sum(dim=-1).unsqueeze(-1)
            # Assign the value on the subgraph
            subgraph.srcdata.update({'ft': h, 'el': el})
            subgraph.dstdata.update({'er': er})
            # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively.
            subgraph.apply_edges(fn.u_add_v('el', 'er', 'e'))
            e = self.leaky_relu(subgraph.edata.pop('e'))
            # compute softmax
            subgraph.edata['a'] = self.attn_drop(edge_softmax(subgraph, e))
            # message passing
            subgraph.update_all(fn.u_mul_e('ft', 'a', 'm'),
                             fn.sum('m', 'ft'))
            rst = subgraph.dstdata['ft']
            # activation
            if self.activation:
                rst = self.activation(rst)
            # Residual
            if self.residual:
                rst = rst + self.residual_module(feat).view(feat.shape[0],-1,self.out_feats)

            if get_attention:
                return rst, subgraph.edata['a']
            else:
                return rst
Ejemplo n.º 19
0
def test_partial_edge_softmax(idtype):
    g = dgl.rand_graph(30, 900)
    g = g.astype(idtype).to(F.ctx())

    score = F.randn((300, 1))
    score.requires_grad_()
    grad = F.randn((300, 1))
    import numpy as np
    eids = np.random.choice(900, 300, replace=False)
    eids = F.tensor(eids, dtype=g.idtype)
    # compute partial edge softmax
    y_1 = nn.edge_softmax(g, score, eids)
    y_1.backward(grad)
    grad_1 = score.grad
    score.grad.zero_()
    # compute edge softmax on edge subgraph
    subg = g.edge_subgraph(eids, preserve_nodes=True)
    y_2 = nn.edge_softmax(subg, score)
    y_2.backward(grad)
    grad_2 = score.grad
    score.grad.zero_()

    assert F.allclose(y_1, y_2)
    assert F.allclose(grad_1, grad_2)
Ejemplo n.º 20
0
Archivo: lessr.py Proyecto: lessr/lessr
 def forward(self, sg, feat):
     with sg.local_scope():
         if self.batch_norm is not None:
             feat = self.batch_norm(feat)
         q = self.fc_q(feat)
         k = self.fc_k(feat)
         v = self.fc_v(feat)
         sg.ndata.update({'q': q, 'k': k, 'v': v})
         sg.apply_edges(fn.u_add_v('q', 'k', 'e'))
         e = self.attn_e(th.sigmoid(sg.edata['e']))
         sg.edata['a'] = edge_softmax(sg, e)
         sg.update_all(fn.u_mul_e('v', 'a', 'm'), fn.sum('m', 'ft'))
         rst = sg.ndata['ft']
         if self.activation is not None:
             rst = self.activation(rst)
         return rst
Ejemplo n.º 21
0
 def forward(self, g, ft_q, ft_k, ft_e=None, return_ev=False):
     if self.batch_norm_q is not None:
         ft_q = self.batch_norm_q(ft_q)
         ft_k = self.batch_norm_k(ft_k)
     q = self.fc_q(self.feat_drop(ft_q))
     k = self.fc_k(self.feat_drop(ft_k))
     v = self.fc_v(self.feat_drop(ft_q)).view(-1, self.num_heads,
                                              self.head_feats)
     e = F.u_add_v(g, q, k)
     if ft_e is not None:
         e = e + ft_e
     e = (self.attn_e * th.sigmoid(e)).view(
         -1, self.num_heads, self.head_feats).sum(-1, keepdim=True)
     if return_ev:
         return e, v
     a = self.attn_drop(edge_softmax(g, e))
     rst = F.u_mul_e_sum(g, v, a).view(-1, self.val_feats)
     if self.activation is not None:
         rst = self.activation(rst)
     return rst
Ejemplo n.º 22
0
def test_edge_softmax2(idtype, g):
    g = g.astype(idtype).to(F.ctx())
    g = g.local_var()
    g.srcdata.clear()
    g.dstdata.clear()
    g.edata.clear()
    a1 = F.randn((g.number_of_edges(), 1)).requires_grad_()
    a2 = a1.clone().detach().requires_grad_()
    g.edata['s'] = a1
    g.group_apply_edges('dst', lambda edges: {'ss':F.softmax(edges.data['s'], 1)})
    g.edata['ss'].sum().backward()
    
    builtin_sm = nn.edge_softmax(g, a2)
    builtin_sm.sum().backward()
    #print(a1.grad - a2.grad)
    assert len(g.srcdata) == 0
    assert len(g.dstdata) == 0
    assert len(g.edata) == 2
    assert F.allclose(a1.grad, a2.grad, rtol=1e-4, atol=1e-4) # Follow tolerance in unittest backend
    """
Ejemplo n.º 23
0
    def forward(self, graph, feat):
        r"""Compute graph attention network layer.
        Parameters
        ----------
        graph : DGLGraph
            The graph.
        feat : torch.Tensor
            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
            is size of input feature, :math:`N` is the number of nodes.
        Returns
        -------
        torch.Tensor
            The output feature of shape :math:`(N, H, D_{out})` where :math:`H`
            is the number of heads, and :math:`D_{out}` is size of output feature.
        """
        elist = []
        graph = graph.local_var()

        h = self.feat_drop(feat)
        feat = self.fc(h).view(-1, self._num_heads, self._out_feats)

        el = (feat * self.attn_l1).sum(dim=-1).unsqueeze(-1)
        er = (feat * self.attn_r1).sum(dim=-1).unsqueeze(-1)
        graph.ndata.update({'ft': feat, 'el': el, 'er': er})
        graph.apply_edges(fn.u_add_v('el', 'er', 'e'))
        e = self.leaky_relu(graph.edata.pop('e'))
        e_soft = edge_softmax(graph, e)

        elist.append(e_soft)
        graph.edata['a'] = self.attn_drop(e_soft)
        graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft'))
        rst = graph.ndata['ft']
        if self.activation:
            rst = self.activation(rst)

        # residual
        if self.res_fc is not None:
            resval = self.res_fc(h).view(h.shape[0], -1, self._out_feats)
            rst = rst + resval

        return rst, elist
 def forward(self, graph, feat, ntypes, etypes):
     graph = graph.local_var()
     h = self.feat_drop(feat)
     feat = self.fc(h, ntypes).view(-1, self.num_heads, self.out_feats)
     graph.ndata.update({'ft': feat})
     graph.edata['type'] = etypes
     graph.apply_edges(self.message_func)
     e = self.activation(graph.edata.pop('msg'))
     graph.edata['a'] = self.attn_drop(edge_softmax(graph, e))
     # message passing
     graph.update_all(fn.u_mul_e('ft', 'a', 'm'),
                      fn.sum('m', 'ft'))
     rst = graph.ndata['ft']
     # residual
     if self.res_fc is not None:
         resval = self.res_fc(h, ntypes).view(-1, self.num_heads, self.out_feats)
         rst = rst + resval
     # activation
     if self.activation:
         rst = self.activation(rst)
     return rst
Ejemplo n.º 25
0
    def forward(self, graph, feat):
        """Compute graph attention network layer.

        Parameters
        ----------
        graph : DGLGraph
            The graph.
        feat : torch.Tensor
            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
            is size of input feature, :math:`N` is the number of nodes.

        Returns
        -------
        torch.Tensor
            The output feature of shape :math:`(N, H, D_{out})` where :math:`H`
            is the number of heads, and :math:`D_{out}` is size of output feature.
        """
        graph = graph.local_var()
        #feat_c = feat.clone().detach().requires_grad_(False)
        feat_c = feat
        q, k, v = self.q_proj(feat), self.k_proj(feat_c), self.v_proj(feat_c)
        q = q.view(-1, self._num_heads, self._out_feats)
        k = k.view(-1, self._num_heads, self._out_feats)
        v = v.view(-1, self._num_heads, self._out_feats)
        graph.ndata.update({'ft': v, 'el': k, 'er': q})
        # compute edge attention
        graph.apply_edges(fn.u_dot_v('el', 'er', 'e'))
        e = graph.edata.pop('e') / math.sqrt(self._out_feats)
        # compute softmax
        graph.edata['a'] = self.attn_drop(edge_softmax(graph, e)).unsqueeze(-1)
        # message passing
        graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft'))
        rst = graph.ndata['ft']
        # residual
        rst = rst.view(feat.shape) + feat
        if self._trans:
            rst = self.ln(rst)
            rst = self.ln(rst + self.FFN(rst))
        return rst
Ejemplo n.º 26
0
    def forward(self,
                graph,
                feat,
                cluster_id=None,
                cluster_centroid=None,
                stat=None):
        r"""Compute graph attention network layer.

        Parameters
        ----------
        graph : DGLGraph
            The graph.
        feat : torch.Tensor
            The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
            is size of input feature, :math:`N` is the number of nodes.

        Returns
        -------
        torch.Tensor
            The output feature of shape :math:`(N, H, D_{out})` where :math:`H`
            is the number of heads, and :math:`D_{out}` is size of output feature.
        """
        graph = graph.local_var(
        )  # return as graph can be used in fcuntion scope
        h = self.feat_drop(feat)
        feat = self.fc(h).view(-1, self._num_heads, self._out_feats)
        if cluster_id is not None:
            # start = time.time()
            cluster_centroid = cluster_centroid.view(
                -1, self._num_heads, self._out_feats)  # [6*8*8] * [1,8,8]
            # el = cluster_centroid * self.attn_l[cluster_id].sum(-1).unsqueeze(-1)
            el = (cluster_centroid * self.attn_l)
            er = (cluster_centroid * self.attn_r)
            # er = (feat * self.attn_r).sum(dim=-1).unsqueeze(-1)
            # print(f'cluster el/er calculation time: {time.time() - start:.7f}')
            er = er[cluster_id].sum(dim=-1).unsqueeze(-1)
            el = el[cluster_id].sum(dim=-1).unsqueeze(-1)
            # el = (feat * self.attn_l).sum(dim=-1).unsqueeze(-1)
            # TODO test
            # [3708*8*8] * [1,8,8]
            # el += (feat * self.attn_l).sum(dim=-1).unsqueeze(-1)
            # er += (feat * self.attn_r).sum(dim=-1).unsqueeze(-1)
        else:
            # start = time.time()
            # [3708*8*8] * [1,8,8]
            el = (feat * self.attn_l).sum(dim=-1).unsqueeze(-1)
            er = (feat * self.attn_r).sum(dim=-1).unsqueeze(-1)
            # print(f'el/er calculation time: {time.time() - start:.7f}')
        graph.ndata.update({'ft': feat, 'el': el, 'er': er})
        # compute edge attention
        graph.apply_edges(fn.u_add_v('el', 'er', 'e'))
        e = self.leaky_relu(graph.edata.pop('e'))
        # compute softmax
        graph.edata['a'] = self.attn_drop(edge_softmax(
            graph, e))  # scale after softmax
        if stat is not None:
            stat.append(graph.edata['a'].detach().cpu().numpy())
        # message passing
        graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft'))
        embedding = graph.ndata['ft']
        # residual
        if self.res_fc is not None:
            resval = self.res_fc(h).view(h.shape[0], -1, self._out_feats)
            embedding = embedding + resval
        # activation
        if self.activation:
            return self.activation(embedding), self.activation(embedding)
            # return self.activation(embedding), embedding
            # return rst, embedding
        elif stat is not None:
            return embedding, stat
        else:
            return embedding  # output logits
Ejemplo n.º 27
0
    def forward(self, g, node_feats, edge_feats, node_only=False):
        r"""Update node and edge representations.
        Parameters
        ----------
        g : DGLGraph
            DGLGraph for a batch of graphs
        node_feats : float32 tensor of shape (V, node_in_feats)
            Input node features. V for the number of nodes in the batch of graphs.
        edge_feats : float32 tensor of shape (E, edge_in_feats)
            Input edge features. E for the number of edges in the batch of graphs.
        node_only : bool
            Whether to update node representations only. If False, edge representations
            will be updated as well. Default to False.
        Returns
        -------
        new_node_feats : float32 tensor of shape (V, node_out_feats)
            Updated node representations.
        new_edge_feats : float32 tensor of shape (E, edge_out_feats)
            Updated edge representations.
        """
        g = g.local_var()

        #######
        g.srcdata['h'] = node_feats
        g.ndata['feat'] = node_feats
        g.apply_edges(
            lambda edges: {
                'e':
                torch.sum(
                    (torch.mul(edges.src['h'], torch.tanh(edges.dst['h']))), 1)
            })
        e = self.leaky_relu(g.edata.pop('e'))
        e_soft = edge_softmax(g, e)
        g.ndata.pop('feat')
        g.srcdata.pop('h')
        #print(e_soft.shape)
        #######

        # Update node features
        node_node_feats = self.activation(
            self.node_to_node(node_feats))  # torch.Size([596, 50])
        g.edata['e2n'] = self.activation(self.edge_to_node(edge_feats))
        g.update_all(fn.copy_edge('e2n', 'm'), fn.sum('m', 'e2n'))
        edge_node_feats = g.ndata.pop('e2n')  # torch.Size([596, 50])
        new_node_feats = self.activation(
            self.update_node(
                torch.cat([node_node_feats, edge_node_feats],
                          dim=1)))  # torch.Size([596, 50])

        if node_only:
            return new_node_feats

        # Update edge features
        g.ndata['left_hv'] = self.left_node_to_edge(node_feats)
        g.ndata['right_hv'] = self.right_node_to_edge(node_feats)
        g.apply_edges(fn.u_add_v('left_hv', 'right_hv', 'first'))
        g.apply_edges(fn.u_add_v('right_hv', 'left_hv', 'second'))
        first_edge_feats = self.activation(g.edata.pop('first'))
        second_edge_feats = self.activation(g.edata.pop('second'))
        third_edge_feats = self.activation(self.edge_to_edge(edge_feats))
        new_edge_feats = self.activation(
            self.update_edge(
                torch.cat(
                    [first_edge_feats, second_edge_feats, third_edge_feats],
                    dim=1)))

        return new_node_feats, new_edge_feats, e_soft
Ejemplo n.º 28
0
    def forward(self, graph, feat, weight=None):
        r"""Compute graph convolution.

        Notes
        -----
        * Input shape: :math:`(N, *, \text{in_feats})` where * means any number of additional
          dimensions, :math:`N` is the number of nodes.
        * Output shape: :math:`(N, *, \text{out_feats})` where all but the last dimension are
          the same shape as the input.
        * Weight shape: "math:`(\text{in_feats}, \text{out_feats})`.

        Parameters
        ----------
        graph : DGLGraph
            The graph.
        feat : torch.Tensor
            The input feature
        weight : torch.Tensor, optional
            Optional external weight tensor.

        Returns
        -------
        torch.Tensor
            The output feature
        """
        graph = graph.local_var()

        if self._norm == 'both':
            degs = graph.out_degrees().to(feat.device).float().clamp(min=1)
            norm = th.pow(degs, -0.5)
            shp = norm.shape + (1, ) * (feat.dim() - 1)
            norm = th.reshape(norm, shp)
            feat = feat * norm

        if weight is not None:
            if self.weight is not None:
                raise DGLError(
                    'External weight is provided while at the same time the'
                    ' module has defined its own weight parameter. Please'
                    ' create the module with flag weight=False.')
        else:
            weight = self.weight

        # print(self._in_feats, self._out_feats)
        if self._in_feats > self._out_feats:
            # mult W first to reduce the feature size for aggregation.
            if weight is not None:
                feat = th.matmul(feat, weight)
            graph.srcdata['h'] = feat

            #######
            graph.ndata['feat'] = feat
            graph.apply_edges(lambda edges: {
                'e':
                th.sum((th.mul(edges.src['h'], th.tanh(edges.dst['h']))), 1)
            })
            e = self.leaky_relu(graph.edata.pop('e'))
            e_soft = edge_softmax(graph, e)
            graph.ndata.pop('feat')
            #######

            graph.update_all(fn.copy_src(src='h', out='m'),
                             fn.sum(msg='m', out='h'))
            rst = graph.dstdata['h']
        else:
            # aggregate first then mult W
            graph.srcdata['h'] = feat

            #######
            graph.ndata['feat'] = feat
            graph.apply_edges(lambda edges: {
                'e':
                th.sum((th.mul(edges.src['h'], th.tanh(edges.dst['h']))), 1)
            })
            e = self.leaky_relu(graph.edata.pop('e'))
            e_soft = edge_softmax(graph, e)
            graph.ndata.pop('feat')
            #######

            graph.update_all(fn.copy_src(src='h', out='m'),
                             fn.sum(msg='m', out='h'))
            rst = graph.dstdata['h']
            if weight is not None:
                rst = th.matmul(rst, weight)

        if self._norm != 'none':
            degs = graph.in_degrees().to(feat.device).float().clamp(min=1)
            if self._norm == 'both':
                norm = th.pow(degs, -0.5)
            else:
                norm = 1.0 / degs
            shp = norm.shape + (1, ) * (feat.dim() - 1)
            norm = th.reshape(norm, shp)
            rst = rst * norm

        if self.bias is not None:
            rst = rst + self.bias

        if self._activation is not None:
            rst = self._activation(rst)

        return rst, e_soft
Ejemplo n.º 29
0
 def forward(self, graph, feat):
     r"""Compute graph attention network layer.
     Parameters
     ----------
     graph : DGLGraph
         The graph.
     feat : torch.Tensor or pair of torch.Tensor
         If a torch.Tensor is given, the input feature of shape :math:`(N, D_{in})` where
         :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes.
         If a pair of torch.Tensor is given, the pair must contain two tensors of shape
         :math:`(N_{in}, D_{in_{src}})` and :math:`(N_{out}, D_{in_{dst}})`.
     Returns
     -------
     torch.Tensor
         The output feature of shape :math:`(N, H, D_{out})` where :math:`H`
         is the number of heads, and :math:`D_{out}` is size of output feature.
     """
     with graph.local_scope():
         if isinstance(feat, tuple):
             h_src = self.feat_drop(feat[0])
             h_dst = self.feat_drop(feat[1])
             feat_src = self.fc_src(h_src).view(-1, self._num_heads,
                                                self._out_feats)
             feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads,
                                                self._out_feats)
         else:
             h_src = h_dst = self.feat_drop(feat)
             feat_src = feat_dst = self.fc(h_src).view(
                 -1, self._num_heads, self._out_feats)
         # NOTE: GAT paper uses "first concatenation then linear projection"
         # to compute attention scores, while ours is "first projection then
         # addition", the two approaches are mathematically equivalent:
         # We decompose the weight vector a mentioned in the paper into
         # [a_l || a_r], then
         # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j
         # Our implementation is much efficient because we do not need to
         # save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus,
         # addition could be optimized with DGL's built-in function u_add_v,
         # which further speeds up computation and saves memory footprint.
         el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1)
         er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1)
         graph.srcdata.update({'ft': feat_src, 'el': el})
         graph.dstdata.update({'er': er})
         # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively.
         graph.apply_edges(fn.u_add_v('el', 'er', 'e'))
         e = self.leaky_relu(graph.edata.pop('e'))
         # compute softmax
         esoft = edge_softmax(graph, e)
         graph.edata['a'] = self.attn_drop(esoft)
         # message passing
         graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft'))
         rst = graph.dstdata['ft']
         # residual
         if self.res_fc is not None:
             resval = self.res_fc(h_dst).view(h_dst.shape[0], -1,
                                              self._out_feats)
             rst = rst + resval
         # activation
         if self.activation:
             rst = self.activation(rst)
         return rst, esoft
Ejemplo n.º 30
0
Archivo: conv.py Proyecto: jkx19/cogdl
    def forward(self, graph, feat, e_feat, res_attn=None):
        with graph.local_scope():
            if not self._allow_zero_in_degree:
                if (graph.in_degrees() == 0).any():
                    raise DGLError(
                        "There are 0-in-degree nodes in the graph, "
                        "output for those nodes will be invalid. "
                        "This is harmful for some applications, "
                        "causing silent performance regression. "
                        "Adding self-loop on the input graph by "
                        "calling `g = dgl.add_self_loop(g)` will resolve "
                        "the issue. Setting ``allow_zero_in_degree`` "
                        "to be `True` when constructing this module will "
                        "suppress the check and let the code run.")

            if isinstance(feat, tuple):
                h_src = self.feat_drop(feat[0])
                h_dst = self.feat_drop(feat[1])
                if not hasattr(self, "fc_src"):
                    self.fc_src, self.fc_dst = self.fc, self.fc
                feat_src = self.fc_src(h_src).view(-1, self._num_heads,
                                                   self._out_feats)
                feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads,
                                                   self._out_feats)
            else:
                h_src = h_dst = self.feat_drop(feat)
                feat_src = feat_dst = self.fc(h_src).view(
                    -1, self._num_heads, self._out_feats)
                if graph.is_block:
                    feat_dst = feat_src[:graph.number_of_dst_nodes()]
            e_feat = self.edge_emb(e_feat)
            e_feat = self.fc_e(e_feat).view(-1, self._num_heads,
                                            self._edge_feats)
            ee = (e_feat * self.attn_e).sum(dim=-1).unsqueeze(-1)
            el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1)
            er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1)
            graph.srcdata.update({"ft": feat_src, "el": el})
            graph.dstdata.update({"er": er})
            graph.edata.update({"ee": ee})
            graph.apply_edges(fn.u_add_v("el", "er", "e"))
            e = self.leaky_relu(graph.edata.pop("e") + graph.edata.pop("ee"))
            # compute softmax
            graph.edata["a"] = self.attn_drop(edge_softmax(graph, e))
            if res_attn is not None:
                graph.edata["a"] = graph.edata["a"] * (
                    1 - self.alpha) + res_attn * self.alpha
            # message passing
            graph.update_all(fn.u_mul_e("ft", "a", "m"), fn.sum("m", "ft"))
            rst = graph.dstdata["ft"]
            # residual
            if self.res_fc is not None:
                resval = self.res_fc(h_dst).view(h_dst.shape[0], -1,
                                                 self._out_feats)
                rst = rst + resval
            # bias
            if self.bias:
                rst = rst + self.bias_param
            # activation
            if self.activation:
                rst = self.activation(rst)
            return rst, graph.edata.pop("a").detach()