def test_partial_edge_softmax(): g = dgl.DGLGraph() g.add_nodes(30) # build a complete graph for i in range(30): for j in range(30): g.add_edge(i, j) score = F.randn((300, 1)) score.requires_grad_() grad = F.randn((300, 1)) import numpy as np eids = np.random.choice(900, 300, replace=False).astype('int64') eids = F.zerocopy_from_numpy(eids) # compute partial edge softmax y_1 = nn.edge_softmax(g, score, eids) y_1.backward(grad) grad_1 = score.grad score.grad.zero_() # compute edge softmax on edge subgraph subg = g.edge_subgraph(eids) y_2 = nn.edge_softmax(subg, score) y_2.backward(grad) grad_2 = score.grad score.grad.zero_() assert F.allclose(y_1, y_2) assert F.allclose(grad_1, grad_2)
def test_edge_softmax(): # Basic g = dgl.DGLGraph(nx.path_graph(3)) edata = F.ones((g.number_of_edges(), 1)) a = nn.edge_softmax(g, edata) assert len(g.ndata) == 0 assert len(g.edata) == 0 assert F.allclose(a, uniform_attention(g, a.shape)) # Test higher dimension case edata = F.ones((g.number_of_edges(), 3, 1)) a = nn.edge_softmax(g, edata) assert len(g.ndata) == 0 assert len(g.edata) == 0 assert F.allclose(a, uniform_attention(g, a.shape)) # Test both forward and backward with PyTorch built-in softmax. g = dgl.DGLGraph() g.add_nodes(30) # build a complete graph for i in range(30): for j in range(30): g.add_edge(i, j) score = F.randn((900, 1)) score.requires_grad_() grad = F.randn((900, 1)) y = F.softmax(score.view(30, 30), dim=0).view(-1, 1) y.backward(grad) grad_score = score.grad score.grad.zero_() y_dgl = nn.edge_softmax(g, score) assert len(g.ndata) == 0 assert len(g.edata) == 0 # check forward assert F.allclose(y_dgl, y) y_dgl.backward(grad) # checkout gradient assert F.allclose(score.grad, grad_score) print(score.grad[:10], grad_score[:10]) # Test 2 def generate_rand_graph(n): arr = (sp.sparse.random(n, n, density=0.1, format='coo') != 0).astype(np.int64) return dgl.DGLGraph(arr, readonly=True) g = generate_rand_graph(50) a1 = F.randn((g.number_of_edges(), 1)).requires_grad_() a2 = a1.clone().detach().requires_grad_() g.edata['s'] = a1 g.group_apply_edges('dst', lambda edges: {'ss':F.softmax(edges.data['s'], 1)}) g.edata['ss'].sum().backward() builtin_sm = nn.edge_softmax(g, a2) builtin_sm.sum().backward() print(a1.grad - a2.grad) assert len(g.ndata) == 0 assert len(g.edata) == 2 assert F.allclose(a1.grad, a2.grad, rtol=1e-4, atol=1e-4) # Follow tolerance in unittest backend
def test_edge_softmax(): # Basic g = dgl.DGLGraph(nx.path_graph(3)) edata = th.ones(g.number_of_edges(), 1) a = nn.edge_softmax(g, edata) assert th.allclose(a, uniform_attention(g, a.shape)) # Test higher dimension case edata = th.ones(g.number_of_edges(), 3, 1) a = nn.edge_softmax(g, edata) assert th.allclose(a, uniform_attention(g, a.shape))
def forward(self, graph, feat): graph = graph.local_var() h = self.feat_drop(feat) feat = self.fc(h).view(-1, self._num_heads, self._out_feats) el = (feat * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (feat * self.attn_r).sum(dim=-1).unsqueeze(-1) graph.ndata.update({"ft": feat, "el": el, "er": er}) # compute edge attention graph.apply_edges(fn.u_add_v("el", "er", "e")) # apply leaky relu graph.apply_edges(self.relu_udf) # compute softmax/sparsemax if self.sparsemax: graph.apply_edges(self.sparsemax_udf) else: graph.edata["a"] = edge_softmax(graph, graph.edata.pop("e")) # attention dropout graph.apply_edges(self.attn_drop_udf) # message passing graph.update_all(fn.u_mul_e("ft", "a", "m"), fn.sum("m", "ft")) rst = graph.ndata["ft"] # residual if self.res_fc is not None: resval = self.res_fc(h).view(h.shape[0], -1, self._out_feats) rst = rst + resval # activation if self.activation: rst = self.activation(rst) return rst
def forward(self, graph, feat): graph = graph.local_var() feat_c = feat.clone().detach().requires_grad_(False) q, k, v = self.q_proj(feat), self.k_proj(feat_c), self.v_proj(feat_c) q = q.view(-1, self._num_heads, self._out_feats) k = k.view(-1, self._num_heads, self._out_feats) v = v.view(-1, self._num_heads, self._out_feats) graph.ndata.update({ "ft": v, "el": k, "er": q }) # k,q instead of q,k, the edge_softmax is applied on incoming edges # compute edge attention graph.apply_edges(fn_u_dot_v("el", "er", "e")) e = graph.edata.pop("e") / math.sqrt(self._out_feats * self._num_heads) graph.edata["a"] = edge_softmax(graph, e).unsqueeze(-1) # message passing graph.update_all(fn_u_mul_e("ft", "a", "m"), fn_sum("m", "ft2")) rst = graph.ndata["ft2"] # residual rst = rst.view(feat.shape) + feat if self._trans: rst = self.ln1(rst) rst = self.ln1(rst + self.FFN(rst)) # use the same layer norm return rst
def forward(self, g, features): h_pre = features g = g.local_var() g.ndata['h'] = features g.ndata['norm_h'] = F.normalize(features, p=2, dim=-1) g.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos')) cos = g.edata.pop('cos') e = self.beta * cos if self.graph_cut > 0: k = int(e.size()[0] * self.graph_cut) _, indices = e.topk(k, largest=False, sorted=False) e[indices] = 0 g.edata['p'] = edge_softmax(g, e) g.update_all(fn.u_mul_e('h', 'p', 'm'), fn.sum('m', 'h')) h = g.ndata['h'] if self.project: h = self.linear(h) if self.activation: h = self.activation(h) if self.residual: h = h + self.res_fc(h_pre) h = self.dropout(h) return h
def forward(self, graph, feat): graph = graph.local_var() feat_c = feat.clone().detach().requires_grad_(False) q, k, v = self.q_proj(feat), self.k_proj(feat_c), self.v_proj(feat_c) q = q.view(-1, self._num_heads, self._out_feats) k = k.view(-1, self._num_heads, self._out_feats) v = v.view(-1, self._num_heads, self._out_feats) graph.ndata.update({ 'ft': v, 'el': k, 'er': q }) # k,q instead of q,k, the edge_softmax is applied on incoming edges # compute edge attention graph.apply_edges(fn.u_dot_v('el', 'er', 'e')) e = graph.edata.pop('e') / math.sqrt(self._out_feats * self._num_heads) graph.edata['a'] = edge_softmax(graph, e).unsqueeze(-1) # message passing graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft2')) rst = graph.ndata['ft2'] # residual rst = rst.view(feat.shape) + feat if self._trans: rst = self.ln1(rst) rst = self.ln1(rst + self.FFN(rst)) # use the same layer norm, see the author's code return rst
def forward(self, graph, feat, device): graph = graph.to(device).local_var() feat_c = feat.clone().detach().requires_grad_(False) q, k, v = self.query_proj(feat), self.key_proj( feat_c), self.value_proj(feat_c) q = q.view(-1, self.num_heads, self.embedding_size // self.num_heads) k = k.view(-1, self.num_heads, self.embedding_size // self.num_heads) v = v.view(-1, self.num_heads, self.embedding_size // self.num_heads) graph.ndata.update({ 'ft': v, 'el': k, 'er': q }) # k,q instead of q,k, the edge_softmax is applied on incoming edges # compute edge attention graph.apply_edges(fn.u_dot_v('el', 'er', 'e')) e = graph.edata.pop('e') / math.sqrt(self.embedding_size) graph.edata['a'] = edge_softmax(graph, e) # message passing graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft2')) rst = graph.ndata['ft2'] # residual rst = rst.view(feat.shape) + feat rst = self.ln1(rst) rst = self.ln1(rst + self.out_proj(rst)) return rst
def forward(self, g, h, logits, old_z, attn_l, attn_r, *, shared_tau=True, tau1=None, tau2=None): with g.local_scope(): h = self.dropout(h) if self.fc is not None: feat = self.fc(h).view(-1, self._num_heads, self._out_feats) else: feat = h g.ndata["h"] = feat # (n_node, n_feat) g.ndata["logits"] = logits degs = g.in_degrees().float().clamp(min=1) norm = torch.pow(degs, -0.5).to(feat.device).unsqueeze(1) g.ndata["degree"] = degs el = (feat * attn_l).sum(dim=-1).unsqueeze(-1) er = (feat * attn_r).sum(dim=-1).unsqueeze(-1) g.ndata.update({"ft": feat, "el": el, "er": er}) # compute edge attention g.apply_edges(fn.u_add_v("el", "er", "e")) e = self.leaky_relu(g.edata.pop("e")) # compute softmax g.edata["a"] = self.dropout(edge_softmax(g, e)) g.update_all( message_func=adaptive_attn_message_func, reduce_func=adaptive_attn_reduce_func, ) f1 = g.ndata.pop("f1") f2 = g.ndata.pop("f2") norm_f1 = self.ln1(f1) norm_f2 = self.ln2(f2) if shared_tau: z = torch.sigmoid((-1) * (norm_f1 - tau1)) * torch.sigmoid( (-1) * (norm_f2 - tau2)) else: # tau for each layer z = torch.sigmoid( (-1) * (norm_f1 - self.tau1)) * torch.sigmoid( (-1) * (norm_f2 - self.tau2)) gate = torch.min(old_z, z) agg = g.ndata.pop("agg") normagg = agg * norm.unsqueeze(1) # normalization by tgt degree if self.activation: normagg = self.activation(normagg) new_h = feat + gate.unsqueeze(2) * normagg return new_h, z
def forward(self, graph): node_num = graph.ndata['h'].size(0) Q = self.query(graph.ndata['h']) K = self.key(graph.ndata['h']) V = self.value(graph.ndata['h']) Q = self.transpose_for_scores(Q) K = self.transpose_for_scores(K) V = self.transpose_for_scores(V) graph.ndata['Q'] = Q graph.ndata['K'] = K graph.ndata['V'] = V graph.apply_edges(fn.u_mul_v('K', 'Q', 'attn_probs')) graph.edata['attn_probs'] = graph.edata['attn_probs'].sum(-1, keepdim=True) graph.edata['attn_probs'] = edge_softmax(graph, graph.edata['attn_probs']) graph.edata['attn_probs'] = self.dropout(graph.edata['attn_probs']) graph.apply_edges(fn.u_mul_e('V', 'attn_probs', 'attn_values')) graph.register_message_func(fn.copy_e('attn_values', 'm')) graph.register_reduce_func(fn.sum('m', 'h')) graph.update_all() graph.ndata['h'] = graph.ndata['h'].view([node_num, -1]) return graph
def forward(self, g, edge_logits, node_feats): """Update node representations. Parameters ---------- g : DGLGraph DGLGraph for a batch of graphs edge_logits : float32 tensor of shape (E, 1) The edge logits based on which softmax will be performed for weighting edges within 1-hop neighborhoods. E represents the number of edges. node_feats : float32 tensor of shape (V, node_feat_size) Previous node features. V represents the number of nodes. Returns ------- float32 tensor of shape (V, node_feat_size) Updated node features. """ g = g.local_var() g.edata['a'] = edge_softmax(g, edge_logits) g.ndata['hv'] = self.project_node(node_feats) g.update_all(fn.src_mul_edge('hv', 'a', 'm'), fn.sum('m', 'c')) context = F.elu(g.ndata['c']) return F.relu(self.gru(context, node_feats))
def forward(self, g, h): g = g.local_var() if not self.use_pp or not self.training: norm = self.get_norm(g) # g.ndata['h'] = h # g.update_all(fn.copy_src(src='h', out='m'), # fn.sum(msg='m', out='h')) # ah = g.ndata.pop('h') if self._aggre_type == 'mean': g.ndata['h'] = h g.update_all(fn.copy_src('h', 'm'), fn.mean('m', 'h')) ah = g.ndata.pop('h') elif self._aggre_type == 'gcn': g.ndata['h'] = h g.update_all(fn.copy_src('h', 'm'), fn.sum('m', 'h')) # divide in_degrees # degs = graph.in_degrees().float() # degs = degs.to(feat.device) # h_neigh = (graph.ndata['neigh'] + graph.ndata['h']) / (degs.unsqueeze(-1) + 1) ah = g.ndata.pop('h') ah = ah * norm elif self._aggre_type == 'pool': g.ndata['h'] = F.relu(self.fc_pool(h)) g.update_all(fn.copy_src('h', 'm'), fn.max('m', 'h')) ah = g.ndata['h'] elif self._aggre_type == 'lstm': g.ndata['h'] = h g.update_all(fn.copy_src('h', 'm'), self._lstm_reducer) ah = g.ndata['h'] elif self._aggre_type == 'attn': feat = self.fc_attn(h).view(-1, self.num_heads, self._in_feats) el = (feat * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (feat * self.attn_r).sum(dim=-1).unsqueeze(-1) g.ndata.update({'ft': feat, 'el': el, 'er': er}) g.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(g.edata.pop('e')) g.edata['a'] = edge_softmax(g, e) g.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) ah = g.ndata['ft'] ah = ah.squeeze(1) else: raise KeyError('Aggregator type {} not recognized.'.format( self._aggre_type)) h = self.concat(h, ah, norm) if self.dropout: h = self.dropout(h) # GraphSAGE GCN does not require fc_self. # if self._aggre_type == 'gcn': # rst = self.fc_neigh(ah) # else: # rst = self.fc_self(h) + self.fc_neigh(ah) h = self.linear(h) h = self.lynorm(h) if self.activation: h = self.activation(h) return h
def forward(self, g, h, logits, old_z, attn_l, attn_r, shared_tau=True, tau_1=None, tau_2=None): g = g.local_var() if self.feat_drop: h = self.feat_drop(h) if hasattr(self, 'fc'): feat = self.fc(h).view(-1, self._num_heads, self._out_feats) else: feat = h g.ndata['h'] = feat # (n_node, n_feat) g.ndata['logits'] = logits el = (feat * attn_l).sum(dim=-1).unsqueeze(-1) er = (feat * attn_r).sum(dim=-1).unsqueeze(-1) g.ndata.update({'ft': feat, 'el': el, 'er': er}) # compute edge attention g.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(g.edata.pop('e')) # compute softmax g.edata['a'] = self.attn_drop(edge_softmax(g, e)) g.update_all(message_func=adaptive_attn_message_func, reduce_func=adaptive_attn_reduce_func) f1 = g.ndata.pop('f1') f2 = g.ndata.pop('f2') norm_f1 = self.ln_1(f1) norm_f2 = self.ln_2(f2) if shared_tau: z = F.sigmoid((-1) * (norm_f1 - tau_1)) * F.sigmoid( (-1) * (norm_f2 - tau_2)) else: # tau for each layer z = F.sigmoid((-1) * (norm_f1 - self.tau_1)) * F.sigmoid( (-1) * (norm_f2 - self.tau_2)) gate = torch.min(old_z, z) agg = g.ndata.pop('agg') normagg = agg * g.ndata['norm'].unsqueeze( 1) # normalization by tgt degree if self.activation: normagg = self.activation(normagg) new_h = feat + gate.unsqueeze(2) * normagg return new_h, z
def propagate_attention(self, g): ''' copied from gqp ''' g.apply_edges(fn.u_mul_v('q', 'k', 'e')) e = (g.edata['e'].sum(dim=-1, keepdim=True)) / (self.dk**0.5) g.edata['e'] = self.attn_drop(edge_softmax(g, e)) g.update_all(fn.u_mul_e('v', 'e', 'e'), fn.sum('e', 'v'))
def forward(self, g): alpha_prime = self.leaky_relu(self.attn(g.edata[self.attn_key])) # Magic part is multiplying attention weights with the edge embedding g.edata['a'] = dglnn.edge_softmax( g, alpha_prime) * g.edata['emb'].view(g.edata['emb'].shape[0], self.n_heads, -1) attn_emb = g.ndata[self.msg_key] if attn_emb.ndimension() == 2: g.ndata[self.msg_key] = attn_emb.view(g.number_of_nodes(), self.n_heads, -1) g.update_all(fn.src_mul_edge(self.msg_key, 'a', 'm'), fn.sum('m', 'emb')) return GraphLambda(lambda x: x.view(x.shape[0], -1))(g)
def forward(self, graph, features): g = graph.local_var() if self.attention: g.ndata['h'] = features g.apply_edges(fn.u_sub_v('h', 'h', 'l1')) l1 = g.edata.pop('l1') # l1 = -th.norm(l1, p=1, dim=1) # g.edata['att'] = edge_softmax(g, l1) l1 = 1 / (th.norm(l1, p=2, dim=1) + 1e-7) g.edata['att'] = edge_softmax(g, l1) else: if self.graph_norm: degs = g.in_degrees().float() norm = th.pow(degs + 1, -0.5) norm = norm.to(features.device).unsqueeze(1) g.edata['att'] = th.ones(g.number_of_edges(), 1).to(features.device) h_last = features h = self.dropout(features) h = self.linear(h) h_pre = h ri = h * norm * norm for _ in range(self.k): if self.attention is False: if self.graph_norm: h = h * norm g.ndata['h'] = h g.update_all(fn.u_mul_e('h', 'att', 'm'), fn.sum('m', 'h')) h = g.ndata.pop('h') if self.attention is False: if self.graph_norm: h = h * norm h = self.alpha * h + self.alpha * ri + (1 - self.alpha) * h_pre h_pre = h if self.activation is not None: h = self.activation(h) if self.residual: h = h + self.res_fc(h_last) return h
def test_edge_softmax(idtype): # Basic g = dgl.graph(nx.path_graph(3)) g = g.astype(idtype).to(F.ctx()) edata = F.ones((g.number_of_edges(), 1)) a = nn.edge_softmax(g, edata) assert len(g.ndata) == 0 assert len(g.edata) == 0 assert F.allclose(a, uniform_attention(g, a.shape)) # Test higher dimension case edata = F.ones((g.number_of_edges(), 3, 1)) a = nn.edge_softmax(g, edata) assert len(g.ndata) == 0 assert len(g.edata) == 0 assert F.allclose(a, uniform_attention(g, a.shape)) # Test both forward and backward with PyTorch built-in softmax. g = dgl.rand_graph(30, 900) g = g.astype(idtype).to(F.ctx()) score = F.randn((900, 1)) score.requires_grad_() grad = F.randn((900, 1)) y = F.softmax(score.view(30, 30), dim=0).view(-1, 1) y.backward(grad) grad_score = score.grad score.grad.zero_() y_dgl = nn.edge_softmax(g, score) assert len(g.ndata) == 0 assert len(g.edata) == 0 # check forward assert F.allclose(y_dgl, y) y_dgl.backward(grad) # checkout gradient assert F.allclose(score.grad, grad_score) print(score.grad[:10], grad_score[:10])
def forward(self, graph, feat, get_attention=False): # Check in degree and generate error if (graph.in_degrees()==0).any(): raise DGLError('There are 0-in-degree nodes in the graph, ' 'output for those nodes will be invalid. ' 'This is harmful for some applications, ' 'causing silent performance regression. ' 'Adding self-loop on the input graph by ' 'calling `g = dgl.add_self_loop(g)` will resolve ' 'the issue. Setting ``allow_zero_in_degree`` ' 'to be `True` when constructing this module will ' 'suppress the check and let the code run.') # projection process to get importance vector y graph.ndata['y'] = torch.abs(torch.matmul(self.p,feat.T).view(-1))/torch.norm(self.p,p=2) # Use edge message passing function to get the weight from src node graph.apply_edges(fn.copy_u('y','y')) # Select Top k neighbors subgraph = select_topk(graph,self.k,'y') # Sigmoid as information threshold subgraph.ndata['y'] = torch.sigmoid(subgraph.ndata['y']) # Using vector matrix elementwise mul for acceleration feat = subgraph.ndata['y'].view(-1,1)*feat feat = self.feat_drop(feat) h = self.fc(feat).view(-1, self.num_heads, self.out_feats) el = (h * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (h * self.attn_r).sum(dim=-1).unsqueeze(-1) # Assign the value on the subgraph subgraph.srcdata.update({'ft': h, 'el': el}) subgraph.dstdata.update({'er': er}) # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. subgraph.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(subgraph.edata.pop('e')) # compute softmax subgraph.edata['a'] = self.attn_drop(edge_softmax(subgraph, e)) # message passing subgraph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) rst = subgraph.dstdata['ft'] # activation if self.activation: rst = self.activation(rst) # Residual if self.residual: rst = rst + self.residual_module(feat).view(feat.shape[0],-1,self.out_feats) if get_attention: return rst, subgraph.edata['a'] else: return rst
def test_partial_edge_softmax(idtype): g = dgl.rand_graph(30, 900) g = g.astype(idtype).to(F.ctx()) score = F.randn((300, 1)) score.requires_grad_() grad = F.randn((300, 1)) import numpy as np eids = np.random.choice(900, 300, replace=False) eids = F.tensor(eids, dtype=g.idtype) # compute partial edge softmax y_1 = nn.edge_softmax(g, score, eids) y_1.backward(grad) grad_1 = score.grad score.grad.zero_() # compute edge softmax on edge subgraph subg = g.edge_subgraph(eids, preserve_nodes=True) y_2 = nn.edge_softmax(subg, score) y_2.backward(grad) grad_2 = score.grad score.grad.zero_() assert F.allclose(y_1, y_2) assert F.allclose(grad_1, grad_2)
def forward(self, sg, feat): with sg.local_scope(): if self.batch_norm is not None: feat = self.batch_norm(feat) q = self.fc_q(feat) k = self.fc_k(feat) v = self.fc_v(feat) sg.ndata.update({'q': q, 'k': k, 'v': v}) sg.apply_edges(fn.u_add_v('q', 'k', 'e')) e = self.attn_e(th.sigmoid(sg.edata['e'])) sg.edata['a'] = edge_softmax(sg, e) sg.update_all(fn.u_mul_e('v', 'a', 'm'), fn.sum('m', 'ft')) rst = sg.ndata['ft'] if self.activation is not None: rst = self.activation(rst) return rst
def forward(self, g, ft_q, ft_k, ft_e=None, return_ev=False): if self.batch_norm_q is not None: ft_q = self.batch_norm_q(ft_q) ft_k = self.batch_norm_k(ft_k) q = self.fc_q(self.feat_drop(ft_q)) k = self.fc_k(self.feat_drop(ft_k)) v = self.fc_v(self.feat_drop(ft_q)).view(-1, self.num_heads, self.head_feats) e = F.u_add_v(g, q, k) if ft_e is not None: e = e + ft_e e = (self.attn_e * th.sigmoid(e)).view( -1, self.num_heads, self.head_feats).sum(-1, keepdim=True) if return_ev: return e, v a = self.attn_drop(edge_softmax(g, e)) rst = F.u_mul_e_sum(g, v, a).view(-1, self.val_feats) if self.activation is not None: rst = self.activation(rst) return rst
def test_edge_softmax2(idtype, g): g = g.astype(idtype).to(F.ctx()) g = g.local_var() g.srcdata.clear() g.dstdata.clear() g.edata.clear() a1 = F.randn((g.number_of_edges(), 1)).requires_grad_() a2 = a1.clone().detach().requires_grad_() g.edata['s'] = a1 g.group_apply_edges('dst', lambda edges: {'ss':F.softmax(edges.data['s'], 1)}) g.edata['ss'].sum().backward() builtin_sm = nn.edge_softmax(g, a2) builtin_sm.sum().backward() #print(a1.grad - a2.grad) assert len(g.srcdata) == 0 assert len(g.dstdata) == 0 assert len(g.edata) == 2 assert F.allclose(a1.grad, a2.grad, rtol=1e-4, atol=1e-4) # Follow tolerance in unittest backend """
def forward(self, graph, feat): r"""Compute graph attention network layer. Parameters ---------- graph : DGLGraph The graph. feat : torch.Tensor The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. Returns ------- torch.Tensor The output feature of shape :math:`(N, H, D_{out})` where :math:`H` is the number of heads, and :math:`D_{out}` is size of output feature. """ elist = [] graph = graph.local_var() h = self.feat_drop(feat) feat = self.fc(h).view(-1, self._num_heads, self._out_feats) el = (feat * self.attn_l1).sum(dim=-1).unsqueeze(-1) er = (feat * self.attn_r1).sum(dim=-1).unsqueeze(-1) graph.ndata.update({'ft': feat, 'el': el, 'er': er}) graph.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(graph.edata.pop('e')) e_soft = edge_softmax(graph, e) elist.append(e_soft) graph.edata['a'] = self.attn_drop(e_soft) graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) rst = graph.ndata['ft'] if self.activation: rst = self.activation(rst) # residual if self.res_fc is not None: resval = self.res_fc(h).view(h.shape[0], -1, self._out_feats) rst = rst + resval return rst, elist
def forward(self, graph, feat, ntypes, etypes): graph = graph.local_var() h = self.feat_drop(feat) feat = self.fc(h, ntypes).view(-1, self.num_heads, self.out_feats) graph.ndata.update({'ft': feat}) graph.edata['type'] = etypes graph.apply_edges(self.message_func) e = self.activation(graph.edata.pop('msg')) graph.edata['a'] = self.attn_drop(edge_softmax(graph, e)) # message passing graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) rst = graph.ndata['ft'] # residual if self.res_fc is not None: resval = self.res_fc(h, ntypes).view(-1, self.num_heads, self.out_feats) rst = rst + resval # activation if self.activation: rst = self.activation(rst) return rst
def forward(self, graph, feat): """Compute graph attention network layer. Parameters ---------- graph : DGLGraph The graph. feat : torch.Tensor The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. Returns ------- torch.Tensor The output feature of shape :math:`(N, H, D_{out})` where :math:`H` is the number of heads, and :math:`D_{out}` is size of output feature. """ graph = graph.local_var() #feat_c = feat.clone().detach().requires_grad_(False) feat_c = feat q, k, v = self.q_proj(feat), self.k_proj(feat_c), self.v_proj(feat_c) q = q.view(-1, self._num_heads, self._out_feats) k = k.view(-1, self._num_heads, self._out_feats) v = v.view(-1, self._num_heads, self._out_feats) graph.ndata.update({'ft': v, 'el': k, 'er': q}) # compute edge attention graph.apply_edges(fn.u_dot_v('el', 'er', 'e')) e = graph.edata.pop('e') / math.sqrt(self._out_feats) # compute softmax graph.edata['a'] = self.attn_drop(edge_softmax(graph, e)).unsqueeze(-1) # message passing graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) rst = graph.ndata['ft'] # residual rst = rst.view(feat.shape) + feat if self._trans: rst = self.ln(rst) rst = self.ln(rst + self.FFN(rst)) return rst
def forward(self, graph, feat, cluster_id=None, cluster_centroid=None, stat=None): r"""Compute graph attention network layer. Parameters ---------- graph : DGLGraph The graph. feat : torch.Tensor The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. Returns ------- torch.Tensor The output feature of shape :math:`(N, H, D_{out})` where :math:`H` is the number of heads, and :math:`D_{out}` is size of output feature. """ graph = graph.local_var( ) # return as graph can be used in fcuntion scope h = self.feat_drop(feat) feat = self.fc(h).view(-1, self._num_heads, self._out_feats) if cluster_id is not None: # start = time.time() cluster_centroid = cluster_centroid.view( -1, self._num_heads, self._out_feats) # [6*8*8] * [1,8,8] # el = cluster_centroid * self.attn_l[cluster_id].sum(-1).unsqueeze(-1) el = (cluster_centroid * self.attn_l) er = (cluster_centroid * self.attn_r) # er = (feat * self.attn_r).sum(dim=-1).unsqueeze(-1) # print(f'cluster el/er calculation time: {time.time() - start:.7f}') er = er[cluster_id].sum(dim=-1).unsqueeze(-1) el = el[cluster_id].sum(dim=-1).unsqueeze(-1) # el = (feat * self.attn_l).sum(dim=-1).unsqueeze(-1) # TODO test # [3708*8*8] * [1,8,8] # el += (feat * self.attn_l).sum(dim=-1).unsqueeze(-1) # er += (feat * self.attn_r).sum(dim=-1).unsqueeze(-1) else: # start = time.time() # [3708*8*8] * [1,8,8] el = (feat * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (feat * self.attn_r).sum(dim=-1).unsqueeze(-1) # print(f'el/er calculation time: {time.time() - start:.7f}') graph.ndata.update({'ft': feat, 'el': el, 'er': er}) # compute edge attention graph.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(graph.edata.pop('e')) # compute softmax graph.edata['a'] = self.attn_drop(edge_softmax( graph, e)) # scale after softmax if stat is not None: stat.append(graph.edata['a'].detach().cpu().numpy()) # message passing graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) embedding = graph.ndata['ft'] # residual if self.res_fc is not None: resval = self.res_fc(h).view(h.shape[0], -1, self._out_feats) embedding = embedding + resval # activation if self.activation: return self.activation(embedding), self.activation(embedding) # return self.activation(embedding), embedding # return rst, embedding elif stat is not None: return embedding, stat else: return embedding # output logits
def forward(self, g, node_feats, edge_feats, node_only=False): r"""Update node and edge representations. Parameters ---------- g : DGLGraph DGLGraph for a batch of graphs node_feats : float32 tensor of shape (V, node_in_feats) Input node features. V for the number of nodes in the batch of graphs. edge_feats : float32 tensor of shape (E, edge_in_feats) Input edge features. E for the number of edges in the batch of graphs. node_only : bool Whether to update node representations only. If False, edge representations will be updated as well. Default to False. Returns ------- new_node_feats : float32 tensor of shape (V, node_out_feats) Updated node representations. new_edge_feats : float32 tensor of shape (E, edge_out_feats) Updated edge representations. """ g = g.local_var() ####### g.srcdata['h'] = node_feats g.ndata['feat'] = node_feats g.apply_edges( lambda edges: { 'e': torch.sum( (torch.mul(edges.src['h'], torch.tanh(edges.dst['h']))), 1) }) e = self.leaky_relu(g.edata.pop('e')) e_soft = edge_softmax(g, e) g.ndata.pop('feat') g.srcdata.pop('h') #print(e_soft.shape) ####### # Update node features node_node_feats = self.activation( self.node_to_node(node_feats)) # torch.Size([596, 50]) g.edata['e2n'] = self.activation(self.edge_to_node(edge_feats)) g.update_all(fn.copy_edge('e2n', 'm'), fn.sum('m', 'e2n')) edge_node_feats = g.ndata.pop('e2n') # torch.Size([596, 50]) new_node_feats = self.activation( self.update_node( torch.cat([node_node_feats, edge_node_feats], dim=1))) # torch.Size([596, 50]) if node_only: return new_node_feats # Update edge features g.ndata['left_hv'] = self.left_node_to_edge(node_feats) g.ndata['right_hv'] = self.right_node_to_edge(node_feats) g.apply_edges(fn.u_add_v('left_hv', 'right_hv', 'first')) g.apply_edges(fn.u_add_v('right_hv', 'left_hv', 'second')) first_edge_feats = self.activation(g.edata.pop('first')) second_edge_feats = self.activation(g.edata.pop('second')) third_edge_feats = self.activation(self.edge_to_edge(edge_feats)) new_edge_feats = self.activation( self.update_edge( torch.cat( [first_edge_feats, second_edge_feats, third_edge_feats], dim=1))) return new_node_feats, new_edge_feats, e_soft
def forward(self, graph, feat, weight=None): r"""Compute graph convolution. Notes ----- * Input shape: :math:`(N, *, \text{in_feats})` where * means any number of additional dimensions, :math:`N` is the number of nodes. * Output shape: :math:`(N, *, \text{out_feats})` where all but the last dimension are the same shape as the input. * Weight shape: "math:`(\text{in_feats}, \text{out_feats})`. Parameters ---------- graph : DGLGraph The graph. feat : torch.Tensor The input feature weight : torch.Tensor, optional Optional external weight tensor. Returns ------- torch.Tensor The output feature """ graph = graph.local_var() if self._norm == 'both': degs = graph.out_degrees().to(feat.device).float().clamp(min=1) norm = th.pow(degs, -0.5) shp = norm.shape + (1, ) * (feat.dim() - 1) norm = th.reshape(norm, shp) feat = feat * norm if weight is not None: if self.weight is not None: raise DGLError( 'External weight is provided while at the same time the' ' module has defined its own weight parameter. Please' ' create the module with flag weight=False.') else: weight = self.weight # print(self._in_feats, self._out_feats) if self._in_feats > self._out_feats: # mult W first to reduce the feature size for aggregation. if weight is not None: feat = th.matmul(feat, weight) graph.srcdata['h'] = feat ####### graph.ndata['feat'] = feat graph.apply_edges(lambda edges: { 'e': th.sum((th.mul(edges.src['h'], th.tanh(edges.dst['h']))), 1) }) e = self.leaky_relu(graph.edata.pop('e')) e_soft = edge_softmax(graph, e) graph.ndata.pop('feat') ####### graph.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='h')) rst = graph.dstdata['h'] else: # aggregate first then mult W graph.srcdata['h'] = feat ####### graph.ndata['feat'] = feat graph.apply_edges(lambda edges: { 'e': th.sum((th.mul(edges.src['h'], th.tanh(edges.dst['h']))), 1) }) e = self.leaky_relu(graph.edata.pop('e')) e_soft = edge_softmax(graph, e) graph.ndata.pop('feat') ####### graph.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='h')) rst = graph.dstdata['h'] if weight is not None: rst = th.matmul(rst, weight) if self._norm != 'none': degs = graph.in_degrees().to(feat.device).float().clamp(min=1) if self._norm == 'both': norm = th.pow(degs, -0.5) else: norm = 1.0 / degs shp = norm.shape + (1, ) * (feat.dim() - 1) norm = th.reshape(norm, shp) rst = rst * norm if self.bias is not None: rst = rst + self.bias if self._activation is not None: rst = self._activation(rst) return rst, e_soft
def forward(self, graph, feat): r"""Compute graph attention network layer. Parameters ---------- graph : DGLGraph The graph. feat : torch.Tensor or pair of torch.Tensor If a torch.Tensor is given, the input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. If a pair of torch.Tensor is given, the pair must contain two tensors of shape :math:`(N_{in}, D_{in_{src}})` and :math:`(N_{out}, D_{in_{dst}})`. Returns ------- torch.Tensor The output feature of shape :math:`(N, H, D_{out})` where :math:`H` is the number of heads, and :math:`D_{out}` is size of output feature. """ with graph.local_scope(): if isinstance(feat, tuple): h_src = self.feat_drop(feat[0]) h_dst = self.feat_drop(feat[1]) feat_src = self.fc_src(h_src).view(-1, self._num_heads, self._out_feats) feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads, self._out_feats) else: h_src = h_dst = self.feat_drop(feat) feat_src = feat_dst = self.fc(h_src).view( -1, self._num_heads, self._out_feats) # NOTE: GAT paper uses "first concatenation then linear projection" # to compute attention scores, while ours is "first projection then # addition", the two approaches are mathematically equivalent: # We decompose the weight vector a mentioned in the paper into # [a_l || a_r], then # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j # Our implementation is much efficient because we do not need to # save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus, # addition could be optimized with DGL's built-in function u_add_v, # which further speeds up computation and saves memory footprint. el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1) graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. graph.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(graph.edata.pop('e')) # compute softmax esoft = edge_softmax(graph, e) graph.edata['a'] = self.attn_drop(esoft) # message passing graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) rst = graph.dstdata['ft'] # residual if self.res_fc is not None: resval = self.res_fc(h_dst).view(h_dst.shape[0], -1, self._out_feats) rst = rst + resval # activation if self.activation: rst = self.activation(rst) return rst, esoft
def forward(self, graph, feat, e_feat, res_attn=None): with graph.local_scope(): if not self._allow_zero_in_degree: if (graph.in_degrees() == 0).any(): raise DGLError( "There are 0-in-degree nodes in the graph, " "output for those nodes will be invalid. " "This is harmful for some applications, " "causing silent performance regression. " "Adding self-loop on the input graph by " "calling `g = dgl.add_self_loop(g)` will resolve " "the issue. Setting ``allow_zero_in_degree`` " "to be `True` when constructing this module will " "suppress the check and let the code run.") if isinstance(feat, tuple): h_src = self.feat_drop(feat[0]) h_dst = self.feat_drop(feat[1]) if not hasattr(self, "fc_src"): self.fc_src, self.fc_dst = self.fc, self.fc feat_src = self.fc_src(h_src).view(-1, self._num_heads, self._out_feats) feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads, self._out_feats) else: h_src = h_dst = self.feat_drop(feat) feat_src = feat_dst = self.fc(h_src).view( -1, self._num_heads, self._out_feats) if graph.is_block: feat_dst = feat_src[:graph.number_of_dst_nodes()] e_feat = self.edge_emb(e_feat) e_feat = self.fc_e(e_feat).view(-1, self._num_heads, self._edge_feats) ee = (e_feat * self.attn_e).sum(dim=-1).unsqueeze(-1) el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1) graph.srcdata.update({"ft": feat_src, "el": el}) graph.dstdata.update({"er": er}) graph.edata.update({"ee": ee}) graph.apply_edges(fn.u_add_v("el", "er", "e")) e = self.leaky_relu(graph.edata.pop("e") + graph.edata.pop("ee")) # compute softmax graph.edata["a"] = self.attn_drop(edge_softmax(graph, e)) if res_attn is not None: graph.edata["a"] = graph.edata["a"] * ( 1 - self.alpha) + res_attn * self.alpha # message passing graph.update_all(fn.u_mul_e("ft", "a", "m"), fn.sum("m", "ft")) rst = graph.dstdata["ft"] # residual if self.res_fc is not None: resval = self.res_fc(h_dst).view(h_dst.shape[0], -1, self._out_feats) rst = rst + resval # bias if self.bias: rst = rst + self.bias_param # activation if self.activation: rst = self.activation(rst) return rst, graph.edata.pop("a").detach()