def test_remove_edges(): def check(g1, etype, g, edges_removed): src, dst, eid = g.edges(etype=etype, form='all') src1, dst1 = g1.edges(etype=etype, order='eid') if etype is not None: eid1 = g1.edges[etype].data[dgl.EID] else: eid1 = g1.edata[dgl.EID] src1 = F.asnumpy(src1) dst1 = F.asnumpy(dst1) eid1 = F.asnumpy(eid1) src = F.asnumpy(src) dst = F.asnumpy(dst) eid = F.asnumpy(eid) sde_set = set(zip(src, dst, eid)) for s, d, e in zip(src1, dst1, eid1): assert (s, d, e) in sde_set assert not np.isin(edges_removed, eid1).any() for fmt in ['coo', 'csr', 'csc']: for edges_to_remove in [[2], [2, 2], [3, 2], [1, 3, 1, 2]]: g = dgl.graph([(0, 1), (2, 3), (1, 2), (3, 4)], restrict_format=fmt) g1 = dgl.remove_edges(g, F.tensor(edges_to_remove)) check(g1, None, g, edges_to_remove) g = dgl.graph(spsp.csr_matrix( ([1, 1, 1, 1], ([0, 2, 1, 3], [1, 3, 2, 4])), shape=(5, 5)), restrict_format=fmt) g1 = dgl.remove_edges(g, F.tensor(edges_to_remove)) check(g1, None, g, edges_to_remove) g = dgl.heterograph({ ('A', 'AA', 'A'): [(0, 1), (2, 3), (1, 2), (3, 4)], ('A', 'AB', 'B'): [(0, 1), (1, 3), (3, 5), (1, 6)], ('B', 'BA', 'A'): [(2, 3), (3, 2)] }) g2 = dgl.remove_edges(g, { 'AA': F.tensor([2]), 'AB': F.tensor([3]), 'BA': F.tensor([1]) }) check(g2, 'AA', g, [2]) check(g2, 'AB', g, [3]) check(g2, 'BA', g, [1]) g3 = dgl.remove_edges(g, { 'AA': F.tensor([]), 'AB': F.tensor([3]), 'BA': F.tensor([1]) }) check(g3, 'AA', g, []) check(g3, 'AB', g, [3]) check(g3, 'BA', g, [1]) g4 = dgl.remove_edges(g, {'AB': F.tensor([3, 1, 2, 0])}) check(g4, 'AA', g, []) check(g4, 'AB', g, [3, 1, 2, 0]) check(g4, 'BA', g, [])
def construct_blocks(self, seeds, user_item_pairs_to_remove): blocks = [] users, items = user_item_pairs_to_remove # 采样就是根据卷积层数选取对应数量的邻居结点 # 涉及到双向图的处理 for i in range(self.num_layers): sampled_graph = dgl.in_subgraph(self.graph, seeds) sampled_eids = sampled_graph.edges[('user', 'watched', 'item')].data[dgl.EID] sampled_eids_rev = sampled_graph.edges[('item', 'watchedby', 'user')].data[dgl.EID] # 训练时要去掉用户和项目间的关联 _, _, edges_to_remove = sampled_graph.edge_ids(users, items, etype=('user', 'watched', 'item'), return_uv=True) _, _, edges_to_remove_rev = sampled_graph.edge_ids( items, users, etype=('item', 'watchedby', 'user'), return_uv=True) # sampled_with_edges_removed = dgl.remove_edges( # sampled_graph, # {('user', 'watched', 'item'): edges_to_remove, ('item', 'watchedby', 'user'): edges_to_remove_rev} # ) sampled_with_edges_removed = dgl.remove_edges( sampled_graph, edges_to_remove, ('user', 'watched', 'item')) sampled_with_edges_removed = dgl.remove_edges( sampled_with_edges_removed, edges_to_remove_rev, ('item', 'watchedby', 'user')) sampled_eids = sampled_eids[sampled_with_edges_removed.edges[( 'user', 'watched', 'item')].data[dgl.EID]] sampled_eids_rev = sampled_eids_rev[ sampled_with_edges_removed.edges[('item', 'watchedby', 'user')].data[dgl.EID]] # 创建子图块 block = dgl.to_block(sampled_with_edges_removed, seeds) blocks.insert(0, block) seeds = { 'user': block.srcnodes['user'].data[dgl.NID], 'item': block.srcnodes['item'].data[dgl.NID] } # 把评分复制过去 block.edges[('user', 'watched', 'item')].data['rating'] = \ self.graph.edges[('user', 'watched', 'item')].data['rating'][sampled_eids] block.edges[('item', 'watchedby', 'user')].data['rating'] = \ self.graph.edges[('item', 'watchedby', 'user')].data['rating'][sampled_eids_rev] return blocks
def decode(g, tau, threshold, use_gt, ids=None, global_edges=None, global_num_nodes=None, global_peaks=None): # Edge filtering with tau and density den_key = 'density' if use_gt else 'pred_den' g = g.local_var() g.edata['edge_dist'] = get_edge_dist(g, threshold) g.apply_edges(lambda edges: {'keep': (edges.src[den_key] > edges.dst[den_key]).long() * \ (edges.data['edge_dist'] < 1 - tau).long()}) eids = torch.where(g.edata['keep'] == 0)[0] ng = dgl.remove_edges(g, eids) # Tree generation ng.edata[dgl.EID] = torch.arange(ng.number_of_edges()) treeg = tree_generation(ng) # Label propogation peaks, pred_labels = peak_propogation(treeg) if ids is None: return pred_labels, peaks # Merge with previous layers src, dst = treeg.edges() new_global_edges = (global_edges[0] + ids[src.numpy()].tolist(), global_edges[1] + ids[dst.numpy()].tolist()) global_treeg = dgl.graph(new_global_edges, num_nodes=global_num_nodes) global_peaks, global_pred_labels = peak_propogation(global_treeg) return pred_labels, peaks, new_global_edges, global_pred_labels, global_peaks
def split_data(g): # Split edge set for training and testing u, v = g.edges() eids = np.arange(g.number_of_edges()) eids = np.random.permutation(eids) test_size = int(len(eids) * 0.1) train_size = g.number_of_edges() - test_size test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]] # Find all negative edges and split them for training and testing adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) neg_u, neg_v = np.where(adj_neg != 0) neg_eids = np.random.choice(len(neg_u), g.number_of_edges() // 2) test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[ neg_eids[:test_size]] train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[ neg_eids[test_size:]] train_g = dgl.remove_edges(g, eids[:test_size]) train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g
def sample_blocks(self, seed_edges): n_edges = len(seed_edges) seed_edges = th.LongTensor(np.asarray(seed_edges)) heads, tails = self.g.find_edges(seed_edges) if self.neg_share and n_edges % self.num_negs == 0: neg_tails = self.neg_sampler(n_edges) neg_tails = (neg_tails.view(-1, 1, self.num_negs).expand( n_edges // self.num_negs, self.num_negs, self.num_negs).flatten()) neg_heads = (heads.view(-1, 1).expand(n_edges, self.num_negs).flatten()) else: neg_tails = self.neg_sampler(self.num_negs * n_edges) neg_heads = (heads.view(-1, 1).expand(n_edges, self.num_negs).flatten()) # Maintain the correspondence between heads, tails and negative tails as two # graphs. # pos_graph contains the correspondence between each head and its positive tail. # neg_graph contains the correspondence between each head and its negative tails. # Both pos_graph and neg_graph are first constructed with the same node space as # the original graph. Then they are compacted together with dgl.compact_graphs. pos_graph = dgl.graph((heads, tails), num_nodes=self.g.number_of_nodes()) neg_graph = dgl.graph((neg_heads, neg_tails), num_nodes=self.g.number_of_nodes()) pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph]) # Obtain the node IDs being used in either pos_graph or neg_graph. Since they # are compacted together, pos_graph and neg_graph share the same compacted node # space. seeds = pos_graph.ndata[dgl.NID] blocks = [] for fanout in self.fanouts: # For each seed node, sample ``fanout`` neighbors. frontier = dgl.sampling.sample_neighbors(self.g, seeds, fanout, replace=True) # Remove all edges between heads and tails, as well as heads and neg_tails. _, _, edge_ids = frontier.edge_ids( th.cat([heads, tails, neg_heads, neg_tails]), th.cat([tails, heads, neg_tails, neg_heads]), return_uv=True, ) frontier = dgl.remove_edges(frontier, edge_ids) # Then we compact the frontier into a bipartite graph for message passing. block = dgl.to_block(frontier, seeds) # Pre-generate CSR format that it can be used in training directly block.in_degree(0) # Obtain the seed nodes for next layer. seeds = block.srcdata[dgl.NID] blocks.insert(0, block) # Pre-generate CSR format that it can be used in training directly return pos_graph, neg_graph, blocks
def __split_edges_train_val_test( cls, g: dgl.DGLGraph, train_ratio: float, val_ratio: float ) -> _typing.Tuple[dgl.DGLGraph, dgl.DGLGraph, dgl.DGLGraph, dgl.DGLGraph, dgl.DGLGraph, dgl.DGLGraph, dgl.DGLGraph]: u, v = g.edges() eids = np.arange(g.number_of_edges()) eids = np.random.permutation(eids) valid_size = int(len(eids) * val_ratio) test_size = int(len(eids) * (1 - train_ratio - val_ratio)) train_size = g.number_of_edges() - test_size - valid_size test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] valid_pos_u, valid_pos_v = u[ eids[test_size:test_size + valid_size]], v[eids[test_size:test_size + valid_size]] train_pos_u, train_pos_v = u[eids[test_size + valid_size:]], v[eids[test_size + valid_size:]] # Find all negative edges and split them for training and testing adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) neg_u, neg_v = np.where(adj_neg != 0) neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[ neg_eids[:test_size]] valid_neg_u, valid_neg_v = neg_u[ neg_eids[test_size:test_size + valid_size]], neg_v[neg_eids[test_size:test_size + valid_size]] train_neg_u, train_neg_v = neg_u[ neg_eids[test_size + valid_size:]], neg_v[neg_eids[test_size + valid_size:]] train_g = dgl.remove_edges(g, eids[:test_size + valid_size]) train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) valid_pos_g = dgl.graph((valid_pos_u, valid_pos_v), num_nodes=g.number_of_nodes()) valid_neg_g = dgl.graph((valid_neg_u, valid_neg_v), num_nodes=g.number_of_nodes()) test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) return (train_g, train_pos_g, train_neg_g, valid_pos_g, valid_neg_g, test_pos_g, test_neg_g)
def sample_blocks(self, seeds, heads=None, tails=None, neg_tails=None): blocks = [] for sampler in self.samplers: frontier = sampler(seeds) if heads is not None: eids = frontier.edge_ids(torch.cat([heads, heads]), torch.cat([tails, neg_tails]), return_uv=True)[2] if len(eids) > 0: old_frontier = frontier frontier = dgl.remove_edges(old_frontier, eids) frontier.edata['weights'] = old_frontier.edata['weights'][frontier.edata[dgl.EID]] block = compact_and_copy(frontier, seeds) seeds = block.srcdata[dgl.NID] blocks.insert(0, block) return blocks
def sample_blocks(self, seeds, heads=None, tails=None, neg_tails=None): blocks = [] context_dicts = [] for sampler in self.samplers: frontier, context_dict = sampler(seeds) if heads is not None: # edge ids node pointing to itself eids = frontier.edge_ids(torch.cat([heads, heads]), torch.cat([tails, neg_tails]), return_uv=True)[2] if len(eids) > 0: frontier = dgl.remove_edges(frontier, eids) # remove edge if the node pointing to itself block = compact_and_copy(frontier, seeds) seeds = block.srcdata[dgl.NID] blocks.insert(0, block) context_dicts.insert(0, context_dict) return blocks, context_dicts
def sample_blocks(self, seeds, heads=None, tails=None, neg_tails=None): blocks = [] for sampler in self.samplers: frontier = sampler(seeds) # if sampling for pairs, remove any direct edges between the pairs if heads is not None: eids = frontier.edge_ids( torch.cat([heads, heads]), torch.cat([tails, neg_tails]), return_uv=True, )[2] if len(eids) > 0: old_frontier = frontier frontier = dgl.remove_edges(old_frontier, eids) block = compact_and_copy(frontier, seeds) seeds = block.srcdata[dgl.NID] blocks.insert(0, block) return blocks
def extract_graph(self, G, u_id, v_id): v_id += self.num_user static_u = torch.zeros(len(self.class_values)) static_v = torch.zeros(len(self.class_values)) #start0 = time.time() u_nodes, v, e_ids = G.out_edges(u_id, "all") u, v_nodes, e_ids = G.in_edges(v_id, "all") nodes = torch.cat([u, v]) if self.testing: nodes = torch.cat([nodes, torch.tensor([u_id, v_id])]) #start1 = time.time() #print(start1 - start0) subg = G.subgraph(nodes) #start2 = time.time() #print(start2 - start1) subg.ndata['node_label'] = torch.zeros([nodes.shape[0], 4]) pid = subg.ndata[dgl.NID] #start3 = time.time() #print(start3 - start2) for i in range(pid.shape[0]): if pid[i] == u_id: e_u = i subg.ndata['node_label'][i, 0] = 1 elif pid[i] == v_id: e_v = i subg.ndata['node_label'][i, 1] = 1 elif pid[i] in u: subg.ndata['node_label'][i, 2] = 1 elif pid[i] in v: subg.ndata['node_label'][i, 3] = 1 #start4 = time.time() #print(start4 - start3) if not self.testing: e_ids = subg.edge_ids([e_u, e_v], [e_v, e_u]) #start5 = time.time() #print(start5 - start4) subg = dgl.remove_edges(subg, e_ids) #start6 = time.time() #print(start6 - start0) #print() return subg
def sample_blocks(self, seeds, heads=None, tails=None, neg_tails=None): blocks = [] for sampler in self.samplers: frontier = sampler(seeds) # frontier包含了原图的 所有 节点 if heads is not None: eids = frontier.edge_ids(torch.cat([heads, heads]), torch.cat([tails, neg_tails]), return_uv=True)[2] if len(eids) > 0: old_frontier = frontier frontier = dgl.remove_edges(old_frontier, eids) #print(old_frontier) #print(frontier) #print(frontier.edata['weights']) #frontier.edata['weights'] = old_frontier.edata['weights'][frontier.edata[dgl.EID]] block = compact_and_copy(frontier, seeds) # block只包含了seeds和采样的邻居节点 seeds = block.srcdata[ dgl. NID] # srcdata 也包含了seeds的邻居(想要更新seeds 那么在上一层就必须先更新seeds的邻居) blocks.insert(0, block) return blocks
def extract_graph_new(self, G, u_id, v_id): v_id += self.num_user static_u = torch.zeros(len(self.class_values)) static_v = torch.zeros(len(self.class_values)) start0 = time.time() u_nodes, v, e_ids_1 = G.in_edges(v_id, "all") u, v_nodes, e_ids_2 = G.out_edges(u_id, "all") e_ids = [] nodes = torch.cat([u_nodes, v_nodes]) for i in range(u_nodes.shape[0]): if u_nodes[i] == u_id: e_ids.append(e_ids_1[i]) for i in range(v_nodes.shape[0]): if v_nodes[i] == v_id: e_ids.append(e_ids_2[i]) #start1 = time.time() #print(start1 - start0) subg = dgl.node_subgraph(G, nodes) #start2 = time.time() #print(start2 - start1) subg.ndata['node_label'] = torch.zeros([subg.num_nodes(), 4]) pid = subg.ndata[dgl.NID] #start3 = time.time() #print(start3 - start2) for i in range(pid.shape[0]): if pid[i] == u_id: e_u = i subg.ndata['node_label'][i, 0] = 1 elif pid[i] == v_id: e_v = i subg.ndata['node_label'][i, 1] = 1 elif pid[i] in u: subg.ndata['node_label'][i, 2] = 1 elif pid[i] in v: subg.ndata['node_label'][i, 3] = 1 subg = dgl.remove_edges(subg, e_ids) start6 = time.time() print(start6 - start0) print() return subg
def obtain_Bs(self, ed_ids): n_edges = len(ed_ids) ed_ids = torch.LongTensor(np.asarray(ed_ids)) heads, tails = self.g.find_edges(ed_ids) neg_tails = self.weights.multinomial(self.num_negs * n_edges, replacement=True) neg_heads = heads.view(-1, 1).expand(n_edges, self.num_negs).flatten() pos_graph = dgl.graph((heads, tails), num_nodes=self.g.number_of_nodes()) neg_graph = dgl.graph((neg_heads, neg_tails), num_nodes=self.g.number_of_nodes()) pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph]) ids = pos_graph.ndata[dgl.NID] B = [] for s in self.fanout: nf = sample_neighbors(self.g, nodes=ids, fanout=s, replace=True) # 返回采样后的图,节点不变,边仅保留采样到的 _, _, edge_ids = nf.edge_ids( torch.cat([heads, tails, neg_heads, neg_tails]), torch.cat([tails, heads, neg_tails, neg_heads]), return_uv=True) nf = dgl.remove_edges(nf, edge_ids) # 用于计算损失函数的边剔除,前向传播用剩下的边 b = dgl.to_block(nf, ids) # 转为二部图,可以方便读取src和dst节点,将后一层节点作为dst ids = b.srcdata[dgl.NID] # 二部图源节点作为前一层的ids B.insert(0, b) # 插入到列表最前 return pos_graph, neg_graph, B
def split_train_test(g, train, val): u, v = g.edges() eids = np.arange(g.number_of_edges()) eids = np.random.permutation(eids) test_size = int(len(eids) * (1 - train - val)) val_size = int(len(eids) * val) train_size = g.number_of_edges() - test_size - val_size test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] val_pos_u, val_pos_v = u[eids[test_size : test_size + val_size]], v[eids[test_size : test_size + val_size]] train_pos_u, train_pos_v = u[eids[test_size + val_size:]], v[eids[test_size + val_size:]] # Find all negative edges and split them for training and testing adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes()) neg_u, neg_v = np.where(adj_neg != 0) neg_eids = np.random.choice(len(neg_u), g.number_of_edges()) test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]] val_neg_u, val_neg_v = neg_u[neg_eids[test_size: test_size + val_size]], neg_v[neg_eids[test_size: test_size + val_size]] train_neg_u, train_neg_v = neg_u[neg_eids[test_size + val_size:]], neg_v[neg_eids[test_size + val_size:]] train_g = dgl.add_self_loop(dgl.remove_edges(g, eids[:test_size + val_size])) # import pdb # pdb.set_trace() train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes()) train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes()) val_pos_g = dgl.graph((val_pos_u, val_pos_v), num_nodes=g.number_of_nodes()) val_neg_g = dgl.graph((val_neg_u, val_neg_v), num_nodes=g.number_of_nodes()) test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes()) test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes()) return train_g, train_pos_g, train_neg_g, val_pos_g, val_neg_g, test_pos_g, test_neg_g
def test_hetero_conv(agg, idtype): g = dgl.heterograph( { ('user', 'follows', 'user'): ([0, 0, 2, 1], [1, 2, 1, 3]), ('user', 'plays', 'game'): ([0, 0, 0, 1, 2], [0, 2, 3, 0, 2]), ('store', 'sells', 'game'): ([0, 0, 1, 1], [0, 3, 1, 2]) }, idtype=idtype, device=F.ctx()) conv = nn.HeteroGraphConv( { 'follows': nn.GraphConv(2, 3, allow_zero_in_degree=True), 'plays': nn.GraphConv(2, 4, allow_zero_in_degree=True), 'sells': nn.GraphConv(3, 4, allow_zero_in_degree=True) }, agg) conv = conv.to(F.ctx()) # test pickle th.save(conv, tmp_buffer) uf = F.randn((4, 2)) gf = F.randn((4, 4)) sf = F.randn((2, 3)) h = conv(g, {'user': uf, 'game': gf, 'store': sf}) assert set(h.keys()) == {'user', 'game'} if agg != 'stack': assert h['user'].shape == (4, 3) assert h['game'].shape == (4, 4) else: assert h['user'].shape == (4, 1, 3) assert h['game'].shape == (4, 2, 4) block = dgl.to_block(g.to(F.cpu()), { 'user': [0, 1, 2, 3], 'game': [0, 1, 2, 3], 'store': [] }).to(F.ctx()) h = conv(block, ({ 'user': uf, 'game': gf, 'store': sf }, { 'user': uf, 'game': gf, 'store': sf[0:0] })) assert set(h.keys()) == {'user', 'game'} if agg != 'stack': assert h['user'].shape == (4, 3) assert h['game'].shape == (4, 4) else: assert h['user'].shape == (4, 1, 3) assert h['game'].shape == (4, 2, 4) h = conv(block, {'user': uf, 'game': gf, 'store': sf}) assert set(h.keys()) == {'user', 'game'} if agg != 'stack': assert h['user'].shape == (4, 3) assert h['game'].shape == (4, 4) else: assert h['user'].shape == (4, 1, 3) assert h['game'].shape == (4, 2, 4) # test with mod args class MyMod(th.nn.Module): def __init__(self, s1, s2): super(MyMod, self).__init__() self.carg1 = 0 self.carg2 = 0 self.s1 = s1 self.s2 = s2 def forward(self, g, h, arg1=None, *, arg2=None): if arg1 is not None: self.carg1 += 1 if arg2 is not None: self.carg2 += 1 return th.zeros((g.number_of_dst_nodes(), self.s2)) mod1 = MyMod(2, 3) mod2 = MyMod(2, 4) mod3 = MyMod(3, 4) conv = nn.HeteroGraphConv({ 'follows': mod1, 'plays': mod2, 'sells': mod3 }, agg) conv = conv.to(F.ctx()) mod_args = {'follows': (1, ), 'plays': (1, )} mod_kwargs = {'sells': {'arg2': 'abc'}} h = conv(g, { 'user': uf, 'game': gf, 'store': sf }, mod_args=mod_args, mod_kwargs=mod_kwargs) assert mod1.carg1 == 1 assert mod1.carg2 == 0 assert mod2.carg1 == 1 assert mod2.carg2 == 0 assert mod3.carg1 == 0 assert mod3.carg2 == 1 #conv on graph without any edges for etype in g.etypes: g = dgl.remove_edges(g, g.edges(form='eid', etype=etype), etype=etype) assert g.num_edges() == 0 h = conv(g, {'user': uf, 'game': gf, 'store': sf}) assert set(h.keys()) == {'user', 'game'} block = dgl.to_block(g.to(F.cpu()), { 'user': [0, 1, 2, 3], 'game': [0, 1, 2, 3], 'store': [] }).to(F.ctx()) h = conv(block, ({ 'user': uf, 'game': gf, 'store': sf }, { 'user': uf, 'game': gf, 'store': sf[0:0] })) assert set(h.keys()) == {'user', 'game'}
train_neg_u, train_neg_v = neg_u[neg_eids[train_size:]], neg_v[neg_eids[train_size:]] ###################################################################### # When training, you will need to remove the edges in the test set from # the original graph. You can do this via ``dgl.remove_edges``. # # .. note:: # # ``dgl.remove_edges`` works by creating a subgraph from the # original graph, resulting in a copy and therefore could be slow for # large graphs. If so, you could save the training and test graph to # disk, as you would do for preprocessing. # train_g = dgl.remove_edges(g, eids[:test_size]) ###################################################################### # Define a GraphSAGE model # ------------------------ # # This tutorial builds a model consisting of two # `GraphSAGE <https://arxiv.org/abs/1706.02216>`__ layers, each computes # new node representations by averaging neighbor information. DGL provides # ``dgl.nn.SAGEConv`` that conveniently creates a GraphSAGE layer. # from dgl.nn import SAGEConv # ----------- 2. create model -------------- #
def dglMainMovielens(layers, batch_size, epochs, hiddeen_dims, topK): # 拿到数据 train_data, test_data, user_data, item_data = get_ml_100k() # 把用户id和项目id的类别换成枚举类 train_data = train_data.astype({ 'user_id': 'category', 'item_id': 'category' }) test_data = test_data.astype({ 'user_id': 'category', 'item_id': 'category' }) # 训练集和测试集的数据保持一致 # test_data['user_id'].cat.set_categories(train_data['user_id'].cat.categories, inplace=True) # test_data['item_id'].cat.set_categories(train_data['item_id'].cat.categories, inplace=True) # 实现了绝对id到相对id的映射 train_user_ids = torch.LongTensor(train_data['user_id'].cat.codes.values) train_item_ids = torch.LongTensor(train_data['item_id'].cat.codes.values) train_ratings = torch.LongTensor(train_data['rating'].values) # 全连接图的建立,准备user的id,item的id,item的id只留训练过的 all_user_ids = list(set(train_user_ids.tolist())) all_item_ids = list(set(train_item_ids.tolist())) all_user_ids = [ val for val in all_user_ids for i in range(len(all_item_ids)) ] all_item_ids = all_item_ids * len(set(all_user_ids)) # 建立字典对应前后id user_dict = dict( zip(train_user_ids.tolist(), train_data['user_id'].values.tolist())) item_dict = dict( zip(train_item_ids.tolist(), train_data['item_id'].values.tolist())) # test_user_ids = torch.LongTensor(test_data['user_id'].cat.codes.values) # test_item_ids = torch.LongTensor(test_data['item_id'].cat.codes.values) # test_ratings = torch.LongTensor(test_data['rating'].values) # 创建异构图 graph = dgl.heterograph({ ('user', 'watched', 'item'): (train_user_ids, train_item_ids), ('item', 'watchedby', 'user'): (train_item_ids, train_user_ids) }) # 令训练数据和用户、项目数据一致 user_data[0] = user_data[0].astype('category') user_data[0] = user_data[0].cat.set_categories( train_data['user_id'].cat.categories) user_data = user_data.dropna(subset=[0]) user_data[0] = user_data[0].cat.codes user_data = user_data.sort_values(0) item_data[0] = item_data[0].astype('category') item_data[0] = item_data[0].cat.set_categories( train_data['item_id'].cat.categories) item_data = item_data.dropna(subset=[0]) item_data[0] = item_data[0].cat.codes item_data = item_data.sort_values(0) # 处理用户的年龄、性别、职业,以及项目的体裁one-hot向量 user_data[2] = user_data[2].astype('category') user_data[3] = user_data[3].astype('category') # user_data[4] = user_data[4].astype('category') # user_age = user_data[1].values // 10 # num_user_age_bins = user_age.max() + 1 user_gender = user_data[2].cat.codes.values num_user_genders = len(user_data[2].cat.categories) # user_occupation = user_data[3].cat.codes.values # num_user_occupations = len(user_data[3].cat.categories) item_genres = item_data[range(5, 24)].values num_item_genres = item_genres.shape[1] # 将上述特征赋予图中的结点 # graph.nodes['user'].data['age'] = torch.LongTensor(user_age) graph.nodes['user'].data['gender'] = torch.LongTensor(user_gender) # graph.nodes['user'].data['occupation'] = torch.LongTensor(user_occupation) graph.nodes['item'].data['genres'] = torch.FloatTensor(item_genres) # 本来直接用边类型就可以,但是会报错,只好用全称 graph.edges[('item', 'watchedby', 'user')].data['rating'] = torch.LongTensor(train_ratings) graph.edges[('user', 'watched', 'item')].data['rating'] = torch.LongTensor(train_ratings) # target_graph.edges[('item', 'watchedby', 'user')].data['rating'] = torch.LongTensor(train_ratings) # target_graph.edges[('user', 'watched', 'item')].data['rating'] = torch.LongTensor(train_ratings) # 创建用户、项目全连接图 all_graph = dgl.heterograph({ ('user', 'watched', 'item'): (all_user_ids, all_item_ids) }) #real_data为之后还原id做准备 real_data = torch.tensor(list(zip(all_user_ids, all_item_ids)), dtype=torch.int) all_graph.edata['real_data'] = real_data # ---------------------------------------------从全连接图中去掉历史连接------------------------------------------------ # 训练时要去掉用户和项目间的关联 seeds = { 'user': list(set(train_user_ids.tolist())), 'item': list(set(train_item_ids.tolist())) } sampled_graph = all_graph.in_subgraph(seeds) _, _, edges_to_remove = sampled_graph.edge_ids(train_user_ids, train_item_ids, etype=('user', 'watched', 'item'), return_uv=True) # _, _, edges_to_remove_rev = graph.edge_ids( # train_item_ids, train_user_ids, etype=('item', 'watchedby', 'user'), return_uv=True) # sampled_with_edges_removed = dgl.remove_edges( # sampled_graph, # {('user', 'watched', 'item'): edges_to_remove, ('item', 'watchedby', 'user'): edges_to_remove_rev} # ) target_graph = dgl.remove_edges(sampled_graph, edges_to_remove, ('user', 'watched', 'item')) # target_graph = dgl.remove_edges(target_graph,edges_to_remove_rev, ('item', 'watchedby', 'user')) # ---------------------------------------------从全连接图中去掉历史连接------------------------------------------------ # target_graph.nodes['user'].data['gender'] = torch.LongTensor(user_gender) # target_graph.nodes['item'].data['genres'] = torch.FloatTensor(item_genres) # 设置数据集 train_dataset = TensorDataset(train_user_ids, train_item_ids, train_ratings) # test_dataset = TensorDataset(test_user_ids, test_item_ids, test_ratings) # 定义训练过程 NUM_LAYERS = layers BATCH_SIZE = batch_size NUM_EPOCHS = epochs HIDDEN_DIMS = hiddeen_dims sampler = MinibatchSampler(graph, NUM_LAYERS) # 准备加载数据 train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=sampler.sample, shuffle=True) # test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=sampler.sample, shuffle=False) # 创建模型 model = GCMCRating2(graph.number_of_nodes('user'), graph.number_of_nodes('item'), HIDDEN_DIMS, 5, NUM_LAYERS, num_user_genders, num_item_genres) # 使用Adam优化器 opt = torch.optim.Adam(model.parameters()) # 开始训练 epoch = 0 for _ in range(NUM_EPOCHS): model.train() # 加个进度条,直观 # 首先是训练 with tqdm.tqdm(train_dataloader) as t: # with torch.no_grad(): predictions = [] ratings = [] # real_data = [] for pair_graph, blocks in t: # real_data.append(pair_graph.edata['real_data']) user_emb, item_emb = model(blocks) prediction = model.compute_score(pair_graph, user_emb, item_emb) loss = ((prediction - pair_graph.edata['rating'])**2).mean() opt.zero_grad() loss.backward() opt.step() t.set_postfix({'loss': '%.4f' % loss.item()}, refresh=False) ratings.append(pair_graph.edata['rating']) predictions.append(prediction) # predictions = torch.cat(predictions, 0) # ratings = torch.cat(ratings, 0) # real_data = torch.cat(real_data, 0) model.eval() epoch += 1 if epoch % 10 == 0: # ----------------------------------------------------------------------------------------------------------- # ---------------------------------------graph转block--------------------------------------------- # 创建子图块 train_blocks = [] block = dgl.to_block(graph) # 把评分复制过去 # block.edges[('user', 'watched', 'item')].data['rating'] = \ # graph.edges[('user', 'watched', 'item')].data['rating'] # block.edges[('item', 'watchedby', 'user')].data['rating'] = \ # graph.edges[('item', 'watchedby', 'user')].data['rating'] train_blocks.insert(0, block) # ---------------------------------------graph转block--------------------------------------------- # 将train_blocks输入模型 user_emb, item_emb = model(train_blocks) # 基于target_graph得到预测评分 prediction = model.compute_score(target_graph, user_emb, item_emb) # ---------------------------------------还原用户与项目的初始id--------------------------------------------- real_data = target_graph.edata['real_data'] real_data = pd.DataFrame(real_data.tolist()) real_data.columns = ['user_id', 'item_id'] real_user = real_data['user_id'].values.tolist() real_uid = [user_dict[k] for k in real_user] real_data['user_id'] = real_uid real_item = real_data['item_id'].values.tolist() real_uid = [item_dict[k] for k in real_item] real_data['item_id'] = real_uid # ---------------------------------------还原用户与项目的初始id--------------------------------------------- # 将边两端的节点给到result result = real_data # 列表降维,去掉prediction中单个元素外的中括号 predictions = np.ravel(prediction.tolist()) # predictions = sum(prediction.tolist(), []) # 将边上预测得到的值给到result的'rating' result['rating'] = predictions # 按user_id分组排序 result = result.groupby('user_id').apply(lambda x: x.sort_values( by="rating", ascending=False)).reset_index(drop=True) result.to_csv('file_saved/ml-DGLresult.csv', index=None) #----------------------------------------------------------------------------------------------------------- print(result) result.to_csv( 'new_saved/dgl/ml-DGLresult-epoch{}.csv'.format(epoch), index=None) m1 = pd.DataFrame(model.W.weight.tolist()) m2 = pd.DataFrame(model.V.weight.tolist()) m1.to_csv('new_saved/dgl/ml-GCMC-W-epoch{}.csv'.format(epoch), index=None, header=None) m2.to_csv('new_saved/dgl/ml-GCMC-V-epoch{}.csv'.format(epoch), index=None, header=None) for i in range(layers): l1 = pd.DataFrame( model.layers[i].heteroconv.mods['watchedby'].W_r.flatten( 1).tolist()) l2 = pd.DataFrame(model.layers[i].heteroconv.mods['watchedby']. W.weight.tolist()) l1.to_csv( 'new_saved/dgl/ml-GCMCConv-W_r-epoch{}-layer{}.csv'.format( epoch, i), index=None, header=None) l2.to_csv( 'new_saved/dgl/ml-GCMCConv-W-epoch{}-layer{}.csv'.format( epoch, i), index=None, header=None) recommend(item_data_for_recommend, topK, FSLflag)
def dglMainFSL(layers, batch_size, epochs, hiddeen_dims, topK): # 从初始输入里拿到数据 train_data, user_data, item_data, classify_data = get_bigraph() train_data = train_data.astype({'user_id': 'str', 'item_id': 'str'}) item_data_for_recommend = item_data.copy() rating_count = train_data['rating'].value_counts().values.tolist() NUM_RATINGS = len(rating_count) classify_id = classify_data['classify_id'].values.tolist() classify_num = len(set(classify_id)) # 把用户id和项目id的类型换成枚举类 train_data = train_data.astype({ 'user_id': 'category', 'item_id': 'category' }) # 训练集和测试集的数据保持一致 # 这两步操作从训练集和测试集数据里提取出0开始的相对位置ID # 用于后续数据集的创建,以及异构图graph的创建 train_user_ids = torch.LongTensor(train_data['user_id'].cat.codes.values) train_item_ids = torch.LongTensor(train_data['item_id'].cat.codes.values) train_data['rating'] = train_data['rating'].astype(float) train_ratings = torch.LongTensor(train_data['rating'].values) # 全连接图的建立,准备user的id,item的id,item的id只留训练过的 all_user_ids = list(set(train_user_ids.tolist())) all_item_ids = list(set(train_item_ids.tolist())) all_user_ids = [ val for val in all_user_ids for i in range(len(all_item_ids)) ] all_item_ids = all_item_ids * len(set(all_user_ids)) # 建立字典对应前后id user_dict = dict( zip(train_user_ids.tolist(), train_data['user_id'].values.tolist())) item_dict = dict( zip(train_item_ids.tolist(), train_data['item_id'].values.tolist())) # 创建异构图 graph = dgl.heterograph({ ('user', 'watched', 'item'): (train_user_ids, train_item_ids), ('item', 'watchedby', 'user'): (train_item_ids, train_user_ids) }) # 令训练数据和用户、项目数据一致 user_data[0] = user_data[0].astype('category') user_data[0] = user_data[0].cat.set_categories( train_data['user_id'].astype('category').cat.categories) # 更新了类别后,训练数据集的类别可能比整体用户数的类别少,导致产生空值 # 为此需要进行空值去除,具体就是把第0列的空值去掉 user_data = user_data.dropna(subset=[0]) # codes是类别数值到索引值的映射,把原来很长的用户id映射成从0开始的位置整数 user_data[0] = user_data[0].cat.codes user_data = user_data.sort_values(0) # 这里操作的含义就是缩减项目数,让第0列的index数值符合实际的item数量 item_data[0] = item_data[0].astype('category') item_data[0] = item_data[0].cat.set_categories( train_data['item_id'].cat.categories) item_data = item_data.dropna(subset=[0]) item_data[0] = item_data[0].cat.codes item_data = item_data.sort_values(0) # 处理用户以及项目的特征one-hot向量 user_data[1] = user_data[1].astype('category') user_gender = user_data[1].cat.codes.values num_user_genders = len(user_data[1].cat.categories) item_data[1] = item_data[1].astype('category') item_genres = item_data[1].cat.codes.values num_item_genres = len(item_data[1].cat.categories) # 将上述特征赋予图中的结点 graph.nodes['user'].data['gender'] = torch.LongTensor(user_gender) graph.nodes['item'].data['genres'] = torch.LongTensor(item_genres) # 本来直接用边类型就可以,但是会报错,只好用全称 graph.edges[('item', 'watchedby', 'user')].data['rating'] = torch.LongTensor(train_ratings) graph.edges[('user', 'watched', 'item')].data['rating'] = torch.LongTensor(train_ratings) # ------------------------------------------------------------------------------------------------------------------- # 创建用户、项目全连接图 all_graph = dgl.heterograph({ ('user', 'watched', 'item'): (all_user_ids, all_item_ids) }) # real_data为之后还原id做准备 real_data = torch.tensor(list(zip(all_user_ids, all_item_ids)), dtype=torch.int) all_graph.edata['real_data'] = real_data # ---------------------------------------------从全连接图中去掉历史连接--------------------------------------------- # 训练时要去掉用户和项目间的关联 seeds = { 'user': list(set(train_user_ids.tolist())), 'item': list(set(train_item_ids.tolist())) } sampled_graph = all_graph.in_subgraph(seeds) _, _, edges_to_remove = sampled_graph.edge_ids(train_user_ids, train_item_ids, etype=('user', 'watched', 'item'), return_uv=True) # _, _, edges_to_remove_rev = graph.edge_ids( # train_item_ids, train_user_ids, etype=('item', 'watchedby', 'user'), return_uv=True) # sampled_with_edges_removed = dgl.remove_edges( # sampled_graph, # {('user', 'watched', 'item'): edges_to_remove, ('item', 'watchedby', 'user'): edges_to_remove_rev} # ) target_graph = dgl.remove_edges(sampled_graph, edges_to_remove, ('user', 'watched', 'item')) # target_graph = dgl.remove_edges(target_graph,edges_to_remove_rev, ('item', 'watchedby', 'user')) # ---------------------------------------------从全连接图中去掉历史连接--------------------------------------------- # target_graph.nodes['user'].data['gender'] = torch.LongTensor(user_gender) # target_graph.nodes['item'].data['genres'] = torch.FloatTensor(item_genres) # ------------------------------------------------------------------------------------------------------------------- # 设置数据集 train_dataset = TensorDataset(train_user_ids, train_item_ids, train_ratings) # 定义训练过程 NUM_LAYERS = layers BATCH_SIZE = batch_size NUM_EPOCHS = epochs HIDDEN_DIMS = hiddeen_dims sampler = MinibatchSampler(graph, NUM_LAYERS) # 准备加载数据 train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=sampler.sample, shuffle=True) # 创建模型 model = GCMCRating1(graph.number_of_nodes('user'), graph.number_of_nodes('item'), HIDDEN_DIMS, NUM_RATINGS, NUM_LAYERS, num_user_genders, num_item_genres) # 使用Adam优化器 opt = torch.optim.Adam(model.parameters()) epoch = 0 # 开始训练 for _ in range(NUM_EPOCHS): model.train() # 加个进度条,直观 # 首先是训练 with tqdm.tqdm(train_dataloader) as t: predictions = [] # ratings = [] # real_data = [] for pair_graph, blocks in t: # real_data.append(pair_graph.edata['real_data']) user_emb, item_emb = model(blocks) prediction = model.compute_score(pair_graph, user_emb, item_emb) loss = ((prediction - pair_graph.edata['rating'])**2).mean() opt.zero_grad() loss.backward() opt.step() t.set_postfix({'loss': '%.4f' % loss.item()}, refresh=False) # ratings.append(pair_graph.edata['rating']) predictions.append(prediction) predictions = torch.cat(predictions, 0) # ratings = torch.cat(ratings, 0) # real_data = torch.cat(real_data, 0) model.eval() epoch += 1 if epoch % 10 == 0: # ----------------------------------------------------------------------------------------------------------- # ---------------------------------------graph转block--------------------------------------------- # 创建子图块 train_blocks = [] block = dgl.to_block(graph) # 把评分复制过去 # block.edges[('user', 'watched', 'item')].data['rating'] = \ # graph.edges[('user', 'watched', 'item')].data['rating'] # block.edges[('item', 'watchedby', 'user')].data['rating'] = \ # graph.edges[('item', 'watchedby', 'user')].data['rating'] train_blocks.insert(0, block) # ---------------------------------------graph转block--------------------------------------------- # 将train_blocks输入模型 user_emb, item_emb = model(train_blocks) # 基于target_graph得到预测评分 prediction = model.compute_score(target_graph, user_emb, item_emb) # ---------------------------------------还原用户与项目的初始id--------------------------------------------- real_data = target_graph.edata['real_data'] real_data = pd.DataFrame(real_data.tolist()) real_data.columns = ['user_id', 'item_id'] real_user = real_data['user_id'].values.tolist() real_uid = [user_dict[k] for k in real_user] real_data['user_id'] = real_uid real_item = real_data['item_id'].values.tolist() real_uid = [item_dict[k] for k in real_item] real_data['item_id'] = real_uid # ---------------------------------------还原用户与项目的初始id--------------------------------------------- # 将边两端的节点给到result result = real_data # 列表降维,去掉prediction中单个元素外的中括号 predictions = np.ravel(prediction.tolist()) # predictions = sum(prediction.tolist(), []) # 将边上预测得到的值给到result的'rating' result['rating'] = predictions # 按user_id分组排序 result = result.groupby('user_id').apply(lambda x: x.sort_values( by="rating", ascending=False)).reset_index(drop=True) result.to_csv('file_saved/fsl-DGLresult.csv', index=None) # ----------------------------------------------------------------------------------------------------------- print(result) result.to_csv( 'new_saved/dgl/fsl-DGLresult-epoch{}.csv'.format(epoch), index=None) m1 = pd.DataFrame(model.W.weight.tolist()) m2 = pd.DataFrame(model.V.weight.tolist()) m1.to_csv('new_saved/dgl/fsl-GCMC-W-epoch{}.csv'.format(epoch), index=None) m2.to_csv('new_saved/dgl/fsl-GCMC-V-epoch{}.csv'.format(epoch), index=None) for i in range(layers): l1 = pd.DataFrame( model.layers[i].heteroconv.mods['watchedby'].W_r.flatten( 1).tolist()) l2 = pd.DataFrame(model.layers[i].heteroconv.mods['watchedby']. W.weight.tolist()) l1.to_csv('new_saved/dgl/fsl-GCMCConv-W_r-epoch{}-layer{}.csv'. format(epoch, i), index=None) l2.to_csv( 'new_saved/dgl/fsl-GCMCConv-W-epoch{}-layer{}.csv'.format( epoch, i), index=None) recommend(item_data_for_recommend, topK, FSLflag, classify_num=classify_num)