def create_test_heterograph(): # test heterograph from the docstring, plus a user -- wishes -- game relation # 3 users, 2 games, 2 developers # metagraph: # ('user', 'follows', 'user'), # ('user', 'plays', 'game'), # ('user', 'wishes', 'game'), # ('developer', 'develops', 'game')]) plays_spmat = ssp.coo_matrix(([1, 1, 1, 1], ([0, 1, 2, 1], [0, 0, 1, 1]))) wishes_nx = nx.DiGraph() wishes_nx.add_nodes_from(['u0', 'u1', 'u2'], bipartite=0) wishes_nx.add_nodes_from(['g0', 'g1'], bipartite=1) wishes_nx.add_edge('u0', 'g1', id=0) wishes_nx.add_edge('u2', 'g0', id=1) follows_g = dgl.graph([(0, 1), (1, 2)], 'user', 'follows') plays_g = dgl.bipartite(plays_spmat, 'user', 'plays', 'game') wishes_g = dgl.bipartite(wishes_nx, 'user', 'wishes', 'game') develops_g = dgl.bipartite([(0, 0), (1, 1)], 'developer', 'develops', 'game') g = dgl.hetero_from_relations([follows_g, plays_g, wishes_g, develops_g]) return g
def create_heterographs2(index_dtype): g_x = dgl.graph(([0, 1, 2], [1, 2, 3]), 'user', 'follows', index_dtype=index_dtype, restrict_format='any') g_y = dgl.graph(([0, 2], [2, 3]), 'user', 'knows', index_dtype=index_dtype, restrict_format='csr') g_z = dgl.bipartite(([0, 1, 3], [2, 3, 4]), 'user', 'knows', 'knowledge', index_dtype=index_dtype) g_x.nodes['user'].data['h'] = F.randn((4, 3)) g_x.edges['follows'].data['w'] = F.randn((3, 2)) g_y.nodes['user'].data['hh'] = F.ones((4, 5)) g_y.edges['knows'].data['ww'] = F.randn((2, 10)) g = dgl.hetero_from_relations([g_x, g_y, g_z]) return [g, g_x, g_y, g_z]
def test_in_subgraph(index_dtype): g1 = dgl.graph([(1,0),(2,0),(3,0),(0,1),(2,1),(3,1),(0,2)], 'user', 'follow', index_dtype=index_dtype) g2 = dgl.bipartite([(0,0),(0,1),(1,2),(3,2)], 'user', 'play', 'game', index_dtype=index_dtype) g3 = dgl.bipartite([(2,0),(2,1),(2,2),(1,0),(1,3),(0,0)], 'game', 'liked-by', 'user', index_dtype=index_dtype) g4 = dgl.bipartite([(0,0),(1,0),(2,0),(3,0)], 'user', 'flips', 'coin', index_dtype=index_dtype) hg = dgl.hetero_from_relations([g1, g2, g3, g4]) subg = dgl.in_subgraph(hg, {'user' : [0,1], 'game' : 0}) assert subg._idtype_str == index_dtype assert len(subg.ntypes) == 3 assert len(subg.etypes) == 4 u, v = subg['follow'].edges() edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v)))) assert F.array_equal(hg['follow'].edge_ids(u, v), subg['follow'].edata[dgl.EID]) assert edge_set == {(1,0),(2,0),(3,0),(0,1),(2,1),(3,1)} u, v = subg['play'].edges() edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v)))) assert F.array_equal(hg['play'].edge_ids(u, v), subg['play'].edata[dgl.EID]) assert edge_set == {(0,0)} u, v = subg['liked-by'].edges() edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v)))) assert F.array_equal(hg['liked-by'].edge_ids(u, v), subg['liked-by'].edata[dgl.EID]) assert edge_set == {(2,0),(2,1),(1,0),(0,0)} assert subg['flips'].number_of_edges() == 0
def test_metapath_random_walk(idtype): g1 = dgl.bipartite(([0, 1, 2, 3], [0, 1, 2, 3]), 'a', 'ab', 'b', idtype=idtype) g2 = dgl.bipartite(([0, 0, 1, 1, 2, 2, 3, 3], [1, 3, 2, 0, 3, 1, 0, 2]), 'b', 'ba', 'a', idtype=idtype) G = dgl.hetero_from_relations([g1, g2]) seeds = [0, 1] traces = dgl.contrib.sampling.metapath_random_walk(G, ['ab', 'ba'] * 4, seeds, 3) for seed, traces_per_seed in zip(seeds, traces): assert len(traces_per_seed) == 3 for trace in traces_per_seed: assert len(trace) == 8 trace = np.insert(F.asnumpy(trace), 0, seed) for i in range(4): assert g1.has_edge_between(trace[2 * i], trace[2 * i + 1]) assert g2.has_edge_between(trace[2 * i + 1], trace[2 * i + 2])
def _gen_neighbor_sampling_test_graph(hypersparse, reverse): if hypersparse: # should crash if allocated a CSR card = 1 << 50 card2 = (1 << 50, 1 << 50) else: card = None card2 = None if reverse: g = dgl.graph([(0, 1), (0, 2), (0, 3), (1, 0), (1, 2), (1, 3), (2, 0)], 'user', 'follow', num_nodes=card) g.edata['prob'] = F.tensor([.5, .5, 0., .5, .5, 0., 1.], dtype=F.float32) g1 = dgl.bipartite([(0, 0), (1, 0), (2, 1), (2, 3)], 'game', 'play', 'user', num_nodes=card2) g1.edata['prob'] = F.tensor([.8, .5, .5, .5], dtype=F.float32) g2 = dgl.bipartite([(0, 2), (1, 2), (2, 2), (0, 1), (3, 1), (0, 0)], 'user', 'liked-by', 'game', num_nodes=card2) g2.edata['prob'] = F.tensor([.3, .5, .2, .5, .1, .1], dtype=F.float32) g3 = dgl.bipartite([(0, 0), (0, 1), (0, 2), (0, 3)], 'coin', 'flips', 'user', num_nodes=card2) hg = dgl.hetero_from_relations([g, g1, g2, g3]) else: g = dgl.graph([(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1), (0, 2)], 'user', 'follow', num_nodes=card) g.edata['prob'] = F.tensor([.5, .5, 0., .5, .5, 0., 1.], dtype=F.float32) g1 = dgl.bipartite([(0, 0), (0, 1), (1, 2), (3, 2)], 'user', 'play', 'game', num_nodes=card2) g1.edata['prob'] = F.tensor([.8, .5, .5, .5], dtype=F.float32) g2 = dgl.bipartite([(2, 0), (2, 1), (2, 2), (1, 0), (1, 3), (0, 0)], 'game', 'liked-by', 'user', num_nodes=card2) g2.edata['prob'] = F.tensor([.3, .5, .2, .5, .1, .1], dtype=F.float32) g3 = dgl.bipartite([(0, 0), (1, 0), (2, 0), (3, 0)], 'user', 'flips', 'coin', num_nodes=card2) hg = dgl.hetero_from_relations([g, g1, g2, g3]) return g, hg
def sample_blocks(self, seeds): """Sample subgraphs from the entire graph. The input ``seeds`` represents the edges to compute prediction for. The sampling algorithm works as follows: 1. Get the head and tail nodes of the provided seed edges. 2. For each head and tail node, extract the entire in-coming neighborhood. 3. Copy the node features/embeddings from the full graph to the sampled subgraphs. """ dataset = self.dataset enc_graph = self.enc_graph dec_graph = self.dec_graph edge_ids = th.stack(seeds) # generate frontiers for user and item possible_rating_values = dataset.possible_rating_values true_relation_ratings = self.truths[edge_ids] true_relation_labels = None if self.labels is None else self.labels[ edge_ids] # 1. Get the head and tail nodes from both the decoder and encoder graphs. head_id, tail_id = dec_graph.find_edges(edge_ids) utype, _, vtype = enc_graph.canonical_etypes[0] subg = [] true_rel_ratings = [] true_rel_labels = [] for possible_rating_value in possible_rating_values: idx_loc = (true_relation_ratings == possible_rating_value) head = head_id[idx_loc] tail = tail_id[idx_loc] true_rel_ratings.append(true_relation_ratings[idx_loc]) if self.labels is not None: true_rel_labels.append(true_relation_labels[idx_loc]) subg.append( dgl.bipartite((head, tail), utype=utype, etype=str(possible_rating_value), vtype=vtype, num_nodes=(enc_graph.number_of_nodes(utype), enc_graph.number_of_nodes(vtype)))) # Convert the encoder subgraph to a more compact one by removing nodes that covered # by the seed edges. g = dgl.hetero_from_relations(subg) g = dgl.compact_graphs(g) # 2. For each head and tail node, extract the entire in-coming neighborhood. seed_nodes = {} for ntype in g.ntypes: seed_nodes[ntype] = g.nodes[ntype].data[dgl.NID] frontier = dgl.in_subgraph(enc_graph, seed_nodes) frontier = dgl.to_block(frontier, seed_nodes) # 3. Copy the node features/embeddings from the full graph to the sampled subgraphs. frontier.dstnodes['user'].data['ci'] = \ enc_graph.nodes['user'].data['ci'][frontier.dstnodes['user'].data[dgl.NID]] frontier.srcnodes['movie'].data['cj'] = \ enc_graph.nodes['movie'].data['cj'][frontier.srcnodes['movie'].data[dgl.NID]] frontier.srcnodes['user'].data['cj'] = \ enc_graph.nodes['user'].data['cj'][frontier.srcnodes['user'].data[dgl.NID]] frontier.dstnodes['movie'].data['ci'] = \ enc_graph.nodes['movie'].data['ci'][frontier.dstnodes['movie'].data[dgl.NID]] # handle features head_feat = frontier.srcnodes['user'].data[dgl.NID].long() \ if dataset.user_feature is None else \ dataset.user_feature[frontier.srcnodes['user'].data[dgl.NID]] tail_feat = frontier.srcnodes['movie'].data[dgl.NID].long()\ if dataset.movie_feature is None else \ dataset.movie_feature[frontier.srcnodes['movie'].data[dgl.NID]] true_rel_labels = None if self.labels is None else th.cat( true_rel_labels, dim=0) true_rel_ratings = th.cat(true_rel_ratings, dim=0) return (g, frontier, head_feat, tail_feat, true_rel_labels, true_rel_ratings)
def construct_graph(): paper_ids = [] paper_names = [] author_ids = [] author_names = [] conf_ids = [] conf_names = [] f_3 = open(os.path.join(path, "id_author.txt"), encoding="ISO-8859-1") f_4 = open(os.path.join(path, "id_conf.txt"), encoding="ISO-8859-1") f_5 = open(os.path.join(path, "paper.txt"), encoding="ISO-8859-1") while True: z = f_3.readline() if not z: break z = z.strip().split() identity = int(z[0]) author_ids.append(identity) author_names.append(z[1]) while True: w = f_4.readline() if not w: break w = w.strip().split() identity = int(w[0]) conf_ids.append(identity) conf_names.append(w[1]) while True: v = f_5.readline() if not v: break v = v.strip().split() identity = int(v[0]) paper_name = 'p' + ''.join(v[1:]) paper_ids.append(identity) paper_names.append(paper_name) f_3.close() f_4.close() f_5.close() author_ids_invmap = {x: i for i, x in enumerate(author_ids)} conf_ids_invmap = {x: i for i, x in enumerate(conf_ids)} paper_ids_invmap = {x: i for i, x in enumerate(paper_ids)} paper_author_src = [] paper_author_dst = [] paper_conf_src = [] paper_conf_dst = [] f_1 = open(os.path.join(path, "paper_author.txt"), "r") f_2 = open(os.path.join(path, "paper_conf.txt"), "r") for x in f_1: x = x.split('\t') x[0] = int(x[0]) x[1] = int(x[1].strip('\n')) paper_author_src.append(paper_ids_invmap[x[0]]) paper_author_dst.append(author_ids_invmap[x[1]]) for y in f_2: y = y.split('\t') y[0] = int(y[0]) y[1] = int(y[1].strip('\n')) paper_conf_src.append(paper_ids_invmap[y[0]]) paper_conf_dst.append(conf_ids_invmap[y[1]]) f_1.close() f_2.close() pa = dgl.bipartite((paper_author_src, paper_author_dst), 'paper', 'pa', 'author') ap = dgl.bipartite((paper_author_dst, paper_author_src), 'author', 'ap', 'paper') pc = dgl.bipartite((paper_conf_src, paper_conf_dst), 'paper', 'pc', 'conf') cp = dgl.bipartite((paper_conf_dst, paper_conf_src), 'conf', 'cp', 'paper') hg = dgl.hetero_from_relations([pa, ap, pc, cp]) return hg, author_names, conf_names, paper_names
def test_flatten(): def check_mapping(g, fg): if len(fg.ntypes) == 1: SRC = DST = fg.ntypes[0] else: SRC = fg.ntypes[0] DST = fg.ntypes[1] etypes = F.asnumpy(fg.edata[dgl.ETYPE]).tolist() eids = F.asnumpy(fg.edata[dgl.EID]).tolist() for i, (etype, eid) in enumerate(zip(etypes, eids)): src_g, dst_g = g.find_edges([eid], g.canonical_etypes[etype]) src_fg, dst_fg = fg.find_edges([i]) # TODO(gq): I feel this code is quite redundant; can we just add new members (like # "induced_srcid") to returned heterograph object and not store them as features? assert src_g == fg.nodes[SRC].data[dgl.NID][src_fg] tid = F.asnumpy(fg.nodes[SRC].data[dgl.NTYPE][src_fg])[0] assert g.canonical_etypes[etype][0] == g.ntypes[tid] assert dst_g == fg.nodes[DST].data[dgl.NID][dst_fg] tid = F.asnumpy(fg.nodes[DST].data[dgl.NTYPE][dst_fg])[0] assert g.canonical_etypes[etype][2] == g.ntypes[tid] # check for wildcard slices g = create_test_heterograph() g.nodes['user'].data['h'] = F.ones((3, 5)) g.nodes['game'].data['i'] = F.ones((2, 5)) g.edges['plays'].data['e'] = F.ones((4, 4)) g.edges['wishes'].data['e'] = F.ones((2, 4)) g.edges['wishes'].data['f'] = F.ones((2, 4)) fg = g['user', :, 'game'] # user--plays->game and user--wishes->game assert len(fg.ntypes) == 2 assert fg.ntypes == ['user', 'game'] assert fg.etypes == ['plays+wishes'] assert F.array_equal(fg.nodes['user'].data['h'], F.ones((3, 5))) assert F.array_equal(fg.nodes['game'].data['i'], F.ones((2, 5))) assert F.array_equal(fg.edata['e'], F.ones((6, 4))) assert 'f' not in fg.edata etypes = F.asnumpy(fg.edata[dgl.ETYPE]).tolist() eids = F.asnumpy(fg.edata[dgl.EID]).tolist() assert set(zip(etypes, eids)) == set([(1, 0), (1, 1), (1, 2), (1, 3), (2, 0), (2, 1)]) check_mapping(g, fg) fg = g['user', :, 'user'] # NOTE(gq): The node/edge types from the parent graph is returned if there is only one # node/edge type. This differs from the behavior above. assert fg.ntypes == ['user'] assert fg.etypes == ['follows'] u1, v1 = g.edges(etype='follows', order='eid') u2, v2 = fg.edges(etype='follows', order='eid') assert F.array_equal(u1, u2) assert F.array_equal(v1, v2) fg = g['developer', :, 'game'] assert fg.ntypes == ['developer', 'game'] assert fg.etypes == ['develops'] u1, v1 = g.edges(etype='develops', order='eid') u2, v2 = fg.edges(etype='develops', order='eid') assert F.array_equal(u1, u2) assert F.array_equal(v1, v2) fg = g[:, :, :] assert fg.ntypes == ['developer+user', 'game+user'] assert fg.etypes == ['develops+follows+plays+wishes'] check_mapping(g, fg) # Test another heterograph g_x = dgl.graph(([0, 1, 2], [1, 2, 3]), 'user', 'follows') g_y = dgl.graph(([0, 2], [2, 3]), 'user', 'knows') g_x.nodes['user'].data['h'] = F.randn((4, 3)) g_x.edges['follows'].data['w'] = F.randn((3, 2)) g_y.nodes['user'].data['hh'] = F.randn((4, 5)) g_y.edges['knows'].data['ww'] = F.randn((2, 10)) g = dgl.hetero_from_relations([g_x, g_y]) assert F.array_equal(g.ndata['h'], g_x.ndata['h']) assert F.array_equal(g.ndata['hh'], g_y.ndata['hh']) assert F.array_equal(g.edges['follows'].data['w'], g_x.edata['w']) assert F.array_equal(g.edges['knows'].data['ww'], g_y.edata['ww']) fg = g['user', :, 'user'] assert fg.ntypes == ['user'] assert fg.etypes == ['follows+knows'] check_mapping(g, fg) fg = g['user', :, :] assert fg.ntypes == ['user'] assert fg.etypes == ['follows+knows'] check_mapping(g, fg)
import dgl if __name__ == "__main__": follows_g = dgl.graph([(0, 1), (1, 2)], 'user', 'follows') devs_g = dgl.bipartite([(0, 0), (1, 1)], 'developer', 'develops', 'game') hetero_g = dgl.hetero_from_relations([follows_g, devs_g]) homo_g = dgl.to_homo(hetero_g) hetero_g_2 = dgl.to_hetero(homo_g, hetero_g.ntypes, hetero_g.etypes) print(hetero_g) print(hetero_g_2) print("here")
def _generate_enc_graph(self, rating_pairs, rating_values, add_support=False): user_movie_R = np.zeros((self._num_user, self._num_movie), dtype=np.float32) user_movie_R[rating_pairs] = rating_values movie_user_R = user_movie_R.transpose() rating_graphs = [] rating_row, rating_col = rating_pairs for rating in self.possible_rating_values: ridx = np.where(rating_values == rating) rrow = rating_row[ridx] rcol = rating_col[ridx] bg = dgl.bipartite((rrow, rcol), 'user', str(rating), 'movie', num_nodes=(self._num_user, self._num_movie)) rev_bg = dgl.bipartite((rcol, rrow), 'movie', 'rev-%s' % str(rating), 'user', num_nodes=(self._num_movie, self._num_user)) rating_graphs.append(bg) rating_graphs.append(rev_bg) graph = dgl.hetero_from_relations(rating_graphs) # sanity check assert len(rating_pairs[0]) == sum( [graph.number_of_edges(et) for et in graph.etypes]) // 2 if add_support: def _calc_norm(x): x = x.asnumpy().astype('float32') x[x == 0.] = np.inf x = mx.nd.array(1. / np.sqrt(x)) return x.as_in_context(self._ctx).expand_dims(1) user_ci = [] user_cj = [] movie_ci = [] movie_cj = [] for r in self.possible_rating_values: r = str(r) user_ci.append(graph['rev-%s' % r].in_degrees()) movie_ci.append(graph[r].in_degrees()) if self._symm: user_cj.append(graph[r].out_degrees()) movie_cj.append(graph['rev-%s' % r].out_degrees()) else: user_cj.append(mx.nd.zeros((self.num_user, ))) movie_cj.append(mx.nd.zeros((self.num_movie, ))) user_ci = _calc_norm(mx.nd.add_n(*user_ci)) movie_ci = _calc_norm(mx.nd.add_n(*movie_ci)) if self._symm: user_cj = _calc_norm(mx.nd.add_n(*user_cj)) movie_cj = _calc_norm(mx.nd.add_n(*movie_cj)) else: user_cj = mx.nd.ones((self.num_user, ), ctx=self._ctx) movie_cj = mx.nd.ones((self.num_movie, ), ctx=self._ctx) graph.nodes['user'].data.update({'ci': user_ci, 'cj': user_cj}) graph.nodes['movie'].data.update({'ci': movie_ci, 'cj': movie_cj}) return graph
mask[indices] = 1 return mask.byte() with open('../dataset/DBLP/DBLP.pickle', 'rb') as f: a_list, p_list, c_list = pickle.load(f) pa_list, pc_list = pickle.load(f) author_features = pickle.load(f) labels = pickle.load(f) # 构造异构网络 pa = dgl.bipartite(pa_list, 'paper', 'pa', 'author') ap = dgl.bipartite(transpose(pa_list), 'author', 'ap', 'paper') pc = dgl.bipartite(pc_list, 'paper', 'pc', 'conf') cp = dgl.bipartite(transpose(pc_list), 'conf', 'cp', 'paper') hg = dgl.hetero_from_relations([pa, ap, pc, cp]) features = torch.FloatTensor(author_features) labels = torch.LongTensor(labels) print(features.shape) print(labels.shape) num_class = 4 alls = [i for i in range(len(a_list))] train_idx, x, _, _ = train_test_split(alls, labels, test_size=0.2, random_state=52) eval_idx, test_idx, _, _ = train_test_split(x,
def _generate_enc_graph(self, rating_pairs, rating_values, add_support=False): user_movie_R = np.zeros((self._num_user, self._num_movie), dtype=np.float32) user_movie_R[rating_pairs] = rating_values movie_user_R = user_movie_R.transpose() rating_graphs = [] rating_row, rating_col = rating_pairs for rating in self.possible_rating_values: ridx = np.where(rating_values == rating) rrow = rating_row[ridx] rcol = rating_col[ridx] rating = str(rating).replace('.', '_') bg = dgl.bipartite((rrow, rcol), 'user', rating, 'movie', num_nodes=(self._num_user, self._num_movie)) rev_bg = dgl.bipartite((rcol, rrow), 'movie', 'rev-%s' % rating, 'user', num_nodes=(self._num_movie, self._num_user)) rating_graphs.append(bg) rating_graphs.append(rev_bg) graph = dgl.hetero_from_relations(rating_graphs) # sanity check assert len(rating_pairs[0]) == sum( [graph.number_of_edges(et) for et in graph.etypes]) // 2 if add_support: def _calc_norm(x): x = x.numpy().astype('float32') x[x == 0.] = np.inf x = th.FloatTensor(1. / np.sqrt(x)) return x.to(self._device).unsqueeze(1) user_ci = [] user_cj = [] movie_ci = [] movie_cj = [] for r in self.possible_rating_values: r = str(r).replace('.', '_') user_ci.append(graph['rev-%s' % r].in_degrees()) movie_ci.append(graph[r].in_degrees()) if self._symm: user_cj.append(graph[r].out_degrees()) movie_cj.append(graph['rev-%s' % r].out_degrees()) else: user_cj.append(th.zeros((self.num_user, ))) movie_cj.append(th.zeros((self.num_movie, ))) user_ci = _calc_norm(sum(user_ci)) movie_ci = _calc_norm(sum(movie_ci)) if self._symm: user_cj = _calc_norm(sum(user_cj)) movie_cj = _calc_norm(sum(movie_cj)) else: user_cj = th.ones(self.num_user, ).to(self._device) movie_cj = th.ones(self.num_movie, ).to(self._device) graph.nodes['user'].data.update({'ci': user_ci, 'cj': user_cj}) graph.nodes['movie'].data.update({'ci': movie_ci, 'cj': movie_cj}) return graph
def ACNN_graph_construction_and_featurization(ligand_mol, protein_mol, ligand_coordinates, protein_coordinates, max_num_ligand_atoms=None, max_num_protein_atoms=None, neighbor_cutoff=12., max_num_neighbors=12, strip_hydrogens=False): """Graph construction and featurization for `Atomic Convolutional Networks for Predicting Protein-Ligand Binding Affinity <https://arxiv.org/abs/1703.10603>`__. Parameters ---------- ligand_mol : rdkit.Chem.rdchem.Mol RDKit molecule instance. protein_mol : rdkit.Chem.rdchem.Mol RDKit molecule instance. ligand_coordinates : Float Tensor of shape (V1, 3) Atom coordinates in a ligand. protein_coordinates : Float Tensor of shape (V2, 3) Atom coordinates in a protein. max_num_ligand_atoms : int or None Maximum number of atoms in ligands for zero padding, which should be no smaller than ligand_mol.GetNumAtoms() if not None. If None, no zero padding will be performed. Default to None. max_num_protein_atoms : int or None Maximum number of atoms in proteins for zero padding, which should be no smaller than protein_mol.GetNumAtoms() if not None. If None, no zero padding will be performed. Default to None. neighbor_cutoff : float Distance cutoff to define 'neighboring'. Default to 12. max_num_neighbors : int Maximum number of neighbors allowed for each atom. Default to 12. strip_hydrogens : bool Whether to exclude hydrogen atoms. Default to False. """ assert ligand_coordinates is not None, 'Expect ligand_coordinates to be provided.' assert protein_coordinates is not None, 'Expect protein_coordinates to be provided.' if max_num_ligand_atoms is not None: assert max_num_ligand_atoms >= ligand_mol.GetNumAtoms(), \ 'Expect max_num_ligand_atoms to be no smaller than ligand_mol.GetNumAtoms(), ' \ 'got {:d} and {:d}'.format(max_num_ligand_atoms, ligand_mol.GetNumAtoms()) if max_num_protein_atoms is not None: assert max_num_protein_atoms >= protein_mol.GetNumAtoms(), \ 'Expect max_num_protein_atoms to be no smaller than protein_mol.GetNumAtoms(), ' \ 'got {:d} and {:d}'.format(max_num_protein_atoms, protein_mol.GetNumAtoms()) if strip_hydrogens: # Remove hydrogen atoms and their corresponding coordinates ligand_atom_indices_left = filter_out_hydrogens(ligand_mol) protein_atom_indices_left = filter_out_hydrogens(protein_mol) ligand_coordinates = ligand_coordinates.take(ligand_atom_indices_left, axis=0) protein_coordinates = protein_coordinates.take(protein_atom_indices_left, axis=0) else: ligand_atom_indices_left = list(range(ligand_mol.GetNumAtoms())) protein_atom_indices_left = list(range(protein_mol.GetNumAtoms())) # Compute number of nodes for each type if max_num_ligand_atoms is None: num_ligand_atoms = len(ligand_atom_indices_left) else: num_ligand_atoms = max_num_ligand_atoms if max_num_protein_atoms is None: num_protein_atoms = len(protein_atom_indices_left) else: num_protein_atoms = max_num_protein_atoms # Construct graph for atoms in the ligand ligand_srcs, ligand_dsts, ligand_dists = k_nearest_neighbors( ligand_coordinates, neighbor_cutoff, max_num_neighbors) ligand_graph = graph((ligand_srcs, ligand_dsts), 'ligand_atom', 'ligand', num_ligand_atoms) ligand_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy( np.array(ligand_dists).astype(np.float32)), (-1, 1)) # Construct graph for atoms in the protein protein_srcs, protein_dsts, protein_dists = k_nearest_neighbors( protein_coordinates, neighbor_cutoff, max_num_neighbors) protein_graph = graph((protein_srcs, protein_dsts), 'protein_atom', 'protein', num_protein_atoms) protein_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy( np.array(protein_dists).astype(np.float32)), (-1, 1)) # Construct 4 graphs for complex representation, including the connection within # protein atoms, the connection within ligand atoms and the connection between # protein and ligand atoms. complex_srcs, complex_dsts, complex_dists = k_nearest_neighbors( np.concatenate([ligand_coordinates, protein_coordinates]), neighbor_cutoff, max_num_neighbors) complex_srcs = np.array(complex_srcs) complex_dsts = np.array(complex_dsts) complex_dists = np.array(complex_dists) offset = num_ligand_atoms # ('ligand_atom', 'complex', 'ligand_atom') inter_ligand_indices = np.intersect1d( (complex_srcs < offset).nonzero()[0], (complex_dsts < offset).nonzero()[0], assume_unique=True) inter_ligand_graph = graph( (complex_srcs[inter_ligand_indices].tolist(), complex_dsts[inter_ligand_indices].tolist()), 'ligand_atom', 'complex', num_ligand_atoms) inter_ligand_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy( complex_dists[inter_ligand_indices].astype(np.float32)), (-1, 1)) # ('protein_atom', 'complex', 'protein_atom') inter_protein_indices = np.intersect1d( (complex_srcs >= offset).nonzero()[0], (complex_dsts >= offset).nonzero()[0], assume_unique=True) inter_protein_graph = graph( ((complex_srcs[inter_protein_indices] - offset).tolist(), (complex_dsts[inter_protein_indices] - offset).tolist()), 'protein_atom', 'complex', num_protein_atoms) inter_protein_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy( complex_dists[inter_protein_indices].astype(np.float32)), (-1, 1)) # ('ligand_atom', 'complex', 'protein_atom') ligand_protein_indices = np.intersect1d( (complex_srcs < offset).nonzero()[0], (complex_dsts >= offset).nonzero()[0], assume_unique=True) ligand_protein_graph = bipartite( (complex_srcs[ligand_protein_indices].tolist(), (complex_dsts[ligand_protein_indices] - offset).tolist()), 'ligand_atom', 'complex', 'protein_atom', (num_ligand_atoms, num_protein_atoms)) ligand_protein_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy( complex_dists[ligand_protein_indices].astype(np.float32)), (-1, 1)) # ('protein_atom', 'complex', 'ligand_atom') protein_ligand_indices = np.intersect1d( (complex_srcs >= offset).nonzero()[0], (complex_dsts < offset).nonzero()[0], assume_unique=True) protein_ligand_graph = bipartite( ((complex_srcs[protein_ligand_indices] - offset).tolist(), complex_dsts[protein_ligand_indices].tolist()), 'protein_atom', 'complex', 'ligand_atom', (num_protein_atoms, num_ligand_atoms)) protein_ligand_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy( complex_dists[protein_ligand_indices].astype(np.float32)), (-1, 1)) # Merge the graphs g = hetero_from_relations( [protein_graph, ligand_graph, inter_ligand_graph, inter_protein_graph, ligand_protein_graph, protein_ligand_graph] ) # Get atomic numbers for all atoms left and set node features ligand_atomic_numbers = np.array(get_atomic_numbers(ligand_mol, ligand_atom_indices_left)) # zero padding ligand_atomic_numbers = np.concatenate([ ligand_atomic_numbers, np.zeros(num_ligand_atoms - len(ligand_atom_indices_left))]) protein_atomic_numbers = np.array(get_atomic_numbers(protein_mol, protein_atom_indices_left)) # zero padding protein_atomic_numbers = np.concatenate([ protein_atomic_numbers, np.zeros(num_protein_atoms - len(protein_atom_indices_left))]) g.nodes['ligand_atom'].data['atomic_number'] = F.reshape(F.zerocopy_from_numpy( ligand_atomic_numbers.astype(np.float32)), (-1, 1)) g.nodes['protein_atom'].data['atomic_number'] = F.reshape(F.zerocopy_from_numpy( protein_atomic_numbers.astype(np.float32)), (-1, 1)) # Prepare mask indicating the existence of nodes ligand_masks = np.zeros((num_ligand_atoms, 1)) ligand_masks[:len(ligand_atom_indices_left), :] = 1 g.nodes['ligand_atom'].data['mask'] = F.zerocopy_from_numpy( ligand_masks.astype(np.float32)) protein_masks = np.zeros((num_protein_atoms, 1)) protein_masks[:len(protein_atom_indices_left), :] = 1 g.nodes['protein_atom'].data['mask'] = F.zerocopy_from_numpy( protein_masks.astype(np.float32)) return g
def encode_data(self): """ Encode nodes & edges from data.json """ # first create dictionary is_new_dict_node = False is_new_dict_edge = False if not os.path.isdir(self.vocab_path): os.makedirs(self.vocab_path) if not os.path.exists(self.vocab_path_node): self.create_dict_node() is_new_dict_node = True if not os.path.exists(self.vocab_path_edge): self.create_dict_edge() is_new_dict_edge = True # read from dict node with open(self.vocab_path_node, 'r') as f: vocab = f.read().strip() self.word_dict_node = vocab.split(' ') if is_new_dict_node is False: self.append_dict_node() # vocab_size = len(vocab) # print('vocab size: {}'.format(vocab_size)) self.word_to_ix_node = {word: i for i, word in enumerate(self.word_dict_node)} # read from dict edge with open(self.vocab_path_edge, 'r') as f: vocab = f.read().strip() self.word_dict_edge = vocab.split(' ') if is_new_dict_edge is False: self.append_dict_edge() # vocab_size = len(vocab) # print('vocab size: {}'.format(vocab_size)) self.word_to_ix_edge = {word: i for i, word in enumerate(self.word_dict_edge)} # num_token_node = len(self.word_dict_node) # self.embedding_node = nn.Embedding(num_token_node, 1) # num_token_edge = len(self.word_dict_edge) # self.embedding_edge = nn.Embedding(num_token_edge, 1) num_token = len(self.word_dict_node) + len(self.word_dict_edge) self.embedding = nn.Embedding(num_token, 1) if 'nodes' in self.json_data.keys(): n_num = 0 n_tot = len(self.json_data['nodes']) # self.embed_nodes = nn.Embedding(n_tot, 1) print('\nencode_node') for node in self.json_data['nodes']: # print('encode_node ', node) self.encode_node(node) n_num += 1 if n_num % 1000 == 0 or n_num == n_tot: print('{}/{}'.format(n_num, n_tot)) for key in self.json_data: if key != 'nodes': p_num = 0 p_tot = len(self.json_data[key]) # self.embed_edges = nn.Embedding(p_tot, 1) print('\nencode_edge type '+key) for path in self.json_data[key]: # print('encode_edge ', path) self.encode_edge(path) p_num += 1 if p_num % 1000 == 0 or p_num == p_tot: print('{}/{}'.format(p_num, p_tot)) g_proc_call_api = dgl.bipartite([self.proc_call_api['proc'], self.proc_call_api['api']], 'proc', 'call', 'api') g_file_affect_api = dgl.bipartite([self.file_affect_api['file'], self.file_affect_api['api']], 'file', 'affect', 'api') g_api_modify_file = dgl.bipartite([self.api_modify_file['api'], self.api_modify_file['file']], 'api', 'modify', 'file') g_reg_affect_api = dgl.bipartite([self.reg_affect_api['reg'], self.reg_affect_api['api']], 'reg', 'affect', 'api') g_api_modify_reg = dgl.bipartite([self.api_modify_reg['api'], self.api_modify_reg['reg']], 'api', 'modify', 'reg') self.hetero_g = dgl.hetero_from_relations([g_proc_call_api, g_file_affect_api, g_api_modify_file, g_reg_affect_api, g_api_modify_reg])
def construct_graph(): api_ids = [] api_names = [] app_ids = [] app_names = [] f_3 = open(os.path.join(path, "id_api_320.txt"), encoding='utf-8') f_4 = open(os.path.join(path, "id_app_320.txt"), encoding='utf-8') while True: z = f_3.readline() if not z: break z = z.strip().split() identity = int(z[0]) api_ids.append(identity) api_names.append(z[1]) while True: w = f_4.readline() if not w: break w = w.strip().split() identity = int(w[0]) app_ids.append(identity) app_names.append(w[1]) f_3.close() f_4.close() api_ids_invmap = {x: i for i, x in enumerate(api_ids)} app_ids_invmap = {x: i for i, x in enumerate(app_ids)} api_api_B_src = [] api_api_B_dst = [] api_api_P_src = [] api_api_P_dst = [] api_app_src = [] api_app_dst = [] f_1 = open(os.path.join(path, "same_block_api_320.txt"), "r") # B matrix f_2 = open(os.path.join(path, "api_app_320.txt"), "r") # A matrix f_5 = open(os.path.join(path, "same_package_api_320.txt"), "r") # P matrix # B for x in f_1: x = x.split() x[0] = int(x[0]) x[1] = int(x[1].strip('\n')) api_api_B_src.append(api_ids_invmap[x[0]]) api_api_B_dst.append(api_ids_invmap[x[1]]) # A for y in f_2: y = y.split() y[0] = int(y[0]) y[1] = int(y[1].strip('\n')) api_app_src.append(api_ids_invmap[y[0]]) api_app_dst.append(app_ids_invmap[y[1]]) # P for z in f_5: z = z.split() z[0] = int(z[0]) z[1] = int(z[1].strip('\n')) api_api_P_src.append(api_ids_invmap[z[0]]) api_api_P_dst.append(api_ids_invmap[z[1]]) f_1.close() f_2.close() f_5.close() app_api = dgl.bipartite((api_app_dst, api_app_src), 'app', 'app_api', 'api1') api_api_B = dgl.bipartite((api_api_B_src, api_api_B_dst), 'api1', 'api_api_B', 'api2') api_api_P = dgl.bipartite((api_api_P_src, api_api_P_dst), 'api2', 'api_api_P', 'api3') api_api_B_T = dgl.bipartite((api_api_B_dst, api_api_B_src), 'api3', 'api_api_B_T', 'api1') # B transpose api_app = dgl.bipartite((api_app_src, api_app_dst), 'api1', 'api_app', 'app') # A transpose hg = dgl.hetero_from_relations( [app_api, api_api_B, api_api_P, api_api_B_T, api_app]) return hg, api_names, app_names
for item in np.unique(items): iid2vid[item] = inc vid2iid[inc] = item inc += 1 assert((len(iid2vid)+len(uid2vid)) == total_vertices) src = list(map(lambda x: uid2vid[x], users)) dst = list(map(lambda x: iid2vid[x], items)) click_graph = dgl.bipartite(list(zip(src,dst)), 'user', 'ui', 'item') click_graph.edges['ui'].data['timestamp']=timestamps click_graph.edges['ui'].data['rating']=torch.ones(click_graph.number_of_edges()) clicked_graph = dgl.bipartite(list(zip(dst,src)), 'item', 'iu', 'user') clicked_graph.edges['iu'].data['timestamp']=timestamps clicked_graph.edges['iu'].data['rating']=torch.ones(clicked_graph.number_of_edges()) g = dgl.hetero_from_relations({click_graph, clicked_graph}) with open(directory+"/underexpose_train/user_generate_feat.txt", "r")as f: lines = f.readlines() usr_feat = np.zeros((len(lines), 5)) for i in range(len(lines)): if lines[i].split(",")[2] == "0": usr_feat[i][3] = 1 if lines[i].split(",")[2] == "1": usr_feat[i][4] = 1 del lines fn = directory+"/underexpose_train/user_generate_feat.txt" usr_data = genfromtxt(fn, delimiter=',', dtype=np.int16) usr_feat[:, 0:2] = usr_data[:, 0:2] usr_feat[:, 2] = usr_data[:, 3] uid2feat = {}