def subgraph_gen(hg, label_idx, neighbours=[10, 10]): """ Subgraph sampling by graphsage_sampling """ nt = hg.node_types drugs_idx = np.where(nt == 'drug')[0] layer1_neighs, layer1_eids = graphsage_sampling(hg, drugs_idx, num_neighbours = neighbours[0], etype='dti') layer2_neighs, layer2_eids = graphsage_sampling(hg, layer1_neighs, num_neighbours = neighbours[1], etype='ppi') sub_nodes = drugs_idx.tolist() + layer1_neighs + layer2_neighs sub_nodes_reidx = dict(zip(sub_nodes, range(len(sub_nodes)))) label_mat = np.zeros((len(sub_nodes), len(sub_nodes))).astype('float32') for p in hg['dds'].edges.tolist(): label_mat[sub_nodes_reidx[p[0]], sub_nodes_reidx[p[1]]] = label_idx[tuple(p)] sub_eids = {} sub_eids['dds'] = [(sub_nodes_reidx[src], sub_nodes_reidx[dst]) for (src, dst) in hg['dds'].edges.tolist()] sub_eids['dti'] = [(sub_nodes_reidx[src], sub_nodes_reidx[dst]) for (src, dst) in layer1_eids] sub_eids['dti'] += [(sub_nodes_reidx[dst], sub_nodes_reidx[src]) for (src, dst) in layer1_eids] sub_eids['ppi'] = [(sub_nodes_reidx[src], sub_nodes_reidx[dst]) for (src, dst) in layer2_eids] sub_eids['ppi'] += [(sub_nodes_reidx[dst], sub_nodes_reidx[src]) for (src, dst) in layer2_eids] sub_nodes_feat = hg['dds'].node_feat['features'][sub_nodes, :] sub_graph = pgl.HeterGraph(edges=sub_eids, num_nodes=len(sub_nodes), node_feat={'features': sub_nodes_feat}) return {'sub_graph': (sub_graph, len(sub_nodes), sub_eids, sub_nodes_feat, label_mat)}
def test_tensor(self): np.random.seed(1) dim = 4 num_nodes = 15 edges = {} # for test no successor edges['c2p'] = [(1, 4), (0, 5), (1, 9), (1, 8), (2, 8), (2, 5), (3, 6), (3, 7), (3, 4), (3, 8)] edges['p2c'] = [(v, u) for u, v in edges['c2p']] edges['p2a'] = [(4, 10), (4, 11), (4, 12), (4, 14), (4, 13), (6, 12), (6, 11), (6, 14), (7, 12), (7, 11), (8, 14), (9, 10)] edges['a2p'] = [(v, u) for u, v in edges['p2a']] node_types = ['c' for _ in range(4)] + \ ['p' for _ in range(6)] + \ ['a' for _ in range(5)] node_types = [(i, t) for i, t in enumerate(node_types)] nfeat = {'nfeat': np.random.randn(num_nodes, dim)} efeat = {} for etype, _edges in edges.items(): efeat[etype] = {'efeat': np.random.randn(len(_edges), dim)} hg = pgl.HeterGraph(edges=edges, node_types=node_types, node_feat=nfeat, edge_feat=efeat) # inplace new_hg = hg.tensor(inplace=False) self.assertNotIsInstance(hg.node_feat['nfeat'], paddle.Tensor) self.assertNotIsInstance(hg.edge_feat['a2p']['efeat'], paddle.Tensor) self.assertIsInstance(new_hg.node_feat['nfeat'], paddle.Tensor) self.assertIsInstance(new_hg.edge_feat['a2p']['efeat'], paddle.Tensor) self.assertIsInstance(new_hg.num_nodes, paddle.Tensor) hg.tensor(inplace=True) self.assertIsInstance(hg.node_feat['nfeat'], paddle.Tensor) self.assertIsInstance(hg.edge_feat['a2p']['efeat'], paddle.Tensor) new_hg = hg.numpy(inplace=False) self.assertIsInstance(new_hg.node_feat['nfeat'], np.ndarray) self.assertIsInstance(new_hg.edge_feat['a2p']['efeat'], np.ndarray) self.assertIsInstance(hg.node_feat['nfeat'], paddle.Tensor) self.assertIsInstance(hg.edge_feat['a2p']['efeat'], paddle.Tensor) hg.numpy(inplace=True) self.assertIsInstance(hg.node_feat['nfeat'], np.ndarray) self.assertIsInstance(hg.edge_feat['a2p']['efeat'], np.ndarray)
def build_heter_graph(data_path, num_nodes): edges = {} idx = 0 for filename in glob.glob(os.path.join(data_path, '*')): try: e = pd.read_csv(filename, header=None, sep="\t").values edges['etype%s' % idx] = e idx += 1 except Exception as e: log.info(e) continue node_types = [(i, "n") for i in range(num_nodes)] hg = pgl.HeterGraph(edges=edges, node_types=node_types) return hg
def test_build_hetergraph(self): np.random.seed(1) dim = 4 num_nodes = 15 edges = {} # for test no successor edges['c2p'] = [(1, 4), (0, 5), (1, 9), (1, 8), (2, 8), (2, 5), (3, 6), (3, 7), (3, 4), (3, 8)] edges['p2c'] = [(v, u) for u, v in edges['c2p']] edges['p2a'] = [(4, 10), (4, 11), (4, 12), (4, 14), (4, 13), (6, 12), (6, 11), (6, 14), (7, 12), (7, 11), (8, 14), (9, 10)] edges['a2p'] = [(v, u) for u, v in edges['p2a']] node_types = ['c' for _ in range(4)] + \ ['p' for _ in range(6)] + \ ['a' for _ in range(5)] node_types = [(i, t) for i, t in enumerate(node_types)] nfeat = {'nfeat': np.random.randn(num_nodes, dim)} efeat = {} for etype, _edges in edges.items(): efeat[etype] = {'efeat': np.random.randn(len(_edges), dim)} hg = pgl.HeterGraph(edges=edges, node_types=node_types, node_feat=nfeat, edge_feat=efeat) self.assertFalse(hg.is_tensor()) self.assertEqual(hg.indegree(5), [2]) self.assertEqual(hg.outdegree(4), [7]) self.assertEqual(hg.outdegree(4, 'c2p'), [0]) self.assertEqual(hg.successor('c2p', [4]).tolist(), [[]]) self.assertEqual( hg.predecessor('a2p', [4]).tolist(), [[10, 11, 12, 14, 13]]) print() # print(hg.predecessor('a2p', [4])) for batch in hg.node_batch_iter(3): break
def test_build_tensor_hetergraph(self): np.random.seed(1) dim = 4 num_nodes = paddle.to_tensor(15) edges = {} # for test no successor c2p = [(1, 4), (0, 5), (1, 9), (1, 8), (2, 8), (2, 5), (3, 6), (3, 7), (3, 4), (3, 8)] edges['c2p'] = paddle.to_tensor(np.array(c2p)) p2c = [(v, u) for u, v in c2p] edges['p2c'] = paddle.to_tensor(np.array(p2c)) p2a = [(4, 10), (4, 11), (4, 12), (4, 14), (4, 13), (6, 12), (6, 11), (6, 14), (7, 12), (7, 11), (8, 14), (9, 10)] edges['p2a'] = paddle.to_tensor(np.array(p2a)) a2p = [(v, u) for u, v in p2a] edges['a2p'] = paddle.to_tensor(np.array(a2p)) node_types = ['c' for _ in range(4)] + \ ['p' for _ in range(6)] + \ ['a' for _ in range(5)] node_types = [(i, t) for i, t in enumerate(node_types)] hg = pgl.HeterGraph(edges=edges, node_types=node_types) self.assertTrue(hg.is_tensor()) print() self.assertEqual( hg.indegree(paddle.to_tensor(5)).numpy(), np.array([2])) self.assertEqual( hg.indegree(paddle.to_tensor(5), 'c2p').numpy(), np.array([2])) self.assertEqual( hg.outdegree(paddle.to_tensor(4)).numpy(), np.array([7])) self.assertEqual( hg.outdegree(paddle.to_tensor(4), 'p2a').numpy(), np.array([5])) # print(hg.outdegree(paddle.to_tensor(4)).numpy()) for batch in hg.node_batch_iter(3, n_type='c'): break
def test_dump_and_load(self): np.random.seed(1) dim = 4 num_nodes = 15 edges = {} # for test no successor edges['c2p'] = [(1, 4), (0, 5), (1, 9), (1, 8), (2, 8), (2, 5), (3, 6), (3, 7), (3, 4), (3, 8)] edges['p2c'] = [(v, u) for u, v in edges['c2p']] edges['p2a'] = [(4, 10), (4, 11), (4, 12), (4, 14), (4, 13), (6, 12), (6, 11), (6, 14), (7, 12), (7, 11), (8, 14), (9, 10)] edges['a2p'] = [(v, u) for u, v in edges['p2a']] node_types = ['c' for _ in range(4)] + \ ['p' for _ in range(6)] + \ ['a' for _ in range(5)] node_types = [(i, t) for i, t in enumerate(node_types)] nfeat = {'nfeat': np.random.randn(num_nodes, dim)} efeat = {} for etype, _edges in edges.items(): efeat[etype] = {'efeat': np.random.randn(len(_edges), dim)} hg = pgl.HeterGraph(edges=edges, node_types=node_types, node_feat=nfeat, edge_feat=efeat) path = "./tmp" hg.dump(path, indegree=True) hg2 = pgl.HeterGraph.load(path) self.assertEqual(hg.num_nodes, hg2.num_nodes) del hg del hg2 shutil.rmtree(path)
def collate_fn(self, ddi_data, dti_data, ppi_data, features): """Aggregate all needed nodes into a Hetrogenous graph""" drug_feat = pd.read_csv(features, index_col=0) drug_feat = drug_feat[~drug_feat.index.duplicated()] drug_feat = drug_feat.fillna(0) drug_feat.replace([np.inf, -np.inf], 0, inplace=True) nm = StandardScaler() scaled_feat = pd.DataFrame(nm.fit_transform(drug_feat)) scaled_feat = scaled_feat.fillna(0) scaled_feat.index = drug_feat.index edges = {'dds': [], 'dti': [], 'ppi': []} ddi_nn, ddi_nodes = num_nodes_stat(ddi_data) selected_drugs_feat = scaled_feat[scaled_feat.index.isin(ddi_nodes)] ddi_nodes = set(selected_drugs_feat.index) total_nodes = set() label = {} for d in ddi_data: if d['pair'][0] in ddi_nodes and d['pair'][1] in ddi_nodes: edges['dds'].append((d['pair'][0], d['pair'][1])) edges['dds'].append((d['pair'][1], d['pair'][0])) total_nodes.add(d['pair'][0]) total_nodes.add(d['pair'][1]) label[d['pair'][0], d['pair'][1]] = d['label'] label[d['pair'][1], d['pair'][0]] = d['label'] for d in dti_data: if d['pair'][0] in ddi_nodes: edges['dti'].append((d['pair'][0], d['pair'][1])) edges['dti'].append((d['pair'][1], d['pair'][0])) total_nodes.add(d['pair'][0]) total_nodes.add(d['pair'][1]) for d in ppi_data: edges['ppi'].append((d['pair'][0], d['pair'][1])) edges['ppi'].append((d['pair'][1], d['pair'][0])) total_nodes.add(d['pair'][0]) total_nodes.add(d['pair'][1]) num_nodes = len(total_nodes) nodes_dict = dict(zip(total_nodes, range(num_nodes))) node_feat = np.zeros((num_nodes, 2325)).astype('float32') selected_drugs_feat.index = [ nodes_dict[x] for x in selected_drugs_feat.index ] for d in selected_drugs_feat.index: node_feat[d, :] = selected_drugs_feat.loc[d, :].values.astype( 'float32') node_feats = {'features': node_feat} ek = {'dds': [], 'dti': [], 'ppi': []} for edge_type in edges.keys(): for p in edges[edge_type]: p1, p2 = nodes_dict[p[0]], nodes_dict[p[1]] ek[edge_type].append((p1, p2)) node_types = [] for m in nodes_dict.keys(): if m.startswith('CID'): node_types.append((nodes_dict[m], 'drug')) else: node_types.append((nodes_dict[m], 'protein')) hg = pgl.HeterGraph(num_nodes=num_nodes, edges=ek, node_types=node_types, node_feat=node_feats) label_idx = {} for key in label.keys(): label_idx[(nodes_dict[key[0]], nodes_dict[key[1]])] = label[key] return {'rt': (hg, nodes_dict, label, label_idx)}