def check_score_func(func_name): batch_size = 10 neg_sample_size = 10 g, entity_emb, rel_emb = generate_rand_graph(100, func_name) hidden_dim = entity_emb.shape[1] ke_score_func = ke_score_funcs[func_name] model = BaseKEModel(ke_score_func, entity_emb, rel_emb) EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler') sampler = EdgeSampler(g, batch_size=batch_size, neg_sample_size=neg_sample_size, negative_mode='PBG-head', num_workers=1, shuffle=False, exclude_positive=False, return_false_neg=False) for pos_g, neg_g in sampler: neg_g = create_neg_subgraph(pos_g, neg_g, True, True, g.number_of_nodes()) pos_g.copy_from_parent() neg_g.copy_from_parent() score1 = F.reshape(model.predict_score(neg_g), (batch_size, -1)) score2 = model.predict_neg_score(pos_g, neg_g) score2 = F.reshape(score2, (batch_size, -1)) np.testing.assert_allclose(F.asnumpy(score1), F.asnumpy(score2), rtol=1e-5, atol=1e-5)
def knn_graphE(x, k, istrain=False): """Transforms the given point set to a directed graph, whose coordinates are given as a matrix. The predecessors of each point are its k-nearest neighbors. If a 3D tensor is given instead, then each row would be transformed into a separate graph. The graphs will be unioned. Parameters ---------- x : Tensor The input tensor. If 2D, each row of ``x`` corresponds to a node. If 3D, a k-NN graph would be constructed for each row. Then the graphs are unioned. k : int The number of neighbors Returns ------- DGLGraph The graph. The node IDs are in the same order as ``x``. """ if F.ndim(x) == 2: x = F.unsqueeze(x, 0) n_samples, n_points, _ = F.shape(x) dist = pairwise_squared_distance(x) if istrain and np.random.rand() > 0.5: k_indices = F.argtopk(dist, round(1.5 * k), 2, descending=False) rand_k = np.random.permutation(round(1.5 * k) - 1)[0:k - 1] + 1 # 0 + random k-1 rand_k = np.append(rand_k, 0) k_indices = k_indices[:, :, rand_k] # add 0 else: k_indices = F.argtopk(dist, k, 2, descending=False) dst = F.copy_to(k_indices, F.cpu()) src = F.zeros_like(dst) + F.reshape(F.arange(0, n_points), (1, -1, 1)) per_sample_offset = F.reshape( F.arange(0, n_samples) * n_points, (-1, 1, 1)) dst += per_sample_offset src += per_sample_offset dst = F.reshape(dst, (-1, )) src = F.reshape(src, (-1, )) adj = sparse.csr_matrix( (F.asnumpy(F.zeros_like(dst) + 1), (F.asnumpy(dst), F.asnumpy(src)))) g = DGLGraph(adj, readonly=True) return g
def __call__(self, edges): sdata = edges.src[self.src_field] edata = edges.data[self.edge_field] # Due to the different broadcasting semantics of different backends, # we need to broadcast the sdata and edata to be of the same rank. rank = max(F.ndim(sdata), F.ndim(edata)) sshape = F.shape(sdata) eshape = F.shape(edata) sdata = F.reshape(sdata, sshape + (1, ) * (rank - F.ndim(sdata))) edata = F.reshape(edata, eshape + (1, ) * (rank - F.ndim(edata))) ret = self.mul_op(sdata, edata) return {self.out_field: ret}
def segmented_knn_graph(x, k, segs): """Transforms the given point set to a directed graph, whose coordinates are given as a matrix. The predecessors of each point are its k-nearest neighbors. The matrices are concatenated along the first axis, and are segmented by ``segs``. Each block would be transformed into a separate graph. The graphs will be unioned. Parameters ---------- x : Tensor The input tensor. k : int The number of neighbors segs : iterable of int Number of points of each point set. Must sum up to the number of rows in ``x``. Returns ------- DGLGraph The graph. The node IDs are in the same order as ``x``. """ n_total_points, _ = F.shape(x) offset = np.insert(np.cumsum(segs), 0, 0) h_list = F.split(x, segs, 0) dst = [ F.argtopk(pairwise_squared_distance(h_g), k, 1, descending=False) + offset[i] for i, h_g in enumerate(h_list) ] dst = F.cat(dst, 0) src = F.arange(0, n_total_points).unsqueeze(1).expand(n_total_points, k) dst = F.reshape(dst, (-1, )) src = F.reshape(src, (-1, )) # !!! fix shape adj = sparse.csr_matrix( (F.asnumpy(F.zeros_like(dst) + 1), (F.asnumpy(dst), F.asnumpy(src))), shape=(n_total_points, n_total_points)) g = DGLGraph(adj, readonly=True) return g
def __call__(self, edges): src_data = edges.src[self.src_field] edata = edges.data[self.edge_field] if F.ndim(edata) == 1: # edge feature is a scalar, unsqueeze dims of len 1 src_dim = F.ndim(src_data) new_eshape = (F.shape(edata)[0], ) + (1, ) * (src_dim - 1) edata = F.reshape(edata, new_eshape) ret = self.mul_op(src_data, edata) return {self.out_field: ret}
def score(self, head, rel, tail, triplet_wise=False): head_emb = self.entity_emb(head) rel_emb = self.relation_emb(rel) tail_emb = self.entity_emb(tail) num_head = F.shape(head)[0] num_rel = F.shape(rel)[0] num_tail = F.shape(tail)[0] batch_size = self.batch_size score = [] if triplet_wise: class FakeEdge(object): def __init__(self, head_emb, rel_emb, tail_emb): self._hobj = {} self._robj = {} self._tobj = {} self._hobj['emb'] = head_emb self._robj['emb'] = rel_emb self._tobj['emb'] = tail_emb @property def src(self): return self._hobj @property def dst(self): return self._tobj @property def data(self): return self._robj for i in range((num_head + batch_size - 1) // batch_size): sh_emb = head_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] sr_emb = rel_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] st_emb = tail_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] edata = FakeEdge(sh_emb, sr_emb, st_emb) score.append( F.copy_to( self.score_func.edge_func(edata)['score'], F.cpu())) score = F.cat(score, dim=0) return score else: for i in range((num_head + batch_size - 1) // batch_size): sh_emb = head_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] s_score = [] for j in range((num_tail + batch_size - 1) // batch_size): st_emb = tail_emb[j * batch_size : (j + 1) * batch_size \ if (j + 1) * batch_size < num_tail \ else num_tail] s_score.append( F.copy_to( self.score_func.infer(sh_emb, rel_emb, st_emb), F.cpu())) score.append(F.cat(s_score, dim=2)) score = F.cat(score, dim=0) return F.reshape(score, (num_head * num_rel * num_tail, ))
def topK(self, head=None, tail=None, bcast=False, pair_ws=False, k=10): if head is None: head = F.arange(0, self.emb.shape[0]) else: head = F.tensor(head) if tail is None: tail = F.arange(0, self.emb.shape[0]) else: tail = F.tensor(tail) head_emb = self.emb[head] tail_emb = self.emb[tail] if pair_ws is True: result = [] batch_size = self.batch_size # chunked cal score score = [] num_head = head.shape[0] num_tail = tail.shape[0] for i in range((num_head + batch_size - 1) // batch_size): sh_emb = head_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] sh_emb = F.copy_to(sh_emb, self.device) st_emb = tail_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] st_emb = F.copy_to(st_emb, self.device) score.append(F.copy_to(self.sim_func(sh_emb, st_emb, pw=True), F.cpu())) score = F.cat(score, dim=0) sidx = F.argsort(score, dim=0, descending=True) sidx = sidx[:k] score = score[sidx] result.append((F.asnumpy(head[sidx]), F.asnumpy(tail[sidx]), F.asnumpy(score))) else: num_head = head.shape[0] num_tail = tail.shape[0] batch_size = self.batch_size # chunked cal score score = [] for i in range((num_head + batch_size - 1) // batch_size): sh_emb = head_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] sh_emb = F.copy_to(sh_emb, self.device) s_score = [] for j in range((num_tail + batch_size - 1) // batch_size): st_emb = tail_emb[j * batch_size : (j + 1) * batch_size \ if (j + 1) * batch_size < num_tail \ else num_tail] st_emb = F.copy_to(st_emb, self.device) s_score.append(F.copy_to(self.sim_func(sh_emb, st_emb), F.cpu())) score.append(F.cat(s_score, dim=1)) score = F.cat(score, dim=0) if bcast is False: result = [] idx = F.arange(0, num_head * num_tail) score = F.reshape(score, (num_head * num_tail, )) sidx = F.argsort(score, dim=0, descending=True) sidx = sidx[:k] score = score[sidx] sidx = sidx idx = idx[sidx] tail_idx = idx % num_tail idx = floor_divide(idx, num_tail) head_idx = idx % num_head result.append((F.asnumpy(head[head_idx]), F.asnumpy(tail[tail_idx]), F.asnumpy(score))) else: # bcast at head result = [] for i in range(num_head): i_score = score[i] sidx = F.argsort(i_score, dim=0, descending=True) idx = F.arange(0, num_tail) i_idx = sidx[:k] i_score = i_score[i_idx] idx = idx[i_idx] result.append((np.full((k,), F.asnumpy(head[i])), F.asnumpy(tail[idx]), F.asnumpy(i_score))) return result
def ACNN_graph_construction_and_featurization(ligand_mol, protein_mol, ligand_coordinates, protein_coordinates, max_num_ligand_atoms=None, max_num_protein_atoms=None, neighbor_cutoff=12., max_num_neighbors=12, strip_hydrogens=False): """Graph construction and featurization for `Atomic Convolutional Networks for Predicting Protein-Ligand Binding Affinity <https://arxiv.org/abs/1703.10603>`__. Parameters ---------- ligand_mol : rdkit.Chem.rdchem.Mol RDKit molecule instance. protein_mol : rdkit.Chem.rdchem.Mol RDKit molecule instance. ligand_coordinates : Float Tensor of shape (V1, 3) Atom coordinates in a ligand. protein_coordinates : Float Tensor of shape (V2, 3) Atom coordinates in a protein. max_num_ligand_atoms : int or None Maximum number of atoms in ligands for zero padding, which should be no smaller than ligand_mol.GetNumAtoms() if not None. If None, no zero padding will be performed. Default to None. max_num_protein_atoms : int or None Maximum number of atoms in proteins for zero padding, which should be no smaller than protein_mol.GetNumAtoms() if not None. If None, no zero padding will be performed. Default to None. neighbor_cutoff : float Distance cutoff to define 'neighboring'. Default to 12. max_num_neighbors : int Maximum number of neighbors allowed for each atom. Default to 12. strip_hydrogens : bool Whether to exclude hydrogen atoms. Default to False. """ assert ligand_coordinates is not None, 'Expect ligand_coordinates to be provided.' assert protein_coordinates is not None, 'Expect protein_coordinates to be provided.' if max_num_ligand_atoms is not None: assert max_num_ligand_atoms >= ligand_mol.GetNumAtoms(), \ 'Expect max_num_ligand_atoms to be no smaller than ligand_mol.GetNumAtoms(), ' \ 'got {:d} and {:d}'.format(max_num_ligand_atoms, ligand_mol.GetNumAtoms()) if max_num_protein_atoms is not None: assert max_num_protein_atoms >= protein_mol.GetNumAtoms(), \ 'Expect max_num_protein_atoms to be no smaller than protein_mol.GetNumAtoms(), ' \ 'got {:d} and {:d}'.format(max_num_protein_atoms, protein_mol.GetNumAtoms()) if strip_hydrogens: # Remove hydrogen atoms and their corresponding coordinates ligand_atom_indices_left = filter_out_hydrogens(ligand_mol) protein_atom_indices_left = filter_out_hydrogens(protein_mol) ligand_coordinates = ligand_coordinates.take(ligand_atom_indices_left, axis=0) protein_coordinates = protein_coordinates.take( protein_atom_indices_left, axis=0) else: ligand_atom_indices_left = list(range(ligand_mol.GetNumAtoms())) protein_atom_indices_left = list(range(protein_mol.GetNumAtoms())) # Compute number of nodes for each type if max_num_ligand_atoms is None: num_ligand_atoms = len(ligand_atom_indices_left) else: num_ligand_atoms = max_num_ligand_atoms if max_num_protein_atoms is None: num_protein_atoms = len(protein_atom_indices_left) else: num_protein_atoms = max_num_protein_atoms data_dict = dict() num_nodes_dict = dict() # graph data for atoms in the ligand ligand_srcs, ligand_dsts, ligand_dists = k_nearest_neighbors( ligand_coordinates, neighbor_cutoff, max_num_neighbors) data_dict[('ligand_atom', 'ligand', 'ligand_atom')] = (ligand_srcs, ligand_dsts) num_nodes_dict['ligand_atom'] = num_ligand_atoms # graph data for atoms in the protein protein_srcs, protein_dsts, protein_dists = k_nearest_neighbors( protein_coordinates, neighbor_cutoff, max_num_neighbors) data_dict[('protein_atom', 'protein', 'protein_atom')] = (protein_srcs, protein_dsts) num_nodes_dict['protein_atom'] = num_protein_atoms # 4 graphs for complex representation, including the connection within # protein atoms, the connection within ligand atoms and the connection between # protein and ligand atoms. complex_srcs, complex_dsts, complex_dists = k_nearest_neighbors( np.concatenate([ligand_coordinates, protein_coordinates]), neighbor_cutoff, max_num_neighbors) complex_srcs = np.array(complex_srcs) complex_dsts = np.array(complex_dsts) complex_dists = np.array(complex_dists) offset = num_ligand_atoms # ('ligand_atom', 'complex', 'ligand_atom') inter_ligand_indices = np.intersect1d((complex_srcs < offset).nonzero()[0], (complex_dsts < offset).nonzero()[0], assume_unique=True) data_dict[('ligand_atom', 'complex', 'ligand_atom')] = \ (complex_srcs[inter_ligand_indices].tolist(), complex_dsts[inter_ligand_indices].tolist()) # ('protein_atom', 'complex', 'protein_atom') inter_protein_indices = np.intersect1d( (complex_srcs >= offset).nonzero()[0], (complex_dsts >= offset).nonzero()[0], assume_unique=True) data_dict[('protein_atom', 'complex', 'protein_atom')] = \ ((complex_srcs[inter_protein_indices] - offset).tolist(), (complex_dsts[inter_protein_indices] - offset).tolist()) # ('ligand_atom', 'complex', 'protein_atom') ligand_protein_indices = np.intersect1d( (complex_srcs < offset).nonzero()[0], (complex_dsts >= offset).nonzero()[0], assume_unique=True) data_dict[('ligand_atom', 'complex', 'protein_atom')] = \ (complex_srcs[ligand_protein_indices].tolist(), (complex_dsts[ligand_protein_indices] - offset).tolist()) # ('protein_atom', 'complex', 'ligand_atom') protein_ligand_indices = np.intersect1d( (complex_srcs >= offset).nonzero()[0], (complex_dsts < offset).nonzero()[0], assume_unique=True) data_dict[('protein_atom', 'complex', 'ligand_atom')] = \ ((complex_srcs[protein_ligand_indices] - offset).tolist(), complex_dsts[protein_ligand_indices].tolist()) g = heterograph(data_dict, num_nodes_dict=num_nodes_dict) g.edges['ligand'].data['distance'] = F.reshape( F.zerocopy_from_numpy(np.array(ligand_dists).astype(np.float32)), (-1, 1)) g.edges['protein'].data['distance'] = F.reshape( F.zerocopy_from_numpy(np.array(protein_dists).astype(np.float32)), (-1, 1)) g.edges[('ligand_atom', 'complex', 'ligand_atom')].data['distance'] = \ F.reshape(F.zerocopy_from_numpy( complex_dists[inter_ligand_indices].astype(np.float32)), (-1, 1)) g.edges[('protein_atom', 'complex', 'protein_atom')].data['distance'] = \ F.reshape(F.zerocopy_from_numpy( complex_dists[inter_protein_indices].astype(np.float32)), (-1, 1)) g.edges[('ligand_atom', 'complex', 'protein_atom')].data['distance'] = \ F.reshape(F.zerocopy_from_numpy( complex_dists[ligand_protein_indices].astype(np.float32)), (-1, 1)) g.edges[('protein_atom', 'complex', 'ligand_atom')].data['distance'] = \ F.reshape(F.zerocopy_from_numpy( complex_dists[protein_ligand_indices].astype(np.float32)), (-1, 1)) # Get atomic numbers for all atoms left and set node features ligand_atomic_numbers = np.array( get_atomic_numbers(ligand_mol, ligand_atom_indices_left)) # zero padding ligand_atomic_numbers = np.concatenate([ ligand_atomic_numbers, np.zeros(num_ligand_atoms - len(ligand_atom_indices_left)) ]) protein_atomic_numbers = np.array( get_atomic_numbers(protein_mol, protein_atom_indices_left)) # zero padding protein_atomic_numbers = np.concatenate([ protein_atomic_numbers, np.zeros(num_protein_atoms - len(protein_atom_indices_left)) ]) g.nodes['ligand_atom'].data['atomic_number'] = F.reshape( F.zerocopy_from_numpy(ligand_atomic_numbers.astype(np.float32)), (-1, 1)) g.nodes['protein_atom'].data['atomic_number'] = F.reshape( F.zerocopy_from_numpy(protein_atomic_numbers.astype(np.float32)), (-1, 1)) # Prepare mask indicating the existence of nodes ligand_masks = np.zeros((num_ligand_atoms, 1)) ligand_masks[:len(ligand_atom_indices_left), :] = 1 g.nodes['ligand_atom'].data['mask'] = F.zerocopy_from_numpy( ligand_masks.astype(np.float32)) protein_masks = np.zeros((num_protein_atoms, 1)) protein_masks[:len(protein_atom_indices_left), :] = 1 g.nodes['protein_atom'].data['mask'] = F.zerocopy_from_numpy( protein_masks.astype(np.float32)) return g
def XYZ_graph_construction_and_featurization( protein_mol, protein_coordinates, max_num_protein_atoms=None, neighbor_cutoff=12., max_num_neighbors=12, strip_hydrogens=False): """Graph construction and featurization for `Atomic Convolutional Networks for Predicting Protein-Ligand Binding Affinity <https://arxiv.org/abs/1703.10603>`__. Parameters ---------- protein_mol : rdkit.Chem.rdchem.Mol RDKit molecule instance. protein_coordinates : Float Tensor of shape (V2, 3) Atom coordinates in a protein. max_num_protein_atoms : int or None Maximum number of atoms in proteins for zero padding. If None, no zero padding will be performed. Default to None. neighbor_cutoff : float Distance cutoff to define 'neighboring'. Default to 12. max_num_neighbors : int Maximum number of neighbors allowed for each atom. Default to 12. strip_hydrogens : bool Whether to exclude hydrogen atoms. Default to False. """ assert protein_coordinates is not None, 'Expect protein_coordinates to be provided.' if strip_hydrogens: # Remove hydrogen atoms and their corresponding coordinates protein_atom_indices_left = filter_out_hydrogens(protein_mol) protein_coordinates = protein_coordinates.take(protein_atom_indices_left, axis=0) else: protein_atom_indices_left = list(range(protein_mol.n_atoms )) # Compute number of nodes for each type if max_num_protein_atoms is None: num_protein_atoms = len(protein_atom_indices_left) else: num_protein_atoms = max_num_protein_atoms # Construct graph for atoms in the protein protein_srcs, protein_dsts, protein_dists = k_nearest_neighbors( protein_coordinates, neighbor_cutoff, max_num_neighbors) protein_graph = graph((protein_srcs, protein_dsts), 'protein_atom', 'protein', num_protein_atoms) protein_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy( np.array(protein_dists).astype(np.float32)), (-1, 1)) # Construct 4 graphs for complex representation, including the connection within # protein atoms, the connection within ligand atoms and the connection between # protein and ligand atoms. # Merge the graphs g = protein_graph protein_atomic_numbers = np.array(get_atomic_numbers(protein_mol, protein_atom_indices_left)) # zero padding protein_atomic_numbers = np.concatenate([ protein_atomic_numbers, np.zeros(num_protein_atoms - len(protein_atom_indices_left))]) g.nodes['protein_atom'].data['atomic_number'] = F.reshape(F.zerocopy_from_numpy( protein_atomic_numbers.astype(np.float32)), (-1, 1)) # Prepare mask indicating the existence of nodes protein_masks = np.zeros((num_protein_atoms, 1)) protein_masks[:len(protein_atom_indices_left), :] = 1 g.nodes['protein_atom'].data['mask'] = F.zerocopy_from_numpy( protein_masks.astype(np.float32)) return g