def featurize_binding_pocket_sybyl(protein_xyz, protein, ligand_xyz, ligand, pairwise_distances=None, cutoff=7.0): """Computes Sybyl dicts for ligand and binding pocket of the protein. Parameters ---------- protein_xyz: np.ndarray Of shape (N_protein_atoms, 3) protein: Rdkit Molecule Contains more metadata. ligand_xyz: np.ndarray Of shape (N_ligand_atoms, 3) ligand: Rdkit Molecule Contains more metadata pairwise_distances: np.ndarray Array of pairwise protein-ligand distances (Angstroms) cutoff: float Cutoff distance for contact consideration. """ if pairwise_distances is None: pairwise_distances = compute_pairwise_distances( protein_xyz, ligand_xyz) contacts = np.nonzero((pairwise_distances < cutoff)) protein_atoms = set([int(c) for c in contacts[0].tolist()]) protein_sybyl_dict = compute_all_sybyl(protein, indices=protein_atoms) ligand_sybyl_dict = compute_all_sybyl(ligand) return (protein_sybyl_dict, ligand_sybyl_dict)
def _featurize(self, complex: Tuple[str, str]): """ Compute featurization for a molecular complex Parameters ---------- complex: Tuple[str, str] Filenames for molecule and protein. """ try: fragments = load_complex(complex, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints for (frag1, frag2) in itertools.combinations(fragments, 2): # Get coordinates distances = compute_pairwise_distances(frag1[0], frag2[0]) vector = [ vectorize(hash_ecfp, feature_dict=ecfp_dict, size=self.size) for ecfp_dict in featurize_contacts_ecfp( frag1, frag2, distances, cutoff=self.cutoff, ecfp_degree=self.radius) ] pairwise_features += vector pairwise_features = np.concatenate(pairwise_features) return pairwise_features
def _featurize(self, mol_pdb: str, complex_pdb: str): """ Compute featurization for a molecular complex Parameters ---------- mol_pdb: str Filename for ligand molecule complex_pdb: str Filename for protein molecule """ molecular_complex = (mol_pdb, complex_pdb) try: fragments = load_complex(molecular_complex, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints for (frag1, frag2) in itertools.combinations(fragments, 2): # Get coordinates distances = compute_pairwise_distances(frag1[0], frag2[0]) # distances = compute_pairwise_distances(prot_xyz, lig_xyz) vectors = [ vectorize(hash_ecfp_pair, feature_dict=splif_dict, size=self.size) for splif_dict in featurize_splif( frag1, frag2, self.contact_bins, distances, self.radius) ] pairwise_features += vectors pairwise_features = np.concatenate(pairwise_features) return pairwise_features
def test_compute_pairwise_distances(self): n1 = 10 n2 = 50 coords1 = np.random.rand(n1, 3) coords2 = np.random.rand(n2, 3) distance = compute_pairwise_distances(coords1, coords2) self.assertEqual(distance.shape, (n1, n2)) self.assertTrue((distance >= 0).all()) # random coords between 0 and 1, so the max possible distance in sqrt(2) self.assertTrue((distance <= 2.0**0.5).all()) # check if correct distance metric was used coords1 = np.array([[0, 0, 0], [1, 0, 0]]) coords2 = np.array([[1, 0, 0], [2, 0, 0], [3, 0, 0]]) distance = compute_pairwise_distances(coords1, coords2) self.assertTrue((distance == [[1, 2, 3], [0, 1, 2]]).all())
def _featurize(self, datapoint, **kwargs): # -> Optional[np.ndarray]: """ Compute featurization for a single mol/protein complex Parameters ---------- datapoint: Tuple[str, str] Filenames for molecule and protein. """ if 'complex' in kwargs: datapoint = kwargs.get("complex") raise DeprecationWarning( 'Complex is being phased out as a parameter, please pass "datapoint" instead.' ) try: fragments = rdkit_utils.load_complex(datapoint, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) if self.reduce_to_contacts: fragments = reduce_molecular_complex_to_contacts( fragments, self.cutoff) for (frag1_ind, frag2_ind) in itertools.combinations(range(len(fragments)), 2): frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind] distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] # rdks = [frag1[1], frag2[1]] pairwise_features.append( np.concatenate([ sum([ voxelize(convert_atom_pair_to_voxel, hash_function=None, box_width=self.box_width, voxel_width=self.voxel_width, coordinates=xyz, feature_list=hbond_list, nb_channel=1) for xyz in xyzs ]) for hbond_list in compute_hydrogen_bonds( frag1, frag2, distances, self.distance_bins, self.angle_cutoffs) ], axis=-1)) # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis. return np.concatenate(pairwise_features, axis=-1)
def _featurize(self, mol_pdb: str, protein_pdb: str): """ Compute featurization for a molecular complex Parameters ---------- mol_pdb: str Filename for ligand molecule protein_pdb: str Filename for protein molecule """ molecular_complex = (mol_pdb, protein_pdb) try: fragments = load_complex(molecular_complex, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features: List[np.ndarray] = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) for (frag1, frag2) in itertools.combinations(fragments, 2): distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] pairwise_features.append( sum([ voxelize(convert_atom_to_voxel, xyz, self.box_width, self.voxel_width, hash_function=hash_ecfp, feature_dict=ecfp_dict, nb_channel=self.size) for xyz, ecfp_dict in zip( xyzs, featurize_contacts_ecfp(frag1, frag2, distances, cutoff=self.cutoff, ecfp_degree=self.radius)) ])) if self.flatten: return np.concatenate( [features.flatten() for features in pairwise_features]) else: # Features are of shape (voxels_per_edge, voxels_per_edge, # voxels_per_edge, num_feat) so we should concatenate on the last # axis. return np.concatenate(pairwise_features, axis=-1)
def _featurize(self, mol_pdb: str, protein_pdb: str) -> np.ndarray: """ Compute featurization for a single mol/protein complex Parameters ---------- mol_pdb: str Filename for ligand molecule protein_pdb: str Filename for protein molecule """ molecular_complex = (mol_pdb, protein_pdb) try: fragments = rdkit_utils.load_complex( molecular_complex, add_hydrogens=False) except MoleculeLoadException: logger.warning("This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) if self.reduce_to_contacts: fragments = reduce_molecular_complex_to_contacts(fragments, self.cutoff) for (frag1_ind, frag2_ind) in itertools.combinations( range(len(fragments)), 2): frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind] distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] # rdks = [frag1[1], frag2[1]] pairwise_features.append( np.concatenate( [ sum([ voxelize( convert_atom_pair_to_voxel, hash_function=None, box_width=self.box_width, voxel_width=self.voxel_width, coordinates=xyz, feature_list=hbond_list, nb_channel=1) for xyz in xyzs ]) for hbond_list in compute_hydrogen_bonds( frag1, frag2, distances, self.distance_bins, self.angle_cutoffs) ], axis=-1)) # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis. return np.concatenate(pairwise_features, axis=-1)
def _featurize(self, complex: Tuple[str, str]) -> Optional[np.ndarray]: """ Compute featurization for a single mol/protein complex Parameters ---------- complex: Tuple[str, str] Filenames for molecule and protein. """ try: fragments = rdkit_utils.load_complex(complex, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) if self.reduce_to_contacts: fragments = reduce_molecular_complex_to_contacts( fragments, self.cutoff) for (frag1_ind, frag2_ind) in itertools.combinations(range(len(fragments)), 2): frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind] distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] # rdks = [frag1[1], frag2[1]] pairwise_features.append( sum([ voxelize(convert_atom_pair_to_voxel, hash_function=None, coordinates=xyz, box_width=self.box_width, voxel_width=self.voxel_width, feature_list=compute_salt_bridges( frag1[1], frag2[1], distances, cutoff=self.cutoff), nb_channel=1) for xyz in xyzs ])) # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis. return np.concatenate(pairwise_features, axis=-1)
def _featurize(self, datapoint, **kwargs): """ Compute featurization for a molecular complex Parameters ---------- datapoint: Tuple[str, str] Filenames for molecule and protein. """ if 'complex' in kwargs: datapoint = kwargs.get("complex") raise DeprecationWarning( 'Complex is being phased out as a parameter, please pass "datapoint" instead.' ) try: fragments = load_complex(datapoint, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) for (frag1, frag2) in itertools.combinations(fragments, 2): distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] pairwise_features.append( np.concatenate([ voxelize(convert_atom_pair_to_voxel, hash_function=hash_ecfp_pair, coordinates=xyzs, box_width=self.box_width, voxel_width=self.voxel_width, feature_dict=splif_dict, nb_channel=self.size) for splif_dict in featurize_splif(frag1, frag2, self.contact_bins, distances, self.radius) ], axis=-1)) # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis. return np.concatenate(pairwise_features, axis=-1)
def get_contact_atom_indices(fragments: List[Tuple[np.ndarray, RDKitMol]], cutoff: float = 4.5) -> List[List[int]]: """Compute that atoms close to contact region. Molecular complexes can get very large. This can make it unwieldy to compute functions on them. To improve memory usage, it can be very useful to trim out atoms that aren't close to contact regions. This function computes pairwise distances between all pairs of molecules in the molecular complex. If an atom is within cutoff distance of any atom on another molecule in the complex, it is regarded as a contact atom. Otherwise it is trimmed. Parameters ---------- fragments: List[Tuple[np.ndarray, RDKit Mol]] As returned by `rdkit_utils.load_complex`, a list of tuples of `(coords, mol)` where `coords` is a `(N_atoms, 3)` array and `mol` is the rdkit molecule object. cutoff: float, optional (default 4.5) The cutoff distance in angstroms. Returns ------- List[List[int]] A list of length `len(molecular_complex)`. Each entry in this list is a list of atom indices from that molecule which should be kept, in sorted order. """ # indices to atoms to keep keep_inds: List[Set[int]] = [set([]) for _ in fragments] for (ind1, ind2) in itertools.combinations(range(len(fragments)), 2): frag1, frag2 = fragments[ind1], fragments[ind2] pairwise_distances = compute_pairwise_distances(frag1[0], frag2[0]) # contacts is of form (x_coords, y_coords), a tuple of 2 lists contacts = np.nonzero((pairwise_distances < cutoff)) # contacts[0] is the x_coords, that is the frag1 atoms that have # nonzero contact. frag1_atoms = set([int(c) for c in contacts[0].tolist()]) # contacts[1] is the y_coords, the frag2 atoms with nonzero contacts frag2_atoms = set([int(c) for c in contacts[1].tolist()]) keep_inds[ind1] = keep_inds[ind1].union(frag1_atoms) keep_inds[ind2] = keep_inds[ind2].union(frag2_atoms) sorted_keep_inds = [sorted(list(keep)) for keep in keep_inds] return sorted_keep_inds
def featurize_contacts_ecfp( frag1: Tuple, frag2: Tuple, pairwise_distances: np.ndarray = None, cutoff: float = 4.5, ecfp_degree: int = 2) -> Tuple[Dict[int, str], Dict[int, str]]: """Computes ECFP dicts for pairwise interaction between two molecular fragments. Parameters ---------- frag1: Tuple A tuple of (coords, mol) returned by `load_molecule`. frag2: Tuple A tuple of (coords, mol) returned by `load_molecule`. pairwise_distances: np.ndarray Array of pairwise fragment-fragment distances (Angstroms) cutoff: float Cutoff distance for contact consideration ecfp_degree: int ECFP radius Returns ------- Tuple of dictionaries of ECFP contact fragments """ if pairwise_distances is None: pairwise_distances = compute_pairwise_distances(frag1[0], frag2[0]) # contacts is of form (x_coords, y_coords), a tuple of 2 lists contacts = np.nonzero((pairwise_distances < cutoff)) # contacts[0] is the x_coords, that is the frag1 atoms that have # nonzero contact. frag1_atoms = set([int(c) for c in contacts[0].tolist()]) # contacts[1] is the y_coords, the frag2 atoms with nonzero contacts frag2_atoms = set([int(c) for c in contacts[1].tolist()]) frag1_ecfp_dict = compute_all_ecfp(frag1[1], indices=frag1_atoms, degree=ecfp_degree) frag2_ecfp_dict = compute_all_ecfp(frag2[1], indices=frag2_atoms, degree=ecfp_degree) return (frag1_ecfp_dict, frag2_ecfp_dict)
def _featurize(self, mol_pdb: str, complex_pdb: str): """ Compute featurization for a molecular complex Parameters ---------- mol_pdb: str Filename for ligand molecule complex_pdb: str Filename for protein molecule """ molecular_complex = (mol_pdb, complex_pdb) try: fragments = load_complex(molecular_complex, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) for (frag1, frag2) in itertools.combinations(fragments, 2): distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] pairwise_features.append( np.concatenate([ voxelize(convert_atom_pair_to_voxel, hash_function=hash_ecfp_pair, coordinates=xyzs, box_width=self.box_width, voxel_width=self.voxel_width, feature_dict=splif_dict, nb_channel=self.size) for splif_dict in featurize_splif(frag1, frag2, self.contact_bins, distances, self.radius) ], axis=-1)) # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis. return np.concatenate(pairwise_features, axis=-1)
def _featurize(self, complex: Tuple[str, str]) -> Optional[np.ndarray]: """ Compute featurization for a single mol/protein complex Parameters ---------- complex: Tuple[str, str] Filenames for molecule and protein. """ try: fragments = rdkit_utils.load_complex(complex, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints # centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) if self.reduce_to_contacts: fragments = reduce_molecular_complex_to_contacts( fragments, self.cutoff) # We compute pairwise contact fingerprints for (frag1_ind, frag2_ind) in itertools.combinations(range(len(fragments)), 2): frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind] distances = compute_pairwise_distances(frag1[0], frag2[0]) # frag1_xyz = subtract_centroid(frag1[0], centroid) # frag2_xyz = subtract_centroid(frag2[0], centroid) # xyzs = [frag1_xyz, frag2_xyz] # rdks = [frag1[1], frag2[1]] pairwise_features.append( np.concatenate([ np.array([len(hbond_list)]) for hbond_list in compute_hydrogen_bonds( frag1, frag2, distances, self.distance_bins, self.angle_cutoffs) ], axis=-1)) # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis. return np.concatenate(pairwise_features, axis=-1)
def _featurize(self, datapoint, **kwargs): """ Compute featurization for a molecular complex Parameters ---------- datapoint: Tuple[str, str] Filenames for molecule and protein. """ if 'complex' in kwargs: datapoint = kwargs.get("complex") raise DeprecationWarning( 'Complex is being phased out as a parameter, please pass "datapoint" instead.' ) try: fragments = load_complex(datapoint, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints for (frag1, frag2) in itertools.combinations(fragments, 2): # Get coordinates distances = compute_pairwise_distances(frag1[0], frag2[0]) vector = [ vectorize(hash_ecfp, feature_dict=ecfp_dict, size=self.size) for ecfp_dict in featurize_contacts_ecfp( frag1, frag2, distances, cutoff=self.cutoff, ecfp_degree=self.radius) ] pairwise_features += vector pairwise_features = np.concatenate(pairwise_features) return pairwise_features
def _featurize(self, complex): """Computes grid featurization of protein/ligand complex. Takes as input filenames pdb of the protein, pdb of the ligand. This function then computes the centroid of the ligand; decrements this centroid from the atomic coordinates of protein and ligand atoms, and then merges the translated protein and ligand. This combined system/complex is then saved. This function then computes a featurization with scheme specified by the user. Parameters ---------- complex: Tuple[str, str] Filenames for molecule and protein. """ try: mol_pdb_file, protein_pdb_file = complex time1 = time.time() protein_xyz, protein_rdk = load_molecule(protein_pdb_file, calc_charges=True, sanitize=self.sanitize) time2 = time.time() logger.info( "TIMING: Loading protein coordinates took %0.3f s" % (time2 - time1), self.verbose) time1 = time.time() ligand_xyz, ligand_rdk = load_molecule(mol_pdb_file, calc_charges=True, sanitize=self.sanitize) time2 = time.time() logger.info( "TIMING: Loading ligand coordinates took %0.3f s" % (time2 - time1), self.verbose) except MoleculeLoadException: logger.warning( "Some molecules cannot be loaded by Rdkit. Skipping") return None time1 = time.time() centroid = compute_centroid(ligand_xyz) ligand_xyz = subtract_centroid(ligand_xyz, centroid) protein_xyz = subtract_centroid(protein_xyz, centroid) time2 = time.time() logger.info( "TIMING: Centroid processing took %0.3f s" % (time2 - time1), self.verbose) pairwise_distances = compute_pairwise_distances( protein_xyz, ligand_xyz) transformed_systems = {} transformed_systems[(0, 0)] = [protein_xyz, ligand_xyz] for i in range(self.nb_rotations): rotated_system = rotate_molecules([protein_xyz, ligand_xyz]) transformed_systems[(i + 1, 0)] = rotated_system features_dict = {} for system_id, (protein_xyz, ligand_xyz) in transformed_systems.items(): feature_arrays = [] for is_flat, function_name in self.feature_types: result = self._compute_feature( function_name, protein_xyz, protein_rdk, ligand_xyz, ligand_rdk, pairwise_distances, ) feature_arrays += result if self.flatten: features_dict[system_id] = np.concatenate([ feature_array.flatten() for feature_array in feature_arrays ]) else: features_dict[system_id] = np.concatenate(feature_arrays, axis=-1) # TODO(rbharath): Is this squeeze OK? features = np.squeeze(np.array(list(features_dict.values()))) return features
def _featurize(self, complex) -> Optional[np.ndarray]: """ Compute featurization for a single mol/protein complex Parameters ---------- complex: Tuple[str, str] Filenames for molecule and protein. """ try: fragments = rdkit_utils.load_complex(complex, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) for (frag1_ind, frag2_ind) in itertools.combinations(range(len(fragments)), 2): frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind] distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] # rdks = [frag1[1], frag2[1]] protein_pi_t, protein_pi_parallel, ligand_pi_t, ligand_pi_parallel = ( compute_pi_stack(frag1[1], frag2[1], distances, dist_cutoff=self.cutoff, angle_cutoff=self.angle_cutoff)) pi_parallel_tensor = sum([ voxelize(convert_atom_to_voxel, hash_function=None, box_width=self.box_width, voxel_width=self.voxel_width, coordinates=xyz, feature_dict=feature_dict, nb_channel=1) for (xyz, feature_dict ) in zip(xyzs, [ligand_pi_parallel, protein_pi_parallel]) ]) pi_t_tensor = sum([ voxelize(convert_atom_to_voxel, hash_function=None, box_width=self.box_width, voxel_width=self.voxel_width, coordinates=frag1_xyz, feature_dict=protein_pi_t, nb_channel=1) for (xyz, feature_dict) in zip(xyzs, [ligand_pi_t, protein_pi_t]) ]) pairwise_features.append( np.concatenate([pi_parallel_tensor, pi_t_tensor], axis=-1)) # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 2) so we should concatenate on the last axis. return np.concatenate(pairwise_features, axis=-1)