def _featurize(self, datapoint, **kwargs): # -> Optional[np.ndarray]: """ Compute featurization for a single mol/protein complex Parameters ---------- datapoint: Tuple[str, str] Filenames for molecule and protein. """ if 'complex' in kwargs: datapoint = kwargs.get("complex") raise DeprecationWarning( 'Complex is being phased out as a parameter, please pass "datapoint" instead.' ) try: fragments = rdkit_utils.load_complex(datapoint, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) if self.reduce_to_contacts: fragments = reduce_molecular_complex_to_contacts( fragments, self.cutoff) for (frag1_ind, frag2_ind) in itertools.combinations(range(len(fragments)), 2): frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind] distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] # rdks = [frag1[1], frag2[1]] pairwise_features.append( np.concatenate([ sum([ voxelize(convert_atom_pair_to_voxel, hash_function=None, box_width=self.box_width, voxel_width=self.voxel_width, coordinates=xyz, feature_list=hbond_list, nb_channel=1) for xyz in xyzs ]) for hbond_list in compute_hydrogen_bonds( frag1, frag2, distances, self.distance_bins, self.angle_cutoffs) ], axis=-1)) # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis. return np.concatenate(pairwise_features, axis=-1)
def _featurize(self, mol_pdb: str, protein_pdb: str): """ Compute featurization for a molecular complex Parameters ---------- mol_pdb: str Filename for ligand molecule protein_pdb: str Filename for protein molecule """ molecular_complex = (mol_pdb, protein_pdb) try: fragments = load_complex(molecular_complex, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features: List[np.ndarray] = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) for (frag1, frag2) in itertools.combinations(fragments, 2): distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] pairwise_features.append( sum([ voxelize(convert_atom_to_voxel, xyz, self.box_width, self.voxel_width, hash_function=hash_ecfp, feature_dict=ecfp_dict, nb_channel=self.size) for xyz, ecfp_dict in zip( xyzs, featurize_contacts_ecfp(frag1, frag2, distances, cutoff=self.cutoff, ecfp_degree=self.radius)) ])) if self.flatten: return np.concatenate( [features.flatten() for features in pairwise_features]) else: # Features are of shape (voxels_per_edge, voxels_per_edge, # voxels_per_edge, num_feat) so we should concatenate on the last # axis. return np.concatenate(pairwise_features, axis=-1)
def _featurize(self, mol_pdb: str, protein_pdb: str) -> np.ndarray: """ Compute featurization for a single mol/protein complex Parameters ---------- mol_pdb: str Filename for ligand molecule protein_pdb: str Filename for protein molecule """ molecular_complex = (mol_pdb, protein_pdb) try: fragments = rdkit_utils.load_complex( molecular_complex, add_hydrogens=False) except MoleculeLoadException: logger.warning("This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) if self.reduce_to_contacts: fragments = reduce_molecular_complex_to_contacts(fragments, self.cutoff) for (frag1_ind, frag2_ind) in itertools.combinations( range(len(fragments)), 2): frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind] distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] # rdks = [frag1[1], frag2[1]] pairwise_features.append( np.concatenate( [ sum([ voxelize( convert_atom_pair_to_voxel, hash_function=None, box_width=self.box_width, voxel_width=self.voxel_width, coordinates=xyz, feature_list=hbond_list, nb_channel=1) for xyz in xyzs ]) for hbond_list in compute_hydrogen_bonds( frag1, frag2, distances, self.distance_bins, self.angle_cutoffs) ], axis=-1)) # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis. return np.concatenate(pairwise_features, axis=-1)
def _featurize(self, complex: Tuple[str, str]) -> Optional[np.ndarray]: """ Compute featurization for a single mol/protein complex Parameters ---------- complex: Tuple[str, str] Filenames for molecule and protein. """ try: fragments = rdkit_utils.load_complex(complex, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) for (frag1_ind, frag2_ind) in itertools.combinations(range(len(fragments)), 2): frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind] # distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] # rdks = [frag1[1], frag2[1]] pairwise_features.append( sum([ voxelize(convert_atom_to_voxel, hash_function=None, box_width=self.box_width, voxel_width=self.voxel_width, coordinates=xyz, feature_dict=cation_pi_dict, nb_channel=1) for xyz, cation_pi_dict in zip( xyzs, compute_binding_pocket_cation_pi( frag1[1], frag2[1], dist_cutoff=self.cutoff, angle_cutoff=self.angle_cutoff, )) ])) # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis. return np.concatenate(pairwise_features, axis=-1)
def _featurize(self, datapoint, **kwargs): """ Compute featurization for a molecular complex Parameters ---------- datapoint: Tuple[str, str] Filenames for molecule and protein. """ if 'complex' in kwargs: datapoint = kwargs.get("complex") raise DeprecationWarning( 'Complex is being phased out as a parameter, please pass "datapoint" instead.' ) try: fragments = load_complex(datapoint, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) for (frag1, frag2) in itertools.combinations(fragments, 2): distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] pairwise_features.append( np.concatenate([ voxelize(convert_atom_pair_to_voxel, hash_function=hash_ecfp_pair, coordinates=xyzs, box_width=self.box_width, voxel_width=self.voxel_width, feature_dict=splif_dict, nb_channel=self.size) for splif_dict in featurize_splif(frag1, frag2, self.contact_bins, distances, self.radius) ], axis=-1)) # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis. return np.concatenate(pairwise_features, axis=-1)
def test_subract_centroid(self): N = 10 coords = np.random.rand(N, 3) centroid = geometry_utils.compute_centroid(coords) new_coords = geometry_utils.subtract_centroid(coords, centroid) assert new_coords.shape == (N, 3) new_centroid = geometry_utils.compute_centroid(new_coords) assert new_centroid.shape == (3, ) np.testing.assert_almost_equal(new_centroid, np.zeros_like(new_centroid))
def _featurize(self, mol_pdb: str, complex_pdb: str): """ Compute featurization for a molecular complex Parameters ---------- mol_pdb: str Filename for ligand molecule complex_pdb: str Filename for protein molecule """ molecular_complex = (mol_pdb, complex_pdb) try: fragments = load_complex(molecular_complex, add_hydrogens=False) except MoleculeLoadException: logger.warning( "This molecule cannot be loaded by Rdkit. Returning None") return None pairwise_features = [] # We compute pairwise contact fingerprints centroid = compute_contact_centroid(fragments, cutoff=self.cutoff) for (frag1, frag2) in itertools.combinations(fragments, 2): distances = compute_pairwise_distances(frag1[0], frag2[0]) frag1_xyz = subtract_centroid(frag1[0], centroid) frag2_xyz = subtract_centroid(frag2[0], centroid) xyzs = [frag1_xyz, frag2_xyz] pairwise_features.append( np.concatenate([ voxelize(convert_atom_pair_to_voxel, hash_function=hash_ecfp_pair, coordinates=xyzs, box_width=self.box_width, voxel_width=self.voxel_width, feature_dict=splif_dict, nb_channel=self.size) for splif_dict in featurize_splif(frag1, frag2, self.contact_bins, distances, self.radius) ], axis=-1)) # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis. return np.concatenate(pairwise_features, axis=-1)
def _featurize(self, complex): """Computes grid featurization of protein/ligand complex. Takes as input filenames pdb of the protein, pdb of the ligand. This function then computes the centroid of the ligand; decrements this centroid from the atomic coordinates of protein and ligand atoms, and then merges the translated protein and ligand. This combined system/complex is then saved. This function then computes a featurization with scheme specified by the user. Parameters ---------- complex: Tuple[str, str] Filenames for molecule and protein. """ try: mol_pdb_file, protein_pdb_file = complex time1 = time.time() protein_xyz, protein_rdk = load_molecule(protein_pdb_file, calc_charges=True, sanitize=self.sanitize) time2 = time.time() logger.info( "TIMING: Loading protein coordinates took %0.3f s" % (time2 - time1), self.verbose) time1 = time.time() ligand_xyz, ligand_rdk = load_molecule(mol_pdb_file, calc_charges=True, sanitize=self.sanitize) time2 = time.time() logger.info( "TIMING: Loading ligand coordinates took %0.3f s" % (time2 - time1), self.verbose) except MoleculeLoadException: logger.warning( "Some molecules cannot be loaded by Rdkit. Skipping") return None time1 = time.time() centroid = compute_centroid(ligand_xyz) ligand_xyz = subtract_centroid(ligand_xyz, centroid) protein_xyz = subtract_centroid(protein_xyz, centroid) time2 = time.time() logger.info( "TIMING: Centroid processing took %0.3f s" % (time2 - time1), self.verbose) pairwise_distances = compute_pairwise_distances( protein_xyz, ligand_xyz) transformed_systems = {} transformed_systems[(0, 0)] = [protein_xyz, ligand_xyz] for i in range(self.nb_rotations): rotated_system = rotate_molecules([protein_xyz, ligand_xyz]) transformed_systems[(i + 1, 0)] = rotated_system features_dict = {} for system_id, (protein_xyz, ligand_xyz) in transformed_systems.items(): feature_arrays = [] for is_flat, function_name in self.feature_types: result = self._compute_feature( function_name, protein_xyz, protein_rdk, ligand_xyz, ligand_rdk, pairwise_distances, ) feature_arrays += result if self.flatten: features_dict[system_id] = np.concatenate([ feature_array.flatten() for feature_array in feature_arrays ]) else: features_dict[system_id] = np.concatenate(feature_arrays, axis=-1) # TODO(rbharath): Is this squeeze OK? features = np.squeeze(np.array(list(features_dict.values()))) return features