def voxel_generator(self, batch_size=32, center=None, boxsize=24, resolution=1., n_jobs=1): """ Batch voxel generator.abs Parameters ---------- batch_size: int The size to yield each batch. center: Either a list of centers or a np.array of shape (3, ) contanining a single one. By default it chooses its molecule's geometrical center. boxsize: int Resulting size of voxelized array. resolution: float Resolution in Amstrong of the resulting array. n_jobs: int Number of threads to use during voxelization. """ # Cache the box centers if (boxsize, resolution) not in SmallMol.array_cache: bbm = (np.zeros(3) - float(boxsize * resolution / 2)) SmallMol.array_cache[(boxsize, resolution)] = \ _getGridCenters(bbm, [boxsize]*3, 1.).reshape(boxsize ** 3, 3) num_batches = math.ceil(self.__len__() / batch_size) # Setup voxelization def get_vox(mol, xcenter=None): if mol is None: return None return SmallMol.get_voxels(mol, center=xcenter, size=boxsize, resolution=resolution) SmallMolStack.vox_fun = get_vox # Generate batches of data: if n_jobs == 1: for batch in range(num_batches): idx_mols = enumerate(self._mols[batch * batch_size: (batch + 1) * batch_size]) yield [get_vox(mol, center[i+(batch_size*batch)] if isinstance(center, list) else center) for i, mol in idx_mols] elif n_jobs > 1 and n_jobs <= multiprocessing.cpu_count(): pool = multiprocessing.Pool(n_jobs) for batch in range(num_batches): idx_mols = enumerate(self._mols[batch * batch_size: (batch + 1) * batch_size]) yield pool.map(unwrap_self, [[mol, center[i+(batch_size*batch)] if isinstance(center, list) else center] for i, mol in idx_mols]) pool.close() else: raise ValueError("n_jobs needs to be a positive integer!")
def get_prop(mol, left_most_point): """ Returns atom occupancies """ n = [24, 24, 24] # Voxel size # Get the channels channels = vd._getAtomtypePropertiesPDBQT(mol) sigmas = vd._getRadii(mol) channels = sigmas[:, np.newaxis] * channels.astype(float) # Choose the grid centers centers = vd._getGridCenters(llc=left_most_point, N=n, resolution=1) centers = centers.reshape(np.prod(n), 3) # Extract the features and return features = vd._getOccupancyC(mol.coords[:, :, mol.frame], centers, channels) return features.reshape(*n, -1)
class SmallMol: """ SmallMol class using RDkit for featurization. """ array_cache = {(16, 1.): _getGridCenters(np.array([-8] * 3), [16, 16, 16], 1.).reshape(16**3, 3), (24, 1.): _getGridCenters(np.array([-12] * 3), [24, 24, 24], 1.).reshape(24**3, 3)} fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) def __init__(self, mol, ignore_errors=False, addHs=True): """ Initializes small molecule object Parameters ---------- mol: rdkit Molecule object or string (i) Rdkit molecule or (ii) Location of molecule file (".pdb"/".mol2") or (iii) a smile string. ignore_errors: bool If True errors will not be raised. """ # Determine how to load molecule # Process as Rdkit molecule if isinstance(mol, Chem.Mol): self._mol = mol elif mol is None and not ignore_errors: self._mol = mol # Process as string elif isinstance(mol, str): name_sufix = os.path.splitext(mol)[-1] if name_sufix == ".mol2": self._mol = Chem.MolFromMol2File(mol) elif name_sufix == ".pdb": self._mol = Chem.MolFromPDBFile(mol) # We assume any string is a valid smile # TODO: validate the strings else: self._mol = Chem.MolFromSmiles(mol) # Don't feed garbage! else: raise ValueError("Unkown file type: '{}'.".format(type(mol))) # Add hydrogens if addHs: self._mol = Chem.RemoveHs(self._mol) self._mol = Chem.AddHs(self._mol, addCoords=True) def get_coords(self): """ Returns molecule coordinates. """ n_atoms = self._mol.GetNumAtoms() conformer = self._mol.GetConformer() coords = [[corobj.x, corobj.y, corobj.z] for corobj in [conformer.GetAtomPosition(i) for i in range(n_atoms)]] return np.array(coords, dtype=np.float32) def get_elements(self): """ Returns molecule elements. """ return np.array([atom.GetSymbol() for atom in self._mol.GetAtoms()]) def _get_atom_types(self): """ Returns ndarray of shape (n_atoms x n_properties) molecule atom types, according to the following definitions and order: 0. Hydrophibic 1. Aromatic 2. Acceptor 3. Donor 4. - Ionizable 5. + Ionizable 6. Metal (empty) 7. Occupancy (No hydrogens) """ n_atoms = self._mol.GetNumAtoms() feats = SmallMol.factory.GetFeaturesForMol(self._mol) properties = np.zeros((n_atoms, 8), dtype=bool) for feat in feats: fam = feat.GetFamily() if fam not in atom_mapping: # Non relevant property continue properties[feat.GetAtomIds(), atom_mapping[fam]] = 1 # Occupancy, ignoring hydrogens. properties[:, 7] = self.get_elements() != 'H' return properties def _get_channel_radii(self): """ Multiplies atom types by each atom vdW radius. """ from htmd.molecule.vdw import radiidict radii = np.vectorize(radiidict.__getitem__)(self.get_elements()) * self._get_atom_types().T return radii.T.copy() def get_center(self, coords=None): """ Returns geometrical center of molecule. """ if coords is None: coords = self.get_coords() return coords.mean(axis=0).astype(np.float32) def generate_conformers(self, savefolder, savename="molecule_conformers", filetype="pdb", savefolder_exist_ok=False, num_confs=400): """ Generates ligand conformer and saves the results to a folder. Parameters ---------- savefolder: str Path to directory where the results will be saved savename: str Name of the generated files. example filename: <savename>_1.pdb filetype: str must be 'pdb' or 'mol2' savefolder_exist_ok: bool if false returns an error if savefolder already exsits Nconformers: int Number of conforer to generate. """ from rdkit.Chem import AllChem os.makedirs(savefolder, exist_ok=savefolder_exist_ok) mol = deepcopy(self._mol) mol = Chem.AddHs(mol) ids = AllChem.EmbedMultipleConfs(mol, numConfs=num_confs, pruneRmsThresh=1., maxAttempts=10000) for id in ids: AllChem.UFFOptimizeMolecule(mol, confId=id) for index, id in enumerate(ids): if filetype == "pdb": chemwrite = Chem.PDBWriter elif filetype == "sdf": chemwrite = Chem.SDWriter else: raise ValueError("Unknown file format. Cannot save to format '{}'".format(filetype)) writer = chemwrite(os.path.join(savefolder, '{}_{}.{}'.format(savename, index + 1, filetype))) writer.write(mol, confId=id) def get_voxels(self, center=None, size=24, resolution=1., rotation=None, displacement=None, dtype=np.float32): """ Computes molecule voxelization. Parameters ---------- center: array-like Geometrical coordinates where descriptors will be computed. size: int Size of resulting descriptor array. resolution: float Grid resolution of resulting array. rotation : array-like of shape (3,) Prior to voxelization rotates the molecule around its center give the rotation angles in radians. displacement: array-like of shape (3,) Prior to voxelization displaces the molecule by provided (X, Y, Z) distance before returning the voxelized representation. dtype : numpy datatype returns array of the specified type. Returns ------- voxels: array-like Computed descriptors. """ coords = self.get_coords() lig_center = self.get_center(coords=coords) if center is None: center = lig_center if rotation is not None: rotation = list(rotation) matx = get_rotationMatrix([1, 0, 0], rotation[0]) maty = get_rotationMatrix([0, 1, 0], rotation[1]) matz = get_rotationMatrix([0, 0, 1], rotation[2]) coords = rotate(coords, matx, center=lig_center) coords = rotate(coords, maty, center=lig_center) coords = rotate(coords, matz, center=lig_center) if displacement is not None: coords += np.asarray(displacement) multisigmas = self._get_channel_radii() if (size, resolution) not in SmallMol.array_cache: N = [size, size, size] bbm = (np.zeros(3) - float(size * resolution / 2)) centers = _getGridCenters(bbm, N, resolution) # Cache the array SmallMol.array_cache[(size, resolution)] = centers.reshape(size**3, 3) centers2D = centers + center else: centers2D = SmallMol.array_cache[(size, resolution)] + center voxels = _getOccupancyC(coords.astype(np.float32), centers2D, multisigmas).reshape(size, size, size, 8).astype(dtype) return voxels def get_name(self): return self._mol.GetProp('_Name') def get_natoms(self): return self._mol.GetNumAtoms() def to_molecule(self): from htmd.molecule.molecule import Molecule coords = self.get_coords() elements = self.get_elements() mol = Molecule() mol.empty(self.get_natoms()) mol.resname[:] = self.get_name()[:3] mol.resid[:] = 1 mol.name[:] = elements mol.element[:] = elements mol.charge[:] = self.get_charges() mol.coords[:, :, 0] = coords mol.viewname = self.get_name() mol.bonds, mol.bondtype = self.get_bonds() return mol def get_bonds(self): from rdkit.Chem import rdchem bonds = [] bondtypes = [] for bo in self._mol.GetBonds(): bonds.append([bo.GetBeginAtomIdx(), bo.GetEndAtomIdx()]) if bo.GetBondType() == rdchem.BondType.SINGLE: bondtypes.append('1') elif bo.GetBondType() == rdchem.BondType.DOUBLE: bondtypes.append('2') elif bo.GetBondType() == rdchem.BondType.TRIPLE: bondtypes.append('3') elif bo.GetBondType() == rdchem.BondType.AROMATIC: bondtypes.append('ar') return np.vstack(bonds), np.array(bondtypes) def get_charges(self): charges = [] for a in self._mol.GetAtoms(): charges.append(a.GetFormalCharge()) return np.array(charges)
def get_voxels(self, center=None, size=24, resolution=1., rotation=None, displacement=None, dtype=np.float32): """ Computes molecule voxelization. Parameters ---------- center: array-like Geometrical coordinates where descriptors will be computed. size: int Size of resulting descriptor array. resolution: float Grid resolution of resulting array. rotation : array-like of shape (3,) Prior to voxelization rotates the molecule around its center give the rotation angles in radians. displacement: array-like of shape (3,) Prior to voxelization displaces the molecule by provided (X, Y, Z) distance before returning the voxelized representation. dtype : numpy datatype returns array of the specified type. Returns ------- voxels: array-like Computed descriptors. """ coords = self.get_coords() lig_center = self.get_center(coords=coords) if center is None: center = lig_center if rotation is not None: rotation = list(rotation) matx = get_rotationMatrix([1, 0, 0], rotation[0]) maty = get_rotationMatrix([0, 1, 0], rotation[1]) matz = get_rotationMatrix([0, 0, 1], rotation[2]) coords = rotate(coords, matx, center=lig_center) coords = rotate(coords, maty, center=lig_center) coords = rotate(coords, matz, center=lig_center) if displacement is not None: coords += np.asarray(displacement) multisigmas = self._get_channel_radii() if (size, resolution) not in SmallMol.array_cache: N = [size, size, size] bbm = (np.zeros(3) - float(size * resolution / 2)) centers = _getGridCenters(bbm, N, resolution) # Cache the array SmallMol.array_cache[(size, resolution)] = centers.reshape(size**3, 3) centers2D = centers + center else: centers2D = SmallMol.array_cache[(size, resolution)] + center voxels = _getOccupancyC(coords.astype(np.float32), centers2D, multisigmas).reshape(size, size, size, 8).astype(dtype) return voxels
import os import math import numpy as np from htmd.molecule.voxeldescriptors import _getGridCenters from rdkit.Chem import ChemicalFeatures from rdkit import RDConfig _highlight_colors = [ (1.00, 0.50, 0.00), (0.00, 0.50, 1.00), (0.00, 1.00, 0.50), (1.00, 0.00, 0.50), (0.50, 0.00, 1.00), (0.50, 1.00, 0.00), (1.00, 0.00, 0.25), (0.00, 0.25, 1.00), (0.25, 1.00, 0.00) ] array_cache = { (16, 1.): _getGridCenters(np.array([-8] * 3), [16, 16, 16], 1.).reshape(16**3, 3), (24, 1.): _getGridCenters(np.array([-12] * 3), [24, 24, 24], 1.).reshape(24**3, 3) } fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) def calculateAngle(atomcentercoords, atom1coords, atom2coords, deg=False): r23 = np.zeros(3) r21 = np.zeros(3) norm23 = 0 norm21 = 0 dotprod = 0
# All Rights Reserved # Distributed under HTMD Software License Agreement # No redistribution in whole or part # import os import math import numpy as np from htmd.molecule.voxeldescriptors import _getGridCenters from rdkit.Chem import ChemicalFeatures from rdkit import RDConfig _highlight_colors = [(1.00, 0.50, 0.00), (0.00, 0.50, 1.00), (0.00, 1.00, 0.50), (1.00, 0.00, 0.50), (0.50, 0.00, 1.00), (0.50, 1.00, 0.00), (1.00, 0.00, 0.25), (0.00, 0.25, 1.00), (0.25, 1.00, 0.00)] array_cache = {(16, 1.): _getGridCenters(np.array([-8] * 3), [16, 16, 16], 1.).reshape(16**3, 3), (24, 1.): _getGridCenters(np.array([-12] * 3), [24, 24, 24], 1.).reshape(24**3, 3) } fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) def calculateAngle(atomcentercoords, atom1coords, atom2coords, deg=False): r23 = np.zeros(3) r21 = np.zeros(3) norm23 = 0 norm21 = 0 dotprod = 0
"#", "=", "-", "(", ")" # Misc ] vocab_i2c_v1 = {i: x for i, x in enumerate(vocab_list)} vocab_c2i_v1 = {vocab_i2c_v1[i]: i for i in vocab_i2c_v1} resolution = 1. size = 24 N = [size, size, size] bbm = (np.zeros(3) - float(size * 1. / 2)) global_centers = _getGridCenters(bbm, N, resolution) def string_gen_V1(in_string): out = in_string.replace("Cl", "X").replace("[nH]", "Y").replace("Br", "Z") return out def tokenize_v1(in_string, return_torch=True): caption = [] caption.append(0) caption.extend([vocab_c2i_v1[x] for x in in_string]) caption.append(1) if return_torch: return torch.Tensor(caption) return caption