def build_benchmark_check_rdkmols_catalog(mmapdir, molit=read_labelled_only_smiles, checks=False, overwrite=False): """Builds a memmapped catalog {molid->rdkbytes} from a (molid, smiles) iterator. tests it and compares to sequential recreation of the molecules from smiles. """ # Build the catalog info('Building %s catalog...' % mmapdir) start = time() mmm = MemMappedMols(mmapdir) if not overwrite and mmm.has_catalog(): info('Already computed, skipping.') else: mmm.save_from_smiles_iterator(molit()) info('Time taken to build the memmapped file: %.2f seconds' % (time() - start)) if not checks: return # Load the catalog mmms = MemMappedMols(mmapdir) # Lame benchmark - memmapped contiguous info('Benchmarking contiguous memmap reading') start = time() molcount = 0 for molid in mmms.molids(): mmms.mol(molid) molcount += 1 info('Time taken to read the memmapped %d mols (contiguous): %.2f seconds' % (molcount, time() - start)) info('Benchmarking random memmap reading') start = time() molcount = 0 for molid in set(mmms.molids()): mmms.mol(molid) molcount += 1 info('Time taken to read the memmapped %d mols (random): %.2f seconds' % (molcount, time() - start)) # Lame benchmark - from smiles info('Benchmarking reading from the original file') start = time() molcount = 0 for _, smiles in molit(): Chem.MolFromSmiles(smiles) molcount += 1 info('Time taken to read the smiled %d mols: %.2f seconds' % (molcount, time() - start)) # Exhaustive linear test that all mols are correctly stored info('Making sure that all is OKish') for molid, smiles in molit(): emol = Chem.MolFromSmiles(smiles) if emol is None: if not mmms.mol(molid) is None: warning('Molecule %s with original smiles %s should not be parsed from the binary store' % (molid, smiles)) else: if not Chem.MolToSmiles(emol) == Chem.MolToSmiles(mmms.mol(molid)): warning('Molecule %s with original smiles %s do not reconstruct properly: \n\t(%s != %s)' % (molid, smiles, Chem.MolToSmiles(emol), Chem.MolToSmiles(mmms.mol(molid)))) info('All is OKish')
def to_rdkit_mol(smiles, molid=None, sanitize=True, to2D=False, to3D=False, toPropertyMol=False): """Converts a smiles string into an RDKit molecule.""" mol = Chem.MolFromSmiles(smiles, sanitize=sanitize) if mol is None: if molid is None: warning('RDKit cannot create a molecule from smiles %s' % smiles) else: warning('RDKit cannot create molecule %s from smiles %s' % (molid, smiles)) return mol if to3D: AllChem.EmbedMolecule(mol) AllChem.UFFOptimizeMolecule(mol) elif to2D: AllChem.Compute2DCoords(mol) if toPropertyMol: return PropertyMol(mol) return mol
def to_rdkit_mol(mol_repr, molid=None, instantiator=Chem.MolFromSmiles, to2D=False, to3D=False, toPropertyMol=False): """ Converts a molecular representation (e.g. smiles string) into an RDKit molecule. Allows to perform common postprocessing operations on the resulting molecule. """ if not isinstance(mol_repr, Chem.Mol): mol = instantiator(mol_repr) else: mol = mol_repr if mol is None: if molid is None: warning('RDKit cannot create a molecule from %r' % mol_repr) else: warning('RDKit cannot create molecule %s from %r' % (molid, mol_repr)) return mol if to3D: AllChem.EmbedMolecule(mol) AllChem.UFFOptimizeMolecule(mol) elif to2D: AllChem.Compute2DCoords(mol) if toPropertyMol: return PropertyMol(mol) return mol