def mol_with_smiles_is_in_contnr(self, smiles): """Checks whether or not a given smiles string is already in this container. :param smiles: The smiles string to check. :type smiles: str :return: True if it is present, otherwise a new MyMol.MyMol object corresponding to that smiles. :rtype: bool or MyMol.MyMol """ # Checks all the mols in this container to see if a given smiles is # already present. Returns a new MyMol object if it isn't, True # otherwise. # First, get the set of all cannonical smiles. # TODO: Probably shouldn't be generating this on the fly every time # you use it! can_smi_in_this_container = set([m.smiles() for m in self.mols]) # Determine whether it is already in the container, and act # accordingly. amol = MyMol.MyMol(smiles) if amol.smiles() in can_smi_in_this_container: return True else: return amol
def update_orig_smi(self, orig_smi): """Updates the orig_smi string. Used by desalter (to replace with largest fragment). :param orig_smi: The replacement smiles string. :type orig_smi: str """ # Update the MolContainer object self.orig_smi = orig_smi self.orig_smi_deslt = orig_smi self.mol_orig_frm_inp_smi = MyMol.MyMol(self.orig_smi, self.name) self.frgs = "" self.orig_smi_canonical = self.mol_orig_frm_inp_smi.smiles() self.num_nonaro_rngs = len( self.mol_orig_frm_inp_smi.get_idxs_of_nonaro_rng_atms() ) self.num_specif_chiral_cntrs = len( self.mol_orig_frm_inp_smi.chiral_cntrs_only_asignd() ) self.num_unspecif_chiral_cntrs = len( self.mol_orig_frm_inp_smi.chiral_cntrs_w_unasignd() ) # None of the mols derived to date, if present, are accurate. self.mols = []
def parallel_add_H(contnr, protonation_settings): """Creates alternate ionization variants for a given molecule container. This is the function that gets fed into the parallelizer. :param contnr: The molecule container. :type contnr: MolContainer.MolContainer :param protonation_settings: Protonation settings to pass to Dimorphite-DL. :type protonation_settings: dict :return: [description] :rtype: [type] """ # Make sure the canonical SMILES is actually a string. if type(contnr.orig_smi_canonical) != str: Utils.log("container.orig_smi_canonical: " + contnr.orig_smi_canonical) Utils.log("type container.orig_smi_canonical: " + str(type(contnr.orig_smi_canonical))) Utils.exception("container.orig_smi_canonical: " + contnr.orig_smi_canonical) # Add the SMILES string to the protonation parameters. protonation_settings["smiles"] = contnr.orig_smi_canonical # Protonate the SMILESstring. This is Dimorphite-DL. smis = Protonate(protonation_settings) # Convert the protonated SMILES strings into a list of rdkit molecule # objects. rdkit_mols = [Chem.MolFromSmiles(smi.strip()) for smi in smis] # Convert from rdkit mols to MyMol.MyMol. addH_mols = [MyMol.MyMol(mol) for mol in rdkit_mols if mol is not None] # Remove MyMols with odd substructures. addH_mols = [ mol for mol in addH_mols if mol.remove_bizarre_substruc() is False ] # I once saw it add a "C+"" here. So do a secondary check at this point to # make sure it's valid. Recreate the list, moving new MyMol.MyMol objects # into the return_values list. return_values = [] orig_mol = contnr.mol_orig_frm_inp_smi for Hm in addH_mols: Hm.inherit_contnr_props(contnr) Hm.genealogy = orig_mol.genealogy[:] Hm.name = orig_mol.name if Hm.smiles() != orig_mol.smiles(): Hm.genealogy.append(Hm.smiles(True) + " (protonated)") return_values.append(Hm) return return_values
def __init__(self, smiles, name, index, properties): """The constructor. :param smiles: A list of SMILES strings. :type smiles: str :param name: The name of the molecule. :type name: str :param index: The index of this MolContainer in the main MolContainer list. :type index: int :param properties: A dictionary of properties from the sdf. :type properties: dict """ # Set some variables are set on the container level (not the MyMol # level) self.contnr_idx = index self.contnr_idx_orig = index # Because if some circumstances (mpi), # might be reset. But good to have # original for filename output. self.orig_smi = smiles self.orig_smi_deslt = smiles # initial assumption self.mols = [] self.name = name self.properties = properties self.mol_orig_frm_inp_smi = MyMol.MyMol(smiles, name) self.mol_orig_frm_inp_smi.contnr_idx = self.contnr_idx self.frgs = "" # For caching. # Save the original canonical smiles self.orig_smi_canonical = self.mol_orig_frm_inp_smi.smiles() # Get the number of nonaromatic rings self.num_nonaro_rngs = len( self.mol_orig_frm_inp_smi.get_idxs_of_nonaro_rng_atms() ) # Get the number of chiral centers, assigned self.num_specif_chiral_cntrs = len( self.mol_orig_frm_inp_smi.chiral_cntrs_only_asignd() ) # Also get the number of chiral centers, unassigned self.num_unspecif_chiral_cntrs = len( self.mol_orig_frm_inp_smi.chiral_cntrs_w_unasignd() ) # Get the non-acidic carbon-hydrogen footprint. self.carbon_hydrogen_count = self.mol_orig_frm_inp_smi.count_hyd_bnd_to_carb()
def desalter(contnr): """Desalts molecules in a molecule container. :param contnr: The molecule container. :type contnr: MolContainer.MolContainer :return: A molecule object. :rtype: MyMol.MyMol """ # Split it into fragments frags = contnr.get_frags_of_orig_smi() if len(frags) == 1: # It's only got one fragment, so default assumption that # orig_smi = orig_smi_deslt is correct. return contnr.mol_orig_frm_inp_smi else: Utils.log( "\tMultiple fragments found in " + contnr.orig_smi + " (" + contnr.name + ")" ) # Find the biggest fragment num_heavy_atoms = [] num_heavy_atoms_to_frag = {} for i, f in enumerate(frags): num = f.GetNumHeavyAtoms() num_heavy_atoms.append(num) num_heavy_atoms_to_frag[num] = f max_num = max(num_heavy_atoms) biggest_frag = num_heavy_atoms_to_frag[max_num] # Return info about that biggest fragment. new_mol = MyMol.MyMol(biggest_frag) new_mol.contnr_idx = contnr.contnr_idx new_mol.name = contnr.name new_mol.genealogy = contnr.mol_orig_frm_inp_smi.genealogy new_mol.make_mol_frm_smiles_sanitze() # Need to update the mol. return new_mol
def parallel_get_chiral(mol, max_variants_per_compound, thoroughness): """A parallelizable function for enumerating chiralities. :param mol: The input molecule. :type mol: MyMol.MyMol :param max_variants_per_compound: To control the combinatorial explosion, only this number of variants (molecules) will be advanced to the next step. :type max_variants_per_compound: int :param thoroughness: How many molecules to generate per variant (molecule) retained, for evaluation. For example, perhaps you want to advance five molecules (max_variants_per_compound = 5). You could just generate five and advance them all. Or you could generate ten and advance the best five (so thoroughness = 2). Using thoroughness > 1 increases the computational expense, but it also increases the chances of finding good molecules. :type thoroughness: int :return: A list of MyMol.MyMol objects. :rtype: list """ # Get all chiral centers that aren't assigned explicitly in the input # molecules. unasignd = [p[0] for p in mol.chiral_cntrs_w_unasignd() if p[1] == "?"] num = len(unasignd) # Get all possible chiral assignments. If the chirality is specified, # retain it. results = [] if num == 0: # There are no unspecified chiral centers, so just keep existing. results.append(mol) return results elif num == 1: # There's only one chiral center. options = ["R", "S"] else: # There are multiple chiral centers. starting = [["R"], ["S"]] options = [["R"], ["S"]] for i in range(num - 1): if len(options) > thoroughness * max_variants_per_compound: # Unfortunately, this section lends itself to a combinatorial # explosion if there are many chiral centers. Necessary to # control that or it can become problematic. So truncate early # if you already have a enough (so some will unfortunately # never be evaluated). break options = list(itertools.product(options, starting)) options = [list(itertools.chain(c[0], c[1])) for c in options] # Let the user know the number of chiral centers. Utils.log( "\t" + mol.smiles(True) + " (" + mol.name + ") has " # + str(len(options)) + str(2 ** num) + " enantiomers when chiral centers with " + "no specified chirality are systematically varied." ) # Randomly select a few of the chiral combinations to examine. This is to # reduce the potential combinatorial explosion. num_to_keep_initially = thoroughness * max_variants_per_compound options = Utils.random_sample(options, num_to_keep_initially, "") # Go through the chirality combinations and make a molecule with that # chirality. for option in options: # Copy the initial rdkit molecule. a_rd_mol = copy.copy(mol.rdkit_mol) # Set its chirality. for idx, chiral in zip(unasignd, option): if chiral == "R": a_rd_mol.GetAtomWithIdx(idx).SetChiralTag( Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW ) elif chiral == "S": a_rd_mol.GetAtomWithIdx(idx).SetChiralTag( Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW ) # Make a new MyMol.MyMol object from that rdkit molecule. new_mol = MyMol.MyMol(a_rd_mol) # Add the new molecule to the list of results, if it does not have a # bizarre substructure. if not new_mol.remove_bizarre_substruc(): new_mol.contnr_idx = mol.contnr_idx new_mol.name = mol.name new_mol.genealogy = mol.genealogy[:] new_mol.genealogy.append(new_mol.smiles(True) + " (chirality)") results.append(new_mol) # Return the results. return results
def parallel_get_double_bonded(mol, max_variants_per_compound, thoroughness): """A parallelizable function for enumerating double bonds. :param mol: The molecule with a potentially unspecified double bond. :type mol: MyMol.MyMol :param max_variants_per_compound: To control the combinatorial explosion, only this number of variants (molecules) will be advanced to the next step. :type max_variants_per_compound: int :param thoroughness: How many molecules to generate per variant (molecule) retained, for evaluation. For example, perhaps you want to advance five molecules (max_variants_per_compound = 5). You could just generate five and advance them all. Or you could generate ten and advance the best five (so thoroughness = 2). Using thoroughness > 1 increases the computational expense, but it also increases the chances of finding good molecules. :type thoroughness: int :return: [description] :rtype: [type] """ # For this to work, you need to have explicit hydrogens in place. mol.rdkit_mol = Chem.AddHs(mol.rdkit_mol) # Get all double bonds that don't have defined stereochemistry. Note that # these are the bond indexes, not the atom indexes. unasignd_dbl_bnd_idxs = mol.get_double_bonds_without_stereochemistry() if len(unasignd_dbl_bnd_idxs) == 0: # There are no unassigned double bonds, so move on. return [mol] # Throw out any bond that is in a small ring. unasignd_dbl_bnd_idxs = [ i for i in unasignd_dbl_bnd_idxs if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(3) ] unasignd_dbl_bnd_idxs = [ i for i in unasignd_dbl_bnd_idxs if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(4) ] unasignd_dbl_bnd_idxs = [ i for i in unasignd_dbl_bnd_idxs if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(5) ] unasignd_dbl_bnd_idxs = [ i for i in unasignd_dbl_bnd_idxs if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(6) ] unasignd_dbl_bnd_idxs = [ i for i in unasignd_dbl_bnd_idxs if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(7) ] # Previously, I fully enumerated all double bonds. When there are many # such bonds, that leads to a combinatorial explosion that causes problems # in terms of speed and memory. Now, enumerate only enough bonds to make # sure you generate at least thoroughness * max_variants_per_compound. unasignd_dbl_bnd_idxs_orig_count = len(unasignd_dbl_bnd_idxs) num_bonds_to_keep = int(math.ceil(math.log(thoroughness * max_variants_per_compound, 2))) random.shuffle(unasignd_dbl_bnd_idxs) unasignd_dbl_bnd_idxs = sorted(unasignd_dbl_bnd_idxs[:num_bonds_to_keep]) # Get a list of all the single bonds that come off each double-bond atom. all_sngl_bnd_idxs = set([]) dbl_bnd_count = 0 for dbl_bnd_idx in unasignd_dbl_bnd_idxs: bond = mol.rdkit_mol.GetBondWithIdx(dbl_bnd_idx) atom1 = bond.GetBeginAtom() atom1_bonds = atom1.GetBonds() if len(atom1_bonds) == 1: # The only bond is the one you already know about. So don't save. continue atom2 = bond.GetEndAtom() atom2_bonds = atom2.GetBonds() if len(atom2_bonds) == 1: # The only bond is the one you already know about. So don't save. continue dbl_bnd_count = dbl_bnd_count + 1 # Suffice it to say, RDKit does not deal with cis-trans isomerization # in an intuitive way... idxs_of_other_bnds_frm_atm1 = [b.GetIdx() for b in atom1.GetBonds()] idxs_of_other_bnds_frm_atm1.remove(dbl_bnd_idx) idxs_of_other_bnds_frm_atm2 = [b.GetIdx() for b in atom2.GetBonds()] idxs_of_other_bnds_frm_atm2.remove(dbl_bnd_idx) all_sngl_bnd_idxs |= set(idxs_of_other_bnds_frm_atm1) all_sngl_bnd_idxs |= set(idxs_of_other_bnds_frm_atm2) # Now come up with all possible up/down combinations for those bonds. all_sngl_bnd_idxs = list(all_sngl_bnd_idxs) all_atom_config_options = list( itertools.product([True, False], repeat=len(all_sngl_bnd_idxs)) ) # Let the user know. if dbl_bnd_count > 0: Utils.log( "\t" + mol.smiles(True) + " has " # + str(dbl_bnd_count) + str( # Not exactly right, I think, because should be dbl_bnd_count, but ok. unasignd_dbl_bnd_idxs_orig_count ) + " double bond(s) with unspecified stereochemistry." ) # Go through and consider each of the retained combinations. smiles_to_consider = set([]) for atom_config_options in all_atom_config_options: # Make a copy of the original RDKit molecule. a_rd_mol = copy.copy(mol.rdkit_mol) # a_rd_mol = Chem.MolFromSmiles(mol.smiles()) for bond_idx, direc in zip(all_sngl_bnd_idxs, atom_config_options): # Always done with reference to the atom in the double bond. if direc: a_rd_mol.GetBondWithIdx(bond_idx).SetBondDir(Chem.BondDir.ENDUPRIGHT) else: a_rd_mol.GetBondWithIdx(bond_idx).SetBondDir(Chem.BondDir.ENDDOWNRIGHT) # Assign the StereoChemistry. Required to actually set it. a_rd_mol.ClearComputedProps() Chem.AssignStereochemistry(a_rd_mol, force=True) # Add to list of ones to consider try: smiles_to_consider.add( Chem.MolToSmiles(a_rd_mol, isomericSmiles=True, canonical=True) ) except: # Some molecules still give troubles. Unfortunate, but these are # rare cases. Let's just skip these. Example: # CN1C2=C(C=CC=C2)C(C)(C)[C]1=[C]=[CH]C3=CC(=C(O)C(=C3)I)I continue # Remove ones that don't have "/" or "\". These are not real enumerated ones. smiles_to_consider = [s for s in smiles_to_consider if "/" in s or "\\" in s] # Get the maximum number of / + \ in any string. cnts = [s.count("/") + s.count("\\") for s in smiles_to_consider] if len(cnts) == 0: # There are no appropriate double bonds. Move on... return [mol] max_cnts = max(cnts) # Only keep those with that same max count. The others have double bonds # that remain unspecified. smiles_to_consider = [ s[0] for s in zip(smiles_to_consider, cnts) if s[1] == max_cnts ] results = [] for smile_to_consider in smiles_to_consider: # Make a new MyMol.MyMol object with the specified smiles. new_mol = MyMol.MyMol(smile_to_consider) if new_mol.can_smi != False and new_mol.can_smi != None: # Sometimes you get an error if there's a bad structure otherwise. # Add the new molecule to the list of results, if it does not have # a bizarre substructure. if not new_mol.remove_bizarre_substruc(): new_mol.contnr_idx = mol.contnr_idx new_mol.name = mol.name new_mol.genealogy = mol.genealogy[:] new_mol.genealogy.append( new_mol.smiles(True) + " (cis-trans isomerization)" ) results.append(new_mol) # Return the results. return results
def parallel_make_taut(contnr, mol_index, max_variants_per_compound): """Makes alternate tautomers for a given molecule container. This is the function that gets fed into the parallelizer. :param contnr: The molecule container. :type contnr: MolContainer.MolContainer :param mol_index: The molecule index. :type mol_index: int :param max_variants_per_compound: To control the combinatorial explosion, only this number of variants (molecules) will be advanced to the next step. :type max_variants_per_compound: int :return: A list of MyMol.MyMol objects, containing the alternate tautomeric forms. :rtype: list """ # Get the MyMol.MyMol within the molecule container corresponding to the # given molecule index. mol = contnr.mols[mol_index] # Create a temporary RDKit mol object, since that's what MolVS works with. # TODO: There should be a copy function m = MyMol.MyMol(mol.smiles()).rdkit_mol # For tautomers to work, you need to not have any explicit hydrogens. m = Chem.RemoveHs(m) # Make sure it's not None. if m is None: Utils.log("\tCould not generate tautomers for " + contnr.orig_smi + ". I'm deleting it.") return # Molecules should be kekulized already, but let's double check that. # Because MolVS requires kekulized input. Chem.Kekulize(m) m = MOH.check_sanitization(m) if m is None: return None # Limit to max_variants_per_compound tauts. Note that another batch could # add more, so you'll need to once again trim to this number later. But # this could at least help prevent the combinatorial explosion at this # stage. enum = tautomer.TautomerEnumerator(max_tautomers=max_variants_per_compound) tauts_rdkit_mols = enum.enumerate(m) # Make all those tautomers into MyMol objects. tauts_mols = [MyMol.MyMol(m) for m in tauts_rdkit_mols] # Keep only those that have reasonable substructures. tauts_mols = [ t for t in tauts_mols if t.remove_bizarre_substruc() == False ] # If there's more than one, let the user know that. if len(tauts_mols) > 1: Utils.log("\t" + mol.smiles(True) + " has tautomers.") # Now collect the final results. results = [] for tm in tauts_mols: tm.inherit_contnr_props(contnr) tm.genealogy = mol.genealogy[:] tm.name = mol.name if tm.smiles() != mol.smiles(): tm.genealogy.append(tm.smiles(True) + " (tautomer)") results.append(tm) return results