def durrant_lab_filters(contnrs, num_procs, job_manager, parallelizer_obj): """Removes any molecules that contain prohibited substructures, per the durrant-lab filters. :param contnrs: A list of containers (MolContainer.MolContainer). :type contnrs: A list. :param num_procs: The number of processors to use. :type num_procs: int :param job_manager: The multithred mode to use. :type job_manager: string :param parallelizer_obj: The Parallelizer object. :type parallelizer_obj: Parallelizer.Parallelizer """ Utils.log("Applying Durrant-lab filters to all molecules...") # Get the substructures you won't permit. prohibited_smi_substrs = [ "C=[N-]", "[N-]C=[N+]", "[nH+]c[n-]", "[#7+]~[#7+]", "[#7-]~[#7-]", "[!#7]~[#7+]~[#7-]~[!#7]" # Doesn't hit azide. ] prohibited_substructs = [ Chem.MolFromSmarts(s) for s in prohibited_smi_substrs ] # Get the parameters to pass to the parallelizer object. params = [[c, prohibited_substructs] for c in contnrs] # Run the tautomizer through the parallel object. tmp = [] if parallelizer_obj != None: tmp = parallelizer_obj.run(params, parallel_durrant_lab_filter, num_procs, job_manager) else: for c in params: tmp.append(parallel_durrant_lab_filter(c, prohibited_substructs)) # Note that results is a list of containers. # Stripping out None values (failed). results = Parallelizer.strip_none(tmp) # You need to get the molecules as a flat array so you can run it through # bst_for_each_contnr_no_opt mols = [] for contnr in results: mols.extend(contnr.mols) # contnr.mols = [] # Necessary because ones are being removed... # Using this function just to make the changes. ChemUtils.bst_for_each_contnr_no_opt( contnrs, mols, 1000, 1000 # max_variants_per_compound, thoroughness )
def convert_2d_to_3d( contnrs, max_variants_per_compound, thoroughness, num_procs, job_manager, parallelizer_obj, ): """Converts the 1D smiles strings into 3D small-molecule models. :param contnrs: A list of containers (MolContainer.MolContainer). :type contnrs: list :param max_variants_per_compound: To control the combinatorial explosion, only this number of variants (molecules) will be advanced to the next step. :type max_variants_per_compound: int :param thoroughness: How many molecules to generate per variant (molecule) retained, for evaluation. For example, perhaps you want to advance five molecules (max_variants_per_compound = 5). You could just generate five and advance them all. Or you could generate ten and advance the best five (so thoroughness = 2). Using thoroughness > 1 increases the computational expense, but it also increases the chances of finding good molecules. :type thoroughness: int :param num_procs: The number of processors to use. :type num_procs: int :param job_manager: The multithred mode to use. :type job_manager: string :param parallelizer_obj: The Parallelizer object. :type parallelizer_obj: Parallelizer.Parallelizer """ Utils.log("Converting all molecules to 3D structures.") # Make the inputs to pass to the parallelizer. params = [] for contnr in contnrs: for mol in contnr.mols: params.append(tuple([mol])) params = tuple(params) # Run the parallelizer tmp = [] if parallelizer_obj != None: tmp = parallelizer_obj.run(params, parallel_make_3d, num_procs, job_manager) else: for i in params: tmp.append(parallel_make_3d(i[0])) # Remove and Nones from the output, which represent failed molecules. clear = Parallelizer.strip_none(tmp) # Keep only the top few compound variants in each container, to prevent a # combinatorial explosion. ChemUtils.bst_for_each_contnr_no_opt(contnrs, clear, max_variants_per_compound, thoroughness, False)
def durrant_lab_filters(contnrs, num_procs, job_manager, parallelizer_obj): """Removes any molecules that contain prohibited substructures, per the durrant-lab filters. :param contnrs: A list of containers (MolContainer.MolContainer). :type contnrs: A list. :param num_procs: The number of processors to use. :type num_procs: int :param job_manager: The multithred mode to use. :type job_manager: string :param parallelizer_obj: The Parallelizer object. :type parallelizer_obj: Parallelizer.Parallelizer """ Utils.log("Applying Durrant-lab filters to all molecules...") prohibited_substructs = [ Chem.MolFromSmarts(s) for s in prohibited_smi_substrs_for_substruc ] # Get the parameters to pass to the parallelizer object. params = [[c, prohibited_substructs] for c in contnrs] # Run the tautomizer through the parallel object. tmp = [] if parallelizer_obj != None: tmp = parallelizer_obj.run(params, parallel_durrant_lab_filter, num_procs, job_manager) else: for c in params: tmp.append(parallel_durrant_lab_filter(c, prohibited_substructs)) # Note that results is a list of containers. # Stripping out None values (failed). results = Parallelizer.strip_none(tmp) # You need to get the molecules as a flat array so you can run it through # bst_for_each_contnr_no_opt mols = [] for contnr in results: mols.extend(contnr.mols) # contnr.mols = [] # Necessary because ones are being removed... # contnrs = results # print([c.orig_smi for c in results]) # import pdb; pdb.set_trace() # Using this function just to make the changes. Doesn't do energy # minimization or anything (as it does later) because max variants # and thoroughness maxed out. ChemUtils.bst_for_each_contnr_no_opt( contnrs, mols, 1000, 1000 # max_variants_per_compound, thoroughness )
def enumerate_chiral_molecules( contnrs, max_variants_per_compound, thoroughness, num_procs, job_manager, parallelizer_obj, ): """Enumerates all possible enantiomers of a molecule. If the chirality of an atom is given, that chiral center is not varied. Only the chirality of unspecified chiral centers is varied. :param contnrs: A list of containers (MolContainer.MolContainer). :type contnrs: list :param max_variants_per_compound: To control the combinatorial explosion, only this number of variants (molecules) will be advanced to the next step. :type max_variants_per_compound: int :param thoroughness: How many molecules to generate per variant (molecule) retained, for evaluation. For example, perhaps you want to advance five molecules (max_variants_per_compound = 5). You could just generate five and advance them all. Or you could generate ten and advance the best five (so thoroughness = 2). Using thoroughness > 1 increases the computational expense, but it also increases the chances of finding good molecules. :type thoroughness: int :param num_procs: The number of processors to use. :type num_procs: int :param job_manager: The multiprocess mode. :type job_manager: string :param parallelizer_obj: The Parallelizer object. :type parallelizer_obj: Parallelizer.Parallelizer """ # No point in continuing none requested. if max_variants_per_compound == 0: return Utils.log("Enumerating all possible enantiomers for all molecules...") # Group the molecules so you can feed them to parallelizer. params = [] for contnr in contnrs: for mol in contnr.mols: params.append(tuple([mol, thoroughness, max_variants_per_compound])) params = tuple(params) # Run it through the parallelizer. tmp = [] if parallelizer_obj != None: tmp = parallelizer_obj.run(params, parallel_get_chiral, num_procs, job_manager) else: for i in params: tmp.append(parallel_get_chiral(i[0], i[1], i[2])) # Remove Nones (failed molecules) clean = Parallelizer.strip_none(tmp) # Flatten the data into a single list. flat = Parallelizer.flatten_list(clean) # Get the indexes of the ones that failed to generate. contnr_idxs_of_failed = Utils.fnd_contnrs_not_represntd(contnrs, flat) # Go through the missing ones and throw a message. for miss_indx in contnr_idxs_of_failed: Utils.log( "\tCould not generate valid enantiomers for " + contnrs[miss_indx].orig_smi + " (" + contnrs[miss_indx].name + "), so using existing " + "(unprocessed) structures." ) for mol in contnrs[miss_indx].mols: mol.genealogy.append("(WARNING: Unable to generate enantiomers)") clean.append(mol) # Keep only the top few compound variants in each container, to prevent a # combinatorial explosion. ChemUtils.bst_for_each_contnr_no_opt( contnrs, flat, max_variants_per_compound, thoroughness )
def remove_identical_mols_from_contnr(self): """Removes itentical molecules from this container.""" # For reasons I don't understand, the following doesn't give unique # canonical smiles: # Chem.MolToSmiles(self.mols[0].rdkit_mol, isomericSmiles=True, # canonical=True) # # This block for debugging. JDD: Needs attention? # all_can_noh_smiles = [m.smiles() for m in self.mols] # Get all the smiles as stored. # wrong_cannonical_smiles = [ # Chem.MolToSmiles( # m.rdkit_mol, # Using the RdKit mol stored in MyMol # isomericSmiles=True, # canonical=True # ) for m in self.mols # ] # right_cannonical_smiles = [ # Chem.MolToSmiles( # Chem.MolFromSmiles( # Regenerating the RdKit mol from the smiles string stored in MyMol # m.smiles() # ), # isomericSmiles=True, # canonical=True # ) for m in self.mols] # if len(set(wrong_cannonical_smiles)) != len(set(right_cannonical_smiles)): # Utils.log("ERROR!") # Utils.log("Stored smiles string in this container:") # Utils.log("\n".join(all_can_noh_smiles)) # Utils.log("") # Utils.log("""Supposedly cannonical smiles strings generated from stored # RDKit Mols in this container:""") # Utils.log("\n".join(wrong_cannonical_smiles)) # Utils.log("""But if you plop these into chemdraw, you'll see some of them # represent identical structures.""") # Utils.log("") # Utils.log("""Cannonical smiles strings generated from RDKit mols that # were generated from the stored smiles string in this container:""") # Utils.log("\n".join(right_cannonical_smiles)) # Utils.log("""Now you see the identical molecules. But why didn't the previous # method catch them?""") # Utils.log("") # Utils.log("""Note that the third method identifies duplicates that the second # method doesn't.""") # Utils.log("") # Utils.log("=" * 20) # # You need to make new molecules to get it to work. # new_smiles = [m.smiles() for m in self.mols] # new_mols = [Chem.MolFromSmiles(smi) for smi in new_smiles] # new_can_smiles = [Chem.MolToSmiles(new_mol, isomericSmiles=True, canonical=True) for new_mol in new_mols] # can_smiles_already_set = set([]) # for i, new_can_smile in enumerate(new_can_smiles): # if not new_can_smile in can_smiles_already_set: # # Never seen before # can_smiles_already_set.add(new_can_smile) # else: # # Seen before. Delete! # self.mols[i] = None # while None in self.mols: # self.mols.remove(None) self.mols = ChemUtils.uniq_mols_in_list(self.mols)
def make_tauts(contnrs, max_variants_per_compound, thoroughness, num_procs, job_manager, let_tautomers_change_chirality, parallelizer_obj): """Generates tautomers of the molecules. Note that some of the generated tautomers are not realistic. If you find a certain improbable substructure keeps popping up, add it to the list in the `prohibited_substructures` definition found with MyMol.py, in the function remove_bizarre_substruc(). :param contnrs: A list of containers (MolContainer.MolContainer). :type contnrs: A list. :param max_variants_per_compound: To control the combinatorial explosion, only this number of variants (molecules) will be advanced to the next step. :type max_variants_per_compound: int :param thoroughness: How many molecules to generate per variant (molecule) retained, for evaluation. For example, perhaps you want to advance five molecules (max_variants_per_compound = 5). You could just generate five and advance them all. Or you could generate ten and advance the best five (so thoroughness = 2). Using thoroughness > 1 increases the computational expense, but it also increases the chances of finding good molecules. :type thoroughness: int :param num_procs: The number of processors to use. :type num_procs: int :param let_tautomers_change_chirality: Whether to allow tautomers that change the total number of chiral centers. :type let_tautomers_change_chirality: bool :param job_manager: The multithred mode to use. :type job_manager: string :param parallelizer_obj: The Parallelizer object. :type parallelizer_obj: Parallelizer.Parallelizer """ # No need to proceed if there are no max variants. if max_variants_per_compound == 0: return Utils.log("Generating tautomers for all molecules...") # Create the parameters to feed into the parallelizer object. params = [] for contnr in contnrs: for mol_index, mol in enumerate(contnr.mols): params.append(tuple([contnr, mol_index, max_variants_per_compound])) params = tuple(params) # Run the tautomizer through the parallel object. tmp = [] if parallelizer_obj != None: tmp = parallelizer_obj.run(params, parallel_make_taut, num_procs, job_manager) else: for i in params: tmp.append(parallel_make_taut(i[0], i[1], i[2])) # Flatten the resulting list of lists. none_data = tmp taut_data = Parallelizer.flatten_list(none_data) # Remove bad tautomers. taut_data = tauts_no_break_arom_rngs(contnrs, taut_data, num_procs, job_manager, parallelizer_obj) if not let_tautomers_change_chirality: taut_data = tauts_no_elim_chiral(contnrs, taut_data, num_procs, job_manager, parallelizer_obj) # taut_data = tauts_no_change_hs_to_cs_unless_alpha_to_carbnyl( # contnrs, taut_data, num_procs, job_manager, parallelizer_obj # ) # Keep only the top few compound variants in each container, to prevent a # combinatorial explosion. ChemUtils.bst_for_each_contnr_no_opt(contnrs, taut_data, max_variants_per_compound, thoroughness)
def add_hydrogens(contnrs, min_pH, max_pH, st_dev, max_variants_per_compound, thoroughness, num_procs, job_manager, parallelizer_obj): """Adds hydrogen atoms to molecule containers, as appropriate for a given pH. :param contnrs: A list of containers (MolContainer.MolContainer). :type contnrs: A list. :param min_pH: The minimum pH to consider. :type min_pH: float :param max_pH: The maximum pH to consider. :type max_pH: float :param st_dev: The standard deviation. See Dimorphite-DL paper. :type st_dev: float :param max_variants_per_compound: To control the combinatorial explosion, only this number of variants (molecules) will be advanced to the next step. :type max_variants_per_compound: int :param thoroughness: How many molecules to generate per variant (molecule) retained, for evaluation. For example, perhaps you want to advance five molecules (max_variants_per_compound = 5). You could just generate five and advance them all. Or you could generate ten and advance the best five (so thoroughness = 2). Using thoroughness > 1 increases the computational expense, but it also increases the chances of finding good molecules. :type thoroughness: int :param num_procs: The number of processors to use. :type num_procs: int :param job_manager: The multithred mode to use. :type job_manager: string :param parallelizer_obj: The Parallelizer object. :type parallelizer_obj: Parallelizer.Parallelizer """ Utils.log("Ionizing all molecules...") # Make a simple directory with the ionization parameters. protonation_settings = {"min_ph": min_pH, "max_ph": max_pH, "pka_precision": st_dev, "max_variants": thoroughness * max_variants_per_compound} # Format the inputs for use in the parallelizer. inputs = tuple([tuple([cont, protonation_settings]) for cont in contnrs if type(cont.orig_smi_canonical)==str]) # Run the parallelizer and collect the results. results = [] if parallelizer_obj != None: results = parallelizer_obj.run(inputs, parallel_add_H, num_procs, job_manager) else: for i in inputs: results.append(parallel_add_H(i[0],i[1])) results = Parallelizer.flatten_list(results) # Dimorphite-DL might not have generated ionization states for some # molecules. Identify those that are missing. contnr_idxs_of_failed = Utils.fnd_contnrs_not_represntd(contnrs, results) # For those molecules, just use the original SMILES string, with hydrogen # atoms added using RDKit. for miss_indx in contnr_idxs_of_failed: Utils.log( "\tWARNING: Gypsum-DL produced no valid ionization states for " + contnrs[miss_indx].orig_smi + " (" + contnrs[miss_indx].name + "), so using the original " + "smiles." ) amol = contnrs[miss_indx].mol_orig_frm_inp_smi amol.contnr_idx = miss_indx # Save this failure to the genealogy record. amol.genealogy = [ amol.orig_smi + " (source)", amol.orig_smi_deslt + " (desalted)", "(WARNING: Gypsum-DL could not assign ionization states)" ] # Save this one to the results too, even though not processed # properly. results.append(amol) # Keep only the top few compound variants in each container, to prevent a # combinatorial explosion. ChemUtils.bst_for_each_contnr_no_opt( contnrs, results, max_variants_per_compound, thoroughness )