Exemple #1
0
def durrant_lab_filters(contnrs, num_procs, job_manager, parallelizer_obj):
    """Removes any molecules that contain prohibited substructures, per the
    durrant-lab filters.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: A list.
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param job_manager: The multithred mode to use.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    Utils.log("Applying Durrant-lab filters to all molecules...")

    # Get the substructures you won't permit.
    prohibited_smi_substrs = [
        "C=[N-]",
        "[N-]C=[N+]",
        "[nH+]c[n-]",
        "[#7+]~[#7+]",
        "[#7-]~[#7-]",
        "[!#7]~[#7+]~[#7-]~[!#7]"  # Doesn't hit azide.
    ]
    prohibited_substructs = [
        Chem.MolFromSmarts(s) for s in prohibited_smi_substrs
    ]

    # Get the parameters to pass to the parallelizer object.
    params = [[c, prohibited_substructs] for c in contnrs]

    # Run the tautomizer through the parallel object.
    tmp = []
    if parallelizer_obj != None:
        tmp = parallelizer_obj.run(params, parallel_durrant_lab_filter,
                                   num_procs, job_manager)
    else:
        for c in params:
            tmp.append(parallel_durrant_lab_filter(c, prohibited_substructs))

    # Note that results is a list of containers.

    # Stripping out None values (failed).
    results = Parallelizer.strip_none(tmp)

    # You need to get the molecules as a flat array so you can run it through
    # bst_for_each_contnr_no_opt
    mols = []
    for contnr in results:
        mols.extend(contnr.mols)
        # contnr.mols = []  # Necessary because ones are being removed...

    # Using this function just to make the changes.
    ChemUtils.bst_for_each_contnr_no_opt(
        contnrs,
        mols,
        1000,
        1000  # max_variants_per_compound, thoroughness
    )
def convert_2d_to_3d(
    contnrs,
    max_variants_per_compound,
    thoroughness,
    num_procs,
    job_manager,
    parallelizer_obj,
):
    """Converts the 1D smiles strings into 3D small-molecule models.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param job_manager: The multithred mode to use.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    Utils.log("Converting all molecules to 3D structures.")

    # Make the inputs to pass to the parallelizer.
    params = []
    for contnr in contnrs:
        for mol in contnr.mols:
            params.append(tuple([mol]))
    params = tuple(params)

    # Run the parallelizer
    tmp = []
    if parallelizer_obj != None:
        tmp = parallelizer_obj.run(params, parallel_make_3d, num_procs,
                                   job_manager)
    else:
        for i in params:
            tmp.append(parallel_make_3d(i[0]))

    # Remove and Nones from the output, which represent failed molecules.
    clear = Parallelizer.strip_none(tmp)

    # Keep only the top few compound variants in each container, to prevent a
    # combinatorial explosion.
    ChemUtils.bst_for_each_contnr_no_opt(contnrs, clear,
                                         max_variants_per_compound,
                                         thoroughness, False)
Exemple #3
0
def durrant_lab_filters(contnrs, num_procs, job_manager, parallelizer_obj):
    """Removes any molecules that contain prohibited substructures, per the
    durrant-lab filters.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: A list.
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param job_manager: The multithred mode to use.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    Utils.log("Applying Durrant-lab filters to all molecules...")

    prohibited_substructs = [
        Chem.MolFromSmarts(s) for s in prohibited_smi_substrs_for_substruc
    ]

    # Get the parameters to pass to the parallelizer object.
    params = [[c, prohibited_substructs] for c in contnrs]

    # Run the tautomizer through the parallel object.
    tmp = []
    if parallelizer_obj != None:
        tmp = parallelizer_obj.run(params, parallel_durrant_lab_filter,
                                   num_procs, job_manager)
    else:
        for c in params:
            tmp.append(parallel_durrant_lab_filter(c, prohibited_substructs))

    # Note that results is a list of containers.

    # Stripping out None values (failed).
    results = Parallelizer.strip_none(tmp)

    # You need to get the molecules as a flat array so you can run it through
    # bst_for_each_contnr_no_opt
    mols = []
    for contnr in results:
        mols.extend(contnr.mols)
        # contnr.mols = []  # Necessary because ones are being removed...

    # contnrs = results

    # print([c.orig_smi for c in results])
    # import pdb; pdb.set_trace()

    # Using this function just to make the changes. Doesn't do energy
    # minimization or anything (as it does later) because max variants
    # and thoroughness maxed out.
    ChemUtils.bst_for_each_contnr_no_opt(
        contnrs,
        mols,
        1000,
        1000  # max_variants_per_compound, thoroughness
    )
def enumerate_chiral_molecules(
    contnrs,
    max_variants_per_compound,
    thoroughness,
    num_procs,
    job_manager,
    parallelizer_obj,
):
    """Enumerates all possible enantiomers of a molecule. If the chirality of
       an atom is given, that chiral center is not varied. Only the chirality
       of unspecified chiral centers is varied.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param job_manager: The multiprocess mode.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    # No point in continuing none requested.
    if max_variants_per_compound == 0:
        return

    Utils.log("Enumerating all possible enantiomers for all molecules...")

    # Group the molecules so you can feed them to parallelizer.
    params = []
    for contnr in contnrs:
        for mol in contnr.mols:
            params.append(tuple([mol, thoroughness, max_variants_per_compound]))
    params = tuple(params)

    # Run it through the parallelizer.
    tmp = []
    if parallelizer_obj != None:
        tmp = parallelizer_obj.run(params, parallel_get_chiral, num_procs, job_manager)
    else:
        for i in params:
            tmp.append(parallel_get_chiral(i[0], i[1], i[2]))

    # Remove Nones (failed molecules)
    clean = Parallelizer.strip_none(tmp)

    # Flatten the data into a single list.
    flat = Parallelizer.flatten_list(clean)

    # Get the indexes of the ones that failed to generate.
    contnr_idxs_of_failed = Utils.fnd_contnrs_not_represntd(contnrs, flat)

    # Go through the missing ones and throw a message.
    for miss_indx in contnr_idxs_of_failed:
        Utils.log(
            "\tCould not generate valid enantiomers for "
            + contnrs[miss_indx].orig_smi
            + " ("
            + contnrs[miss_indx].name
            + "), so using existing "
            + "(unprocessed) structures."
        )
        for mol in contnrs[miss_indx].mols:
            mol.genealogy.append("(WARNING: Unable to generate enantiomers)")
            clean.append(mol)

    # Keep only the top few compound variants in each container, to prevent a
    # combinatorial explosion.
    ChemUtils.bst_for_each_contnr_no_opt(
        contnrs, flat, max_variants_per_compound, thoroughness
    )
Exemple #5
0
    def remove_identical_mols_from_contnr(self):
        """Removes itentical molecules from this container."""

        # For reasons I don't understand, the following doesn't give unique
        # canonical smiles:

        # Chem.MolToSmiles(self.mols[0].rdkit_mol, isomericSmiles=True,
        # canonical=True)

        # # This block for debugging. JDD: Needs attention?
        # all_can_noh_smiles = [m.smiles() for m in self.mols]  # Get all the smiles as stored.

        # wrong_cannonical_smiles = [
        #     Chem.MolToSmiles(
        #         m.rdkit_mol,  # Using the RdKit mol stored in MyMol
        #         isomericSmiles=True,
        #         canonical=True
        #     ) for m in self.mols
        # ]

        # right_cannonical_smiles = [
        #     Chem.MolToSmiles(
        #         Chem.MolFromSmiles(  # Regenerating the RdKit mol from the smiles string stored in MyMol
        #             m.smiles()
        #         ),
        #         isomericSmiles=True,
        #         canonical=True
        #     ) for m in self.mols]

        # if len(set(wrong_cannonical_smiles)) != len(set(right_cannonical_smiles)):
        #     Utils.log("ERROR!")
        #     Utils.log("Stored smiles string in this container:")
        #     Utils.log("\n".join(all_can_noh_smiles))
        #     Utils.log("")
        #     Utils.log("""Supposedly cannonical smiles strings generated from stored
        #         RDKit Mols in this container:""")
        #     Utils.log("\n".join(wrong_cannonical_smiles))
        #     Utils.log("""But if you plop these into chemdraw, you'll see some of them
        #         represent identical structures.""")
        #     Utils.log("")
        #     Utils.log("""Cannonical smiles strings generated from RDKit mols that
        #         were generated from the stored smiles string in this container:""")
        #     Utils.log("\n".join(right_cannonical_smiles))
        #     Utils.log("""Now you see the identical molecules. But why didn't the previous
        #         method catch them?""")
        #     Utils.log("")

        #     Utils.log("""Note that the third method identifies duplicates that the second
        #         method doesn't.""")
        #     Utils.log("")
        #     Utils.log("=" * 20)

        # # You need to make new molecules to get it to work.
        # new_smiles = [m.smiles() for m in self.mols]
        # new_mols = [Chem.MolFromSmiles(smi) for smi in new_smiles]
        # new_can_smiles = [Chem.MolToSmiles(new_mol, isomericSmiles=True, canonical=True) for new_mol in new_mols]

        # can_smiles_already_set = set([])
        # for i, new_can_smile in enumerate(new_can_smiles):
        #     if not new_can_smile in can_smiles_already_set:
        #         # Never seen before
        #         can_smiles_already_set.add(new_can_smile)
        #     else:
        #         # Seen before. Delete!
        #         self.mols[i] = None

        # while None in self.mols:
        #     self.mols.remove(None)

        self.mols = ChemUtils.uniq_mols_in_list(self.mols)
Exemple #6
0
def make_tauts(contnrs, max_variants_per_compound, thoroughness, num_procs,
               job_manager, let_tautomers_change_chirality, parallelizer_obj):
    """Generates tautomers of the molecules. Note that some of the generated
    tautomers are not realistic. If you find a certain improbable
    substructure keeps popping up, add it to the list in the
    `prohibited_substructures` definition found with MyMol.py, in the function
    remove_bizarre_substruc().

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: A list.
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param let_tautomers_change_chirality: Whether to allow tautomers that
      change the total number of chiral centers.
    :type let_tautomers_change_chirality: bool
    :param job_manager: The multithred mode to use.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    # No need to proceed if there are no max variants.
    if max_variants_per_compound == 0:
        return

    Utils.log("Generating tautomers for all molecules...")

    # Create the parameters to feed into the parallelizer object.
    params = []
    for contnr in contnrs:
        for mol_index, mol in enumerate(contnr.mols):
            params.append(tuple([contnr, mol_index,
                                 max_variants_per_compound]))
    params = tuple(params)

    # Run the tautomizer through the parallel object.
    tmp = []
    if parallelizer_obj != None:
        tmp = parallelizer_obj.run(params, parallel_make_taut, num_procs,
                                   job_manager)
    else:
        for i in params:
            tmp.append(parallel_make_taut(i[0], i[1], i[2]))

    # Flatten the resulting list of lists.
    none_data = tmp
    taut_data = Parallelizer.flatten_list(none_data)

    # Remove bad tautomers.
    taut_data = tauts_no_break_arom_rngs(contnrs, taut_data, num_procs,
                                         job_manager, parallelizer_obj)

    if not let_tautomers_change_chirality:
        taut_data = tauts_no_elim_chiral(contnrs, taut_data, num_procs,
                                         job_manager, parallelizer_obj)

    # taut_data = tauts_no_change_hs_to_cs_unless_alpha_to_carbnyl(
    #    contnrs, taut_data, num_procs, job_manager, parallelizer_obj
    # )

    # Keep only the top few compound variants in each container, to prevent a
    # combinatorial explosion.
    ChemUtils.bst_for_each_contnr_no_opt(contnrs, taut_data,
                                         max_variants_per_compound,
                                         thoroughness)
Exemple #7
0
def add_hydrogens(contnrs, min_pH, max_pH, st_dev, max_variants_per_compound,
                  thoroughness, num_procs, job_manager,
                  parallelizer_obj):
    """Adds hydrogen atoms to molecule containers, as appropriate for a given
       pH.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: A list.
    :param min_pH: The minimum pH to consider.
    :type min_pH: float
    :param max_pH: The maximum pH to consider.
    :type max_pH: float
    :param st_dev: The standard deviation. See Dimorphite-DL paper.
    :type st_dev: float
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param job_manager: The multithred mode to use.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    Utils.log("Ionizing all molecules...")

    # Make a simple directory with the ionization parameters.
    protonation_settings = {"min_ph": min_pH,
                            "max_ph": max_pH,
                            "pka_precision": st_dev,
                            "max_variants": thoroughness * max_variants_per_compound}

    # Format the inputs for use in the parallelizer.
    inputs = tuple([tuple([cont, protonation_settings]) for cont in contnrs if type(cont.orig_smi_canonical)==str])

    # Run the parallelizer and collect the results.
    results = []
    if parallelizer_obj !=  None:
        results = parallelizer_obj.run(inputs, parallel_add_H, num_procs, job_manager)
    else:
        for i in inputs:
            results.append(parallel_add_H(i[0],i[1]))

    results = Parallelizer.flatten_list(results)

    # Dimorphite-DL might not have generated ionization states for some
    # molecules. Identify those that are missing.
    contnr_idxs_of_failed = Utils.fnd_contnrs_not_represntd(contnrs, results)

    # For those molecules, just use the original SMILES string, with hydrogen
    # atoms added using RDKit.
    for miss_indx in contnr_idxs_of_failed:
        Utils.log(
            "\tWARNING: Gypsum-DL produced no valid ionization states for " +
            contnrs[miss_indx].orig_smi + " (" +
            contnrs[miss_indx].name + "), so using the original " +
            "smiles."
        )

        amol = contnrs[miss_indx].mol_orig_frm_inp_smi
        amol.contnr_idx = miss_indx

        # Save this failure to the genealogy record.
        amol.genealogy = [
            amol.orig_smi + " (source)",
            amol.orig_smi_deslt + " (desalted)",
            "(WARNING: Gypsum-DL could not assign ionization states)"
        ]

        # Save this one to the results too, even though not processed
        # properly.
        results.append(amol)

    # Keep only the top few compound variants in each container, to prevent a
    # combinatorial explosion.
    ChemUtils.bst_for_each_contnr_no_opt(
        contnrs, results, max_variants_per_compound, thoroughness
    )