Ejemplo n.º 1
0
    def mol_with_smiles_is_in_contnr(self, smiles):
        """Checks whether or not a given smiles string is already in this
           container.

        :param smiles: The smiles string to check.
        :type smiles: str
        :return: True if it is present, otherwise a new MyMol.MyMol object
           corresponding to that smiles.
        :rtype: bool or MyMol.MyMol
        """

        # Checks all the mols in this container to see if a given smiles is
        # already present. Returns a new MyMol object if it isn't, True
        # otherwise.

        # First, get the set of all cannonical smiles.
        # TODO: Probably shouldn't be generating this on the fly every time
        # you use it!
        can_smi_in_this_container = set([m.smiles() for m in self.mols])

        # Determine whether it is already in the container, and act
        # accordingly.
        amol = MyMol.MyMol(smiles)
        if amol.smiles() in can_smi_in_this_container:
            return True
        else:
            return amol
Ejemplo n.º 2
0
    def update_orig_smi(self, orig_smi):
        """Updates the orig_smi string. Used by desalter (to replace with
           largest fragment).

        :param orig_smi: The replacement smiles string.
        :type orig_smi: str
        """

        # Update the MolContainer object
        self.orig_smi = orig_smi
        self.orig_smi_deslt = orig_smi
        self.mol_orig_frm_inp_smi = MyMol.MyMol(self.orig_smi, self.name)
        self.frgs = ""
        self.orig_smi_canonical = self.mol_orig_frm_inp_smi.smiles()
        self.num_nonaro_rngs = len(
            self.mol_orig_frm_inp_smi.get_idxs_of_nonaro_rng_atms()
        )
        self.num_specif_chiral_cntrs = len(
            self.mol_orig_frm_inp_smi.chiral_cntrs_only_asignd()
        )
        self.num_unspecif_chiral_cntrs = len(
            self.mol_orig_frm_inp_smi.chiral_cntrs_w_unasignd()
        )

        # None of the mols derived to date, if present, are accurate.
        self.mols = []
Ejemplo n.º 3
0
def parallel_add_H(contnr, protonation_settings):
    """Creates alternate ionization variants for a given molecule container.
       This is the function that gets fed into the parallelizer.

    :param contnr: The molecule container.
    :type contnr: MolContainer.MolContainer
    :param protonation_settings: Protonation settings to pass to Dimorphite-DL.
    :type protonation_settings: dict
    :return: [description]
    :rtype: [type]
    """

    # Make sure the canonical SMILES is actually a string.
    if type(contnr.orig_smi_canonical) != str:
        Utils.log("container.orig_smi_canonical: " + contnr.orig_smi_canonical)
        Utils.log("type container.orig_smi_canonical: " +
                  str(type(contnr.orig_smi_canonical)))
        Utils.exception("container.orig_smi_canonical: " +
                        contnr.orig_smi_canonical)

    # Add the SMILES string to the protonation parameters.
    protonation_settings["smiles"] = contnr.orig_smi_canonical

    # Protonate the SMILESstring. This is Dimorphite-DL.
    smis = Protonate(protonation_settings)

    # Convert the protonated SMILES strings into a list of rdkit molecule
    # objects.
    rdkit_mols = [Chem.MolFromSmiles(smi.strip()) for smi in smis]

    # Convert from rdkit mols to MyMol.MyMol.
    addH_mols = [MyMol.MyMol(mol) for mol in rdkit_mols if mol is not None]

    # Remove MyMols with odd substructures.
    addH_mols = [
        mol for mol in addH_mols if mol.remove_bizarre_substruc() is False
    ]

    # I once saw it add a "C+"" here. So do a secondary check at this point to
    # make sure it's valid. Recreate the list, moving new MyMol.MyMol objects
    # into the return_values list.

    return_values = []

    orig_mol = contnr.mol_orig_frm_inp_smi
    for Hm in addH_mols:
        Hm.inherit_contnr_props(contnr)
        Hm.genealogy = orig_mol.genealogy[:]
        Hm.name = orig_mol.name

        if Hm.smiles() != orig_mol.smiles():
            Hm.genealogy.append(Hm.smiles(True) + " (protonated)")

        return_values.append(Hm)

    return return_values
Ejemplo n.º 4
0
    def __init__(self, smiles, name, index, properties):
        """The constructor.

        :param smiles: A list of SMILES strings.
        :type smiles: str
        :param name: The name of the molecule.
        :type name: str
        :param index: The index of this MolContainer in the main MolContainer
           list.
        :type index: int
        :param properties: A dictionary of properties from the sdf.
        :type properties: dict
        """

        # Set some variables are set on the container level (not the MyMol
        # level)
        self.contnr_idx = index
        self.contnr_idx_orig = index  # Because if some circumstances (mpi),
        # might be reset. But good to have
        # original for filename output.
        self.orig_smi = smiles
        self.orig_smi_deslt = smiles  # initial assumption
        self.mols = []
        self.name = name
        self.properties = properties
        self.mol_orig_frm_inp_smi = MyMol.MyMol(smiles, name)
        self.mol_orig_frm_inp_smi.contnr_idx = self.contnr_idx
        self.frgs = ""  # For caching.

        # Save the original canonical smiles
        self.orig_smi_canonical = self.mol_orig_frm_inp_smi.smiles()

        # Get the number of nonaromatic rings
        self.num_nonaro_rngs = len(
            self.mol_orig_frm_inp_smi.get_idxs_of_nonaro_rng_atms()
        )

        # Get the number of chiral centers, assigned
        self.num_specif_chiral_cntrs = len(
            self.mol_orig_frm_inp_smi.chiral_cntrs_only_asignd()
        )

        # Also get the number of chiral centers, unassigned
        self.num_unspecif_chiral_cntrs = len(
            self.mol_orig_frm_inp_smi.chiral_cntrs_w_unasignd()
        )

        # Get the non-acidic carbon-hydrogen footprint.
        self.carbon_hydrogen_count = self.mol_orig_frm_inp_smi.count_hyd_bnd_to_carb()
Ejemplo n.º 5
0
def desalter(contnr):
    """Desalts molecules in a molecule container.

    :param contnr: The molecule container.
    :type contnr: MolContainer.MolContainer
    :return: A molecule object.
    :rtype: MyMol.MyMol
    """

    # Split it into fragments
    frags = contnr.get_frags_of_orig_smi()

    if len(frags) == 1:
        # It's only got one fragment, so default assumption that
        # orig_smi = orig_smi_deslt is correct.
        return contnr.mol_orig_frm_inp_smi
    else:
        Utils.log(
            "\tMultiple fragments found in "
            + contnr.orig_smi
            + " ("
            + contnr.name
            + ")"
        )

        # Find the biggest fragment
        num_heavy_atoms = []
        num_heavy_atoms_to_frag = {}

        for i, f in enumerate(frags):
            num = f.GetNumHeavyAtoms()
            num_heavy_atoms.append(num)
            num_heavy_atoms_to_frag[num] = f

        max_num = max(num_heavy_atoms)
        biggest_frag = num_heavy_atoms_to_frag[max_num]

        # Return info about that biggest fragment.
        new_mol = MyMol.MyMol(biggest_frag)
        new_mol.contnr_idx = contnr.contnr_idx
        new_mol.name = contnr.name
        new_mol.genealogy = contnr.mol_orig_frm_inp_smi.genealogy
        new_mol.make_mol_frm_smiles_sanitze()  # Need to update the mol.
        return new_mol
Ejemplo n.º 6
0
def parallel_get_chiral(mol, max_variants_per_compound, thoroughness):
    """A parallelizable function for enumerating chiralities.

    :param mol: The input molecule.
    :type mol: MyMol.MyMol
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :return: A list of MyMol.MyMol objects.
    :rtype: list
    """

    # Get all chiral centers that aren't assigned explicitly in the input
    # molecules.
    unasignd = [p[0] for p in mol.chiral_cntrs_w_unasignd() if p[1] == "?"]
    num = len(unasignd)

    # Get all possible chiral assignments. If the chirality is specified,
    # retain it.
    results = []
    if num == 0:
        # There are no unspecified chiral centers, so just keep existing.
        results.append(mol)
        return results
    elif num == 1:
        # There's only one chiral center.
        options = ["R", "S"]
    else:
        # There are multiple chiral centers.
        starting = [["R"], ["S"]]
        options = [["R"], ["S"]]
        for i in range(num - 1):
            if len(options) > thoroughness * max_variants_per_compound:
                # Unfortunately, this section lends itself to a combinatorial
                # explosion if there are many chiral centers. Necessary to
                # control that or it can become problematic. So truncate early
                # if you already have a enough (so some will unfortunately
                # never be evaluated).
                break
            options = list(itertools.product(options, starting))
            options = [list(itertools.chain(c[0], c[1])) for c in options]

    # Let the user know the number of chiral centers.
    Utils.log(
        "\t"
        + mol.smiles(True)
        + " ("
        + mol.name
        + ") has "
        # + str(len(options))
        + str(2 ** num)
        + " enantiomers when chiral centers with "
        + "no specified chirality are systematically varied."
    )

    # Randomly select a few of the chiral combinations to examine. This is to
    # reduce the potential combinatorial explosion.
    num_to_keep_initially = thoroughness * max_variants_per_compound
    options = Utils.random_sample(options, num_to_keep_initially, "")

    # Go through the chirality combinations and make a molecule with that
    # chirality.
    for option in options:
        # Copy the initial rdkit molecule.
        a_rd_mol = copy.copy(mol.rdkit_mol)

        # Set its chirality.
        for idx, chiral in zip(unasignd, option):
            if chiral == "R":
                a_rd_mol.GetAtomWithIdx(idx).SetChiralTag(
                    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW
                )
            elif chiral == "S":
                a_rd_mol.GetAtomWithIdx(idx).SetChiralTag(
                    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW
                )

        # Make a new MyMol.MyMol object from that rdkit molecule.
        new_mol = MyMol.MyMol(a_rd_mol)

        # Add the new molecule to the list of results, if it does not have a
        # bizarre substructure.
        if not new_mol.remove_bizarre_substruc():
            new_mol.contnr_idx = mol.contnr_idx
            new_mol.name = mol.name
            new_mol.genealogy = mol.genealogy[:]
            new_mol.genealogy.append(new_mol.smiles(True) + " (chirality)")
            results.append(new_mol)

    # Return the results.
    return results
Ejemplo n.º 7
0
def parallel_get_double_bonded(mol, max_variants_per_compound, thoroughness):
    """A parallelizable function for enumerating double bonds.

    :param mol: The molecule with a potentially unspecified double bond.
    :type mol: MyMol.MyMol
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :return: [description]
    :rtype: [type]
    """

    # For this to work, you need to have explicit hydrogens in place.
    mol.rdkit_mol = Chem.AddHs(mol.rdkit_mol)

    # Get all double bonds that don't have defined stereochemistry. Note that
    # these are the bond indexes, not the atom indexes.
    unasignd_dbl_bnd_idxs = mol.get_double_bonds_without_stereochemistry()

    if len(unasignd_dbl_bnd_idxs) == 0:
        # There are no unassigned double bonds, so move on.
        return [mol]

    # Throw out any bond that is in a small ring.
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(3)
    ]
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(4)
    ]
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(5)
    ]
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(6)
    ]
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(7)
    ]

    # Previously, I fully enumerated all double bonds. When there are many
    # such bonds, that leads to a combinatorial explosion that causes problems
    # in terms of speed and memory. Now, enumerate only enough bonds to make
    # sure you generate at least thoroughness * max_variants_per_compound.
    unasignd_dbl_bnd_idxs_orig_count = len(unasignd_dbl_bnd_idxs)
    num_bonds_to_keep = int(math.ceil(math.log(thoroughness * max_variants_per_compound, 2)))
    random.shuffle(unasignd_dbl_bnd_idxs)
    unasignd_dbl_bnd_idxs = sorted(unasignd_dbl_bnd_idxs[:num_bonds_to_keep])

    # Get a list of all the single bonds that come off each double-bond atom.
    all_sngl_bnd_idxs = set([])
    dbl_bnd_count = 0
    for dbl_bnd_idx in unasignd_dbl_bnd_idxs:
        bond = mol.rdkit_mol.GetBondWithIdx(dbl_bnd_idx)

        atom1 = bond.GetBeginAtom()
        atom1_bonds = atom1.GetBonds()
        if len(atom1_bonds) == 1:
            # The only bond is the one you already know about. So don't save.
            continue

        atom2 = bond.GetEndAtom()
        atom2_bonds = atom2.GetBonds()
        if len(atom2_bonds) == 1:
            # The only bond is the one you already know about. So don't save.
            continue

        dbl_bnd_count = dbl_bnd_count + 1

        # Suffice it to say, RDKit does not deal with cis-trans isomerization
        # in an intuitive way...
        idxs_of_other_bnds_frm_atm1 = [b.GetIdx() for b in atom1.GetBonds()]
        idxs_of_other_bnds_frm_atm1.remove(dbl_bnd_idx)

        idxs_of_other_bnds_frm_atm2 = [b.GetIdx() for b in atom2.GetBonds()]
        idxs_of_other_bnds_frm_atm2.remove(dbl_bnd_idx)

        all_sngl_bnd_idxs |= set(idxs_of_other_bnds_frm_atm1)
        all_sngl_bnd_idxs |= set(idxs_of_other_bnds_frm_atm2)

    # Now come up with all possible up/down combinations for those bonds.
    all_sngl_bnd_idxs = list(all_sngl_bnd_idxs)
    all_atom_config_options = list(
        itertools.product([True, False], repeat=len(all_sngl_bnd_idxs))
    )

    # Let the user know.
    if dbl_bnd_count > 0:
        Utils.log(
            "\t"
            + mol.smiles(True)
            + " has "
            # + str(dbl_bnd_count)
            + str(
                # Not exactly right, I think, because should be dbl_bnd_count, but ok.
                unasignd_dbl_bnd_idxs_orig_count
            )
            + " double bond(s) with unspecified stereochemistry."
        )

    # Go through and consider each of the retained combinations.
    smiles_to_consider = set([])
    for atom_config_options in all_atom_config_options:
        # Make a copy of the original RDKit molecule.
        a_rd_mol = copy.copy(mol.rdkit_mol)
        # a_rd_mol = Chem.MolFromSmiles(mol.smiles())

        for bond_idx, direc in zip(all_sngl_bnd_idxs, atom_config_options):
            # Always done with reference to the atom in the double bond.
            if direc:
                a_rd_mol.GetBondWithIdx(bond_idx).SetBondDir(Chem.BondDir.ENDUPRIGHT)
            else:
                a_rd_mol.GetBondWithIdx(bond_idx).SetBondDir(Chem.BondDir.ENDDOWNRIGHT)

        # Assign the StereoChemistry. Required to actually set it.
        a_rd_mol.ClearComputedProps()
        Chem.AssignStereochemistry(a_rd_mol, force=True)

        # Add to list of ones to consider
        try:
            smiles_to_consider.add(
                Chem.MolToSmiles(a_rd_mol, isomericSmiles=True, canonical=True)
            )
        except:
            # Some molecules still give troubles. Unfortunate, but these are
            # rare cases. Let's just skip these. Example:
            # CN1C2=C(C=CC=C2)C(C)(C)[C]1=[C]=[CH]C3=CC(=C(O)C(=C3)I)I
            continue

    # Remove ones that don't have "/" or "\". These are not real enumerated ones.
    smiles_to_consider = [s for s in smiles_to_consider if "/" in s or "\\" in s]

    # Get the maximum number of / + \ in any string.
    cnts = [s.count("/") + s.count("\\") for s in smiles_to_consider]

    if len(cnts) == 0:
        # There are no appropriate double bonds. Move on...
        return [mol]

    max_cnts = max(cnts)

    # Only keep those with that same max count. The others have double bonds
    # that remain unspecified.
    smiles_to_consider = [
        s[0] for s in zip(smiles_to_consider, cnts) if s[1] == max_cnts
    ]
    results = []
    for smile_to_consider in smiles_to_consider:
        # Make a new MyMol.MyMol object with the specified smiles.
        new_mol = MyMol.MyMol(smile_to_consider)

        if new_mol.can_smi != False and new_mol.can_smi != None:
            # Sometimes you get an error if there's a bad structure otherwise.

            # Add the new molecule to the list of results, if it does not have
            # a bizarre substructure.
            if not new_mol.remove_bizarre_substruc():
                new_mol.contnr_idx = mol.contnr_idx
                new_mol.name = mol.name
                new_mol.genealogy = mol.genealogy[:]
                new_mol.genealogy.append(
                    new_mol.smiles(True) + " (cis-trans isomerization)"
                )
                results.append(new_mol)

    # Return the results.
    return results
Ejemplo n.º 8
0
def parallel_make_taut(contnr, mol_index, max_variants_per_compound):
    """Makes alternate tautomers for a given molecule container. This is the
       function that gets fed into the parallelizer.

    :param contnr: The molecule container.
    :type contnr: MolContainer.MolContainer
    :param mol_index: The molecule index.
    :type mol_index: int
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :return: A list of MyMol.MyMol objects, containing the alternate
        tautomeric forms.
    :rtype: list
    """

    # Get the MyMol.MyMol within the molecule container corresponding to the
    # given molecule index.
    mol = contnr.mols[mol_index]

    # Create a temporary RDKit mol object, since that's what MolVS works with.
    # TODO: There should be a copy function
    m = MyMol.MyMol(mol.smiles()).rdkit_mol

    # For tautomers to work, you need to not have any explicit hydrogens.
    m = Chem.RemoveHs(m)

    # Make sure it's not None.
    if m is None:
        Utils.log("\tCould not generate tautomers for " + contnr.orig_smi +
                  ". I'm deleting it.")
        return

    # Molecules should be kekulized already, but let's double check that.
    # Because MolVS requires kekulized input.
    Chem.Kekulize(m)
    m = MOH.check_sanitization(m)
    if m is None:
        return None

    # Limit to max_variants_per_compound tauts. Note that another batch could
    # add more, so you'll need to once again trim to this number later. But
    # this could at least help prevent the combinatorial explosion at this
    # stage.
    enum = tautomer.TautomerEnumerator(max_tautomers=max_variants_per_compound)
    tauts_rdkit_mols = enum.enumerate(m)

    # Make all those tautomers into MyMol objects.
    tauts_mols = [MyMol.MyMol(m) for m in tauts_rdkit_mols]

    # Keep only those that have reasonable substructures.
    tauts_mols = [
        t for t in tauts_mols if t.remove_bizarre_substruc() == False
    ]

    # If there's more than one, let the user know that.
    if len(tauts_mols) > 1:
        Utils.log("\t" + mol.smiles(True) + " has tautomers.")

    # Now collect the final results.
    results = []

    for tm in tauts_mols:
        tm.inherit_contnr_props(contnr)
        tm.genealogy = mol.genealogy[:]
        tm.name = mol.name

        if tm.smiles() != mol.smiles():
            tm.genealogy.append(tm.smiles(True) + " (tautomer)")

        results.append(tm)

    return results