Exemple #1
0
    def prepare_mol(self, mol: rdchem.Mol) -> Tuple[str, rdchem.Mol]:
        """Prepare both smiles and mol by standardizing to common rules.

        This method should be called before `get_input_feats`.

        Params:
        -------
        mol: rdkit.Chem.rdchem.Mol
            Molecule of interest.

        Returns:
        --------
        canonical_smiles: str
            Canonical SMILES representation of the molecule. 

        mol: rdkit.Chem.rdchem.Mol
            Modified molecule w/ kekulization and Hs added, if specified.
        """
        canonical_smiles = rdmolfiles.MolToSmiles(mol, canonical=True)
        mol = rdmolfiles.MolFromSmiles(canonical_smiles)

        if self.add_Hs:
            mol = rdmolops.AddHs(mol)
        if self.kekulize:
            rdmolops.Kekulize(mol)
        return canonical_smiles, mol
Exemple #2
0
    def process_molecule(self, pdb_file, use_esp=False):
        """
        Processes a molecule from the passed PDB file if the file contents has
        no errors.
        :param pdb_file: path to the PDB file to process the molecule from.
        :return: a ProcessedMolecule object
        """

        # NOTE: Gasteiger is an inappropriate algorithm for ESP calculation of proteins!
        # read a molecule from the PDB file
        try:
            mol = Chem.MolFromPDBFile(molFileName=pdb_file, removeHs=False,
                                      sanitize=True)
        except IOError:
            log.warning("Could not read PDB file.")
            return None

        if mol is None:
            log.warning("Bad pdb file found.")
            return None

        if use_esp:
            try:
                # add missing hydrogen atoms
                mol = rdMO.AddHs(mol, addCoords=True)
                # compute partial charges
                rdPC.ComputeGasteigerCharges(mol, throwOnParamFailure=True)
            except ValueError:
                log.warning("Bad Gasteiger charge evaluation.")
                return None

        # get the conformation of the molecule
        conformer = mol.GetConformer()

        # calculate the center of the molecule
        center = rdMT.ComputeCentroid(conformer, ignoreHs=False)

        atoms_count = mol.GetNumAtoms()
        atoms = mol.GetAtoms()

        def get_coords(i):
            coord = conformer.GetAtomPosition(i)
            return np.asarray([coord.x, coord.y, coord.z])

        # set the coordinates, charges, VDW radii and atom count
        res = {
            "coords": np.asarray(
                [get_coords(i) for i in range(0, atoms_count)]) - np.asarray(
                [center.x, center.y, center.z]),
            "vdwradii": np.asarray(
                [self.periodic_table.GetRvdw(atom.GetAtomicNum()) for atom in
                 atoms])
        }
        if use_esp:
            res['charges'] = np.asarray([float(atom.GetProp("_GasteigerCharge")) for atom in atoms])
        return res
 def get_all_interactions(self, pdbfile, sdffile):
     residue_names, residue_atoms = get_residues_and_atoms(pdbfile)
     compounds = get_compounds(sdffile)
     all_interactions = {}
     for index, compound in enumerate(compounds):
         compound = rdmolops.AddHs(compound)
         interactions = self.get_interactions(compound, residue_atoms,
                                              residue_names)
         all_interactions[index] = interactions
     return all_interactions
def test_mols():
    mols = []
    all_smiles = [
        'CN=C=O', 'Cc1ccccc1', 'CC1=CC2CC(CC1)O2', 'CCCCCCCCCCCCCCCC'
    ]
    for smiles in all_smiles:
        mol = rdmolfiles.MolFromSmiles(smiles)
        mol = rdmolops.AddHs(mol, addCoords=True)
        rdDistGeom.EmbedMolecule(mol, rdDistGeom.ETKDG())
        mol = rdmolops.RemoveHs(mol)
        mol.SetProp('Fitness', str(np.random.rand(1)[0]))
        mols.append(mol)
    return mols
Exemple #5
0
    def apply_retrorules(self, smile, rxns, explicit_hydrogens=False):
        '''Function takes a smile and dictionary of reactions, applys the reactions and
           returns a dictionary of rxn_names : products '''
        try:
            substrate_molecule = AllChem.MolFromSmiles(smile)
        except:
            return {}

        if explicit_hydrogens == True:
            substrate_molecule = rdmolops.AddHs(substrate_molecule)

        rxn_product_dict = {}
        for rxn_name, rxn in rxns.items():
            try:
                products = rxn.RunReactants((substrate_molecule, ))
            except:
                products = []
                print('Error running reactants for: ' + str(smile))

            smiles_products = []
            for product in products:
                sub_list = []
                for mol in product:
                    mols = [mol]

                    if explicit_hydrogens == True:
                        mol = rdmolops.RemoveHs(mol)

                    try:
                        mols = rdmolops.GetMolFrags(mol, asMols=True)
                    except:
                        pass

                    for mol in mols:
                        try:
                            p_smile = AllChem.MolToSmiles(mol)
                            p_smile = rdkit_smile(p_smile)
                            if self._check_valid_smile(
                                    p_smile, rxn_name=rxn_name) == True:
                                sub_list.append(p_smile)
                        except:
                            pass

                if (sub_list not in smiles_products) and (len(sub_list) != 0):
                    smiles_products.append(sub_list)

            if len(smiles_products) != 0:
                rxn_product_dict[rxn_name] = smiles_products

        return rxn_product_dict
Exemple #6
0
def opt_geometry(mol, max_iter, mmffvariant, seed, max_attempts):

    err = 0
    try:
        mol = rdmolops.AddHs(mol)
        a = AllChem.EmbedMolecule(mol,
                                  useRandomCoords=True,
                                  useBasicKnowledge=True,
                                  randomSeed=seed,
                                  clearConfs=True,
                                  maxAttempts=max_attempts)
        if a == -1:
            err = 0

        AllChem.MMFFOptimizeMolecule(mol,
                                     maxIters=max_iter,
                                     mmffVariant=mmffvariant)
    except ValueError:
        err = 1
    except TypeError:
        err = 1

    return mol, err
Exemple #7
0
def construct_pos_matrix(mol: rdchem.Mol,
                         out_size: Optional[int] = -1) -> np.ndarray:
    """Construct relative positions from each atom within the molecule.

    Params:
    -------
    mol: rdkit.Chem.rdchem.Mol
        Molecule of interest. 

    out_size: int, optional, default=-1
        The size of the returned array. If this option is negative, it 
        does not take any effect. Otherwise, it must be larger than or 
        equal to the number of atoms in the input molecule. If so, the 
        end of the array is padded with zeros.

    Returns:
    --------
    pos_matrix: np.ndarray, shape=(n,n,3)
        Relative position (XYZ) coordinates from one atom the others in 
        the mol. 

    Examples:
    ---------
    ```python
    >>> from rdkit import Chem
    >>> from rdkit.Chem import AllChem
    >>> smiles = 'N[C@@]([H])([C@]([H])(O2)C)C(=O)N[C@@]([H])(CC(=O)N)C(=O)N[C@@]([H])([C@]([H])' \
                 '(O)C)C(=O)N[C@@]([H])(Cc1ccc(O)cc1)C(=O)2'
    >>> mol = Chem.MolFromSmiles(smiles)
    >>> mol = Chem.AddHs(mol, addCoords=True)
    >>> AllChem.EmbedMolecule(mol, AllChem.ETKDG())
    >>> mol = Chem.RemoveHs(mol)
    >>> pos_matrix = construct_pos_matrix(mol, out_size=-1)
    >>> pos_matrix.shape
    (34,34,3)

    >>> pos_matrix = construct_pos_matrix(mol, out_size=49)
    >>> pos_matrix.shape
    (49,49,3)
    ```
    """
    # Obtain initial distance geometry between atoms, if unavilable
    if mol.GetNumConformers() == 0:
        mol = rdmolops.AddHs(mol, addCoords=True)
        rdDistGeom.EmbedMolecule(mol, rdDistGeom.ETKDG())
        mol = rdmolops.RemoveHs(mol)
    coords = mol.GetConformer().GetPositions()  # shape=(N,3)
    N = mol.GetNumAtoms()

    # Determine appropiate output size to generate feature matrix of same size for all mols.
    if out_size < 0:
        size = N
    elif out_size >= N:
        size = out_size
    else:
        raise ValueError(
            '`out_size` (N={}) is smaller than number of atoms in mol (N={})'.
            format(out_size, N))

    pos_matrix = np.zeros(shape=(size, size, 3), dtype=np.float)
    for atom_idx in range(N):
        atom_pos = coords[atom_idx]  # central atom of interest
        for neighbor_idx in range(N):
            neigh_pos = coords[neighbor_idx]  # neighboring atom
            pos_matrix[
                atom_idx,
                neighbor_idx] = atom_pos - neigh_pos  # dist between neighbor -> center
    return pos_matrix
Exemple #8
0
def sample_mol():
    mol = rdmolfiles.MolFromSmiles('CN=C=O')
    mol = rdmolops.AddHs(mol, addCoords=True)
    rdDistGeom.EmbedMolecule(mol, rdDistGeom.ETKDG())
    return rdmolops.RemoveHs(mol)
Exemple #9
0
def identify_functional_groups(smi):
    ## We decided to start from a SMILES and add explicit hydrogens inside the function
    mol = Chem.MolFromSmiles(smi)
    mol = rdmolops.AddHs(mol)
    try:
        marked = set()
        ## Since heteroatoms are included in PATT_TUPLE, we remove the first part of the original function
        for patt in PATT_TUPLE:
            for path in mol.GetSubstructMatches(patt):
                for atomindex in path:
                    marked.add(atomindex)

    #merge all connected marked atoms to a single FG
        groups = []
        while marked:
            grp = set([marked.pop()])
            merge(mol, marked, grp)
            groups.append(grp)
        groups = [list(x) for x in groups]

        ## It seems that the initial filtering of heteroatoms was not enough, so we add this to remove groups with only aromatic atoms
        for g in groups:
            group_aromaticity = set(
                [mol.GetAtomWithIdx(idx).GetIsAromatic() for idx in g])
            if group_aromaticity == {True}:
                groups.remove(g)

    ## Identify bonds to break and hydrogens to keep for every FG
        bonds = []
        labels = []
        for g in groups:
            group_bonds = []
            group_labels = []
            for idx in g:
                atom = mol.GetAtomWithIdx(idx)

                ## Carbon atoms
                if atom.GetAtomicNum() == 6:
                    for nbr in atom.GetNeighbors():
                        ## Carbonyl groups to disciminate between aldehydes and ketones
                        if nbr.GetAtomicNum() == 8 and str(
                                mol.GetBondBetweenAtoms(
                                    idx,
                                    nbr.GetIdx()).GetBondType()) == "DOUBLE":
                            PreserveH = True
                            break
                        else:
                            PreserveH = False
                    if PreserveH == True:
                        for nbr in atom.GetNeighbors():
                            jdx = nbr.GetIdx()
                            if jdx not in g and nbr.GetAtomicNum() != 1:
                                group_bonds.append(
                                    mol.GetBondBetweenAtoms(idx, jdx).GetIdx())
                                group_labels.append((0, 0))
                    else:
                        for nbr in atom.GetNeighbors():
                            jdx = nbr.GetIdx()
                            if jdx not in g:
                                group_bonds.append(
                                    mol.GetBondBetweenAtoms(idx, jdx).GetIdx())
                                group_labels.append((0, 0))
                ## Nitrogen atoms
                elif atom.GetAtomicNum() == 7:
                    ## To discriminate between anilines and amines (primary, secondary, etc)
                    if len(g) == 1:
                        neigh_atn = [
                            x.GetAtomicNum() for x in atom.GetNeighbors()
                            if x.GetAtomicNum() != 1
                        ]
                        if neigh_atn.count(6) == 1:
                            for nbr in atom.GetNeighbors():
                                jdx = nbr.GetIdx()
                                if jdx not in g and nbr.GetAtomicNum() != 1:
                                    group_bonds.append(
                                        mol.GetBondBetweenAtoms(idx,
                                                                jdx).GetIdx())
                                    if nbr.GetIsAromatic() == True:
                                        group_labels.append((1, 1))
                                    else:
                                        group_labels.append((0, 0))
                        else:
                            for nbr in atom.GetNeighbors():
                                jdx = nbr.GetIdx()
                                if jdx not in g and nbr.GetAtomicNum() != 1:
                                    group_bonds.append(
                                        mol.GetBondBetweenAtoms(idx,
                                                                jdx).GetIdx())
                                    group_labels.append((0, 0))
                    else:
                        for nbr in atom.GetNeighbors():
                            jdx = nbr.GetIdx()
                            if jdx not in g:
                                group_bonds.append(
                                    mol.GetBondBetweenAtoms(idx, jdx).GetIdx())
                                group_labels.append((0, 0))

                ## Oxygen atoms
                elif atom.GetAtomicNum() == 8:
                    ## To discriminate between alcohols from phenols and esthers from carboxylic acids
                    if len(g) == 1:
                        neigh_atn = [
                            x.GetAtomicNum() for x in atom.GetNeighbors()
                            if x.GetAtomicNum() != 1
                        ]
                        if len(neigh_atn) == 1 and neigh_atn.count(6) == 1:
                            for nbr in atom.GetNeighbors():
                                jdx = nbr.GetIdx()
                                if jdx not in g and (nbr.GetAtomicNum() != 1):
                                    group_bonds.append(
                                        mol.GetBondBetweenAtoms(idx,
                                                                jdx).GetIdx())
                                    if nbr.GetIsAromatic() == True:
                                        group_labels.append((1, 1))
                                    else:
                                        group_labels.append((0, 0))
                        else:
                            for nbr in atom.GetNeighbors():
                                jdx = nbr.GetIdx()
                                if jdx not in g and nbr.GetAtomicNum() != 1:
                                    group_bonds.append(
                                        mol.GetBondBetweenAtoms(idx,
                                                                jdx).GetIdx())
                                    group_labels.append((0, 0))
                    else:
                        for nbr in atom.GetNeighbors():
                            jdx = nbr.GetIdx()
                            if jdx not in g and nbr.GetAtomicNum() != 1:
                                group_bonds.append(
                                    mol.GetBondBetweenAtoms(idx, jdx).GetIdx())
                                group_labels.append((0, 0))

                ## Sulfur atoms
                elif atom.GetAtomicNum() == 16:
                    if len(g) == 1:
                        for nbr in atom.GetNeighbors():
                            jdx = nbr.GetIdx()
                            if jdx not in g and nbr.GetAtomicNum() != 1:
                                group_bonds.append(
                                    mol.GetBondBetweenAtoms(idx, jdx).GetIdx())
                                group_labels.append((0, 0))
                    else:
                        for nbr in atom.GetNeighbors():
                            jdx = nbr.GetIdx()
                            if jdx not in g:
                                group_bonds.append(
                                    mol.GetBondBetweenAtoms(idx, jdx).GetIdx())
                                group_labels.append((0, 0))

                else:
                    for nbr in atom.GetNeighbors():
                        jdx = nbr.GetIdx()
                        if jdx not in g:
                            group_bonds.append(
                                mol.GetBondBetweenAtoms(idx, jdx).GetIdx())
                            group_labels.append((0, 0))
            labels.append(group_labels)
            bonds.append(group_bonds)

    ## Build final fragments
        FGS_ENVS = []
        for i in range(len(groups)):
            Frag = Chem.FragmentOnBonds(mol, bonds[i], dummyLabels=labels[i])
            Frags = rdmolops.GetMolFrags(Frag)
            for j in Frags:
                if groups[i][0] in j:
                    FGS_ENVS.append(
                        Chem.MolFragmentToSmiles(Frag,
                                                 j,
                                                 canonical=True,
                                                 allHsExplicit=True))
        FGS_ENVS = list(set(FGS_ENVS))
        for i in FGS_ENVS:
            if Chem.MolFromSmiles(i) == None:
                FG = Chem.MolFromSmarts(i)
            else:
                FG = Chem.MolFromSmiles(i)
            if set([
                    atom.GetIsAromatic() for atom in FG.GetAtoms()
                    if atom.GetSymbol() not in ["*", "H"]
            ]) == {True}:
                FGS_ENVS.remove(i)
        return FGS_ENVS

    except:
        ## When the molecules is as small as a single FG
        FGS_ENVS = [Chem.MolToSmiles(mol, canonical=True, allHsExplicit=True)]
        return FGS_ENVS
Exemple #10
0
    def process_molecule(self, pdb_file):
        """
        Splits the molecules into separate channels.
        :param pdb_file: the pdb file to be processed
        :return: a dictionary of the coordinates and vdwradii for each channel
        """
        hydro_file_name = '_hydrogenized.'.join(
            os.path.basename(pdb_file).split('.'))
        hydrogenized_pdb_file = os.path.join(os.path.dirname(pdb_file),
                                             hydro_file_name)
        try:
            mol_rdkit = Chem.MolFromPDBFile(molFileName=pdb_file,
                                            removeHs=False, sanitize=True)
            if mol_rdkit is not None:
                mol_rdkit = rdMO.AddHs(mol_rdkit, addCoords=True)
                # get the conformation of the molecule
                conformer = mol_rdkit.GetConformer()
                # calculate the center of the molecule
                center = rdMT.ComputeCentroid(conformer, ignoreHs=False)
                mol_center = np.asarray([center.x, center.y, center.z])
            else:
                raise ValueError
            pdbw = Chem.rdmolfiles.PDBWriter(fileName=hydrogenized_pdb_file)
            pdbw.write(mol_rdkit)
            pdbw.flush()
            pdbw.close()
            del mol_rdkit, pdbw
        except (IOError, ValueError):
            log.warning("Bad PDB file.")
            return None

        try:
            mol = pd.parsePDB(hydrogenized_pdb_file)
        except IOError:
            log.warning("Could not read PDB file.")
            return None

        if mol is None:
            log.warning("Bad pdb file found.")
            return None

        std_amino_acids = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS',
                           'GLN', 'GLU', 'GLY', 'HIS', 'ILE',
                           'LEU', 'LYS', 'MET', 'PHE', 'PRO',
                           'SER', 'THR', 'TRP', 'TYR', 'VAL']

        canonical_notation = lambda x: x[0].upper() + x[1:].lower() if len(
            x) > 1 else x
        res = {'coords': mol.getCoords() - mol_center,
               'vdwradii': np.asarray([self.periodic_table.GetRvdw(
                   self.periodic_table.GetAtomicNumber(
                       canonical_notation(atom)))
                                       for atom in mol.getElements()])}

        # find the data for all the 20 amino acids
        for aa in std_amino_acids:
            all_aas_in_mol = mol.select('resname ' + aa)
            if all_aas_in_mol is not None:
                mask = all_aas_in_mol.getIndices()
            else:
                mask = np.array([], dtype=np.int32)
            res['coords_' + aa] = res['coords'][mask, :]
            res['vdwradii_' + aa] = res['vdwradii'][mask]

        # find the data for the backbones
        backbone_mask = mol.backbone.getIndices()
        res['coords_backbone'] = res['coords'][backbone_mask, :]
        res['vdwradii_backbone'] = res['vdwradii'][backbone_mask]

        # find the data for the heavy atoms (i.e. no H atoms)
        heavy_mask = mol.heavy.getIndices()
        res['coords_heavy'] = res['coords'][heavy_mask, :]
        res['vdwradii_heavy'] = res['vdwradii'][heavy_mask]

        # find the data for the heavy atoms (i.e. no H atoms)
        hydro_mask = mol.hydrogen.getIndices()
        res['coords_hydro'] = res['coords'][hydro_mask, :]
        res['vdwradii_hydro'] = res['vdwradii'][hydro_mask]

        return res