Example #1
0
def __get_context_env(mol, radius):
    """
    INPUT:
        mol - Mol object containing chain(s) of molecular context
        radius - integer, number of bonds to cut context
    OUTPUT:
        Mol containing only atoms within the specified radius from the attachment point(s).
        All explicit Hs will be stripped.
    """
    # mol is context consisting of one or more groups with single attachment point

    m = Chem.RemoveHs(mol)
    m = Chem.RWMol(m)

    bond_ids = set()
    for a in m.GetAtoms():
        if a.GetSymbol() == "*":
            i = radius
            b = Chem.FindAtomEnvironmentOfRadiusN(m, i, a.GetIdx())
            while not b and i > 0:
                i -= 1
                b = Chem.FindAtomEnvironmentOfRadiusN(m, i, a.GetIdx())
            bond_ids.update(b)

    atom_ids = set(__bonds_to_atoms(m, bond_ids))

    dummy_atoms = []

    for a in m.GetAtoms():
        if a.GetIdx() not in atom_ids:
            nei_ids = set(na.GetIdx() for na in a.GetNeighbors())
            intersect = nei_ids & atom_ids
            if intersect:
                dummy_atom_bonds = []
                for ai in intersect:
                    dummy_atom_bonds.append(
                        (ai, m.GetBondBetweenAtoms(a.GetIdx(),
                                                   ai).GetBondType()))
                dummy_atoms.append(dummy_atom_bonds)

    for data in dummy_atoms:
        dummy_id = m.AddAtom(Chem.Atom(0))
        for atom_id, bond_type in data:
            m.AddBond(dummy_id, atom_id, bond_type)
        atom_ids.add(dummy_id)

    m = __get_submol(m, atom_ids)

    return m
Example #2
0
    def bit2atom_mapping(self, mol_obj) -> Dict[int, List[AtomEnvironment]]:
        hash2atom_dict = self.explain_rdmol(mol_obj)
        bit2atom_dict = {
            self.bit_mapping[hash_val]: atom_env
            for hash_val, atom_env in hash2atom_dict.items()
        }

        result_dict = defaultdict(list)

        # Iterating over all present bits and respective matches
        for bit, matches in bit2atom_dict.items():  # type: int, tuple
            for central_atom, radius in matches:  # type: int, int
                if radius == 0:
                    result_dict[bit].append(
                        AtomEnvironment(central_atom, radius, {central_atom}))
                    continue
                env = Chem.FindAtomEnvironmentOfRadiusN(
                    mol_obj, radius, central_atom)
                atom_map = {}
                _ = Chem.PathToSubmol(mol_obj, env, atomMap=atom_map)
                env_atoms = atom_map.keys()
                assert central_atom in env_atoms
                result_dict[bit].append(
                    AtomEnvironment(central_atom, radius, set(env_atoms)))

        # Transforming defaultdict to dict
        return {k: v for k, v in result_dict.items()}
Example #3
0
def getMorganEnvironment(mol, bitInfo, fp=None, minRad=0):
    """

    >>> m = Chem.MolFromSmiles('CC(O)C')
    >>> bi = {}
    >>> fp = AllChem.GetMorganFingerprintAsBitVect(m,2,2048,bitInfo=bi)
    >>> getMorganEnvironment(m,bi)
    defaultdict(<class 'list'>, {1057: [[], []], 227: [[1]], 709: [[0, 1, 2]], 1: [[]], 283: [[0], [2]], 807: [[]]})
    >>> getMorganEnvironment(m,bi,minRad=1)
    defaultdict(<class 'list'>, {283: [[0], [2]], 227: [[1]], 709: [[0, 1, 2]]})
    >>> list(fp.GetOnBits())
    [1, 227, 283, 709, 807, 1057]
    >>> getMorganEnvironment(m,bi,minRad=1,fp=fp)
    defaultdict(<class 'list'>, {283: [[0], [2]], 227: [[1]], 709: [[0, 1, 2]]})
    >>> list(fp.GetOnBits())
    [227, 283, 709]

    """
    bitPaths = defaultdict(list)
    for bit, info in bitInfo.items():
        for atomID, radius in info:
            if radius < minRad:
                if fp != None:
                    fp[bit] = 0
                continue
            env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atomID)
            bitPaths[bit].append(list(env))
    return bitPaths
Example #4
0
    def find_feature_fragments(self, feature_num, mols, radius=3, nBits=1024):

        from rdkit import Chem
        from rdkit import DataStructs
        from rdkit.Chem.Fingerprints import FingerprintMols
        from rdkit.Chem import AllChem, DataStructs, Draw
        from collections import defaultdict

        fragmol = defaultdict(list)
        fragmol_mol = defaultdict(list)
        for mol in mols:
            bit_info = {}
            #fragmol = defaultdict( list )
            fp = AllChem.GetMorganFingerprintAsBitVect(mol,
                                                       radius=radius,
                                                       nBits=nBits,
                                                       bitInfo=bit_info)
            for bit, info in bit_info.items():
                for atm_idx, rad in info:
                    env = Chem.FindAtomEnvironmentOfRadiusN(mol, rad, atm_idx)
                    amap = {}
                    try:
                        submol = Chem.PathToSubmol(mol, env, atomMap=amap)
                    except:
                        raise ValueError('feature does not turn on any bits')
                    smi = Chem.MolToSmiles(submol)
                    if smi != '':
                        if smi not in fragmol[bit]:
                            fragmol[bit].append(smi)
                            fragmol_mol[bit].append(submol)

        return fragmol[feature_num], fragmol_mol[feature_num]
Example #5
0
  def _featurize(self, mol):
    """
    Calculate circular fingerprint.

    Parameters
    ----------
    mol : RDKit Mol
        Molecule.
    """
    if self.sparse:
      info = {}
      fp = rdMolDescriptors.GetMorganFingerprint(
          mol, self.radius, useChirality=self.chiral,
          useBondTypes=self.bonds, useFeatures=self.features,
          bitInfo=info)
      fp = fp.GetNonzeroElements()  # convert to a dict

      # generate SMILES for fragments
      if self.smiles:
        fp_smiles = {}
        for fragment_id, count in fp.items():
          root, radius = info[fragment_id][0]
          env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, root)
          frag = Chem.PathToSubmol(mol, env)
          smiles = Chem.MolToSmiles(frag)
          fp_smiles[fragment_id] = {'smiles': smiles, 'count': count}
        fp = fp_smiles
    else:
      fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
          mol, self.radius, nBits=self.size, useChirality=self.chiral,
          useBondTypes=self.bonds, useFeatures=self.features)
    return fp
Example #6
0
def getSubstructDepiction(mol, atomID, radius, molSize=(450, 200)):
    """
    do a depiction where the atom environment is highlighted normally and the central atom
    is highlighted in blue

    :param mol:
    :param atomID:
    :param radius:
    :param molSize:
    :return:
    """
    if radius > 0:
        env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atomID)
        atomsToUse = []
        for b in env:
            atomsToUse.append(mol.GetBondWithIdx(b).GetBeginAtomIdx())
            atomsToUse.append(mol.GetBondWithIdx(b).GetEndAtomIdx())
        atomsToUse = list(set(atomsToUse))
    else:
        atomsToUse = [atomID]
        env = None
    return moltosvg(mol,
                    molSize=molSize,
                    highlightAtoms=atomsToUse,
                    highlightAtomColors={atomID: (0.3, 0.3, 1)})
Example #7
0
def depict_atoms(mol,
                 atom_ids,
                 radii,
                 molSize=(300, 300),
                 atm_color=(0, 1, 0),
                 oth_color=(0.8, 1, 0)):
    """Get a depiction of molecular substructure. Useful for depicting bits in fingerprints.
    
    Inspired by: http://rdkit.blogspot.ch/2016/02/morgan-fingerprint-bit-statistics.html
    
    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
    atom_ids : list
        List of atoms to depict
    radii : list
        List of radii - how many atoms around each atom with atom_id to highlight
    molSize : tuple
    atm_color, oth_color : tuple
        Colors of central atoms and surrounding atoms and bonds
    
    Returns
    -------
    IPython.display.SVG
    """
    atoms_to_use = []
    bonds = []
    for atom_id, radius in zip(atom_ids, radii):
        if radius > 0:
            env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atom_id)
            bonds += [x for x in env if x not in bonds]
            for b in env:
                atoms_to_use.append(mol.GetBondWithIdx(b).GetBeginAtomIdx())
                atoms_to_use.append(mol.GetBondWithIdx(b).GetEndAtomIdx())
            atoms_to_use = list(set(atoms_to_use))
        else:
            atoms_to_use.append(atom_id)
            env = None
    if sum(radii) == 0:
        return mol_to_svg(mol,
                          molSize=molSize,
                          highlightBonds=False,
                          highlightAtoms=atoms_to_use,
                          highlightAtomColors={x: atm_color
                                               for x in atom_ids})
    else:
        colors = {x: atm_color for x in atom_ids}
        for x in atoms_to_use:
            if x not in atom_ids:
                colors[x] = oth_color
        bond_colors = {b: oth_color for b in bonds}
        return mol_to_svg(mol,
                          molSize=molSize,
                          highlightAtoms=atoms_to_use,
                          highlightAtomColors=colors,
                          highlightBonds=bonds,
                          highlightBondColors=bond_colors)
Example #8
0
def create_histogram():
    """ Uses the given structure and generates its substructure and the 
    substructures frequency. This data is visualized with a histogram 
    (png file).
    
    """

    first_smiles_list =['C=C(C)C1CCC(C)=CCCc2coc(c2)CC2(C)OC2C1',\
                        'c1nccc2n1ccc2','OCC=CC(=O)O','OC1C2C1CC2']
    structure = first_smiles_list[0]  # Select structure
    structure = Chem.MolFromSmiles(structure)

    for smile in first_smiles_list:
        m = Chem.MolFromSmiles(smile)
        nr_of_atoms = m.GetNumAtoms()

        # Generate all possible mol environments per structure
        substructures_list = []
        for i in range(nr_of_atoms):
            for j in range(nr_of_atoms):
                env = Chem.FindAtomEnvironmentOfRadiusN(m, i, j)
                substructures_list += [env]

        # Generate all possible substructures based on the mol envs.
        smile_list = []
        for env in substructures_list:
            amap = {}
            submol = Chem.PathToSubmol(m, env, atomMap=amap)
            mol = Chem.MolToSmiles(submol, canonical=True)
            if mol != '' and mol not in smile_list:
                smile_list += [mol]

    # Add the substructure to the 'all substructures list'
    sub_list = []
    for smile in smile_list:
        x = Chem.MolFromSmiles(smile)
        if x != None:
            sub_list += [x]

    nr_of_matches = 0
    sub_dict = {}
    for substructure in sub_list:
        match = structure.GetSubstructMatches(substructure)
        nr_of_matches += len(match)
        mol = Chem.MolToSmiles(substructure)
        sub_dict[mol] = len(match)

    # Create and save histogram
    fig, ax = plt.subplots(figsize=(10, 5))
    plt.bar(list(sub_dict.keys()), sub_dict.values(), color='b')
    plt.xticks(fontsize=7, rotation=90)
    xlabel = plt.xlabel('Substructure smile')
    plt.ylabel('Substructure frequency')
    plt.title("Substructure frequency for structure XXX")
    fig.savefig('/path/to/histogram.png',
                bbox_extra_artists=[xlabel],
                bbox_inches='tight')
Example #9
0
def get_substruct(mol, atom_idx, radius=1):
    # this function creates submolecules
    for r in range(radius)[::-1]:
        env = Chem.FindAtomEnvironmentOfRadiusN(mol, r, atom_idx)
        amap = {}
        submol = Chem.PathToSubmol(mol, env, atomMap=amap)
        smi = Chem.MolToSmiles(submol)
        if smi != "":
            break
    return submol
Example #10
0
  def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray:
    """Calculate circular fingerprint.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A numpy array of circular fingerprint.
    """
    try:
      from rdkit import Chem
      from rdkit.Chem import rdMolDescriptors
    except ModuleNotFoundError:
      raise ImportError("This class requires RDKit to be installed.")
    if 'mol' in kwargs:
      datapoint = kwargs.get("mol")
      raise DeprecationWarning(
          'Mol is being phased out as a parameter, please pass "datapoint" instead.'
      )
    if self.sparse:
      info: Dict = {}
      fp = rdMolDescriptors.GetMorganFingerprint(
          datapoint,
          self.radius,
          useChirality=self.chiral,
          useBondTypes=self.bonds,
          useFeatures=self.features,
          bitInfo=info)
      fp = fp.GetNonzeroElements()  # convert to a dict

      # generate SMILES for fragments
      if self.smiles:
        fp_smiles = {}
        for fragment_id, count in fp.items():
          root, radius = info[fragment_id][0]
          env = Chem.FindAtomEnvironmentOfRadiusN(datapoint, radius, root)
          frag = Chem.PathToSubmol(datapoint, env)
          smiles = Chem.MolToSmiles(frag)
          fp_smiles[fragment_id] = {'smiles': smiles, 'count': count}
        fp = fp_smiles
    else:
      fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
          datapoint,
          self.radius,
          nBits=self.size,
          useChirality=self.chiral,
          useBondTypes=self.bonds,
          useFeatures=self.features)
      fp = np.asarray(fp, dtype=float)
    return fp
Example #11
0
def use_rdkit3():
    # Fingerprinting and Molecular Similarity, default; Tanimoto similarity
    m_list2 = [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), \
    Chem.MolFromSmiles('COC')]
    fps = [FingerprintMols.FingerprintMol(x) for x in m_list2]
    print('Fingerprint Similarity -->', DataStructs.FingerprintSimilarity(fps[0],fps[1]))
    print('Fingerprint Similarity -->', DataStructs.FingerprintSimilarity(fps[0],fps[0]))
    print('Fingerprint Similarity -->', DataStructs.FingerprintSimilarity(fps[2],fps[1]))
    # MACCS keys    
    fps = [MACCSkeys.GenMACCSKeys(x) for x in m_list2]
    print('Fingerprint MACCS keys -->', DataStructs.FingerprintSimilarity(fps[0],fps[1]))
    print('Fingerprint MACCS keys -->', DataStructs.FingerprintSimilarity(fps[0],fps[0]))
    print('Fingerprint MACCS keys -->', DataStructs.FingerprintSimilarity(fps[2],fps[1]))
    # Morgan/ circular fingerprints
    m7 = Chem.MolFromSmiles('CCOC')
    fp1 = AllChem.GetMorganFingerprint(m7,2)
    m8 = Chem.MolFromSmiles('CCO')
    fp2 = AllChem.GetMorganFingerprint(m8,2)
    print('Fingerprint Morgan similarity -->', DataStructs.DiceSimilarity(fp1,fp2))
    fp1 = AllChem.GetMorganFingerprintAsBitVect(m7,2,nBits=1024)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(m8,2,nBits=1024)
    print(DataStructs.DiceSimilarity(fp1,fp2))
    
    ffp1 = AllChem.GetMorganFingerprint(m7,2,useFeatures=True)
    ffp2 = AllChem.GetMorganFingerprint(m8,2,useFeatures=True)
    print(DataStructs.DiceSimilarity(ffp1,ffp2))

    #explaining bit (Morgan)
    m9 = Chem.MolFromSmiles('c1cccnc1C')
    info={}
    AllChem.GetMorganFingerprint(m9,2,bitInfo=info)
    print (info)
    print (info[98513984])
    print (info[4048591891])
    
    env = Chem.FindAtomEnvironmentOfRadiusN(m9,2,5)
    amap={}
    submol=Chem.PathToSubmol(m9,env,atomMap=amap)
    print (submol.GetNumAtoms())
    print (amap)
    # bit to smile
    print (Chem.MolToSmiles(submol))


    # Descriptor Calculation; used in papers or coding languages
    m6 = Chem.MolFromSmiles('c1ccccc1O')
    print('Descriptor TPSA -->', Descriptors.TPSA(m6))
    print('Descriptor MolLogP -->', Descriptors.MolLogP(m6))
    
    # Chemical reactions
#    rxn = AllChem.ReactionFromSmarts('[C:1]=[C:2].[C:3]=[*:4][*:5]=[C:6]>>[C:1]1[C:2][C:3][*:4]=[*:5][C:6]1')
    rxn = AllChem.ReactionFromSmarts('[C:1](=[O:2])-[OD1].[N!H0:3]>>[C:1](=[O:2])[N:3]')
    ps = rxn.RunReactants((Chem.MolFromSmiles('CC(=O)O'),Chem.MolFromSmiles('NC=C')))
    print ('Reaction product -->', Chem.MolToSmiles(ps[0][0]))
Example #12
0
def find_center_Environment(centers, radius):
    center_environment = ""
    for atom_idx in centers.atom_ids:
        env = Chem.FindAtomEnvironmentOfRadiusN(centers.mol, radius, atom_idx)
        amap = {}
        submol = Chem.PathToSubmol(centers.mol, env, atomMap=amap)
        env_smi = Chem.MolToSmiles(submol)
        if center_environment == "":
            center_environment = env_smi
        else:
            center_environment = center_environment + "." + env_smi

    if center_environment == "":
        center_environment = "NA"
    return center_environment
Example #13
0
def __get_context_env(mol, radius):
    """
    INPUT:
        mol - Mol object containing chain(s) of molecular context
        radius - integer, number of bonds to cut context
    OUTPUT:
        Mol containing only atoms within the specified radius from the attachment point(s).
        All explicit Hs will be stripped.
    """
    # mol is context consisting of one or more groups with single attachment point
    bond_ids = set()
    for a in mol.GetAtoms():
        if a.GetSymbol() == "*":
            i = radius
            b = Chem.FindAtomEnvironmentOfRadiusN(mol, i, a.GetIdx())
            while not b and i > 0:
                i -= 1
                b = Chem.FindAtomEnvironmentOfRadiusN(mol, i, a.GetIdx())
            bond_ids.update(b)
    m = Chem.PathToSubmol(mol, list(bond_ids))
    # remove Hs, otherwise terminal atoms will produce smiles with H ([CH2]C[*:1])
    for a in m.GetAtoms():
        a.SetNumExplicitHs(0)
    return m
Example #14
0
def lads_score_v2(actives, decoys):
    # Similar to DEKOIS (v2)
    # Lower is better (less like actives), higher is worse (more like actives)
    active_fps = []
    active_info = {}
    info = {}
    atoms_per_bit = defaultdict(int)
    for smi in actives:
        m = Chem.MolFromSmiles(smi)
        active_fps.append(
            AllChem.GetMorganFingerprint(m, 3, useFeatures=True, bitInfo=info))
        for key in info:
            if key not in active_info:
                active_info[key] = info[key]
                env = Chem.FindAtomEnvironmentOfRadiusN(
                    m, info[key][0][1], info[key][0][0])
                amap = {}
                submol = Chem.PathToSubmol(m, env, atomMap=amap)
                if info[key][0][1] == 0:
                    atoms_per_bit[key] = 1
                else:
                    atoms_per_bit[key] = submol.GetNumHeavyAtoms()

    decoys_fps = [
        AllChem.GetMorganFingerprint(Chem.MolFromSmiles(smi),
                                     3,
                                     useFeatures=True) for smi in decoys
    ]  # Roughly FCFP_6

    master_active_fp_freq = defaultdict(int)
    for fp in active_fps:
        fp_dict = fp.GetNonzeroElements()
        for k, v in fp_dict.items():
            master_active_fp_freq[k] += 1
    # Reweight
    for k in master_active_fp_freq:
        # Normalise
        master_active_fp_freq[k] /= len(active_fps)
        # Weight by size of bit
        master_active_fp_freq[k] *= atoms_per_bit[k]

    decoys_lads_avoid_scores = [
        sum([master_active_fp_freq[k]
             for k in decoy_fp.GetNonzeroElements()]) /
        len(decoy_fp.GetNonzeroElements()) for decoy_fp in decoys_fps
    ]

    return decoys_lads_avoid_scores
Example #15
0
def GetMorganFingerprint(mol, atomId=-1, radius=2, fpType='bv', nBits=2048, useFeatures=False):
  """
  Calculates the Morgan fingerprint with the counts of atomId removed.

  Parameters:
    mol -- the molecule of interest
    radius -- the maximum radius
    fpType -- the type of Morgan fingerprint: 'count' or 'bv'
    atomId -- the atom to remove the counts for (if -1, no count is removed)
    nBits -- the size of the bit vector (only for fpType = 'bv')
    useFeatures -- if false: ConnectivityMorgan, if true: FeatureMorgan
  """
  if fpType not in ['bv', 'count']: raise ValueError("Unknown Morgan fingerprint type")
  if not hasattr(mol, '_fpInfo'):
    info = {}
    # get the fingerprint
    if fpType == 'bv': molFp = rdMD.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits, useFeatures=useFeatures, bitInfo=info)
    else: molFp = rdMD.GetMorganFingerprint(mol, radius, useFeatures=useFeatures, bitInfo=info)
    # construct the bit map
    if fpType == 'bv': bitmap = [DataStructs.ExplicitBitVect(nBits) for x in range(mol.GetNumAtoms())]
    else: bitmap = [[] for x in range(mol.GetNumAtoms())]
    for bit, es in info.iteritems():
      for at1, rad in es:
        if rad == 0: # for radius 0
          if fpType == 'bv': bitmap[at1][bit] = 1
          else: bitmap[at1].append(bit)
        else: # for radii > 0
          env = Chem.FindAtomEnvironmentOfRadiusN(mol, rad, at1)
          amap = {}
          submol = Chem.PathToSubmol(mol, env, atomMap=amap)
          for at2 in amap.keys():
            if fpType == 'bv': bitmap[at2][bit] = 1
            else: bitmap[at2].append(bit)
    mol._fpInfo = (molFp, bitmap)

  if atomId < 0:
    return mol._fpInfo[0]
  else: # remove the bits of atomId
    if atomId >= mol.GetNumAtoms(): raise ValueError("atom index greater than number of atoms")
    if len(mol._fpInfo) != 2: raise ValueError("_fpInfo not set")
    if fpType == 'bv':
      molFp = mol._fpInfo[0] ^ mol._fpInfo[1][atomId] # xor
    else: # count
      molFp = copy.deepcopy(mol._fpInfo[0])
      # delete the bits with atomId
      for bit in mol._fpInfo[1][atomId]:
        molFp[bit] -= 1
    return molFp
Example #16
0
  def _featurize(self, mol: RDKitMol) -> np.ndarray:
    """Calculate circular fingerprint.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A numpy array of circular fingerprint.
    """
    from rdkit import Chem
    from rdkit.Chem import rdMolDescriptors

    if self.sparse:
      info: Dict = {}
      fp = rdMolDescriptors.GetMorganFingerprint(
          mol,
          self.radius,
          useChirality=self.chiral,
          useBondTypes=self.bonds,
          useFeatures=self.features,
          bitInfo=info)
      fp = fp.GetNonzeroElements()  # convert to a dict

      # generate SMILES for fragments
      if self.smiles:
        fp_smiles = {}
        for fragment_id, count in fp.items():
          root, radius = info[fragment_id][0]
          env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, root)
          frag = Chem.PathToSubmol(mol, env)
          smiles = Chem.MolToSmiles(frag)
          fp_smiles[fragment_id] = {'smiles': smiles, 'count': count}
        fp = fp_smiles
    else:
      fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
          mol,
          self.radius,
          nBits=self.size,
          useChirality=self.chiral,
          useBondTypes=self.bonds,
          useFeatures=self.features)
      fp = np.asarray(fp, dtype=np.float)
    return fp
Example #17
0
def get_substruct(mol, atom_idx_list, radius=3):
    subsmiDic = {}     # key, value: <substr in str, list of amap> / each of the amap elements are the indices of the significant atoms
    orimol_atomI = ()  # (orimol, list of amap) / each of the amap elements are the indices of the significant atoms
    for r in range(1, radius)[::-1]:
        # can extract the submolecule consisting of all atoms within a radius of r of atom_idx
        for atom_idx in atom_idx_list:
            env = Chem.FindAtomEnvironmentOfRadiusN(mol, r, atom_idx)
            amap = {}  # key, val = <atom index prime(different from whole index), order>
            submol = Chem.PathToSubmol(mol, env, atomMap=amap)
            subsmi = Chem.MolToSmiles(submol)

            if subsmi != "":# found the submolecule
                tmpAmapList = list(amap.keys())

                subsmiDic[subsmi] = tmpAmapList
                orimol_atomI = (mol, tmpAmapList)
    return subsmiDic, orimol_atomI
def compute_all_ecfp(mol, indices=None, degree=2):
    """
  For each atom:
    Obtain molecular fragment for all atoms emanating outward to given degree.
    For each fragment, compute SMILES string (for now) and hash to an int.
    Return a dictionary mapping atom index to hashed SMILES.
  """

    ecfp_dict = {}
    for i in range(mol.GetNumAtoms()):
        if indices is not None and i not in indices:
            continue
        env = Chem.FindAtomEnvironmentOfRadiusN(mol, degree, i, useHs=True)
        submol = Chem.PathToSubmol(mol, env)
        smile = Chem.MolToSmiles(submol)
        ecfp_dict[i] = "%s,%s" % (mol.GetAtoms()[i].GetAtomicNum(), smile)

    return ecfp_dict
Example #19
0
def gradient2atom(smi, gradient, pos_cut=3, neg_cut=-3, nBits=2048):
    """
    map the gradient of Morgan fingerprint bit on the molecule
    Input:
        smi - the smiles of the molecule (a string)
        gradient - the 2048 coeffients of the feature
        cutoff - if positive, get the pos where the integrated weight is bigger than the cutoff;
                 if negative, get the pos where the integrated weight is smaller than the cutoff
    Output:
        two list of atom ids (positive and negative)   
    """
    # generate mol
    mol = Chem.MolFromSmiles(smi)
    # get the bit info of the Morgan fingerprint
    bi = {}
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,
                                                        radius=2,
                                                        bitInfo=bi,
                                                        nBits=nBits)
    onbits = list(fp.GetOnBits())
    # calculate the integrated weight
    atomsToUse = np.zeros((len(mol.GetAtoms()), 1))
    for bitId in onbits:
        atomID, radius = bi[bitId][0]
        temp_atomsToUse = []
        if radius > 0:
            env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atomID)
            for b in env:
                temp_atomsToUse.append(mol.GetBondWithIdx(b).GetBeginAtomIdx())
                temp_atomsToUse.append(mol.GetBondWithIdx(b).GetEndAtomIdx())
        else:
            temp_atomsToUse.append(atomID)
            env = None
        temp_atomsToUse = list(set(temp_atomsToUse))
        atomsToUse[temp_atomsToUse] += gradient[bitId]
    # get the postively/negatively contributed atom ids
    highlit_pos = []
    highlit_neg = []
    for i in range(len(atomsToUse)):
        if atomsToUse[i] > pos_cut:
            highlit_pos.append(i)
        elif atomsToUse[i] < neg_cut:
            highlit_neg.append(i)
    return mol, highlit_pos, highlit_neg, atomsToUse
def get_environment_smarts(carbon, mol):
    """For a given carbon atom and molecule, return a SMARTS representation of
    the atom environment.

    carbon: rdkit.Chem.Atom
        The desired carbon atom
    mol: rdkit.Chem.Mol
        The molecule the atom is present in

    """
    bond_list = list(
        Chem.FindAtomEnvironmentOfRadiusN(mol, 1, carbon.GetIdx(), useHs=True))

    bond_smarts = bond_list_to_smarts(mol, bond_list)

    if carbon.IsInRing():
        return bond_smarts + ' | (Ring)'
    else:
        return bond_smarts
Example #21
0
def count_substructures(radius, molecule):
    """Helper function for get the information of molecular signature of a
    metabolite. The relaxed signature requires the number of each substructure
    to construct a matrix for each molecule.
    Parameters
    ----------
    radius : int
        the radius is bond-distance that defines how many neighbor atoms should
        be considered in a reaction center.
    molecule : Molecule
        a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code)
        or Chem.MolToSmiles(smiles_code))
    Returns
    -------
    dict
        dictionary of molecular signature for a molecule,
        {smiles: molecular_signature}
    """
    m = molecule
    smi_count = dict()
    atomList = [atom for atom in m.GetAtoms()]

    for i in range(len(atomList)):
        env = Chem.FindAtomEnvironmentOfRadiusN(m, radius, i)
        atoms = set()
        for bidx in env:
            atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx())
            atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx())

        # only one atom is in this environment, such as O in H2O
        if len(atoms) == 0:
            atoms = {i}

        smi = Chem.MolFragmentToSmiles(m,
                                       atomsToUse=list(atoms),
                                       bondsToUse=env,
                                       canonical=True)

        if smi in smi_count:
            smi_count[smi] = smi_count[smi] + 1
        else:
            smi_count[smi] = 1
    return smi_count
Example #22
0
def getSubstructSmi(mol, atomID, radius):
    if radius > 0:
        env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atomID)
        atomsToUse = []
        for b in env:
            atomsToUse.append(mol.GetBondWithIdx(b).GetBeginAtomIdx())
            atomsToUse.append(mol.GetBondWithIdx(b).GetEndAtomIdx())
        atomsToUse = list(set(atomsToUse))
    else:
        atomsToUse = [atomID]
        env = None
    smi = Chem.MolFragmentToSmiles(mol,
                                   atomsToUse,
                                   bondsToUse=env,
                                   allHsExplicit=True,
                                   allBondsExplicit=True,
                                   rootedAtAtom=atomID)
    order = eval(mol.GetProp('_smilesAtomOutputOrder'))
    smi2 = writePropsToSmiles(mol, smi, order)
    return smi, smi2
Example #23
0
def select_atoms(mol, selected_bits, vis_dir=None):
    features_vec, info = get_fingerprint(mol)
    # print('on bits:', info)

    selected_atoms = set()
    for onbit, subgraphs in info.items():
        if onbit in selected_bits:
            for center, radius in subgraphs:
                # print(f'on bit = {onbit}, center = {center}, radius = {radius}')
                env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius=radius, rootedAtAtom=center)
                amap = {}
                submol = Chem.PathToSubmol(mol, env, atomMap=amap)
                atoms = list(amap.keys())
                selected_atoms.update(atoms)
                # print(atoms)

                if vis_dir is not None:
                    png_f = f'bit{onbit}_center{center}_radius{radius}.png'
                    Draw.MolToFile(mol, filename=os.path.join(vis_dir, png_f), highlightAtoms=atoms)

    return selected_atoms
Example #24
0
        def eliminate(mol):

            # tag atoms within 4 bonds of attachment
            toRemove = set(range(mol.GetNumAtoms()))
            for atom in mol.GetAtoms():
                if atom.GetProp('molAtomRadius') == '0':
                    for idx in Chem.FindAtomEnvironmentOfRadiusN(
                            mol, 3, atom.GetIdx()):
                        envBond = mol.GetBondWithIdx(idx)
                        toRemove.discard(envBond.GetBeginAtom().GetIdx())
                        toRemove.discard(envBond.GetEndAtom().GetIdx())

            # remove environment from core
            toRemove = list(toRemove)
            toRemove.sort(reverse=True)
            frag = Chem.EditableMol(mol)
            for atom in toRemove:
                frag.RemoveAtom(atom)
            frag = frag.GetMol()
            #            frag.Debug()
            return frag
Example #25
0
def explain_fingerprint_bit(mol, vis_dir=None):
    features_vec, info = get_fingerprint(mol)
    print('on bits:', info)

    for onbit, subgraphs in info.items():
        for center, radius in subgraphs:
            print(f'on bit = {onbit}, center = {center}, radius = {radius}')

            if radius > 0:
                env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius=radius, rootedAtAtom=center)
                amap = {}
                submol = Chem.PathToSubmol(mol, env, atomMap=amap)
                atoms = list(amap.keys())
                print(atoms)

                if vis_dir is not None:
                    # mfp2_svg = Draw.DrawMorganEnv(mol, atomId=center, radius=radius, useSVG=True)
                    # svg_f = f'bit{onbit}_center{center}_radius{radius}.svg'
                    # with open(os.path.join(vis_dir, svg_f), 'w') as f:
                    #     f.write(mfp2_svg)

                    png_f = f'bit{onbit}_center{center}_radius{radius}.png'
                    Draw.MolToFile(mol, filename=os.path.join(vis_dir, png_f), highlightAtoms=atoms)
def explain_circular_substructure(mol,
                                  center,
                                  radius,
                                  use_hs=False,
                                  canonical=True,
                                  isomeric=False,
                                  kekule=False,
                                  all_bonds_explicit=False):
    """Returns a SMILES description of the circular structure defined by a center and a topological radius."""
    atoms = {center}
    env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, center, useHs=use_hs)
    for bidx in env:
        bond = mol.GetBondWithIdx(bidx)
        atoms.add(bond.GetBeginAtomIdx())
        atoms.add(bond.GetEndAtomIdx())
    return Chem.MolFragmentToSmiles(mol,
                                    atomsToUse=list(atoms),
                                    bondsToUse=env,
                                    rootedAtAtom=center,
                                    isomericSmiles=isomeric,
                                    kekuleSmiles=kekule,
                                    canonical=canonical,
                                    allBondsExplicit=all_bonds_explicit)
Example #27
0
def compute_all_ecfp(mol: RDKitMol,
                     indices: Optional[Set[int]] = None,
                     degree: int = 2) -> Dict[int, str]:
    """Obtain molecular fragment for all atoms emanating outward to given degree.

  For each fragment, compute SMILES string (for now) and hash to
  an int. Return a dictionary mapping atom index to hashed
  SMILES.

  Parameters
  ----------
  mol: rdkit Molecule
    Molecule to compute ecfp fragments on
  indices: Optional[Set[int]]
    List of atom indices for molecule. Default is all indices. If
    specified will only compute fragments for specified atoms.
  degree: int
    Graph degree to use when computing ECFP fingerprints

  Returns
  ----------
  dict
    Dictionary mapping atom index to hashed smiles.
  """

    ecfp_dict = {}
    from rdkit import Chem
    for i in range(mol.GetNumAtoms()):
        if indices is not None and i not in indices:
            continue
        env = Chem.FindAtomEnvironmentOfRadiusN(mol, degree, i, useHs=True)
        submol = Chem.PathToSubmol(mol, env)
        smile = Chem.MolToSmiles(submol)
        ecfp_dict[i] = "%s,%s" % (mol.GetAtoms()[i].GetAtomicNum(), smile)

    return ecfp_dict
Example #28
0
def FingerprintToSmiles(m, s):
    fp_sm = []
    bi = {}
    fp = AllChem.GetMorganFingerprint(m, s, bitInfo=bi)
    # print('FPSM : ', bi)
    for f in bi:
        # print('K:', f,' V:', bi[f])
        a = bi[f][0][0]
        r = bi[f][0][1]
        # print(f, a, r)
        if r > 0:
            env = Chem.FindAtomEnvironmentOfRadiusN(m, r, a)
            amap = {}
            submol = Chem.PathToSubmol(m, env, atomMap=amap)
            sm = Chem.MolToSmiles(submol)
        else:
            am = m.GetAtomWithIdx(a)
            sm = am.GetSymbol()
            if am.GetIsAromatic():
                sm = sm.lower()
        fp_sm.append((f, sm))
        # print(f,' - ',sm)
        # print(f,' - ',len(v),' - ',v,' - ',a,' - ',r,' - ',sm)
    return fp_sm
Example #29
0
def generate_substructures():
    """ Uses the structure table from the NP SQLite database and generates
    substructures data which will be stored in a text file.
    
    """

    # Connect
    conn = sqlite3.connect(
        "/path/to/SQLiteDatabase/Natural_Product_Structure.sqlite")
    c = conn.cursor()

    # Generate the nr of structures for class XXX
    numrows = c.execute("SELECT count(*) FROM structure WHERE Class = 'XXX';")
    numrows = str(numrows.fetchone()).lstrip("(").rstrip(",)")
    numrows = int(numrows)

    # Select all data for class XXX
    c.execute("SELECT * FROM structure WHERE Class = 'XXX';")

    str_mol_list = []
    sub_smiles_list = []
    for x in range(0, numrows):
        row = c.fetchone()
        m = Chem.MolFromSmiles(row[3])
        str_mol_list += [m]
        nr_of_atoms = m.GetNumAtoms()

        substructures_list = []
        for i in range(nr_of_atoms):
            for j in range(nr_of_atoms):
                mol = Chem.FindAtomEnvironmentOfRadiusN(m, i, j)
                substructures_list += [mol]

        smile_list = []
        for mol in substructures_list:
            amap = {}
            submol = Chem.PathToSubmol(m, mol, atomMap=amap)
            p = Chem.MolToSmiles(submol, canonical=True)
            # prevent overlapping substructures per structure
            if p != '' and p not in smile_list:
                smile_list += [p]

        for sm in smile_list:
            # prevent overlapping substructure for all structures
            if sm not in sub_smiles_list:
                sub_smiles_list += [sm]

    sub_mol_list = []
    for sub_struc in sub_smiles_list:
        sm = Chem.MolFromSmiles(sub_struc)
        if sm != None:
            sub_mol_list += [sm]

    with open("/path/to/store/substructures_from_class_XXX.txt",
              'w') as db_file:
        db_file.write("Structure has Substructure" + '\n\n')
        for structure in str_mol_list:
            for substructure in sub_mol_list:
                if structure.HasSubstructMatch(substructure) == True:
                    struc = Chem.MolToSmiles(structure)
                    substruc = Chem.MolToSmiles(substructure)
                    db_file.write(struc + '\t' + substruc + '\n')

    # Close the connection
    conn.close()
Example #30
0
    def calculate_p_values(self,
                           mols,
                           substructure_dictionary,
                           bioactivities,
                           mols_ids,
                           threshold_frequency,
                           threshold_nb_substructures=5,
                           threshold_pvalue=0.05,
                           active_label=1,
                           inactive_label=0,
                           Bonferroni=True):
        self.Bonferroni = Bonferroni

        # n
        nb_mols = float(
            len(
                set([
                    item for sublist in substructure_dictionary.values()
                    for item in sublist
                ])))
        # m
        nb_active_mols = float(np.sum(bioactivities == active_label))
        # (m - n)
        nb_inactive_mols = float(np.sum(bioactivities == inactive_label))

        nb_substructures_processed = 0
        if type(mols) != list:
            mols = [ext.mols[i]
                    for i in np.arange(0, len(mols))]  #[x for x in mols]

        subs_discarded = [
        ]  # substructure that have been identified in other molecules.
        for m, mol in enumerate(mols):  #np.arange(0,len(mols)):
            #mol=mols[m]
            root_atoms_discarded = []  # center (or root) atoms discarded..
            info = {}
            fp = AllChem.GetMorganFingerprint(mol,
                                              self.max_radius,
                                              bitInfo=info)
            # sort info to make sure the substructures are read from the smallest to the biggest.
            # In case a substructure with low radius is removed, we make sure all containing it will not be considered either in the following steps)
            # get keys sorted
            ff = sorted(info.iteritems(), key=operator.itemgetter(1))
            substructure_ids = [ff[x][0] for x in range(0, len(info))]
            substructures_sub_dict = substructure_dictionary.keys()

            for substructure_id in substructure_ids:
                atom_radius = info[substructure_id]
                nb_substructures_processed += 1
                # check is the substructure is in the database (i.e. training data)
                if substructure_id in substructures_sub_dict and substructure_id not in subs_discarded and atom_radius[
                        0][0] not in root_atoms_discarded:
                    mols_with_current_substructure = substructure_dictionary[
                        substructure_id]
                    nb_comp_with_substructure = float(
                        len(mols_with_current_substructure))
                    active_comp = (bioactivities == active_label)
                    comp_with_substructure = np.in1d(
                        np.asarray(mols_ids),
                        np.asarray(mols_with_current_substructure))
                    nb_comp_with_substructure_active = np.sum(
                        active_comp * comp_with_substructure)  #i.e. m_{S act}
                    inactive_comp = (bioactivities == inactive_label)
                    #comp_with_substructure = np.in1d(np.asarray(mols_ids) , np.asarray(mols_with_current_substructure))
                    nb_comp_with_substructure_inactive = np.sum(
                        inactive_comp * comp_with_substructure)

                    ## ACTIVE
                    #########
                    #filter threshold of compounds with the substructure
                    filter_a = nb_comp_with_substructure > threshold_nb_substructures
                    if filter_a:
                        # filter threshold
                        filter_b = (float(nb_comp_with_substructure_active) /
                                    float(np.sum(comp_with_substructure))
                                    ) > threshold_frequency
                        if filter_b:
                            p_value = 0
                            for count in np.arange(
                                    nb_comp_with_substructure_active,
                                    nb_comp_with_substructure):
                                numerator = Decimal(
                                    sc.math.factorial(
                                        float(nb_comp_with_substructure)))
                                denominatorA = Decimal(
                                    sc.math.factorial(float(count))) * Decimal(
                                        sc.math.factorial(
                                            float(nb_comp_with_substructure -
                                                  count)))
                                denominatorB = (nb_active_mols /
                                                nb_mols)**float(count)
                                denominatorC = (1.0 -
                                                (nb_active_mols / nb_mols))**(
                                                    nb_comp_with_substructure -
                                                    count)
                                out = float(
                                    numerator /
                                    denominatorA) * denominatorB * denominatorC
                                p_value += out

                            if p_value < threshold_pvalue:
                                #self.p_values_dictionary[substructure_id] = p_value

                                # Drawing
                                env = Chem.FindAtomEnvironmentOfRadiusN(
                                    mol, atom_radius[0][1], atom_radius[0][0])
                                amap = {}
                                submol = Chem.PathToSubmol(mol,
                                                           env,
                                                           atomMap=amap)
                                m1 = mol
                                m1.GetSubstructMatch(submol)
                                #mm = Draw.MolToImage( mol,wedgeBonds=True,kekulize=True,highlightAtoms=amap.keys(),colour='green')
                                self.output = self.output.append(
                                    {
                                        'Compound ID':
                                        mols_ids[m],
                                        'Compounds with substr.':
                                        nb_comp_with_substructure,
                                        #'Compounds with substr. and activity' : nb_comp_with_substructure_active,
                                        'p_value':
                                        p_value,
                                        'Activity label':
                                        active_label,
                                        'Substructure in Molecule':
                                        m1,
                                        'Substructure':
                                        submol,
                                        'Comp. with substr. active':
                                        nb_comp_with_substructure_active,
                                        'Comp. with substr. inactive':
                                        nb_comp_with_substructure_inactive
                                        #'Smiles': Chem.MolToSmiles(mol)
                                    },
                                    ignore_index=True)
                                root_atoms_discarded.append(atom_radius[0][0])
                                subs_discarded.append(substructure_id)

                        ## INACTIVE
                        #########
                        #filter threshold of compounds with the substructure
                        # filter threshold
                        filter_b = (float(nb_comp_with_substructure_inactive) /
                                    float(np.sum(comp_with_substructure))
                                    ) > threshold_frequency
                        if filter_b:
                            p_value = 0
                            for count in np.arange(
                                    nb_comp_with_substructure_inactive,
                                    nb_comp_with_substructure):
                                numerator = Decimal(
                                    sc.math.factorial(
                                        float(nb_comp_with_substructure)))
                                denominatorA = Decimal(
                                    sc.math.factorial(float(count))) * Decimal(
                                        sc.math.factorial(
                                            float(nb_comp_with_substructure -
                                                  count)))
                                denominatorB = (nb_inactive_mols /
                                                nb_mols)**float(count)
                                denominatorC = (
                                    1.0 - (nb_inactive_mols / nb_mols))**(
                                        nb_comp_with_substructure - count)
                                out = float(
                                    numerator /
                                    denominatorA) * denominatorB * denominatorC
                                p_value += out

                            if p_value < threshold_pvalue:
                                #self.p_values_dictionary[substructure_id] = p_value

                                # Drawing
                                env = Chem.FindAtomEnvironmentOfRadiusN(
                                    mol, atom_radius[0][1], atom_radius[0][0])
                                amap = {}
                                submol = Chem.PathToSubmol(mol,
                                                           env,
                                                           atomMap=amap)
                                m1 = mol
                                m1.GetSubstructMatch(submol)
                                #mm = Draw.MolToImage(mol,wedgeBonds=True,kekulize=True,highlightAtoms=amap.keys(),colour='red')
                                self.output = self.output.append(
                                    {
                                        'Compound ID':
                                        mols_ids[m],
                                        'Compounds with substr.':
                                        nb_comp_with_substructure,
                                        #'Compounds with substr. and activity' : nb_comp_with_substructure_active,
                                        'p_value':
                                        p_value,
                                        'Activity label':
                                        inactive_label,
                                        'Substructure in Molecule':
                                        m1,
                                        'Substructure':
                                        submol,
                                        'Comp. with substr. active':
                                        nb_comp_with_substructure_active,
                                        'Comp. with substr. inactive':
                                        nb_comp_with_substructure_inactive
                                        #'Smiles': Chem.MolToSmiles(mol)
                                    },
                                    ignore_index=True)
                                root_atoms_discarded.append(atom_radius[0][0])
                                subs_discarded.append(substructure_id)
                    else:
                        subs_discarded.append(substructure_id)
                        root_atoms_discarded.append(atom_radius[0][0])

        if self.Bonferroni == True:
            self.output[
                'p_value'] = self.output['p_value'] * self.output.shape[0]
            self.output = self.output[self.output.p_value < 0.05]
        print 'Number of substructures processed: ', nb_substructures_processed
        print 'Significant substructures: ', self.output.shape[
            0], 'substructures'