def score_model(self, model_configuration: dict, fragments_file: str,
                    descriptors_file: str, output_file: str):
        inputoutput_utils.create_parent_directory(output_file)
        model_data = model_configuration["data"]
        active_molecules_ap = []
        for active_molecule in model_data["active"]:
            molecule_smiles = active_molecule.strip("\"")
            molecule = Chem.MolFromSmiles(molecule_smiles)
            ecfp_fingerprint = Pairs.GetAtomPairFingerprint(molecule)
            active_molecules_ap.append(ecfp_fingerprint)

        first_line = True
        with open(output_file, "w", encoding="utf-8") as output_stream:
            with open(fragments_file, "r", encoding="utf-8") as input_stream:
                for new_line in input_stream:
                    line = json.loads(new_line)
                    test_molecule_input = line["smiles"]
                    test_molecule_smiles = test_molecule_input.strip("\"")
                    test_molecule = Chem.MolFromSmiles(test_molecule_smiles)
                    test_mol_fingerprint = Pairs.GetAtomPairFingerprint(
                        test_molecule)
                    max_sim = max([
                        DataStructs.TanimotoSimilarity(test_mol_fingerprint,
                                                       fingerprint)
                        for fingerprint in active_molecules_ap
                    ])
                    score = {"name": line["name"], "score": max_sim}
                    if first_line:
                        first_line = False
                    else:
                        output_stream.write("\n")
                    json.dump(score, output_stream)
Beispiel #2
0
def caculate_similarity_atomPairs(smiles_A, smiles_B):
    try:
        m1 = Chem.MolFromSmiles(smiles_A)
        m2 = Chem.MolFromSmiles(smiles_B)
        p1 = Pairs.GetAtomPairFingerprint(m1)
        p2 = Pairs.GetAtomPairFingerprint(m2)
        similarity_p1_p2 = DataStructs.DiceSimilarity(p1, p2)
        return round(similarity_p1_p2, 4)
    except:
        return -1
Beispiel #3
0
def sim_rdk_topo_fps(smiA, smisT):
    """ calculate the fingerprint similarity using the RDK atompair fingerprints
                input are a smiles string and a list of smiles strings
                returned is a list of similarities
        """
    fp_A = Pairs.GetAtomPairFingerprint(getMolFromSmiles(smiA))
    fps_T = [Pairs.GetAtomPairFingerprint(getMolFromSmiles(y)) for y in smisT]

    sim_vector = []
    for t in fps_T:
        sim_vector.append(DataStructs.DiceSimilarity(fp_A, t))

    return sim_vector
Beispiel #4
0
 def atom_pairs(self):
     ms = np.array([Chem.MolFromSmiles(i) for i in self.data.SMILES])
     # compute Atom Pair
     fp = [
         Pairs.GetAtomPairFingerprint(
             Chem.RemoveHs(x)).GetNonzeroElements() for x in ms
     ]
     # obtain all bits present
     bits_ap = set()
     for i in fp:
         bits_ap.update([*i])  # add bits for each molecule
     bits_ap = sorted(bits_ap)
     feature_matrix = list()
     # convert fp to bits
     for item in fp:
         vect_rep = np.isin(
             bits_ap, [*item])  # vect_rep, var that indicates bits presents
         # identify axis to replace
         ids_to_update = np.where(vect_rep == True)
         vect_rep = 1 * vect_rep
         vect_rep = np.array(vect_rep).astype(int)
         # replace indices with bict values
         vect_rep[ids_to_update] = list(item.values())
         feature_matrix.append(vect_rep)
     return feature_matrix
Beispiel #5
0
    def computeFP(self, typeFP):

        from rdkit.Chem.Fingerprints import FingerprintMols
        from rdkit.Chem import MACCSkeys
        from rdkit.Chem.AtomPairs import Pairs, Torsions
        from rdkit.Chem import AllChem

        if not "smiclean" in self.__dict__:
            self.log = self.log + "No smiles prepared\n"
            return 1
        else:
            self.mol = Chem.MolFromSmiles(self.smiclean)
            #print self.smiclean

        dFP = {}
        if typeFP == "Mol" or typeFP == "All":
            dFP["Mol"] = FingerprintMols.FingerprintMol(self.mol)
        if typeFP == "MACCS" or typeFP == "All":
            dFP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol)
        if typeFP == "pairs" or typeFP == "All":
            dFP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol)
        if typeFP == "Torsion" or typeFP == "All":
            dFP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint(
                self.mol)
        if typeFP == "Morgan" or typeFP == "All":
            dFP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2)

        self.FP = dFP
        return 0
Beispiel #6
0
def extract_atompair_fragments(molecule: object) -> list:
    output = []
    pairFps = Pairs.GetAtomPairFingerprint(molecule)
    d = pairFps.GetNonzeroElements()
    for pair in d:
        atom1 = rdkit.Chem.AtomFromSmarts(Pairs.ExplainPairScore(pair)[0][0])
        atom2 = rdkit.Chem.AtomFromSmarts(Pairs.ExplainPairScore(pair)[2][0])
        smiles = (Pairs.ExplainPairScore(pair)[0][0] +
                  Pairs.ExplainPairScore(pair)[2][0])
        atom1_type = atom1.GetAtomicNum()
        atom2_type = atom2.GetAtomicNum()
        atom1_num_pi_bonds = Pairs.ExplainPairScore(pair)[0][2]
        atom2_num_pi_bonds = Pairs.ExplainPairScore(pair)[2][2]
        atom1_num_neigh = Pairs.ExplainPairScore(pair)[0][1]
        atom2_num_neigh = Pairs.ExplainPairScore(pair)[2][1]
        atom1_property_value = 64 * atom1_type + 16 * atom1_num_pi_bonds + atom1_num_neigh
        atom2_property_value = 64 * atom2_type + 16 * atom2_num_pi_bonds + atom2_num_neigh
        dist = Pairs.ExplainPairScore(pair)[1] + 1
        atom_pair_key = min(
            atom1_property_value, atom2_property_value) + 1024 * (
                max(atom1_property_value, atom2_property_value) + 1024 * dist)
        num = (d[pair])
        for i in range(num):
            output.append({
                "smiles": smiles,
                "index": atom_pair_key,
                "type": "AP",
                "size": dist
            })
    return output
Beispiel #7
0
def atom_pairs():
    """ Atom pair fingerprints, atom descriptor
    
    """

    # Generate molecules
    ms = [
        Chem.MolFromSmiles('C1CCC1OCC'),
        Chem.MolFromSmiles('CC(C)OCC'),
        Chem.MolFromSmiles('CCOCC')
    ]
    pairFps = [Pairs.GetAtomPairFingerprint(x) for x in ms]

    # Get the list of bits and their counts for each fingerprint as a dictionary
    d = pairFps[-1].GetNonzeroElements()
    print(d)

    # Explanation of the bitscore.
    print(Pairs.ExplainPairScore(558115))

    # Dice similarity; The usual metric for similarity between atom-pair fingerprints
    print(DataStructs.DiceSimilarity(pairFps[0], pairFps[1]))

    # Atom decriptor without count
    pairFps = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in ms]
    print(DataStructs.DiceSimilarity(pairFps[0], pairFps[1]))
Beispiel #8
0
def _atomsFingerprintsClustering(rdkit_mols):
    """
        Returns the dice distance matrix based on atomsfingerprints method

        Parameters
        ----------
        rdkit_mols: list
            The list of rdkit.Chem.rdchem.Mol objects

        Returns
        -------
        dicematrix: np.array
            The numpy array containing the dice matrix
        """
    from rdkit.Chem.AtomPairs import Pairs  # Atom pairs

    fps = []
    for m in tqdm(rdkit_mols):
        fps.append(Pairs.GetAtomPairFingerprint(m))

    aprun = ParallelExecutor(n_jobs=-1)  # _config['ncpus'])
    dice_matrix = aprun(total=len(fps), desc='AtomsFingerprints Distance') \
        (delayed(DiceDistances)(fp1, fps) for fp1 in fps)

    return np.array(dice_matrix)
Beispiel #9
0
    def findCluster(self, smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            try:
                scaffold = MurckoScaffold.GetScaffoldForMol(mol)
            except:
                return "", "", False
            if scaffold:
                cluster = Chem.MolToSmiles(scaffold, isomericSmiles=False)
            else:
                return "", "", False
        else:
            return "", "", False

        fp = Pairs.GetAtomPairFingerprint(scaffold)  # Change to Tanimoto?
        if cluster in self.getFingerprints():
            return cluster, fp, False

        fps = list(self.getFingerprints().values())
        sims = DataStructs.BulkTanimotoSimilarity(fp, fps)
        if len(sims) == 0:
            return cluster, fp, True
        closest = np.argmax(sims)
        if sims[closest] >= self.minsimilarity:
            return list(self.getFingerprints().keys())[closest], fp, False
        else:
            return cluster, fp, True
Beispiel #10
0
def atom_pairs_similarity(active_molecules1, test_molecules):
    similarity = []
    active_molecules_pairfps = [
        Pairs.GetAtomPairFingerprint(p) for p in active_molecules1
    ]
    test_molecules_pairsfps = [
        Pairs.GetAtomPairFingerprint(p) for p in test_molecules
    ]
    for i in range(len(test_molecules_pairsfps)):
        num_sim = 0
        for j in range(len(active_molecules_pairfps)):
            sim = DataStructs.DiceSimilarity(test_molecules_pairsfps[i],
                                             active_molecules_pairfps[j])
            if sim > num_sim:
                num_sim = sim
        similarity.append(num_sim)
    return similarity
Beispiel #11
0
 def testPairsRegression(self):
   inF = gzip.open(os.path.join(self.testDataPath, 'mols1000.aps.pkl.gz'), 'rb')
   atomPairs = cPickle.load(inF, encoding='bytes')
   for i, m in enumerate(self.mols):
     ap = Pairs.GetAtomPairFingerprint(m)
     if ap != atomPairs[i]:  # pragma: nocover
       debugFingerprint(m, ap, atomPairs[i])
     self.assertEqual(ap, atomPairs[i])
     self.assertNotEqual(ap, atomPairs[i - 1])
Beispiel #12
0
def orng_sim_rdk_atompair_fps(smile_active, train_instance):
    """ calculate the fingerprint similarity using the RDK atom pair fingerprints
                input are a smiles string and a orange data instance
                returned is a similaritie value
        """
    smilesName = getSMILESAttr(train_instance)
    if not smilesName: return None
    smile_train = str(train_instance[smilesName].value)

    molAct = getMolFromSmiles(smile_active)
    molTrain = getMolFromSmiles(smile_train)

    if not molAct: return None
    if not molTrain: return None

    fp_A = Pairs.GetAtomPairFingerprint(molAct)
    fp_T = Pairs.GetAtomPairFingerprint(molTrain)
    sim = DataStructs.DiceSimilarity(fp_A, fp_T)

    return sim
def get_similarity(): # get similarities on the first molecule in compound group
    # precalculate fingerprints for reference compound
    ref_morgan2 = AllChem.GetMorganFingerprintAsBitVect(mols[0],radius,bit_size)
    ref_cmorgan2 = AllChem.GetMorganFingerprint(mols[0],radius)
    ref_fmorgan2 = AllChem.GetMorganFingerprintAsBitVect(mols[0], radius,bit_size, useFeatures = True)
    ref_ap = Pairs.GetAtomPairFingerprint(mols[0])
    # precalculate fingerprints and bit information for test molecules
    total_sims = ''
    fps_morgan2 = []
    fps_cmorgan2 = []
    fps_fmorgan2 = []
    fps_ap = []
    info_morgan2 = []
    info_cmorgan2 = []
    info_fmorgan2 = []
    num_mols = len(mols) - 1
    reference = compounds[0]
    del compounds[0] 
    del mols[0] #remove reference cmp from list
    for m in mols:
        info = {}
        fps_morgan2.append(AllChem.GetMorganFingerprintAsBitVect(m, radius, bit_size,  bitInfo = info))
        info_morgan2.append(info)
        info = {}
        fps_cmorgan2.append(AllChem.GetMorganFingerprint(m, radius, bitInfo=info))
        info_cmorgan2.append(info)
        info = {}
        fps_fmorgan2.append(AllChem.GetMorganFingerprintAsBitVect(m, radius, bit_size, useFeatures=True, bitInfo=info))
        info_fmorgan2.append(info)
        fps_ap.append(Pairs.GetAtomPairFingerprint(m))
    ## calculate similarities
    for i,m in enumerate(mols):
        ap_simil = DataStructs.DiceSimilarity(ref_ap, fps_ap[i])
        morgan2_simil = DataStructs.DiceSimilarity(ref_morgan2, fps_morgan2[i])
        cmorgan2_simil = DataStructs.DiceSimilarity(ref_cmorgan2, fps_cmorgan2[i])
        fmorgan2_simil = DataStructs.DiceSimilarity(ref_fmorgan2, fps_fmorgan2[i])
        sims =str(reference)+' '+ str(compounds[i].rstrip())+' '+ str(ap_simil)+' '+str(morgan2_simil)+' '+str(cmorgan2_simil)+' '+str(fmorgan2_simil)+'\n'
        total_sims += sims
    return total_sims
def getCountInfo(m, fpType):
    #     m = Chem.MolFromSmiles(formula)
    fp = None
    if fpType == 'AtomPair' or fpType.lower() == 'atom':
        fp = Pairs.GetAtomPairFingerprint(m)
        return fp.GetNonzeroElements()
    elif fpType.lower() == 'morgan' or fpType.lower() == 'circular':
        fp = AllChem.GetMorganFingerprint(m, 2)
        return fp.GetNonzeroElements()
    elif fpType == 'Topological' or fpType.lower() == 'topo':
        fp = Torsions.GetTopologicalTorsionFingerprint(m)
        Dict = fp.GetNonzeroElements()
        convertedDict = {}
        for elem in Dict:
            convertedDict[int(elem)] = Dict[elem]
        return convertedDict
Beispiel #15
0
def Atompair_fp(mol, rc_names):
    fp = [Pairs.GetAtomPairFingerprint(x) for x in mol]
    tc_df = pd.DataFrame(index=rc_names, columns=rc_names).fillna(0)

    for c1 in range(len(fp)):
        tc_df[rc_names[c1]] = [
            DataStructs.DiceSimilarity(fp[c1], fp[c2]) for c2 in range(len(fp))
        ]

    clusters = linkage(tc_df.as_matrix(columns=None), "ward")
    clust_tree = to_tree(clusters, rd=False)
    d3Dendro = dict(children=[], name=" ")
    add_node(clust_tree, d3Dendro)
    label_tree(d3Dendro["children"][0], rc_names)

    return d3Dendro
Beispiel #16
0
    def computeFP(self, typeFP):

        if not "mol" in self.__dict__:
            self.log = self.log + "No smiles prepared\n"
            self.err = 1
        else:
            d_FP = {}
            if typeFP == "Mol" or typeFP == "All":
                d_FP["Mol"] = FingerprintMols.FingerprintMol(self.mol)
            if typeFP == "MACCS" or typeFP == "All":
                d_FP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol)
            if typeFP == "pairs" or typeFP == "All":
                d_FP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol)
            if typeFP == "Torsion" or typeFP == "All":
                d_FP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint(self.mol)
            if typeFP == "Morgan" or typeFP == "All":
                d_FP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2)
            
            self.d_FP = d_FP
Beispiel #17
0
def compare_structure(smiles1, smiles2, fp_type="Morgan", sim_type="Dice"):
    """
    Task: Compare structual similarity of two compound based on fingerprints.
    Parameters:
        smiles1: str, smiles of the compound 1
        smiles2: str, smiles of the compound 2
        fp_type: str, type of fingerprints
        sim_type: str, method for calculating similarity
    """
    if fp_type == "Morgan":
        getfp = lambda smi: AllChem.GetMorganFingerprint(
            Chem.MolFromSmiles(smi), 2, useFeatures=False)
    elif fp_type == "MorganWithFeature":
        getfp = lambda smi: AllChem.GetMorganFingerprint(
            Chem.MolFromSmiles(smi), 2, useFeatures=True)
    elif fp_type == "MACCS":
        getfp = lambda smi: Chem.MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smi)
                                                        )
    elif fp_type == "Topological":
        getfp = lambda smi: FingerprintMols.FingerprintMol(
            Chem.MolFromSmiles(smi))
    elif fp_type == "AtomPairs":
        getfp = lambda smi: Pairs.GetAtomPairFingerprint(
            Chem.MolFromSmiles(smi))

    try:
        fp1 = getfp(smiles1)
        fp2 = getfp(smiles2)
        if sim_type == "Dice":
            sim_fp = DataStructs.DiceSimilarity(fp1, fp2)
        elif sim_type == "Tanimoto":
            sim_fp = DataStructs.TanimotoSimilarity(fp1, fp2)
        elif sim_type == "Cosine":
            sim_fp = DataStructs.CosineSimilarity(fp1, fp2)
        elif sim_type == "Sokal":
            sim_fp = DataStructs.SokalSimilarity(fp1, fp2)
        elif sim_type == "Russel":
            sim_fp = DataStructs.RusselSimilarity(fp1, fp2)

    except Exception as e:
        sim_fp = -1
    return sim_fp
Beispiel #18
0
 def testPairsRegression(self):
   inF = gzip.open(os.path.join(self.testDataPath,'mols1000.aps.pkl.gz'),'rb')
   atomPairs = cPickle.load(inF, encoding='bytes')
   for i,m in enumerate(self.mols):
     ap = Pairs.GetAtomPairFingerprint(m)
     #if ap!=atomPairs[i]:
     #  print Chem.MolToSmiles(m)
     #  pd=ap.GetNonzeroElements()
     #  rd=atomPairs[i].GetNonzeroElements()
     #  for k,v in pd.iteritems():
     #    if rd.has_key(k):
     #      if rd[k]!=v: print '>>>1',k,v,rd[k]
     #    else:
     #      print '>>>2',k,v
     #  for k,v in rd.iteritems():
     #    if pd.has_key(k):
     #      if pd[k]!=v: print '>>>3',k,v,pd[k]
     #    else:
     #      print '>>>4',k,v
     self.assertTrue(ap==atomPairs[i])
     self.assertTrue(ap!=atomPairs[i-1])
Beispiel #19
0
def CalculateAtomPairsFingerprint(mol: Chem.Mol,
                                  rtype: str = 'countstring',
                                  bits: int = 2048) -> Tuple[str, dict, Any]:
    """Calculate atom pairs fingerprints.

    :param rtype: Type of output, may either be:
                  countstring (default), returns a binary string
                  rdkit, return the native rdkit DataStructs
                  dict, for a dict of bits turned on
    :param bits: Number of folded bits (ignored if rtype != 'countstring')
    """
    res = Pairs.GetAtomPairFingerprint(mol)
    if rtype == 'rdkit':
        return res
    counts = res.GetNonzeroElements()
    if rtype == 'dict':
        return {f'AtomPair_{k}': v for k, v in counts.items()}
    folded = np.zeros(bits)
    for k, v in counts.items():
        folded[k % bits] += v
    return ';'.join(folded.tolist())
Beispiel #20
0
def CalculateAtomPairsFingerprint(mol):
    """
    #################################################################
    Calculate atom pairs fingerprints

    Usage:

        result=CalculateAtomPairsFingerprint(mol)

        Input: mol is a molecule object.

        Output: result is a tuple form. The first is the number of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = Pairs.GetAtomPairFingerprint(mol)

    return res.GetLength(), res.GetNonzeroElements(), res
Beispiel #21
0
def get_smiles_similarity(smiles1, smiles2, similarity="fingerprint"):
    from rdkit import DataStructs
    from rdkit.Chem.Fingerprints import FingerprintMols
    from rdkit.Chem.AtomPairs import Pairs
    """
    fp_type: sim | sub
    metric: tanimoto | tversky
    """
    if len(smiles1) == 0 or len(smiles2) == 0:
        return None

    ms = [Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)]

    if similarity == "fingerprint":
        fps = [FingerprintMols.FingerprintMol(x) for x in ms]

        d = DataStructs.FingerprintSimilarity(fps[0],fps[1],metric=DataStructs.TanimotoSimilarity)

    elif similarity == "atom":
        pairFps = [Pairs.GetAtomPairFingerprint(x) for x in ms]
        d = DataStructs.DiceSimilarity(pairFps[0], pairFps[1])
        # d = DataStructs.TanimotoSimilarity(pairFps[0],pairFps[1])
    # print(d)
    return d
            similarities_maccs[i][j] = 1
        #for other similarity metrics use for example DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity)
    if i % 500 == 0:
        print('running:', i / len(fps_maccs) * 100, '%')

# In[ ]:

df = pd.DataFrame(similarities_maccs)
df.to_csv('similarities_maccs.csv')

# ### Atom pairs fingerprints

# In[ ]:

from rdkit.Chem.AtomPairs import Pairs
fps_pairs = [Pairs.GetAtomPairFingerprint(x) for x in molecules]
similarities_pairs = np.zeros(shape=((len(fps_pairs), len(fps_pairs))))

# In[ ]:

#compute similarities.  Comment this section if only the fingerprints are needed
for i in range(len(fps_pairs)):
    for j in range(len(fps_pairs)):
        if i > j:
            similarities_pairs[i][j] = DataStructs.DiceSimilarity(
                fps_pairs[i],
                fps_pairs[j])  #default is the Dice similarity for these fps
            similarities_pairs[j][i] = similarities_pairs[i][j]
        elif i == j:
            similarities_pairs[i][j] = 1
    if i % 500 == 0:
Beispiel #23
0
    m, 2, useFeatures=True, nBits=nbits)
FPDICT['fcfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect(
    m, 3, useFeatures=True, nBits=nbits)
FPDICT['fcfc2'] = lambda m: Chem.GetMorganFingerprint(m, 1, useFeatures=True)
FPDICT['fcfc4'] = lambda m: Chem.GetMorganFingerprint(m, 2, useFeatures=True)
FPDICT['fcfc6'] = lambda m: Chem.GetMorganFingerprint(m, 3, useFeatures=True)
FPDICT['lecfp4'] = lambda m: Chem.GetMorganFingerprintAsBitVect(
    m, 2, nBits=nbits_long)
FPDICT['lecfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect(
    m, 3, nBits=nbits_long)
FPDICT['lfcfp4'] = lambda m: Chem.GetMorganFingerprintAsBitVect(
    m, 2, useFeatures=True, nBits=nbits_long)
FPDICT['lfcfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect(
    m, 3, useFeatures=True, nBits=nbits_long)
FPDICT['maccs'] = lambda m: MACCSkeys.GenMACCSKeys(m)
FPDICT['ap'] = lambda m: Pairs.GetAtomPairFingerprint(m)
FPDICT['tt'] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m)
FPDICT['hashap'] = lambda m: Desc.GetHashedAtomPairFingerprintAsBitVect(
    m, nBits=nbits)
FPDICT[
    'hashtt'] = lambda m: Desc.GetHashedTopologicalTorsionFingerprintAsBitVect(
        m, nBits=nbits)
FPDICT['rdk5'] = lambda m: Chem.RDKFingerprint(
    m, maxPath=5, fpSize=nbits, nBitsPerHash=2)
FPDICT['rdk6'] = lambda m: Chem.RDKFingerprint(
    m, maxPath=6, fpSize=nbits, nBitsPerHash=2)
FPDICT['rdk7'] = lambda m: Chem.RDKFingerprint(
    m, maxPath=7, fpSize=nbits, nBitsPerHash=2)
if USE_AVALON:
    FPDICT['avalon'] = lambda m: pyAv.GetAvalonFP(m, nbits)
    FPDICT['avalon_l'] = lambda m: pyAv.GetAvalonFP(m, nbits_long)
Beispiel #24
0
def ClusterOnFingerprint(filename, mols=None, fingerprint=0, cutoff=0.8, metric='Tanimoto', outMatrix=False):
    '''Clustering Structure based on Fingerprints in RDKit

    filename: Smile format file saving molecules. If set to None, use given "mols"
    mols: Input molecules. No use if set up "filename"
    cutoff: Cutoff using for Butina Clustering
    fingerprint: Fingerprint to use:
        0 or else:  RDKit Topological Fingerprint
        1: MACCS Fingerprint
        2: Atom Pair Fingerprint (AP)
        3: Topological Torsion Fingerprint (TT)
        4: Morgan Fingerprint similar to ECFP4 Fingerprint
        5: Morgan Fingerprint similar to FCFP4 Fingerprint
    metric: Available similarity metrics include: 
            Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky.
    outMatrix: Change output to a similarity matrix
    Return: Default output "clusters, clusterOut":
        clusters: Clusters containing molecule number.
        clusterOut: Molecular Cluster Number in List.
    '''

    from rdkit import DataStructs
    from rdkit.Chem.Draw import SimilarityMaps
    from rdkit.Chem.Fingerprints import FingerprintMols
    from rdkit.Chem import MACCSkeys
    from rdkit.Chem.AtomPairs import Pairs, Torsions

    if filename:
        suppl = Chem.SmilesMolSupplier(filename)
        mols=[]
        for mol in suppl:
            mols.append(mol)
    molnums=len(mols)

    ### Calculate Molecular Fingerprint
    ## MACCS Fingerprint
    if fingerprint==1:
        fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]
    ## Atom Pair Fingerprint (AP)
    elif fingerprint == 2:
        fps = [Pairs.GetAtomPairFingerprint(mol) for mol in mols]
    ## Topological Torsion Fingerprint (TT)
    elif fingerprint == 3:
        fps = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) for mol in mols]
    ## Morgan Fingerprint similar to ECFP4 Fingerprint
    elif fingerprint == 4:
        fps = [AllChem.GetMorganFingerprint(mol,2) for mol in mols]
    ## Morgan Fingerprint similar to FCFP4 Fingerprint
    elif fingerprint == 5:
        fps = [AllChem.GetMorganFingerprint(mol,2,useFeatures=True) for mol in mols]
    ## RDKit Topological Fingerprint
    else: #fingerprint==0:
        fps = [FingerprintMols.FingerprintMol(mol) for mol in mols]

    if outMatrix:
        ### Output the Fingerprint similarity Matrix
        metricsAvailable={'tanimoto':DataStructs.TanimotoSimilarity,"dice":DataStructs.DiceSimilarity,
        "cosine": DataStructs.CosineSimilarity, "sokal": DataStructs.SokalSimilarity, "russel": DataStructs.RusselSimilarity, 
        "rogotGoldberg": DataStructs.RogotGoldbergSimilarity, "allbit": DataStructs.AllBitSimilarity, 
        "kulczynski": DataStructs.KulczynskiSimilarity, "mcconnaughey": DataStructs.McConnaugheySimilarity,
        "asymmetric": DataStructs.AsymmetricSimilarity, "braunblanquet": DataStructs.BraunBlanquetSimilarity}
        
        if metric.lower() not in metricsAvailable:
            print "The given metric is unknown!"
            metric='Tanimoto'

        simMetrics=metricsAvailable[metric.lower()]

        ### Calculate Fingerprint similarity Matrix
        simdm=[[0.0]*molnums]*molnums
        for i in range(molnums):
            simdm[i,i]=1.0
            for j in range(i+1,molnums):
                simdm[i,j]=DataStructs.FingerprintSimilarity(fps[i],fps[j],metric=simMetrics)
                simdm[j,i]=DataStructs.FingerprintSimilarity(fps[j],fps[i],metric=simMetrics)

        for i in range(molnums):
            print
            for j in range(molnums):
                print '%3.2f' % simdm[i,j],
        return simdm

    else:
        clusters=ClusterFps(fps, cutoff=1-cutoff, metric='Tanimoto')
        clusterID=0
        clusterOut=[0]*len(mols)
        for cluster in clusters:
            clusterID+=1
            for idx in cluster:
                clusterOut[idx]=clusterID
            ## To depict cluster molecule
            if False:
                if len(cluster)>1:
                    print "Cluster: "
                    for idx in cluster:
                        mol2mpl(mols[idx])
        return clusters, clusterOut      
Beispiel #25
0
def smiles2bob2(smiles):
    m = chem.MolFromSmiles(smiles)
    m = chem.AddHs(m)
    fp = Pairs.GetAtomPairFingerprint(m)
    return fp.GetNonzeroElements()
Beispiel #26
0
def eval_similarity(fp_list, dim, evaluator):
    s_list = []
    for i in range(len(fp_list) - 1):
        for j in range(i + 1, len(fp_list)):
            s_list.append(evaluator(fp_list[i][dim], fp_list[j][dim]))
    s_list = np.array(s_list)
    return np.mean(s_list), np.std(s_list)


if __name__ == '__main__':
    f = sys.argv[1]
    fp_func_list = [
        lambda x: AllChem.GetMorganFingerprint(x, 2),
        lambda x: MACCSkeys.GenMACCSKeys(x),
        lambda x: Pairs.GetAtomPairFingerprint(x),
        lambda x: FingerprintMols.FingerprintMol(x)
    ]

    evaluators = [
        lambda x, y: DataStructs.DiceSimilarity(x, y),
        lambda x, y: DataStructs.FingerprintSimilarity(x, y),
        lambda x, y: DataStructs.DiceSimilarity(x, y),
        lambda x, y: DataStructs.FingerprintSimilarity(x, y)
    ]

    fp_list = get_fp_list(f, fp_func_list)

    for i in range(len(fp_func_list)):
        m, s = eval_similarity(fp_list, i, evaluators[i])
        print(1 - m, s)
Beispiel #27
0
def smiles2bob(listofsmiles):
    for smiles in listofsmiles:
        m = chem.MolFromSmiles(smiles)
        m = chem.AddHs(m)
        fp = Pairs.GetAtomPairFingerprint(m)
        yield fp.GetNonzeroElements()
Beispiel #28
0
def sim_two_serial():
    #Load Data-----------------------------------------------------------------------
    path1 = input("Path for list 1: ")
    path2 = input("Path for list 2: ")

    smis1 = pd.read_csv(path1)
    smis1 = smis1["smiles"]
    smis2 = pd.read_csv(path2)
    smis2 = smis2["smiles"]
    l1 = len(smis1)
    l2 = len(smis2)
    l = l1 * l2
    lp = round(l / 20)

    #Get molecules from smiles-----------------------------------------------------------------------
    bad1 = []
    molecules1 = []
    for i, smi in enumerate(smis1):
        m = Chem.MolFromSmiles(smi)
        if m is None:
            print('smile with number:', i,
                  'in list 1 could not be converted to molecule')
            bad1.append(i)
            continue
        molecules1.append(m)

    bad2 = []
    molecules2 = []
    for i, smi in enumerate(smis2):
        m = Chem.MolFromSmiles(smi)
        if m is None:
            print('smile with number:', i,
                  'in list 2 could not be converted to molecule')
            bad2.append(i)
            continue
        molecules2.append(m)

    #can1=[Chem.MolToSmiles(x) for x in molecules1]
    #can2=[Chem.MolToSmiles(x) for x in molecules2]
    #for j in bad1:
    #can1.insert(j,"bad1")
    #for j in bad2:
    #can2.insert(j,"bad2")
    smis1 = []
    smis2 = []

    #Final output matrix-----------------------------------------------------------------------
    similarity = np.zeros(shape=(l1, l2), dtype=np.float32)

    from rdkit.Chem import MACCSkeys
    from rdkit.Chem.AtomPairs import Pairs
    from rdkit.Chem.AtomPairs import Torsions
    from rdkit.Chem import AllChem

    print('Begining fingerprint calculation...wait')
    fps_topol1 = [FingerprintMols.FingerprintMol(x) for x in molecules1]
    fps_maccs1 = [MACCSkeys.GenMACCSKeys(x) for x in molecules1]
    fps_pairs1 = [Pairs.GetAtomPairFingerprint(x) for x in molecules1]
    fps_tts1 = [
        Torsions.GetTopologicalTorsionFingerprintAsIntVect(x)
        for x in molecules1
    ]
    fps_ecfp4_1 = [
        AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024)
        for x in molecules1
    ]
    fps_ecfp6_1 = [
        AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024)
        for x in molecules1
    ]
    fps_fcfp4_1 = [
        AllChem.GetMorganFingerprintAsBitVect(x,
                                              2,
                                              nBits=1024,
                                              useFeatures=True)
        for x in molecules1
    ]
    fps_fcfp6_1 = [
        AllChem.GetMorganFingerprintAsBitVect(x,
                                              3,
                                              nBits=1024,
                                              useFeatures=True)
        for x in molecules1
    ]
    print('Begining fingerprint calculation...50%')
    fps_topol2 = [FingerprintMols.FingerprintMol(x) for x in molecules2]
    fps_maccs2 = [MACCSkeys.GenMACCSKeys(x) for x in molecules2]
    fps_pairs2 = [Pairs.GetAtomPairFingerprint(x) for x in molecules2]
    fps_tts2 = [
        Torsions.GetTopologicalTorsionFingerprintAsIntVect(x)
        for x in molecules2
    ]
    fps_ecfp4_2 = [
        AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024)
        for x in molecules2
    ]
    fps_ecfp6_2 = [
        AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024)
        for x in molecules2
    ]
    fps_fcfp4_2 = [
        AllChem.GetMorganFingerprintAsBitVect(x,
                                              2,
                                              nBits=1024,
                                              useFeatures=True)
        for x in molecules2
    ]
    fps_fcfp6_2 = [
        AllChem.GetMorganFingerprintAsBitVect(x,
                                              3,
                                              nBits=1024,
                                              useFeatures=True)
        for x in molecules2
    ]
    print('Begining fingerprint calculation...done\n')

    for j in bad1:
        fps_topol1.insert(j, 1)
        fps_maccs1.insert(j, 1)
        fps_pairs1.insert(j, 1)
        fps_tts1.insert(j, 1)
        fps_ecfp4_1.insert(j, 1)
        fps_ecfp6_1.insert(j, 1)
        fps_fcfp4_1.insert(j, 1)
        fps_fcfp6_1.insert(j, 1)

    for j in bad2:
        fps_topol2.insert(j, 1)
        fps_maccs2.insert(j, 1)
        fps_pairs2.insert(j, 1)
        fps_tts2.insert(j, 1)
        fps_ecfp4_2.insert(j, 1)
        fps_ecfp6_2.insert(j, 1)
        fps_fcfp4_2.insert(j, 1)
        fps_fcfp6_2.insert(j, 1)

    print('Begining of fingerprints similarity calculation\n')
    molecules1 = []
    molecules2 = []

    k = 0
    maxs = 2 / (0.65 * 10) + 2 / (0.6 * 10) + 2 / (0.7 * 10) + 1 / (
        0.75 * 5) + 1 / (0.85 * 5)
    for i in range(l1):
        for j in range(l2):
            if not ((i in bad1) or (j in bad2)):
                similarities_topol = DataStructs.FingerprintSimilarity(
                    fps_topol1[i], fps_topol2[j])
                similarities_maccs = DataStructs.FingerprintSimilarity(
                    fps_maccs1[i], fps_maccs2[j])
                similarities_pairs = DataStructs.DiceSimilarity(
                    fps_pairs1[i], fps_pairs2[j])
                similarities_tts = DataStructs.DiceSimilarity(
                    fps_tts1[i], fps_tts2[j])
                similarities_ecfp4 = DataStructs.FingerprintSimilarity(
                    fps_ecfp4_1[i], fps_ecfp4_2[j])
                similarities_ecfp6 = DataStructs.FingerprintSimilarity(
                    fps_ecfp6_1[i], fps_ecfp6_2[j])
                similarities_fcfp4 = DataStructs.FingerprintSimilarity(
                    fps_fcfp4_1[i], fps_fcfp4_2[j])
                similarities_fcfp6 = DataStructs.FingerprintSimilarity(
                    fps_fcfp6_1[i], fps_fcfp6_2[j])
                similarity[i][j] = (
                    0.5 *
                    (similarities_ecfp4 / 0.65 + similarities_ecfp6 / 0.6) +
                    0.5 *
                    (similarities_fcfp4 / 0.65 + similarities_fcfp6 / 0.6) +
                    0.5 * (similarities_tts / 0.7 + similarities_pairs / 0.7) +
                    similarities_maccs / 0.85 + similarities_topol / 0.75) / 5
            k = k + 1
            if k % lp == 0:
                print('running:', (k / l) * 100, '%')
        #for other similarity metrics use for example DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity)

    similarity = similarity / maxs
    similarity[bad1, :] = 10
    similarity[:, bad2] = 10

    print('End of fingerprints similarity calculation')
    bad1 = []
    bad2 = []

    df_similarity = pd.DataFrame(similarity)
    similarity = []
    return df_similarity
Beispiel #29
0
def sim_one_serial():
    #Load Data-----------------------------------------------------------------------
    path = input("Path for list : ")
    smis = pd.read_csv(path)
    smis = smis["smiles"]
    l = len(smis)
    lp = round(l * l / 20)
    #Get molecules from smiles-----------------------------------------------------------------------
    bad = []
    molecules = []
    for i, smi in enumerate(smis):
        m = Chem.MolFromSmiles(smi)
        if m is None:
            print('smile with number:', i,
                  'in list could not be converted to molecule')
            bad.append(i)
            continue
        molecules.append(m)
    #can=[Chem.MolToSmiles(x) for x in molecules]
    #for j in bad:
    #can.insert(j,"bad")
    smis = []
    #Final output matrix-----------------------------------------------------------------------
    similarity = np.zeros(shape=(l, l), dtype=np.float32)

    from rdkit.Chem import MACCSkeys
    from rdkit.Chem.AtomPairs import Pairs
    from rdkit.Chem.AtomPairs import Torsions
    from rdkit.Chem import AllChem

    print('Begining fingerprint calculation...wait')
    fps_topol = [FingerprintMols.FingerprintMol(x) for x in molecules]
    fps_maccs = [MACCSkeys.GenMACCSKeys(x) for x in molecules]
    fps_pairs = [Pairs.GetAtomPairFingerprint(x) for x in molecules]
    fps_tts = [
        Torsions.GetTopologicalTorsionFingerprintAsIntVect(x)
        for x in molecules
    ]
    fps_ecfp4 = [
        AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024)
        for x in molecules
    ]
    fps_ecfp6 = [
        AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024)
        for x in molecules
    ]
    fps_fcfp4 = [
        AllChem.GetMorganFingerprintAsBitVect(x,
                                              2,
                                              nBits=1024,
                                              useFeatures=True)
        for x in molecules
    ]
    fps_fcfp6 = [
        AllChem.GetMorganFingerprintAsBitVect(x,
                                              3,
                                              nBits=1024,
                                              useFeatures=True)
        for x in molecules
    ]
    print('Begining fingerprint calculation...done\n')

    for j in bad:
        fps_topol.insert(j, 1)
        fps_maccs.insert(j, 1)
        fps_pairs.insert(j, 1)
        fps_tts.insert(j, 1)
        fps_ecfp4.insert(j, 1)
        fps_ecfp6.insert(j, 1)
        fps_fcfp4.insert(j, 1)
        fps_fcfp6.insert(j, 1)

    #molecules=[]

    print('Begining of fingerprints similarity calculation\n')
    k = 0
    maxs = 2 / (0.65 * 10) + 2 / (0.6 * 10) + 2 / (0.7 * 10) + 1 / (
        0.75 * 5) + 1 / (0.85 * 5)
    for i in range(l):
        for j in range(l):
            if i >= j:
                if not ((i in bad) or (j in bad)):
                    similarities_topol = DataStructs.FingerprintSimilarity(
                        fps_topol[i], fps_topol[j])
                    similarities_maccs = DataStructs.FingerprintSimilarity(
                        fps_maccs[i], fps_maccs[j])
                    similarities_pairs = DataStructs.DiceSimilarity(
                        fps_pairs[i], fps_pairs[j])
                    similarities_tts = DataStructs.DiceSimilarity(
                        fps_tts[i], fps_tts[j])
                    similarities_ecfp4 = DataStructs.FingerprintSimilarity(
                        fps_ecfp4[i], fps_ecfp4[j])
                    similarities_ecfp6 = DataStructs.FingerprintSimilarity(
                        fps_ecfp6[i], fps_ecfp6[j])
                    similarities_fcfp4 = DataStructs.FingerprintSimilarity(
                        fps_fcfp4[i], fps_fcfp4[j])
                    similarities_fcfp6 = DataStructs.FingerprintSimilarity(
                        fps_fcfp6[i], fps_fcfp6[j])
                    similarity[i][j] = (
                        0.5 *
                        (similarities_ecfp4 / 0.65 + similarities_ecfp6 / 0.6)
                        + 0.5 *
                        (similarities_fcfp4 / 0.65 + similarities_fcfp6 / 0.6)
                        + 0.5 *
                        (similarities_tts / 0.7 + similarities_pairs / 0.7) +
                        similarities_maccs / 0.85 +
                        similarities_topol / 0.75) / 5
                    similarity[j][i] = similarity[i][j]
                k = k + 1
                if k % lp == 0:
                    print('running:', (k / (l * l / 2)) * 100, '%')
        #for other similarity metrics use for example DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity)

    similarity = similarity / maxs
    similarity[bad, :] = 10
    similarity[:, bad] = 10

    print('End of fingerprints similarity calculation')
    bad = []

    df_similarity = pd.DataFrame(similarity)
    similarity = []
    return df_similarity
    m, 3, useFeatures=True
)
fpdict["lecfp4"] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
    m, 2, nBits=longbits
)
fpdict["lecfp6"] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
    m, 3, nBits=longbits
)
fpdict["lfcfp4"] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
    m, 2, useFeatures=True, nBits=longbits
)
fpdict["lfcfp6"] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
    m, 3, useFeatures=True, nBits=longbits
)
fpdict["maccs"] = lambda m: MACCSkeys.GenMACCSKeys(m)
fpdict["ap"] = lambda m: Pairs.GetAtomPairFingerprint(m)
fpdict["tt"] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m)
fpdict[
    "hashap"
] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(
    m, nBits=nbits
)
fpdict[
    "hashap_cas_length"
] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(
    m, nBits=n_cas_bits
)
fpdict[
    "hashtt"
] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(
    m, nBits=nbits