Python DataStructs.BulkTanimotoSimilarity Examples

Programming Language: Python

Namespace/Package Name: rdkit.Chem

Class/Type: DataStructs

Method/Function: BulkTanimotoSimilarity

Examples at hotexamples.com: 9

Python DataStructs.BulkTanimotoSimilarity - 9 examples found. These are the top rated real world Python examples of rdkit.Chem.DataStructs.BulkTanimotoSimilarity extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ConvertToNumpyArray(30)

TanimotoSimilarity(15)

FingerprintSimilarity(12)

BulkTanimotoSimilarity(9)

AllProbeBitsMatch(3)

DiceSimilarity(3)

ExplicitBitVect(2)

BulkCosineSimilarity(1)

BulkDiceSimilarity(1)

BulkTverskySimilarity(1)

CosineSimilarity(1)

CreateFromBinaryText(1)

FoldFingerprint(1)

OnBitsInCommon(1)

Example #1

Show file

    def __call__(self, smiles: List[str]) -> dict:
        mols = [Chem.MolFromSmiles(smile) for smile in smiles]
        valid = [1 if mol is not None else 0 for mol in mols]
        valid_idxs = [idx for idx, boolean in enumerate(valid) if boolean == 1]
        valid_mols = [mols[idx] for idx in valid_idxs]

        fps = [
            AllChem.GetMorganFingerprint(mol,
                                         3,
                                         useCounts=True,
                                         useFeatures=False)
            for mol in valid_mols
        ]

        tanimoto = np.array([
            np.max(DataStructs.BulkTanimotoSimilarity(fp, self.ref_fps))
            for fp in fps
        ])
        tanimoto = np.maximum((1 - 2 * np.absolute(0.5 - tanimoto)), 0)

        score = np.full(len(smiles), 0, dtype=np.float32)

        for idx, value in zip(valid_idxs, tanimoto):
            score[idx] = value
        return {"total_score": np.array(score, dtype=np.float32)}

Example #2

Show file

def ClusterFps(fps, similaridad):
    cutoff = 1-similaridad
    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i])
        dists.extend([1-x for x in sims])

    # now cluster the data:
    cs = Butina.ClusterData(dists,nfps,cutoff,isDistData=True)
    return cs

Example #3

Show file

File: Tanimoto_Cytoscape.py Project: storari04/QSAR

def Tanimotosimilarity():
    out = open("hogehoge.txt", "w")
    fps = [AllChem.GetMorganFingerprint(m, 2) for m in mols]

    simmat = []
    for i in range(len(fps)):
        tsims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        simmat.append(tsims)
        if tsims >= 0.7:
            out.write("%s\tsim\t%s\n" %
                      (mols[i].GetProp("ID"), mols[j].GetProp("ID")))
    out.close()

Example #4

Show file

def calcDistMatrix(df, distMeasure):
    # calculates the distance matrix between all paris of molecules, standard: Tanimoto and Morgan2 FPs
    dists=np.zeros([len(df),len(df)])
    if distMeasure=='Tanimoto':
        for i in range(1,len(df)):
            ds = DataStructs.BulkTanimotoSimilarity(df.FP.iloc[i],list(df.FP.iloc[:i]),returnDistance=1)
            for j in range(i):
                dists[i,j] = ds[j]
                dists[j,i] = ds[j]
    else:
        print(distMeasure, 'distance metric not implemented.')
        return
    return dists

Example #5

Show file

File: herg.py Project: shunsunsun/HERG-QSAR

def ClusterFps(fps, cutoff=0.2):
    # Function to group molecules based on their similarity to other compounds in the HERG set.
    # This is an attempt to overcome some of the problems associated with not
    # having a single series of related compounds

    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1 - x for x in sims])

    # now cluster the data:
    cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    return cs

Example #6

Show file

def cluster_mols(
    mols: List[Chem.rdchem.Mol],
    cutoff: float = 0.2,
    feature_fn: Callable = None,
    n_jobs: Optional[int] = 1,
):
    """Cluster a set of molecules using the butina clustering algorithm and a given threshold.

    Args:
        mols: a list of molecules.
        cutoff: Cuttoff for the clustering. Default to 0.2.
        feature_fn: A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    dists = []
    n_mols = len(mols)

    for i in range(1, n_mols):
        dist = DataStructs.BulkTanimotoSimilarity(features[i],
                                                  features[:i],
                                                  returnDistance=True)
        dists.extend([x for x in dist])

    # now cluster the data
    cluster_indices = Butina.ClusterData(dists,
                                         n_mols,
                                         cutoff,
                                         isDistData=True)
    cluster_mols = [
        operator.itemgetter(*cluster)(mols) for cluster in cluster_indices
    ]

    # Make single mol cluster a list
    cluster_mols = [[c] if isinstance(c, Chem.rdchem.Mol) else c
                    for c in cluster_mols]

    return cluster_indices, cluster_mols

Example #7

Show file

File: functions.py Project: InformaticsMatters/rdkit-compose

def tanimoto(mol, lib_in):
    return DataStructs.BulkTanimotoSimilarity(mol, lib_in)

Example #8

Show file

 def _calculate_tanimoto(self, query_fps, ref_fingerprints) -> np.array:
     return np.array([
         np.max(DataStructs.BulkTanimotoSimilarity(fp, ref_fingerprints))
         for fp in query_fps
     ])

Example #9

Show file

        fp_bulk.append(fp1)

for idy, parms_y in dataset2.iterrows():

    try:
        smi_y = parms_y['SMILES']
        exp_solb = parms_y['measured']
        mol_y = Chem.MolFromSmiles(smi_y)
        AllChem.Compute2DCoords(mol_y)
        # fp2 = Chem.RDKFingerprint(mol_y)
        fp2 = AllChem.GetMorganFingerprintAsBitVect(mol_y, 2, 2048)
    except:
        print(smi_y + "was not valid SMILES\n")
        continue
    else:
        Tan = DataStructs.BulkTanimotoSimilarity(fp2, fp_bulk)
        Tan = np.asarray(Tan)
        Tan_mean = Tan.mean()
        Tan_min = Tan.min()
        Tan_max = Tan.max()
        Tan_bulk.append([smi_y, exp_solb, Tan_mean, Tan_min, Tan_max])
        print("Tan=  %s" % (Tan))
Tan_pd = pd.DataFrame(Tan_bulk,
                      columns=[
                          'smi', 'experimental_solubilty', 'smlty_mean',
                          'smlty_min', 'smlty_max'
                      ])
pd_file = '%s_%s_smlty.csv' % (args.dataset1, args.dataset2)
Tan_pd.to_csv(pd_file, index=False)
print('mean= %s' % (Tan_pd['smlty_mean'].mean()))