Beispiel #1
0
  def _bulkTest(self,bvs):
    for metric in 'Tanimoto','Dice','AllBit','OnBit','RogotGoldberg':
      bulk = getattr(DataStructs,f'Bulk{metric}Similarity')
      single = getattr(DataStructs,f'{metric}Similarity')
    sims = bulk(bvs[0],bvs)
    for i in range(len(bvs)):
      sim = single(bvs[0],bvs[i])
      self.assertEqual(sim,sims[i])
      self.assertEqual(sim, single(bvs[0],bvs[i].ToBinary()))
    dists = bulk(bvs[0], bvs, returnDistance=True)
    for i in range(len(bvs)):
      dist = single(bvs[0], bvs[i], returnDistance=True)
      self.assertEqual(dist, dists[i])
      self.assertEqual(dist, single(bvs[0], bvs[i].ToBinary(), returnDistance=True))

    sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1)
    for i in range(len(bvs)):
      sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1)
      self.assertEqual(sim, sims[i])
      sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i])
      self.assertEqual(sim, sims[i])

    sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1, returnDistance=True)
    for i in range(len(bvs)):
      sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1, returnDistance=True)
      self.assertEqual(sim, sims[i])
      sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i], returnDistance=True)
      self.assertEqual(sim, sims[i])
Beispiel #2
0
  def test6BulkTversky(self):
    """

    """
    sz = 10
    nToSet = 5
    nVs = 6
    import random
    vs = []
    for i in range(nVs):
      v = ds.IntSparseIntVect(sz)
      for j in range(nToSet):
        v[random.randint(0, sz - 1)] = random.randint(1, 10)
      vs.append(v)

    baseDs = [ds.TverskySimilarity(vs[0], vs[x], .5, .5) for x in range(1, nVs)]
    bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 0.5, 0.5)
    diceDs = [ds.DiceSimilarity(vs[0], vs[x]) for x in range(1, nVs)]
    for i in range(len(baseDs)):
      self.assertTrue(feq(baseDs[i], bulkDs[i]))
      self.assertTrue(feq(baseDs[i], diceDs[i]))

    bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 1.0, 1.0)
    taniDs = [ds.TanimotoSimilarity(vs[0], vs[x]) for x in range(1, nVs)]
    for i in range(len(bulkDs)):
      self.assertTrue(feq(bulkDs[i], taniDs[i]))
    taniDs = ds.BulkTanimotoSimilarity(vs[0], vs[1:])
    for i in range(len(bulkDs)):
      self.assertTrue(feq(bulkDs[i], taniDs[i]))
Beispiel #3
0
    def test10BulkOps2(self):
        nbits = 10000
        bvs = []
        for bvi in range(10):
            bv = DataStructs.ExplicitBitVect(nbits)
            for j in range(nbits):
                x = random.randrange(0, nbits)
                bv.SetBit(x)
            bvs.append(bv)
        bvs = tuple(bvs)
        sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.DiceSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1)
        for i in range(len(bvs)):
            sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1)
            self.failUnless(feq(sim, sims[i]))
            sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5)
        for i in range(len(bvs)):
            sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5)
            self.failUnless(feq(sim, sims[i]))
            sim = DataStructs.DiceSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))
Beispiel #4
0
    def test10BulkOps3(self):
        nbits = 10000
        bvs = numpy.empty((10, ), DataStructs.ExplicitBitVect)
        for bvi in range(10):
            bv = DataStructs.ExplicitBitVect(nbits)
            for j in range(nbits):
                x = random.randrange(0, nbits)
                bv.SetBit(x)
            bvs[bvi] = bv
        sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))

        sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.DiceSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))

        sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))

        sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))

        sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))

        sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1)
        for i in range(len(bvs)):
            sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1)
            self.assertTrue(feq(sim, sims[i]))
            sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))

        sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5)
        for i in range(len(bvs)):
            sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5)
            self.assertTrue(feq(sim, sims[i]))
            sim = DataStructs.DiceSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))
Beispiel #5
0
def GetNeighborLists(probes,
                     topN,
                     pool,
                     simMetric=DataStructs.DiceSimilarity,
                     simThresh=-1.,
                     silent=False,
                     **kwargs):
    probeFps = [x[1] for x in probes]
    validProbes = [x for x in range(len(probeFps)) if probeFps[x] is not None]
    validFps = [probeFps[x] for x in validProbes]
    from rdkit.DataStructs.TopNContainer import TopNContainer
    if simThresh <= 0:
        nbrLists = [TopNContainer(topN) for x in range(len(probeFps))]
    else:
        nbrLists = [TopNContainer(-1) for x in range(len(probeFps))]

    nDone = 0
    for nm, fp in pool:
        nDone += 1
        if not silent and not nDone % 1000:
            logger.info('  searched %d rows' % nDone)
        if (simMetric == DataStructs.DiceSimilarity):
            scores = DataStructs.BulkDiceSimilarity(fp, validFps)
            for i, score in enumerate(scores):
                if score > simThresh:
                    nbrLists[validProbes[i]].Insert(score, nm)
        elif (simMetric == DataStructs.TanimotoSimilarity):
            scores = DataStructs.BulkTanimotoSimilarity(fp, validFps)
            for i, score in enumerate(scores):
                if score > simThresh:
                    nbrLists[validProbes[i]].Insert(score, nm)
        elif (simMetric == DataStructs.TverskySimilarity):
            av = float(kwargs.get('tverskyA', 0.5))
            bv = float(kwargs.get('tverskyB', 0.5))
            scores = DataStructs.BulkTverskySimilarity(fp, validFps, av, bv)
            for i, score in enumerate(scores):
                if score > simThresh:
                    nbrLists[validProbes[i]].Insert(score, nm)
        else:
            for i in range(len(probeFps)):
                pfp = probeFps[i]
                if pfp is not None:
                    score = simMetric(probeFps[i], fp)
                    if score > simThresh:
                        nbrLists[validProbes[i]].Insert(score, nm)
    return nbrLists
Beispiel #6
0
    #generate fp of query_substructs
    qfp = Chem.RDKFingerprint(qmol, maxPath=5, fpSize=1024, nBitsPerHash=2)

    queries.append(qfp)
    query_info.append((info[0], info[1], info[2]))

fragments = len(query_info)

for line in sys.stdin:

    line = line.rstrip()
    smi, id = re.split('\s|,', line)
    #print smi,id

    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        sys.stderr.write("Can't generate mol for: %s\n" % (smi))
        continue

    mfp = Chem.RDKFingerprint(mol, maxPath=5, fpSize=1024, nBitsPerHash=2)
    #print smi

    res = DataStructs.BulkTverskySimilarity(mfp, queries, 0, 1, False)

    #query_frag_smiles,query_smiles,query_id,retrieved_smi,retrieved_id,tversky_sim
    for i in range(fragments):
        if (res[i] >= options.cutoff):
            print("%s,%s,%s,%s,%s,%s" % (query_info[i][2], query_info[i][0],
                                         query_info[i][1], smi, id, res[i]))
Beispiel #7
0
 def similarity_metric(x, y):
     return DataStructs.BulkTverskySimilarity(x, y, 1.0, 1.5)
Beispiel #8
0
        fp = gen_fp(mol)
        fps1.append(fp)
        smiles1.append(smi)

t2 = time.time()
print("Fingerprinting targets took:", (t2 - t1))
print("Number of targets:", len(fps1))

t3 = time.time()
hits = []
best_scores = []
best_target = []
for mol, smi in read_file(p2, True):
    if mol:
        fp2 = gen_fp(mol)
        scores = DataStructs.BulkTverskySimilarity(fp2, fps1, alpha, 1.0 - alpha)
        best_score = 0.0
        best_target = None

        for score, target in zip(scores, smiles1):
            # print(score)
            if score > best_score:
                best_score = score
                best_target = target
        if threshold1 <= best_score <= threshold2:
            print(smi, best_score)
            print(best_target)
            hits.append(smi)

        # for score in scores:
        #     if threshold1 <= score <= threshold2: