Example #1
0
    def testNonUniqueCrash(self):
        from rdkit import DataStructs
        sz = 10
        nbits = 20
        nBitsToSet = int(nbits * .3)
        N = 12
        vs = []
        for i in range(sz):
            bv = DataStructs.ExplicitBitVect(nbits)
            for j in range(nBitsToSet):
                val = int(nbits * random.random())
                bv.SetBit(val)
            vs.append(bv)
            vs.append(bv)

        def taniFunc(i, j, bvs=vs):
            d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j])
            return d

        picker = rdSimDivPickers.MaxMinPicker()
        try:
            mm1 = picker.LazyPick(taniFunc, len(vs), N)
        except:
            ok = False
        else:
            ok = True
        self.assertTrue(ok)
        self.assertEqual(len(mm1), N)
        picker = None

        picker = rdSimDivPickers.MaxMinPicker()
        try:
            mm2 = picker.LazyBitVectorPick(vs, len(vs), N)
        except:
            ok = False
        else:
            ok = True
        self.assertTrue(ok)
        self.assertEqual(len(mm2), N)
        self.assertEqual(tuple(mm2), tuple(mm1))
        picker = None

        ds = []
        nvs = len(vs)
        for i in range(nvs):
            for j in range(i + 1, nvs):
                d = taniFunc(i, j)
                ds.append(d)
        m = numpy.array(ds)
        picker = rdSimDivPickers.HierarchicalClusterPicker(
            rdSimDivPickers.ClusterMethod.WARD)
        p1 = list(picker.Pick(m, nvs, N))
Example #2
0
  def test0MaxMin(self):
    pkr = rdSimDivPickers.MaxMinPicker()
    maxmin = pkr.Pick(self.dMat, self.n, self.m, (886, 112))
    self.assertEqual(maxmin[0], 886)
    self.assertEqual(maxmin[1], 112)

    def func(i, j):
      if i == j:
        return 0.0
      if i < j:
        j, i = i, j
      return self.dMat[i * (i - 1) // 2 + j]

    lmaxmin = pkr.LazyPick(func, self.n, self.m, (886, 112))
    self.assertEqual(list(lmaxmin), list(maxmin))

    lmaxmin = pkr.LazyPick(func, self.n, self.m, (886, 112), useCache=False)
    self.assertEqual(list(lmaxmin), list(maxmin))

    self.assertRaises(ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (1012, )))
    self.assertRaises(ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (-1, )))

    maxmin = pkr.Pick(self.dMat, self.n, self.m)
    self.assertTrue(maxmin)
    lmaxmin = pkr.LazyPick(func, self.n, self.m)
    self.assertTrue(lmaxmin)
Example #3
0
  def testBitVectorMaxMin(self):
    from rdkit import DataStructs
    sz = 100
    nbits = 200
    nBitsToSet = int(nbits * .1)
    N = 10
    vs = []
    for i in range(sz):
      bv = DataStructs.ExplicitBitVect(nbits)
      for j in range(nBitsToSet):
        val = int(nbits * random.random())
        bv.SetBit(val)
      vs.append(bv)

    def func(i, j, bvs=vs):
      d = DataStructs.TanimotoSimilarity(bvs[i], bvs[j], returnDistance=True)
      return d

    picker = rdSimDivPickers.MaxMinPicker()
    mm1 = picker.LazyPick(func, len(vs), N, seed=42)
    self.assertEqual(len(mm1), N)

    mm2 = picker.LazyPick(func, len(vs), N, useCache=False, seed=42)
    self.assertEqual(len(mm2), N)
    self.assertEqual(list(mm1), list(mm2))

    mm2 = picker.LazyBitVectorPick(vs, len(vs), N, seed=42)
    self.assertEqual(len(mm2), N)
    self.assertEqual(list(mm1), list(mm2))

    mm2 = picker.LazyBitVectorPick(vs, len(vs), N, useCache=False, seed=42)
    self.assertEqual(len(mm2), N)
    self.assertEqual(list(mm1), list(mm2))
Example #4
0
def pick_diverse_set(df, num_mols):
    print 'Resetting the index of the data'
    #reset the index of the data-frame
    dfr = df.reset_index()

    print 'Ther are {} molecules to calculate a distance matrix for'.format(
        dfr.shape[0])
    #Calculate Tanimoto Distance Matrix for the remaining molecules
    dm = GetTanimotoDistMat(dfr.mbv_fp.tolist())

    picker = rdSimDivPickers.MaxMinPicker()

    if num_mols >= dfr.shape[0]:
        print 'You are requesting more molecules than made it through the filters.'
        print 'Returning all the molecules.'

        return dfr
    else:
        #Should probably report some statistics here
        ids = picker.Pick(dm, dfr.shape[0], num_mols)

        dfo = dfr.ix[ids]

        print 'The diversity picker has selected {} molecules'.format(
            dfo.shape[0])
        print '...'
        return dfo
Example #5
0
    def test0MaxMin(self):
        pkr = rdSimDivPickers.MaxMinPicker()
        maxmin = pkr.Pick(self.dMat, self.n, self.m, (886, 112))
        self.failUnless(maxmin[0] == 886)
        self.failUnless(maxmin[1] == 112)

        def func(i, j):
            if i == j:
                return 0.0
            if i < j:
                j, i = i, j
            return self.dMat[i * (i - 1) / 2 + j]

        lmaxmin = pkr.LazyPick(func, self.n, self.m, (886, 112))
        self.failUnless(list(lmaxmin) == list(maxmin))

        self.failUnlessRaises(
            ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (1012, )))
        self.failUnlessRaises(
            ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (-1, )))

        maxmin = pkr.Pick(self.dMat, self.n, self.m)
        self.failUnless(maxmin)
        lmaxmin = pkr.LazyPick(func, self.n, self.m)
        self.failUnless(lmaxmin)
Example #6
0
    def testBitVectorMaxMin4(self):
        # threshold tests
        fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers',
                             'Wrap', 'test_data', 'chembl_cyps.head.fps')
        fps = []
        with open(fname) as infil:
            for line in infil:
                fp = DataStructs.CreateFromFPSText(line.strip())
                fps.append(fp)
        mmp = rdSimDivPickers.MaxMinPicker()
        ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps,
                                                            len(fps),
                                                            20,
                                                            -1.0,
                                                            seed=42)
        self.assertEqual(list(ids), [
            374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630,
            881, 516, 497, 412, 718, 869, 407
        ])

        self.assertAlmostEqual(threshold, 0.8977, 4)

        ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps,
                                                            len(fps),
                                                            20,
                                                            0.91,
                                                            seed=42)
        self.assertEqual(
            list(ids),
            [374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630])
        self.assertTrue(threshold >= 0.91)
Example #7
0
    def testBitVectorMaxMin3(self):
        fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers',
                             'Wrap', 'test_data', 'chembl_cyps.head.fps')
        fps = []
        with open(fname) as infil:
            for line in infil:
                fp = DataStructs.CreateFromFPSText(line.strip())
                fps.append(fp)
        mmp = rdSimDivPickers.MaxMinPicker()
        ids = list(mmp.LazyBitVectorPick(fps, len(fps), 20, seed=42))
        self.assertEqual(ids, [
            374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630,
            881, 516, 497, 412, 718, 869, 407
        ])

        ids = list(
            mmp.LazyBitVectorPick(fps,
                                  len(fps),
                                  20,
                                  firstPicks=[374, 720, 690, 339, 875],
                                  seed=42))
        self.assertEqual(ids, [
            374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630,
            881, 516, 497, 412, 718, 869, 407
        ])
Example #8
0
    def testNonUniqueCrash(self):
        from rdkit import DataStructs
        sz = 300
        nbits = 40
        nBitsToSet = int(nbits * .3)
        N = 8
        vs = []
        for i in range(sz):
            bv = DataStructs.ExplicitBitVect(nbits)
            for j in range(nBitsToSet):
                val = int(nbits * random.random())
                bv.SetBit(val)
            vs.append(bv)
            vs.append(bv)

        def taniFunc(i, j, bvs=vs):
            d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j])
            return d

        picker = rdSimDivPickers.MaxMinPicker()
        mm1 = picker.LazyPick(taniFunc, len(vs), N)
        self.assertEqual(len(mm1), N)
        picker = None

        picker = rdSimDivPickers.MaxMinPicker()
        mm2 = picker.LazyBitVectorPick(vs, len(vs), N)
        self.assertEqual(len(mm2), N)

        picker = rdSimDivPickers.MaxMinPicker()
        mm3 = picker.LazyBitVectorPick(vs, len(vs), N)
        self.assertEqual(len(mm3), N)

        # we get the occasional dupe randomly,
        # make sure we don't get three dupes in a row
        self.assertTrue(tuple(mm2) != tuple(mm1)) or (tuple(mm3) != tuple(mm1))
        picker = None

        ds = []
        nvs = len(vs)
        for i in range(nvs):
            for j in range(i + 1, nvs):
                d = taniFunc(i, j)
                ds.append(d)
        m = numpy.array(ds)
        picker = rdSimDivPickers.HierarchicalClusterPicker(
            rdSimDivPickers.ClusterMethod.WARD)
        p1 = list(picker.Pick(m, nvs, N))
Example #9
0
  def testBitVectorMaxMin2(self):
    fps = [
      "11110010101000000000", "00000000000010010000", "11001010000000000001",
      "00100110101000001000", "01010110000100011001", "11000110101001000011",
      "00000000001100001111", "00011110110000001101", "00000011011110100010",
      "11000010110001000000", "00000100010000010000", "10000001000010110010",
      "00010010000000010100", "00011100100110101000", "10001001100110100000",
      "10000110100110010000", "00101110000101000000", "11011101100011100000",
      "10000110000100101000", "00101000100000010001", "01000001000010000000",
      "00101101010100000110", "10001000100110110001", "00011000010100000001",
      "00101000001000100011", "00010000100010011001", "01100001000100010001",
      "10000101000001101101", "00001000011001011000", "11110000100100100000",
      "10100110000000011010", "00110100010110010010", "00000000000001010010",
      "00100000000010100001", "11110011000010001000", "10110001010100001000",
      "00001100100110011011", "00010010100100001110", "10100101100010100010",
      "01100100010100000001", "10101110011100000000", "01011000000001000001",
      "00000011100110100010", "01100001010001001001", "00001000000001001100",
      "10011001110000000100", "10110000001001100100", "00011000000001001011",
      "11001011010001100010", "10010000000001001011", "00010000100111100000",
      "00001000001110001000", "11010000010001100110", "01101001100000111000",
      "01001000001110111000", "10000000000100010010", "11001000010010000000",
      "01010010000100110001", "00010001010100100001", "01110010000000010000",
      "10001010000011000001", "00000110000000100100", "00010000010001000000",
      "11101100011010000011", "00000010100001010001", "00010000110010000101",
      "00010001001000111001", "01000010001100100110", "00110110000000100001",
      "00100010010110110010", "01000000110011001111", "00011000001000110010",
      "01111010101000110100", "00001010000010110110", "00110011000011011010",
      "00111010111010000110", "00010011101010000011", "00000001011000010000",
      "00011011101110110000", "00010001101000000001", "00010000001010011010",
      "00000010100100100010", "00000010001011000100", "11010000000001011100",
      "00001000110101000001", "00000010000000110010", "10000000010011000001",
      "11110110100100010000", "10001111000110001001", "00100110000110000100",
      "00000100100000100100", "00110000101100010100", "00001010100000100000",
      "01011000000011000111", "00010000100001010001", "10000010100000010000",
      "00001000000000110010", "00001000101011010001", "00011110000100100000", "11001001010001010100"
    ]
    N = 5
    fps = [DataStructs.CreateFromBitString(x) for x in fps]
    picker = rdSimDivPickers.MaxMinPicker()
    mm1 = picker.LazyBitVectorPick(fps, len(fps), N, seed=42)
    self.assertEqual(len(mm1), N)
    self.assertEqual(list(mm1), [37, 1, 43, 38, 16])

    mm2 = picker.LazyBitVectorPick(fps, len(fps), N, useCache=False, seed=42)
    self.assertEqual(len(mm2), N)
    self.assertEqual(list(mm1), list(mm2))
Example #10
0
    def get_divconfs_ids(self, X, S, Ngen, Nkep, atmlist=[]):
        if len(atmlist) > 0:
            al = atmlist
        else:
            al = [i for i, s in enumerate(S) if s != 'H']

        self.nc.setConformers(confs=X, types=list(S))
        Ecmp = self.nc.energy()  # this generates AEVs

        aevs = np.empty([Ngen, len(al) * self.avs])
        for m in range(Ngen):
            for j, a in enumerate(al):
                aevs[m, j * self.avs:(j + 1) * self.avs] = self.nc.atomicenvironments(a, m).copy()

        dm = scispc.distance.pdist(aevs, 'sqeuclidean')
        picker = rdSimDivPickers.MaxMinPicker()
        seed_list = [i for i in range(Ngen)]
        np.random.shuffle(seed_list)
        ids = list(picker.Pick(dm, Ngen, Nkep, firstPicks=list(seed_list[0:5])))
        ids.sort()
        return ids
Example #11
0
from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv
import numpy
from rdkit import RDRandom
RDRandom.seed(23)

pkr = rdsimdiv.MaxMinPicker()

n = 1000
m = 80
dataPts = []
for i in range(n):
    pt = numpy.zeros(2, 'd')
    pt[0] = 10. * RDRandom.random()
    pt[1] = 10. * RDRandom.random()
    dataPts.append(pt)

# compute the distance matrix
distMat = numpy.zeros(n * (n - 1) / 2, 'd')
for i in range(n - 1):
    itab = n * i - ((i + 1) * (i + 2)) / 2
    pt1 = dataPts[i]
    for j in range(i + 1, n):
        id = itab + j
        pt2 = dataPts[j]
        diff = pt2 - pt1

        dist = numpy.sqrt(numpy.dot(diff, diff))
        distMat[id] = dist

    # now do the picking
res = pkr.Pick(distMat, n, m)
Example #12
0
    data = hdn.read_rcdb_coordsandnm(idir + f)
    l_dat.append(data)

    spc = data["species"]
    xyz = data["coordinates"]

    nc.setConformers(confs=xyz.reshape(1, len(spc), 3), types=list(spc))
    Ecmp = nc.energy()

    for i, a in enumerate(rcatoms):
        aevs[m,
             i * aevsize:(i + 1) * aevsize] = nc.atomicenvironments(a,
                                                                    0).copy()

dm = scispc.distance.pdist(aevs, 'sqeuclidean')
picker = rdSimDivPickers.MaxMinPicker()
seed_list = [i for i in range(aevs.shape[0])]
np.random.shuffle(seed_list)
print('seed:', seed_list)
ids = set(picker.Pick(dm, aevs.shape[0], Nk, firstPicks=list(seed_list[0:10])))
ids.update(set(inclist))
ids = list(ids)
print(ids)
ids.sort()

of = open(cdir + 'kept_data.nfo', 'w')
for i in ids:
    data = l_dat[i]
    f = files[i]
    of.write(f + '\n')
    of.flush()
Example #13
0
    def compute_diverse(self, xyz, spc, index, P, aevsize):
        mNa = 100
        Nk = int(np.floor(P * float(index.size)))
        Nk = Nk if Nk < 500 else 500

        #print('Ndiv:',Nk,'Ni:',index.size)
        if Nk > 4 and index.size > 8:
            # Array of random floats from 0 to 1
            selection = np.random.uniform(low=0.0, high=1.0, size=index.size)

            Pt = 1.0 if index.size < 1000 else 1000 / float(index.size)

            # Obtain the sample
            div_idx = np.array([n for n, i in enumerate(selection) if i <= Pt])
            pas_idx = np.array([n for n, i in enumerate(selection) if i > Pt])

            #print(Nk, div_idx.size, index.size, Pt)

            Inh = [i for i, s in enumerate(spc) if s != 'H']

            Nm = div_idx.size
            Na = len(spc)

            div_l = []

            if Na < mNa:
                mNa = Na

            Nat = Na * Nm

            Nit = int(np.ceil(Nat / 65000.0))
            Nmo = int(65000 / Na)
            Nmx = Nm

            aevs = np.empty([Nm, len(Inh) * aevsize])

            for j in range(0, Nit):
                # Setup idicies
                i1 = j * Nmo
                i2 = min(j * Nmo + Nmo, Nm)

                self.nc.setConformers(confs=xyz[div_idx[i1:i2]],
                                      types=list(spc))
                Ecmp_t = self.nc.energy()

                for i, a in enumerate(Inh):
                    for m in range(i1, i2):
                        aevs[m, i * aevsize:(i + 1) *
                             aevsize] = self.nc.atomicenvironments(a,
                                                                   m).copy()

            dm = scispc.distance.pdist(aevs, 'cosine')
            picker = rdSimDivPickers.MaxMinPicker()
            ids = list(picker.Pick(dm, aevs.shape[0], Nk))

            cur_index = np.array(div_idx[ids])
            new_index = np.array(
                [k for k in range(div_idx.size) if k not in ids])

            #print(cur_index.size,new_index.size,index.size)

            return cur_index, np.concatenate([new_index, pas_idx])
        elif index.size > 0:
            # Array of random floats from 0 to 1
            selection = np.random.uniform(low=0.0, high=1.0, size=index.size)

            # Obtain the sample
            new_index = np.array([n for n, i in enumerate(selection) if i > P])
            cur_index = np.array(
                [n for n, i in enumerate(selection) if i <= P])
            return cur_index, new_index
        else:
            return np.array([]), np.array([])