def testNonUniqueCrash(self): from rdkit import DataStructs sz = 10 nbits = 20 nBitsToSet = int(nbits * .3) N = 12 vs = [] for i in range(sz): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nBitsToSet): val = int(nbits * random.random()) bv.SetBit(val) vs.append(bv) vs.append(bv) def taniFunc(i, j, bvs=vs): d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j]) return d picker = rdSimDivPickers.MaxMinPicker() try: mm1 = picker.LazyPick(taniFunc, len(vs), N) except: ok = False else: ok = True self.assertTrue(ok) self.assertEqual(len(mm1), N) picker = None picker = rdSimDivPickers.MaxMinPicker() try: mm2 = picker.LazyBitVectorPick(vs, len(vs), N) except: ok = False else: ok = True self.assertTrue(ok) self.assertEqual(len(mm2), N) self.assertEqual(tuple(mm2), tuple(mm1)) picker = None ds = [] nvs = len(vs) for i in range(nvs): for j in range(i + 1, nvs): d = taniFunc(i, j) ds.append(d) m = numpy.array(ds) picker = rdSimDivPickers.HierarchicalClusterPicker( rdSimDivPickers.ClusterMethod.WARD) p1 = list(picker.Pick(m, nvs, N))
def test0MaxMin(self): pkr = rdSimDivPickers.MaxMinPicker() maxmin = pkr.Pick(self.dMat, self.n, self.m, (886, 112)) self.assertEqual(maxmin[0], 886) self.assertEqual(maxmin[1], 112) def func(i, j): if i == j: return 0.0 if i < j: j, i = i, j return self.dMat[i * (i - 1) // 2 + j] lmaxmin = pkr.LazyPick(func, self.n, self.m, (886, 112)) self.assertEqual(list(lmaxmin), list(maxmin)) lmaxmin = pkr.LazyPick(func, self.n, self.m, (886, 112), useCache=False) self.assertEqual(list(lmaxmin), list(maxmin)) self.assertRaises(ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (1012, ))) self.assertRaises(ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (-1, ))) maxmin = pkr.Pick(self.dMat, self.n, self.m) self.assertTrue(maxmin) lmaxmin = pkr.LazyPick(func, self.n, self.m) self.assertTrue(lmaxmin)
def testBitVectorMaxMin(self): from rdkit import DataStructs sz = 100 nbits = 200 nBitsToSet = int(nbits * .1) N = 10 vs = [] for i in range(sz): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nBitsToSet): val = int(nbits * random.random()) bv.SetBit(val) vs.append(bv) def func(i, j, bvs=vs): d = DataStructs.TanimotoSimilarity(bvs[i], bvs[j], returnDistance=True) return d picker = rdSimDivPickers.MaxMinPicker() mm1 = picker.LazyPick(func, len(vs), N, seed=42) self.assertEqual(len(mm1), N) mm2 = picker.LazyPick(func, len(vs), N, useCache=False, seed=42) self.assertEqual(len(mm2), N) self.assertEqual(list(mm1), list(mm2)) mm2 = picker.LazyBitVectorPick(vs, len(vs), N, seed=42) self.assertEqual(len(mm2), N) self.assertEqual(list(mm1), list(mm2)) mm2 = picker.LazyBitVectorPick(vs, len(vs), N, useCache=False, seed=42) self.assertEqual(len(mm2), N) self.assertEqual(list(mm1), list(mm2))
def pick_diverse_set(df, num_mols): print 'Resetting the index of the data' #reset the index of the data-frame dfr = df.reset_index() print 'Ther are {} molecules to calculate a distance matrix for'.format( dfr.shape[0]) #Calculate Tanimoto Distance Matrix for the remaining molecules dm = GetTanimotoDistMat(dfr.mbv_fp.tolist()) picker = rdSimDivPickers.MaxMinPicker() if num_mols >= dfr.shape[0]: print 'You are requesting more molecules than made it through the filters.' print 'Returning all the molecules.' return dfr else: #Should probably report some statistics here ids = picker.Pick(dm, dfr.shape[0], num_mols) dfo = dfr.ix[ids] print 'The diversity picker has selected {} molecules'.format( dfo.shape[0]) print '...' return dfo
def test0MaxMin(self): pkr = rdSimDivPickers.MaxMinPicker() maxmin = pkr.Pick(self.dMat, self.n, self.m, (886, 112)) self.failUnless(maxmin[0] == 886) self.failUnless(maxmin[1] == 112) def func(i, j): if i == j: return 0.0 if i < j: j, i = i, j return self.dMat[i * (i - 1) / 2 + j] lmaxmin = pkr.LazyPick(func, self.n, self.m, (886, 112)) self.failUnless(list(lmaxmin) == list(maxmin)) self.failUnlessRaises( ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (1012, ))) self.failUnlessRaises( ValueError, lambda: pkr.Pick(self.dMat, self.n, self.m, (-1, ))) maxmin = pkr.Pick(self.dMat, self.n, self.m) self.failUnless(maxmin) lmaxmin = pkr.LazyPick(func, self.n, self.m) self.failUnless(lmaxmin)
def testBitVectorMaxMin4(self): # threshold tests fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data', 'chembl_cyps.head.fps') fps = [] with open(fname) as infil: for line in infil: fp = DataStructs.CreateFromFPSText(line.strip()) fps.append(fp) mmp = rdSimDivPickers.MaxMinPicker() ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, -1.0, seed=42) self.assertEqual(list(ids), [ 374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630, 881, 516, 497, 412, 718, 869, 407 ]) self.assertAlmostEqual(threshold, 0.8977, 4) ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, 0.91, seed=42) self.assertEqual( list(ids), [374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630]) self.assertTrue(threshold >= 0.91)
def testBitVectorMaxMin3(self): fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data', 'chembl_cyps.head.fps') fps = [] with open(fname) as infil: for line in infil: fp = DataStructs.CreateFromFPSText(line.strip()) fps.append(fp) mmp = rdSimDivPickers.MaxMinPicker() ids = list(mmp.LazyBitVectorPick(fps, len(fps), 20, seed=42)) self.assertEqual(ids, [ 374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630, 881, 516, 497, 412, 718, 869, 407 ]) ids = list( mmp.LazyBitVectorPick(fps, len(fps), 20, firstPicks=[374, 720, 690, 339, 875], seed=42)) self.assertEqual(ids, [ 374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630, 881, 516, 497, 412, 718, 869, 407 ])
def testNonUniqueCrash(self): from rdkit import DataStructs sz = 300 nbits = 40 nBitsToSet = int(nbits * .3) N = 8 vs = [] for i in range(sz): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nBitsToSet): val = int(nbits * random.random()) bv.SetBit(val) vs.append(bv) vs.append(bv) def taniFunc(i, j, bvs=vs): d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j]) return d picker = rdSimDivPickers.MaxMinPicker() mm1 = picker.LazyPick(taniFunc, len(vs), N) self.assertEqual(len(mm1), N) picker = None picker = rdSimDivPickers.MaxMinPicker() mm2 = picker.LazyBitVectorPick(vs, len(vs), N) self.assertEqual(len(mm2), N) picker = rdSimDivPickers.MaxMinPicker() mm3 = picker.LazyBitVectorPick(vs, len(vs), N) self.assertEqual(len(mm3), N) # we get the occasional dupe randomly, # make sure we don't get three dupes in a row self.assertTrue(tuple(mm2) != tuple(mm1)) or (tuple(mm3) != tuple(mm1)) picker = None ds = [] nvs = len(vs) for i in range(nvs): for j in range(i + 1, nvs): d = taniFunc(i, j) ds.append(d) m = numpy.array(ds) picker = rdSimDivPickers.HierarchicalClusterPicker( rdSimDivPickers.ClusterMethod.WARD) p1 = list(picker.Pick(m, nvs, N))
def testBitVectorMaxMin2(self): fps = [ "11110010101000000000", "00000000000010010000", "11001010000000000001", "00100110101000001000", "01010110000100011001", "11000110101001000011", "00000000001100001111", "00011110110000001101", "00000011011110100010", "11000010110001000000", "00000100010000010000", "10000001000010110010", "00010010000000010100", "00011100100110101000", "10001001100110100000", "10000110100110010000", "00101110000101000000", "11011101100011100000", "10000110000100101000", "00101000100000010001", "01000001000010000000", "00101101010100000110", "10001000100110110001", "00011000010100000001", "00101000001000100011", "00010000100010011001", "01100001000100010001", "10000101000001101101", "00001000011001011000", "11110000100100100000", "10100110000000011010", "00110100010110010010", "00000000000001010010", "00100000000010100001", "11110011000010001000", "10110001010100001000", "00001100100110011011", "00010010100100001110", "10100101100010100010", "01100100010100000001", "10101110011100000000", "01011000000001000001", "00000011100110100010", "01100001010001001001", "00001000000001001100", "10011001110000000100", "10110000001001100100", "00011000000001001011", "11001011010001100010", "10010000000001001011", "00010000100111100000", "00001000001110001000", "11010000010001100110", "01101001100000111000", "01001000001110111000", "10000000000100010010", "11001000010010000000", "01010010000100110001", "00010001010100100001", "01110010000000010000", "10001010000011000001", "00000110000000100100", "00010000010001000000", "11101100011010000011", "00000010100001010001", "00010000110010000101", "00010001001000111001", "01000010001100100110", "00110110000000100001", "00100010010110110010", "01000000110011001111", "00011000001000110010", "01111010101000110100", "00001010000010110110", "00110011000011011010", "00111010111010000110", "00010011101010000011", "00000001011000010000", "00011011101110110000", "00010001101000000001", "00010000001010011010", "00000010100100100010", "00000010001011000100", "11010000000001011100", "00001000110101000001", "00000010000000110010", "10000000010011000001", "11110110100100010000", "10001111000110001001", "00100110000110000100", "00000100100000100100", "00110000101100010100", "00001010100000100000", "01011000000011000111", "00010000100001010001", "10000010100000010000", "00001000000000110010", "00001000101011010001", "00011110000100100000", "11001001010001010100" ] N = 5 fps = [DataStructs.CreateFromBitString(x) for x in fps] picker = rdSimDivPickers.MaxMinPicker() mm1 = picker.LazyBitVectorPick(fps, len(fps), N, seed=42) self.assertEqual(len(mm1), N) self.assertEqual(list(mm1), [37, 1, 43, 38, 16]) mm2 = picker.LazyBitVectorPick(fps, len(fps), N, useCache=False, seed=42) self.assertEqual(len(mm2), N) self.assertEqual(list(mm1), list(mm2))
def get_divconfs_ids(self, X, S, Ngen, Nkep, atmlist=[]): if len(atmlist) > 0: al = atmlist else: al = [i for i, s in enumerate(S) if s != 'H'] self.nc.setConformers(confs=X, types=list(S)) Ecmp = self.nc.energy() # this generates AEVs aevs = np.empty([Ngen, len(al) * self.avs]) for m in range(Ngen): for j, a in enumerate(al): aevs[m, j * self.avs:(j + 1) * self.avs] = self.nc.atomicenvironments(a, m).copy() dm = scispc.distance.pdist(aevs, 'sqeuclidean') picker = rdSimDivPickers.MaxMinPicker() seed_list = [i for i in range(Ngen)] np.random.shuffle(seed_list) ids = list(picker.Pick(dm, Ngen, Nkep, firstPicks=list(seed_list[0:5]))) ids.sort() return ids
from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv import numpy from rdkit import RDRandom RDRandom.seed(23) pkr = rdsimdiv.MaxMinPicker() n = 1000 m = 80 dataPts = [] for i in range(n): pt = numpy.zeros(2, 'd') pt[0] = 10. * RDRandom.random() pt[1] = 10. * RDRandom.random() dataPts.append(pt) # compute the distance matrix distMat = numpy.zeros(n * (n - 1) / 2, 'd') for i in range(n - 1): itab = n * i - ((i + 1) * (i + 2)) / 2 pt1 = dataPts[i] for j in range(i + 1, n): id = itab + j pt2 = dataPts[j] diff = pt2 - pt1 dist = numpy.sqrt(numpy.dot(diff, diff)) distMat[id] = dist # now do the picking res = pkr.Pick(distMat, n, m)
data = hdn.read_rcdb_coordsandnm(idir + f) l_dat.append(data) spc = data["species"] xyz = data["coordinates"] nc.setConformers(confs=xyz.reshape(1, len(spc), 3), types=list(spc)) Ecmp = nc.energy() for i, a in enumerate(rcatoms): aevs[m, i * aevsize:(i + 1) * aevsize] = nc.atomicenvironments(a, 0).copy() dm = scispc.distance.pdist(aevs, 'sqeuclidean') picker = rdSimDivPickers.MaxMinPicker() seed_list = [i for i in range(aevs.shape[0])] np.random.shuffle(seed_list) print('seed:', seed_list) ids = set(picker.Pick(dm, aevs.shape[0], Nk, firstPicks=list(seed_list[0:10]))) ids.update(set(inclist)) ids = list(ids) print(ids) ids.sort() of = open(cdir + 'kept_data.nfo', 'w') for i in ids: data = l_dat[i] f = files[i] of.write(f + '\n') of.flush()
def compute_diverse(self, xyz, spc, index, P, aevsize): mNa = 100 Nk = int(np.floor(P * float(index.size))) Nk = Nk if Nk < 500 else 500 #print('Ndiv:',Nk,'Ni:',index.size) if Nk > 4 and index.size > 8: # Array of random floats from 0 to 1 selection = np.random.uniform(low=0.0, high=1.0, size=index.size) Pt = 1.0 if index.size < 1000 else 1000 / float(index.size) # Obtain the sample div_idx = np.array([n for n, i in enumerate(selection) if i <= Pt]) pas_idx = np.array([n for n, i in enumerate(selection) if i > Pt]) #print(Nk, div_idx.size, index.size, Pt) Inh = [i for i, s in enumerate(spc) if s != 'H'] Nm = div_idx.size Na = len(spc) div_l = [] if Na < mNa: mNa = Na Nat = Na * Nm Nit = int(np.ceil(Nat / 65000.0)) Nmo = int(65000 / Na) Nmx = Nm aevs = np.empty([Nm, len(Inh) * aevsize]) for j in range(0, Nit): # Setup idicies i1 = j * Nmo i2 = min(j * Nmo + Nmo, Nm) self.nc.setConformers(confs=xyz[div_idx[i1:i2]], types=list(spc)) Ecmp_t = self.nc.energy() for i, a in enumerate(Inh): for m in range(i1, i2): aevs[m, i * aevsize:(i + 1) * aevsize] = self.nc.atomicenvironments(a, m).copy() dm = scispc.distance.pdist(aevs, 'cosine') picker = rdSimDivPickers.MaxMinPicker() ids = list(picker.Pick(dm, aevs.shape[0], Nk)) cur_index = np.array(div_idx[ids]) new_index = np.array( [k for k in range(div_idx.size) if k not in ids]) #print(cur_index.size,new_index.size,index.size) return cur_index, np.concatenate([new_index, pas_idx]) elif index.size > 0: # Array of random floats from 0 to 1 selection = np.random.uniform(low=0.0, high=1.0, size=index.size) # Obtain the sample new_index = np.array([n for n, i in enumerate(selection) if i > P]) cur_index = np.array( [n for n, i in enumerate(selection) if i <= P]) return cur_index, new_index else: return np.array([]), np.array([])