def test_InsertData(self): newTblName = 'NEW_TABLE' conn = DbConnect(self.tempDbName) try: conn.GetCursor().execute('drop table %s' % (newTblName)) except Exception: pass conn.Commit() conn.AddTable(newTblName, 'id int,val1 int, val2 int') for i in range(10): conn.InsertData(newTblName, (i, i + 1, 2 * i)) conn.Commit() d = conn.GetData(table=newTblName) assert len(d) == 10 self.assertEqual(len(conn.GetColumnNames(table=newTblName)), 3) conn.AddColumn(newTblName, 'val3', 'int') conn.Commit() self.assertEqual(len(conn.GetColumnNames(table=newTblName)), 4) d = conn.GetColumns('id,val3', table=newTblName) self.assertEqual(len(d), 10) self.assertTrue(all(r[1] is None for r in d)) for r in d: conn.InsertColumnData(newTblName, 'val3', r[0], 'id={0}'.format(r[0])) conn.Commit() d = conn.GetColumns('id,val3', table=newTblName) self.assertTrue(all(r[0] == r[1] for r in d)) d = None try: conn.GetCursor().execute('drop table %s' % (newTblName)) except Exception: assert 0, 'drop table failed'
def DBToData(dbName, tableName, user='******', password='******', dupCol=-1, what='*', where='', join='', pickleCol=-1, pickleClass=None, ensembleIds=None): """ constructs an _MLData.MLDataSet_ from a database **Arguments** - dbName: the name of the database to be opened - tableName: the table name containing the data in the database - user: the user name to be used to connect to the database - password: the password to be used to connect to the database - dupCol: if nonzero specifies which column should be used to recognize duplicates. **Returns** an _MLData.MLDataSet_ **Notes** - this uses Dbase.DataUtils functionality """ conn = DbConnect(dbName, tableName, user, password) res = conn.GetData(fields=what, where=where, join=join, removeDups=dupCol, forceList=1) nPts = len(res) vals = [None] * nPts ptNames = [None] * nPts classWorks = True for i in range(nPts): tmp = list(res[i]) ptNames[i] = tmp.pop(0) if pickleCol >= 0: if not pickleClass or not classWorks: tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol])) else: try: tmp[pickleCol] = pickleClass(str(tmp[pickleCol])) except Exception: tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol])) classWorks = False if ensembleIds: tmp[pickleCol] = BitUtils.ConstructEnsembleBV(tmp[pickleCol], ensembleIds) else: if ensembleIds: tmp = TakeEnsemble(tmp, ensembleIds, isDataVect=True) vals[i] = tmp varNames = conn.GetColumnNames(join=join, what=what) data = MLData.MLDataSet(vals, varNames=varNames, ptNames=ptNames) return data
def testAddTable(self): """ tests AddTable and GetTableNames functionalities """ newTblName = 'NEW_TABLE' conn = DbConnect(self.tempDbName) try: conn.GetCursor().execute('drop table %s' % (newTblName)) except Exception: pass conn.Commit() self.assertNotIn(newTblName, [x.strip() for x in conn.GetTableNames()]) conn.AddTable(newTblName, 'id int') self.assertIn(newTblName, [x.strip() for x in conn.GetTableNames()]) self.assertEqual(conn.GetColumnNames(table=newTblName), ['id']) conn.GetCursor().execute('drop table %s' % (newTblName))
def GetAllDescriptorNames(db, tbl1, tbl2, user='******', password='******'): """ gets possible descriptor names from a database **Arguments** - db: the name of the database to use - tbl1: the name of the table to be used for reading descriptor values - tbl2: the name of the table to be used for reading notes about the descriptors (*descriptions of the descriptors if you like*) - user: the user name for DB access - password: the password for DB access **Returns** a 2-tuple containing: 1) a list of column names 2) a list of column descriptors **Notes** - this uses _Dbase.DbInfo_ and Dfunctionality for querying the database - it is assumed that tbl2 includes 'property' and 'notes' columns """ from rdkit.Dbase.DbConnection import DbConnect conn = DbConnect(db, user=user, password=password) colNames = conn.GetColumnNames(table=tbl1) colDesc = map(lambda x: (x[0].upper(), x[1]), conn.GetColumns('property,notes', table=tbl2)) for name, desc in countOptions: colNames.append(name) colDesc.append((name, desc)) return colNames, colDesc
def _init(self, refCompos, copyBounds=0): BuildComposite._verbose = 0 conn = DbConnect(self.details.dbName, self.details.tableName) cols = [x.upper() for x in conn.GetColumnNames()] cDescs = [x.upper() for x in refCompos.GetDescriptorNames()] self.assertEqual(cols, cDescs) self.details.nModels = 10 self.details.lockRandom = 1 self.details.randomSeed = refCompos._randomSeed self.details.splitFrac = refCompos._splitFrac if self.details.splitFrac: self.details.splitRun = 1 else: self.details.splitRun = 0 if not copyBounds: self.details.qBounds = [0] * len(cols) else: self.details.qBounds = refCompos.GetQuantBounds()[0]
def RunSearch(options, queryFilename): global sigFactory if options.similarityType == 'AtomPairs': fpBuilder = FingerprintUtils.BuildAtomPairFP simMetric = DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir, options.pairDbName) fpTableName = options.pairTableName fpColName = options.pairColName elif options.similarityType == 'TopologicalTorsions': fpBuilder = FingerprintUtils.BuildTorsionsFP simMetric = DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir, options.torsionsDbName) fpTableName = options.torsionsTableName fpColName = options.torsionsColName elif options.similarityType == 'RDK': fpBuilder = FingerprintUtils.BuildRDKitFP simMetric = DataStructs.FingerprintSimilarity dbName = os.path.join(options.dbDir, options.fpDbName) fpTableName = options.fpTableName if not options.fpColName: options.fpColName = 'rdkfp' fpColName = options.fpColName elif options.similarityType == 'Pharm2D': fpBuilder = FingerprintUtils.BuildPharm2DFP simMetric = DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir, options.fpDbName) fpTableName = options.pharm2DTableName if not options.fpColName: options.fpColName = 'pharm2dfp' fpColName = options.fpColName FingerprintUtils.sigFactory = BuildSigFactory(options) elif options.similarityType == 'Gobbi2D': from rdkit.Chem.Pharm2D import Gobbi_Pharm2D fpBuilder = FingerprintUtils.BuildPharm2DFP simMetric = DataStructs.TanimotoSimilarity dbName = os.path.join(options.dbDir, options.fpDbName) fpTableName = options.gobbi2DTableName if not options.fpColName: options.fpColName = 'gobbi2dfp' fpColName = options.fpColName FingerprintUtils.sigFactory = Gobbi_Pharm2D.factory elif options.similarityType == 'Morgan': fpBuilder = FingerprintUtils.BuildMorganFP simMetric = DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir, options.morganFpDbName) fpTableName = options.morganFpTableName fpColName = options.morganFpColName extraArgs = {} if options.similarityMetric == 'tanimoto': simMetric = DataStructs.TanimotoSimilarity elif options.similarityMetric == 'dice': simMetric = DataStructs.DiceSimilarity elif options.similarityMetric == 'tversky': simMetric = DataStructs.TverskySimilarity extraArgs['tverskyA'] = options.tverskyA extraArgs['tverskyB'] = options.tverskyB if options.smilesQuery: mol = Chem.MolFromSmiles(options.smilesQuery) if not mol: logger.error('could not build query molecule from smiles "%s"' % options.smilesQuery) sys.exit(-1) options.queryMol = mol elif options.smartsQuery: mol = Chem.MolFromSmarts(options.smartsQuery) if not mol: logger.error('could not build query molecule from smarts "%s"' % options.smartsQuery) sys.exit(-1) options.queryMol = mol if options.outF == '-': outF = sys.stdout elif options.outF == '': outF = None else: outF = open(options.outF, 'w+') molsOut = False if options.sdfOut: molsOut = True if options.sdfOut == '-': sdfOut = sys.stdout else: sdfOut = open(options.sdfOut, 'w+') else: sdfOut = None if options.smilesOut: molsOut = True if options.smilesOut == '-': smilesOut = sys.stdout else: smilesOut = open(options.smilesOut, 'w+') else: smilesOut = None if queryFilename: try: tmpF = open(queryFilename, 'r') except IOError: logger.error('could not open query file %s' % queryFilename) sys.exit(1) if options.molFormat == 'smiles': func = GetMolsFromSmilesFile elif options.molFormat == 'sdf': func = GetMolsFromSDFile if not options.silent: msg = 'Reading query molecules' if fpBuilder: msg += ' and generating fingerprints' logger.info(msg) probes = [] i = 0 nms = [] for nm, smi, mol in func(queryFilename, None, options.nameProp): i += 1 nms.append(nm) if not mol: logger.error('query molecule %d could not be built' % (i)) probes.append((None, None)) continue if fpBuilder: probes.append((mol, fpBuilder(mol))) else: probes.append((mol, None)) if not options.silent and not i % 1000: logger.info(" done %d" % i) else: probes = None conn = None idName = options.molIdName ids = None names = None molDbName = os.path.join(options.dbDir, options.molDbName) molIdName = options.molIdName mConn = DbConnect(molDbName) cns = [(x.lower(), y) for x, y in mConn.GetColumnNamesAndTypes('molecules')] idCol, idTyp = cns[0] if options.propQuery or options.queryMol: conn = DbConnect(molDbName) curs = conn.GetCursor() if options.queryMol: if not options.silent: logger.info('Doing substructure query') if options.propQuery: where = 'where %s' % options.propQuery else: where = '' if not options.silent: curs.execute('select count(*) from molecules %(where)s' % locals()) nToDo = curs.fetchone()[0] join = '' doSubstructFPs = False fpDbName = os.path.join(options.dbDir, options.fpDbName) if os.path.exists(fpDbName) and not options.negateQuery: curs.execute("attach database '%s' as fpdb" % (fpDbName)) try: curs.execute('select * from fpdb.%s limit 1' % options.layeredTableName) except: pass else: doSubstructFPs = True join = 'join fpdb.%s using (%s)' % ( options.layeredTableName, idCol) query = LayeredOptions.GetQueryText(options.queryMol) if query: if not where: where = 'where' else: where += ' and' where += ' ' + query cmd = 'select %(idCol)s,molpkl from molecules %(join)s %(where)s' % locals( ) curs.execute(cmd) row = curs.fetchone() nDone = 0 ids = [] while row: id, molpkl = row if not options.zipMols: m = _molFromPkl(molpkl) else: m = Chem.Mol(zlib.decompress(molpkl)) matched = m.HasSubstructMatch(options.queryMol) if options.negateQuery: matched = not matched if matched: ids.append(id) nDone += 1 if not options.silent and not nDone % 500: if not doSubstructFPs: logger.info( ' searched %d (of %d) molecules; %d hits so far' % (nDone, nToDo, len(ids))) else: logger.info( ' searched through %d molecules; %d hits so far' % (nDone, len(ids))) row = curs.fetchone() if not options.silent and doSubstructFPs and nToDo: nFiltered = nToDo - nDone logger.info( ' Fingerprint screenout rate: %d of %d (%%%.2f)' % (nFiltered, nToDo, 100. * nFiltered / nToDo)) elif options.propQuery: if not options.silent: logger.info('Doing property query') propQuery = options.propQuery.split(';')[0] curs.execute( 'select %(idCol)s from molecules where %(propQuery)s' % locals()) ids = [x[0] for x in curs.fetchall()] if not options.silent: logger.info('Found %d molecules matching the query' % (len(ids))) t1 = time.time() if probes: if not options.silent: logger.info('Finding Neighbors') conn = DbConnect(dbName) cns = conn.GetColumnNames(fpTableName) curs = conn.GetCursor() if ids: ids = [(x, ) for x in ids] curs.execute( 'create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' % locals()) curs.executemany('insert into _tmpTbl values (?)', ids) join = 'join _tmpTbl using (%(idCol)s)' % locals() else: join = '' if cns[0].lower() != idCol.lower(): # backwards compatibility to the days when mol tables had a guid and # the fps tables did not: curs.execute("attach database '%(molDbName)s' as mols" % locals()) curs.execute(""" select %(idCol)s,%(fpColName)s from %(fpTableName)s join (select %(idCol)s,%(molIdName)s from mols.molecules %(join)s) using (%(molIdName)s) """ % (locals())) else: curs.execute( 'select %(idCol)s,%(fpColName)s from %(fpTableName)s %(join)s' % locals()) def poolFromCurs(curs, similarityMethod): row = curs.fetchone() while row: id, pkl = row fp = DepickleFP(pkl, similarityMethod) yield (id, fp) row = curs.fetchone() topNLists = GetNeighborLists(probes, options.topN, poolFromCurs(curs, options.similarityType), simMetric=simMetric, simThresh=options.simThresh, **extraArgs) uniqIds = set() nbrLists = {} for i, nm in enumerate(nms): topNLists[i].reverse() scores = topNLists[i].GetPts() nbrNames = topNLists[i].GetExtras() nbrs = [] for j, nbrGuid in enumerate(nbrNames): if nbrGuid is None: break else: uniqIds.add(nbrGuid) nbrs.append((nbrGuid, scores[j])) nbrLists[(i, nm)] = nbrs t2 = time.time() if not options.silent: logger.info('The search took %.1f seconds' % (t2 - t1)) if not options.silent: logger.info('Creating output') curs = mConn.GetCursor() ids = list(uniqIds) ids = [(x, ) for x in ids] curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' % locals()) curs.executemany('insert into _tmpTbl values (?)', ids) curs.execute( 'select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)' % locals()) nmDict = {} for guid, id in curs.fetchall(): nmDict[guid] = str(id) ks = list(nbrLists.keys()) ks.sort() if not options.transpose: for i, nm in ks: nbrs = nbrLists[(i, nm)] nbrTxt = options.outputDelim.join([nm] + [ '%s%s%.3f' % (nmDict[id], options.outputDelim, score) for id, score in nbrs ]) if outF: print(nbrTxt, file=outF) else: labels = [ '%s%sSimilarity' % (x[1], options.outputDelim) for x in ks ] if outF: print(options.outputDelim.join(labels), file=outF) for i in range(options.topN): outL = [] for idx, nm in ks: nbr = nbrLists[(idx, nm)][i] outL.append(nmDict[nbr[0]]) outL.append('%.3f' % nbr[1]) if outF: print(options.outputDelim.join(outL), file=outF) else: if not options.silent: logger.info('Creating output') curs = mConn.GetCursor() ids = [(x, ) for x in set(ids)] curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' % locals()) curs.executemany('insert into _tmpTbl values (?)', ids) molIdName = options.molIdName curs.execute( 'select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)' % locals()) nmDict = {} for guid, id in curs.fetchall(): nmDict[guid] = str(id) if outF: print('\n'.join(nmDict.values()), file=outF) if molsOut and ids: molDbName = os.path.join(options.dbDir, options.molDbName) cns = [x.lower() for x in mConn.GetColumnNames('molecules')] if cns[-1] != 'molpkl': cns.remove('molpkl') cns.append('molpkl') curs = mConn.GetCursor() #curs.execute('create temporary table _tmpTbl (guid integer)'%locals()) #curs.executemany('insert into _tmpTbl values (?)',ids) cnText = ','.join(cns) curs.execute( 'select %(cnText)s from molecules join _tmpTbl using (%(idCol)s)' % locals()) row = curs.fetchone() molD = {} while row: row = list(row) m = _molFromPkl(row[-1]) guid = row[0] nm = nmDict[guid] if sdfOut: m.SetProp('_Name', nm) print(Chem.MolToMolBlock(m), file=sdfOut) for i in range(1, len(cns) - 1): pn = cns[i] pv = str(row[i]) print >> sdfOut, '> <%s>\n%s\n' % (pn, pv) print('$$$$', file=sdfOut) if smilesOut: smi = Chem.MolToSmiles(m, options.chiralSmiles) if smilesOut: print('%s %s' % (smi, str(row[1])), file=smilesOut) row = curs.fetchone() if not options.silent: logger.info('Done!')