Ejemplo n.º 1
0
    def test_InsertData(self):
        newTblName = 'NEW_TABLE'
        conn = DbConnect(self.tempDbName)
        try:
            conn.GetCursor().execute('drop table %s' % (newTblName))
        except Exception:
            pass
        conn.Commit()
        conn.AddTable(newTblName, 'id int,val1 int, val2 int')
        for i in range(10):
            conn.InsertData(newTblName, (i, i + 1, 2 * i))
        conn.Commit()
        d = conn.GetData(table=newTblName)
        assert len(d) == 10

        self.assertEqual(len(conn.GetColumnNames(table=newTblName)), 3)
        conn.AddColumn(newTblName, 'val3', 'int')
        conn.Commit()
        self.assertEqual(len(conn.GetColumnNames(table=newTblName)), 4)
        d = conn.GetColumns('id,val3', table=newTblName)
        self.assertEqual(len(d), 10)
        self.assertTrue(all(r[1] is None for r in d))
        for r in d:
            conn.InsertColumnData(newTblName, 'val3', r[0],
                                  'id={0}'.format(r[0]))
        conn.Commit()
        d = conn.GetColumns('id,val3', table=newTblName)
        self.assertTrue(all(r[0] == r[1] for r in d))

        d = None
        try:
            conn.GetCursor().execute('drop table %s' % (newTblName))
        except Exception:
            assert 0, 'drop table failed'
Ejemplo n.º 2
0
def DBToData(dbName, tableName, user='******', password='******', dupCol=-1, what='*', where='',
             join='', pickleCol=-1, pickleClass=None, ensembleIds=None):
  """ constructs  an _MLData.MLDataSet_ from a database

    **Arguments**

      - dbName: the name of the database to be opened

      - tableName: the table name containing the data in the database

      - user: the user name to be used to connect to the database

      - password: the password to be used to connect to the database

      - dupCol: if nonzero specifies which column should be used to recognize
        duplicates.

    **Returns**

       an _MLData.MLDataSet_

    **Notes**

      - this uses Dbase.DataUtils functionality

  """
  conn = DbConnect(dbName, tableName, user, password)
  res = conn.GetData(fields=what, where=where, join=join, removeDups=dupCol, forceList=1)
  nPts = len(res)
  vals = [None] * nPts
  ptNames = [None] * nPts
  classWorks = True
  for i in range(nPts):
    tmp = list(res[i])
    ptNames[i] = tmp.pop(0)
    if pickleCol >= 0:
      if not pickleClass or not classWorks:
        tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol]))
      else:
        try:
          tmp[pickleCol] = pickleClass(str(tmp[pickleCol]))
        except Exception:
          tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol]))
          classWorks = False
      if ensembleIds:
        tmp[pickleCol] = BitUtils.ConstructEnsembleBV(tmp[pickleCol], ensembleIds)
    else:
      if ensembleIds:
        tmp = TakeEnsemble(tmp, ensembleIds, isDataVect=True)
    vals[i] = tmp
  varNames = conn.GetColumnNames(join=join, what=what)
  data = MLData.MLDataSet(vals, varNames=varNames, ptNames=ptNames)
  return data
Ejemplo n.º 3
0
    def testAddTable(self):
        """ tests AddTable and GetTableNames functionalities """
        newTblName = 'NEW_TABLE'
        conn = DbConnect(self.tempDbName)
        try:
            conn.GetCursor().execute('drop table %s' % (newTblName))
        except Exception:
            pass
        conn.Commit()

        self.assertNotIn(newTblName, [x.strip() for x in conn.GetTableNames()])
        conn.AddTable(newTblName, 'id int')
        self.assertIn(newTblName, [x.strip() for x in conn.GetTableNames()])

        self.assertEqual(conn.GetColumnNames(table=newTblName), ['id'])

        conn.GetCursor().execute('drop table %s' % (newTblName))
Ejemplo n.º 4
0
def GetAllDescriptorNames(db, tbl1, tbl2, user='******', password='******'):
    """ gets possible descriptor names from a database

    **Arguments**

      - db: the name of the database to use

      - tbl1: the name of the table to be used for reading descriptor values

      - tbl2: the name of the table to be used for reading notes about the
        descriptors (*descriptions of the descriptors if you like*)

      - user: the user name for DB access

      - password: the password for DB access

    **Returns**

      a 2-tuple containing:

        1) a list of column names

        2) a list of column descriptors

    **Notes**

      - this uses _Dbase.DbInfo_  and Dfunctionality for querying the database

      - it is assumed that tbl2 includes 'property' and 'notes' columns

  """
    from rdkit.Dbase.DbConnection import DbConnect
    conn = DbConnect(db, user=user, password=password)

    colNames = conn.GetColumnNames(table=tbl1)
    colDesc = map(lambda x: (x[0].upper(), x[1]),
                  conn.GetColumns('property,notes', table=tbl2))
    for name, desc in countOptions:
        colNames.append(name)
        colDesc.append((name, desc))
    return colNames, colDesc
Ejemplo n.º 5
0
    def _init(self, refCompos, copyBounds=0):
        BuildComposite._verbose = 0
        conn = DbConnect(self.details.dbName, self.details.tableName)
        cols = [x.upper() for x in conn.GetColumnNames()]
        cDescs = [x.upper() for x in refCompos.GetDescriptorNames()]
        self.assertEqual(cols, cDescs)

        self.details.nModels = 10
        self.details.lockRandom = 1
        self.details.randomSeed = refCompos._randomSeed
        self.details.splitFrac = refCompos._splitFrac

        if self.details.splitFrac:
            self.details.splitRun = 1
        else:
            self.details.splitRun = 0

        if not copyBounds:
            self.details.qBounds = [0] * len(cols)
        else:
            self.details.qBounds = refCompos.GetQuantBounds()[0]
Ejemplo n.º 6
0
def RunSearch(options, queryFilename):
    global sigFactory
    if options.similarityType == 'AtomPairs':
        fpBuilder = FingerprintUtils.BuildAtomPairFP
        simMetric = DataStructs.DiceSimilarity
        dbName = os.path.join(options.dbDir, options.pairDbName)
        fpTableName = options.pairTableName
        fpColName = options.pairColName
    elif options.similarityType == 'TopologicalTorsions':
        fpBuilder = FingerprintUtils.BuildTorsionsFP
        simMetric = DataStructs.DiceSimilarity
        dbName = os.path.join(options.dbDir, options.torsionsDbName)
        fpTableName = options.torsionsTableName
        fpColName = options.torsionsColName
    elif options.similarityType == 'RDK':
        fpBuilder = FingerprintUtils.BuildRDKitFP
        simMetric = DataStructs.FingerprintSimilarity
        dbName = os.path.join(options.dbDir, options.fpDbName)
        fpTableName = options.fpTableName
        if not options.fpColName:
            options.fpColName = 'rdkfp'
        fpColName = options.fpColName
    elif options.similarityType == 'Pharm2D':
        fpBuilder = FingerprintUtils.BuildPharm2DFP
        simMetric = DataStructs.DiceSimilarity
        dbName = os.path.join(options.dbDir, options.fpDbName)
        fpTableName = options.pharm2DTableName
        if not options.fpColName:
            options.fpColName = 'pharm2dfp'
        fpColName = options.fpColName
        FingerprintUtils.sigFactory = BuildSigFactory(options)
    elif options.similarityType == 'Gobbi2D':
        from rdkit.Chem.Pharm2D import Gobbi_Pharm2D
        fpBuilder = FingerprintUtils.BuildPharm2DFP
        simMetric = DataStructs.TanimotoSimilarity
        dbName = os.path.join(options.dbDir, options.fpDbName)
        fpTableName = options.gobbi2DTableName
        if not options.fpColName:
            options.fpColName = 'gobbi2dfp'
        fpColName = options.fpColName
        FingerprintUtils.sigFactory = Gobbi_Pharm2D.factory
    elif options.similarityType == 'Morgan':
        fpBuilder = FingerprintUtils.BuildMorganFP
        simMetric = DataStructs.DiceSimilarity
        dbName = os.path.join(options.dbDir, options.morganFpDbName)
        fpTableName = options.morganFpTableName
        fpColName = options.morganFpColName

    extraArgs = {}
    if options.similarityMetric == 'tanimoto':
        simMetric = DataStructs.TanimotoSimilarity
    elif options.similarityMetric == 'dice':
        simMetric = DataStructs.DiceSimilarity
    elif options.similarityMetric == 'tversky':
        simMetric = DataStructs.TverskySimilarity
        extraArgs['tverskyA'] = options.tverskyA
        extraArgs['tverskyB'] = options.tverskyB

    if options.smilesQuery:
        mol = Chem.MolFromSmiles(options.smilesQuery)
        if not mol:
            logger.error('could not build query molecule from smiles "%s"' %
                         options.smilesQuery)
            sys.exit(-1)
        options.queryMol = mol
    elif options.smartsQuery:
        mol = Chem.MolFromSmarts(options.smartsQuery)
        if not mol:
            logger.error('could not build query molecule from smarts "%s"' %
                         options.smartsQuery)
            sys.exit(-1)
        options.queryMol = mol

    if options.outF == '-':
        outF = sys.stdout
    elif options.outF == '':
        outF = None
    else:
        outF = open(options.outF, 'w+')

    molsOut = False
    if options.sdfOut:
        molsOut = True
        if options.sdfOut == '-':
            sdfOut = sys.stdout
        else:
            sdfOut = open(options.sdfOut, 'w+')
    else:
        sdfOut = None
    if options.smilesOut:
        molsOut = True
        if options.smilesOut == '-':
            smilesOut = sys.stdout
        else:
            smilesOut = open(options.smilesOut, 'w+')
    else:
        smilesOut = None

    if queryFilename:
        try:
            tmpF = open(queryFilename, 'r')
        except IOError:
            logger.error('could not open query file %s' % queryFilename)
            sys.exit(1)

        if options.molFormat == 'smiles':
            func = GetMolsFromSmilesFile
        elif options.molFormat == 'sdf':
            func = GetMolsFromSDFile

        if not options.silent:
            msg = 'Reading query molecules'
            if fpBuilder: msg += ' and generating fingerprints'
            logger.info(msg)
        probes = []
        i = 0
        nms = []
        for nm, smi, mol in func(queryFilename, None, options.nameProp):
            i += 1
            nms.append(nm)
            if not mol:
                logger.error('query molecule %d could not be built' % (i))
                probes.append((None, None))
                continue
            if fpBuilder:
                probes.append((mol, fpBuilder(mol)))
            else:
                probes.append((mol, None))
            if not options.silent and not i % 1000:
                logger.info("  done %d" % i)
    else:
        probes = None

    conn = None
    idName = options.molIdName
    ids = None
    names = None
    molDbName = os.path.join(options.dbDir, options.molDbName)
    molIdName = options.molIdName
    mConn = DbConnect(molDbName)
    cns = [(x.lower(), y)
           for x, y in mConn.GetColumnNamesAndTypes('molecules')]
    idCol, idTyp = cns[0]
    if options.propQuery or options.queryMol:
        conn = DbConnect(molDbName)
        curs = conn.GetCursor()
        if options.queryMol:
            if not options.silent: logger.info('Doing substructure query')
            if options.propQuery:
                where = 'where %s' % options.propQuery
            else:
                where = ''
            if not options.silent:
                curs.execute('select count(*) from molecules %(where)s' %
                             locals())
                nToDo = curs.fetchone()[0]

            join = ''
            doSubstructFPs = False
            fpDbName = os.path.join(options.dbDir, options.fpDbName)
            if os.path.exists(fpDbName) and not options.negateQuery:
                curs.execute("attach database '%s' as fpdb" % (fpDbName))
                try:
                    curs.execute('select * from fpdb.%s limit 1' %
                                 options.layeredTableName)
                except:
                    pass
                else:
                    doSubstructFPs = True
                    join = 'join fpdb.%s using (%s)' % (
                        options.layeredTableName, idCol)
                    query = LayeredOptions.GetQueryText(options.queryMol)
                    if query:
                        if not where:
                            where = 'where'
                        else:
                            where += ' and'
                        where += ' ' + query

            cmd = 'select %(idCol)s,molpkl from molecules %(join)s %(where)s' % locals(
            )
            curs.execute(cmd)
            row = curs.fetchone()
            nDone = 0
            ids = []
            while row:
                id, molpkl = row
                if not options.zipMols:
                    m = _molFromPkl(molpkl)
                else:
                    m = Chem.Mol(zlib.decompress(molpkl))
                matched = m.HasSubstructMatch(options.queryMol)
                if options.negateQuery:
                    matched = not matched
                if matched:
                    ids.append(id)
                nDone += 1
                if not options.silent and not nDone % 500:
                    if not doSubstructFPs:
                        logger.info(
                            '  searched %d (of %d) molecules; %d hits so far' %
                            (nDone, nToDo, len(ids)))
                    else:
                        logger.info(
                            '  searched through %d molecules; %d hits so far' %
                            (nDone, len(ids)))
                row = curs.fetchone()
            if not options.silent and doSubstructFPs and nToDo:
                nFiltered = nToDo - nDone
                logger.info(
                    '   Fingerprint screenout rate: %d of %d (%%%.2f)' %
                    (nFiltered, nToDo, 100. * nFiltered / nToDo))

        elif options.propQuery:
            if not options.silent: logger.info('Doing property query')
            propQuery = options.propQuery.split(';')[0]
            curs.execute(
                'select %(idCol)s from molecules where %(propQuery)s' %
                locals())
            ids = [x[0] for x in curs.fetchall()]
        if not options.silent:
            logger.info('Found %d molecules matching the query' % (len(ids)))

    t1 = time.time()
    if probes:
        if not options.silent: logger.info('Finding Neighbors')
        conn = DbConnect(dbName)
        cns = conn.GetColumnNames(fpTableName)
        curs = conn.GetCursor()

        if ids:
            ids = [(x, ) for x in ids]
            curs.execute(
                'create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' %
                locals())
            curs.executemany('insert into _tmpTbl values (?)', ids)
            join = 'join  _tmpTbl using (%(idCol)s)' % locals()
        else:
            join = ''

        if cns[0].lower() != idCol.lower():
            # backwards compatibility to the days when mol tables had a guid and
            # the fps tables did not:
            curs.execute("attach database '%(molDbName)s' as mols" % locals())
            curs.execute("""
  select %(idCol)s,%(fpColName)s from %(fpTableName)s join
      (select %(idCol)s,%(molIdName)s from mols.molecules %(join)s)
    using (%(molIdName)s)
""" % (locals()))
        else:
            curs.execute(
                'select %(idCol)s,%(fpColName)s from %(fpTableName)s %(join)s'
                % locals())

        def poolFromCurs(curs, similarityMethod):
            row = curs.fetchone()
            while row:
                id, pkl = row
                fp = DepickleFP(pkl, similarityMethod)
                yield (id, fp)
                row = curs.fetchone()

        topNLists = GetNeighborLists(probes,
                                     options.topN,
                                     poolFromCurs(curs,
                                                  options.similarityType),
                                     simMetric=simMetric,
                                     simThresh=options.simThresh,
                                     **extraArgs)
        uniqIds = set()
        nbrLists = {}
        for i, nm in enumerate(nms):
            topNLists[i].reverse()
            scores = topNLists[i].GetPts()
            nbrNames = topNLists[i].GetExtras()
            nbrs = []
            for j, nbrGuid in enumerate(nbrNames):
                if nbrGuid is None:
                    break
                else:
                    uniqIds.add(nbrGuid)
                    nbrs.append((nbrGuid, scores[j]))
            nbrLists[(i, nm)] = nbrs
        t2 = time.time()
        if not options.silent:
            logger.info('The search took %.1f seconds' % (t2 - t1))

        if not options.silent: logger.info('Creating output')

        curs = mConn.GetCursor()
        ids = list(uniqIds)

        ids = [(x, ) for x in ids]
        curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' %
                     locals())
        curs.executemany('insert into _tmpTbl values (?)', ids)
        curs.execute(
            'select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)'
            % locals())
        nmDict = {}
        for guid, id in curs.fetchall():
            nmDict[guid] = str(id)

        ks = list(nbrLists.keys())
        ks.sort()
        if not options.transpose:
            for i, nm in ks:
                nbrs = nbrLists[(i, nm)]
                nbrTxt = options.outputDelim.join([nm] + [
                    '%s%s%.3f' % (nmDict[id], options.outputDelim, score)
                    for id, score in nbrs
                ])
                if outF: print(nbrTxt, file=outF)
        else:
            labels = [
                '%s%sSimilarity' % (x[1], options.outputDelim) for x in ks
            ]
            if outF: print(options.outputDelim.join(labels), file=outF)
            for i in range(options.topN):
                outL = []
                for idx, nm in ks:
                    nbr = nbrLists[(idx, nm)][i]
                    outL.append(nmDict[nbr[0]])
                    outL.append('%.3f' % nbr[1])
                if outF: print(options.outputDelim.join(outL), file=outF)
    else:
        if not options.silent: logger.info('Creating output')
        curs = mConn.GetCursor()
        ids = [(x, ) for x in set(ids)]
        curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' %
                     locals())
        curs.executemany('insert into _tmpTbl values (?)', ids)
        molIdName = options.molIdName
        curs.execute(
            'select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)'
            % locals())
        nmDict = {}
        for guid, id in curs.fetchall():
            nmDict[guid] = str(id)
        if outF: print('\n'.join(nmDict.values()), file=outF)
    if molsOut and ids:
        molDbName = os.path.join(options.dbDir, options.molDbName)
        cns = [x.lower() for x in mConn.GetColumnNames('molecules')]
        if cns[-1] != 'molpkl':
            cns.remove('molpkl')
            cns.append('molpkl')

        curs = mConn.GetCursor()
        #curs.execute('create temporary table _tmpTbl (guid integer)'%locals())
        #curs.executemany('insert into _tmpTbl values (?)',ids)
        cnText = ','.join(cns)
        curs.execute(
            'select %(cnText)s from molecules join _tmpTbl using (%(idCol)s)' %
            locals())

        row = curs.fetchone()
        molD = {}
        while row:
            row = list(row)
            m = _molFromPkl(row[-1])
            guid = row[0]
            nm = nmDict[guid]
            if sdfOut:
                m.SetProp('_Name', nm)
                print(Chem.MolToMolBlock(m), file=sdfOut)
                for i in range(1, len(cns) - 1):
                    pn = cns[i]
                    pv = str(row[i])
                    print >> sdfOut, '> <%s>\n%s\n' % (pn, pv)
                print('$$$$', file=sdfOut)
            if smilesOut:
                smi = Chem.MolToSmiles(m, options.chiralSmiles)
            if smilesOut:
                print('%s %s' % (smi, str(row[1])), file=smilesOut)
            row = curs.fetchone()
    if not options.silent: logger.info('Done!')