Esempio n. 1
0
def getSequenceIdToTermsMap(release, sequenceIds, conn=None):
    '''
    constructs a map from sequence id to a list of go term info dicts.
    note: if sequence id not in the database, it is not added to the map.
    returns: dict mapping sequence id to a (possibly empty) list of dicts like {'go_id': term accession number, 'go_name': term name}.
    '''
    seqIdToTermsMap = {}
    termMap = {}
    for group in util.groupsOfN(sequenceIds, 1000):
        sql = 'SELECT rs2gt.sequence_id, rs2gt.go_term_acc, rs2gt.go_term_name'
        sql += ' FROM {} AS rs2gt'.format(releaseTable(release, 'sequence_to_go_term'))
        sql += ' WHERE rs2gt.sequence_id IN ('+', '.join([str(id) for id in group])+') '
        # logging.debug('sql='+sql)
        # logging.debug('len sequenceIds='+str(len(sequenceIds)))
        def selectTermsRows():
            return dbutil.selectSQL(sql=sql, conn=conn)
        rows = selectTermsRows()
        # map sequence ids without any terms to [].  with terms to list of accs.
        def buildTermsMap():
            for id, acc, name in rows:
                seqIdToTermsMap.setdefault(id, [])
                termMap[acc] = name
                seqIdToTermsMap[id].append(acc)
        buildTermsMap()
    return (seqIdToTermsMap, termMap)
Esempio n. 2
0
def loadReleaseResults(release, genomeToId, divToId, evalueToId, geneToId, resultsGen):
    '''
    resultsGen: a generator that yields ((qdb, sdb, div, evalue), orthologs) tuples.
    convert the results into a rows, and insert them into the results table.
    `id` int(10) unsigned NOT NULL auto_increment,
    `query_db` smallint(5) unsigned NOT NULL,
    `subject_db` smallint(5) unsigned NOT NULL,
    `divergence` tinyint(3) unsigned NOT NULL,
    `evalue` tinyint(3) unsigned NOT NULL,
    `filename` text,
    `mod_time` datetime default NULL,
    `orthologs` longblob,
    `num_orthologs` int(10) unsigned NOT NULL,
    '''
    def convertForDb(result):
        # convert various items into the form the database table wants.  Change strings into database ids.  Encode orthologs, etc.
        (qdb, sdb, div, evalue), orthologs = result
        qdbId = genomeToId[qdb]
        sdbId = genomeToId[sdb]
        divId = divToId[div]
        evalueId = evalueToId[evalue]
        dbOrthologs = [(geneToId[qid], geneToId[sid], float(dist)) for qid, sid, dist in orthologs] # orthologs using db ids and floats, not strings.
        encodedOrthologs = encodeOrthologs(dbOrthologs)
        numOrthologs = len(orthologs)
        return qdbId, sdbId, divId, evalueId, encodedOrthologs, numOrthologs

    numPerGroup = 400 # not too huge, not too slow.
    sql1 = 'INSERT IGNORE INTO {} (query_db, subject_db, divergence, evalue, mod_time, orthologs, num_orthologs) VALUES '.format(releaseTable(release, 'results'))
    for i, group in enumerate(util.groupsOfN(resultsGen, numPerGroup)):
        sql = sql1 + ', '.join(['(%s, %s, %s, %s, NOW(), %s, %s) ' for j in range(len(group))]) # cannot just use numPerGroup, b/c last group can have fewer results.
        argsLists = [convertForDb(result) for result in group]
        args = list(itertools.chain.from_iterable(argsLists)) # flatten args into one long list for the sql
        with connCM() as conn:
            dbutil.insertSQL(conn, sql, args=args)
Esempio n. 3
0
def getSequenceIdToSequenceDataMap(release, sequenceIds, conn=None):
    '''
    returns: dict mapping sequence id to dict {'external_sequence_id':external_sequence_id, 'genome_id':genome_id, 'gene_name':gene_name} 
    '''
    map = {}
    
    with connCM(conn=conn) as conn:
        for group in util.groupsOfN(sequenceIds, 1000):
            sql = 'SELECT id, external_sequence_id, genome_id, gene_name FROM {} '.format(releaseTable(release, 'sequence'))
            sql += ' WHERE id IN ('+', '.join([str(id) for id in group])+')'
            # logging.debug('sql='+sql)
            # logging.debug('sequenceIds='+str(sequenceIds))
            def selectSequenceDataRows():
                return dbutil.selectSQL(sql=sql, conn=conn)
            rows = selectSequenceDataRows()
            def buildSequenceDataMap():
                for id, external_sequence_id, genome_id, gene_name in rows:
                    map[id] = {roundup_common.EXTERNAL_SEQUENCE_ID_KEY:external_sequence_id, roundup_common.GENOME_ID_KEY:genome_id,
                               roundup_common.GENE_NAME_KEY:gene_name}
            buildSequenceDataMap()
    return map