def getSequenceIdToTermsMap(release, sequenceIds, conn=None): ''' constructs a map from sequence id to a list of go term info dicts. note: if sequence id not in the database, it is not added to the map. returns: dict mapping sequence id to a (possibly empty) list of dicts like {'go_id': term accession number, 'go_name': term name}. ''' seqIdToTermsMap = {} termMap = {} for group in util.groupsOfN(sequenceIds, 1000): sql = 'SELECT rs2gt.sequence_id, rs2gt.go_term_acc, rs2gt.go_term_name' sql += ' FROM {} AS rs2gt'.format(releaseTable(release, 'sequence_to_go_term')) sql += ' WHERE rs2gt.sequence_id IN ('+', '.join([str(id) for id in group])+') ' # logging.debug('sql='+sql) # logging.debug('len sequenceIds='+str(len(sequenceIds))) def selectTermsRows(): return dbutil.selectSQL(sql=sql, conn=conn) rows = selectTermsRows() # map sequence ids without any terms to []. with terms to list of accs. def buildTermsMap(): for id, acc, name in rows: seqIdToTermsMap.setdefault(id, []) termMap[acc] = name seqIdToTermsMap[id].append(acc) buildTermsMap() return (seqIdToTermsMap, termMap)
def loadReleaseResults(release, genomeToId, divToId, evalueToId, geneToId, resultsGen): ''' resultsGen: a generator that yields ((qdb, sdb, div, evalue), orthologs) tuples. convert the results into a rows, and insert them into the results table. `id` int(10) unsigned NOT NULL auto_increment, `query_db` smallint(5) unsigned NOT NULL, `subject_db` smallint(5) unsigned NOT NULL, `divergence` tinyint(3) unsigned NOT NULL, `evalue` tinyint(3) unsigned NOT NULL, `filename` text, `mod_time` datetime default NULL, `orthologs` longblob, `num_orthologs` int(10) unsigned NOT NULL, ''' def convertForDb(result): # convert various items into the form the database table wants. Change strings into database ids. Encode orthologs, etc. (qdb, sdb, div, evalue), orthologs = result qdbId = genomeToId[qdb] sdbId = genomeToId[sdb] divId = divToId[div] evalueId = evalueToId[evalue] dbOrthologs = [(geneToId[qid], geneToId[sid], float(dist)) for qid, sid, dist in orthologs] # orthologs using db ids and floats, not strings. encodedOrthologs = encodeOrthologs(dbOrthologs) numOrthologs = len(orthologs) return qdbId, sdbId, divId, evalueId, encodedOrthologs, numOrthologs numPerGroup = 400 # not too huge, not too slow. sql1 = 'INSERT IGNORE INTO {} (query_db, subject_db, divergence, evalue, mod_time, orthologs, num_orthologs) VALUES '.format(releaseTable(release, 'results')) for i, group in enumerate(util.groupsOfN(resultsGen, numPerGroup)): sql = sql1 + ', '.join(['(%s, %s, %s, %s, NOW(), %s, %s) ' for j in range(len(group))]) # cannot just use numPerGroup, b/c last group can have fewer results. argsLists = [convertForDb(result) for result in group] args = list(itertools.chain.from_iterable(argsLists)) # flatten args into one long list for the sql with connCM() as conn: dbutil.insertSQL(conn, sql, args=args)
def getSequenceIdToSequenceDataMap(release, sequenceIds, conn=None): ''' returns: dict mapping sequence id to dict {'external_sequence_id':external_sequence_id, 'genome_id':genome_id, 'gene_name':gene_name} ''' map = {} with connCM(conn=conn) as conn: for group in util.groupsOfN(sequenceIds, 1000): sql = 'SELECT id, external_sequence_id, genome_id, gene_name FROM {} '.format(releaseTable(release, 'sequence')) sql += ' WHERE id IN ('+', '.join([str(id) for id in group])+')' # logging.debug('sql='+sql) # logging.debug('sequenceIds='+str(sequenceIds)) def selectSequenceDataRows(): return dbutil.selectSQL(sql=sql, conn=conn) rows = selectSequenceDataRows() def buildSequenceDataMap(): for id, external_sequence_id, genome_id, gene_name in rows: map[id] = {roundup_common.EXTERNAL_SEQUENCE_ID_KEY:external_sequence_id, roundup_common.GENOME_ID_KEY:genome_id, roundup_common.GENE_NAME_KEY:gene_name} buildSequenceDataMap() return map