def __init__(self, args, assembly, ver, nbins, j): self.args = args self.assembly = assembly self.var = ver self.nbins = nbins self.j = j self.raw = paths.path(assembly, "raw") self.minipeaks = paths.path(assembly, "minipeaks", str(ver), str(nbins)) Utils.mkdir_p(self.minipeaks) self.bwtool = "/data/cherrypy/bin/bwtool" if not os.path.exists(self.bwtool): self.bwtool = "/usr/local/bin/bwtool" if not os.path.exists(self.bwtool): self.bwtool = "/data/common/tools/bwtool" if not os.path.exists(self.bwtool): raise Exception("no bwtool found") self.bwtoolFilter = os.path.abspath( os.path.join(os.path.dirname(__file__), 'minipeaks/bin/read_json')) if not os.path.exists(self.bwtoolFilter): raise Exception("missing C++ bwtool filter; please compile?") self.masterPeakFnp = os.path.join(self.raw, "cREs.bed") if "GRCh38" == assembly: self.masterPeakFnp = "/data/projects/encode/Registry/V2/GRCh38/GRCh38-ccREs.bed" self.numPeaks = numLines(self.masterPeakFnp) printt(self.masterPeakFnp, "has", self.numPeaks) self.miniPeaksBedFnp = os.path.join(self.minipeaks, "miniPeakSites.bed.gz") self.debug = False
def _load(self): fnBases = [("CTCF", 2), ("DNase", 2), ("Enhancer", 4), ("H3K27ac", 2), ("H3K4me3", 2), ("Insulator", 4), ("Promoter", 4)] self.rankMethodToCtAndFileID = [] for fnBase, ctIdx in fnBases: fn = fnBase + "-list.txt" fnp = paths.path(self.assembly, "raw", fn.lower()) if not os.path.exists(fnp): raise Exception("missing " + fnp) with open(fnp) as f: rows = [x.rstrip('\n').split('\t') for x in f] for r in rows: efn = r[:ctIdx] if 2 == len(efn): efn = '-'.join(efn) + ".txt" else: efn = '.'.join(['-'.join(efn[:2]), '-'.join(efn[2:4]) ]) + ".txt" fnp = paths.path(self.assembly, "raw", "signal-output", efn) if 0: if not os.path.exists(fnp): raise Exception("missing", fnp) if r[ctIdx].startswith( "LNCaP_clone_FGC_immortalized_cell_line_treated_with_1_nM" ) and r[ctIdx].endswith("hydroxy-17-methylestra-4"): continue if r[ctIdx].startswith( "SK-N-SH_immortalized_cell_line_treated_with" ) and r[ctIdx].endswith( "all-trans-retinoic_acid_for_48_hours"): continue d = [fnBase, r[ctIdx], fnp] self.rankMethodToCtAndFileID.append(d)
def run(self): fnp = paths.path(self.assembly, "raw", "tads.txt") printt("reading", fnp) with open(fnp) as f: tadRows = [line.rstrip('\n').split('\t') for line in f] fnp = paths.path(self.assembly, "extras", "ensebleToID.txt") printt("reading", fnp) with open(fnp) as f: ensemblidVerToGeneID = [line.rstrip('\n').split(',') for line in f] lookup = {r[0]: r[2] for r in ensemblidVerToGeneID} f = StringIO.StringIO() for tr in tadRows: r = [tr[1], tr[2]] geneIDs = [lookup[g] for g in tr[3].split(',')] r.append('{' + ','.join(geneIDs) + '}') f.write('\t'.join(r) + '\n') f.seek(0) self.setupTable() self.curs.copy_from(f, self.tableName, '\t', columns=("accession", "tadName", "geneIDs")) printt("copied in TADs", self.curs.rowcount)
def doImport(curs, assembly): fnp = paths.path(assembly, "hg19-tss-rampage-matrix.txt.gz") printt("reading", fnp) with gzip.open(fnp) as f: header = f.readline().rstrip('\n').split('\t') rows = [line.rstrip('\n').split('\t') for line in f] printt("read header and", len(rows), "rows") fnp = paths.path(assembly, "hg19-tss-filtered.bed.gz") with gzip.open(fnp) as f: tsses = [line.rstrip('\n').split('\t') for line in f] lookup = {r[3]: r for r in tsses} printt("rewriting") outF = StringIO.StringIO() for row in rows: r = [row[0]] t = lookup[row[0]] r.append(t[6]) # gene r.append(t[0]) # chrom r.append(t[1]) # start r.append(t[2]) # stop r.append(t[5]) # strand r.append(t[7].replace("_", " ")) # gene info r += row[1:] outF.write('\t'.join(r) + '\n') outF.seek(0) fileIDs = header[1:] cols = [ "transcript", "ensemblid_ver", "chrom", "start", "stop", "strand", "geneInfo" ] + fileIDs tableName = assembly + "_rampage" printt("copy into", tableName) curs.execute(""" DROP TABLE IF EXISTS {tn}; CREATE TABLE {tn} (id serial PRIMARY KEY, transcript text, ensemblid_ver text, chrom text, start integer, stop integer, strand VARCHAR(1), geneInfo text, maxVal real, {fields} );""".format(tn=tableName, fields=','.join([f + " real" for f in fileIDs]))) curs.copy_from(outF, tableName, '\t', columns=cols) printt("inserted", curs.rowcount) curs.execute(""" UPDATE {tn} SET maxVal = GREATEST( {fields} ) """.format(tn=tableName, fields=','.join(fileIDs)))
def _overlap(self, bedFnp): printt("******************* GWAS overlap") self._setupOverlap() cresFnp = paths.path(self.assembly, "extras", "cREs.sorted.bed") if not os.path.exists(cresFnp): Utils.sortFile(paths.path(self.assembly, "raw", "cREs.bed"), cresFnp) printt("running bedtools intersect...") cmds = [cat(bedFnp), '|', "cut -f -4,11-", '|', "bedtools intersect", "-a", "-", "-b", cresFnp, "-wo" ] snpsIntersecting = Utils.runCmds(cmds) print("example", snpsIntersecting[0].rstrip('\n').split('\t')) printt("rewriting...") outF = StringIO.StringIO() count = {} for r in snpsIntersecting: toks = r.rstrip('\n').split('\t') snp = toks[3] authorPubmedTrait = toks[4].replace('-', '_') accession = toks[9] if '_' not in authorPubmedTrait: print(r) print(toks) raise Exception("bad authorPubmedTrait?") if not snp.startswith("rs"): print(r) print(toks) raise Exception("bad rs?") if not accession.startswith("EH3"): print(r) print(toks) raise Exception("bad line?") outF.write('\t'.join([authorPubmedTrait, accession, snp]) + '\n') if authorPubmedTrait not in count: count[authorPubmedTrait] = 0 count[authorPubmedTrait] += 1 print("example", '\t'.join([authorPubmedTrait, accession, snp])) for k, v in count.iteritems(): print("%s: %d" % (k, v)) outF.seek(0) printt("copying into DB...") cols = "authorPubmedTrait accession snp".split(' ') self.curs.copy_from(outF, self.tableNameOverlap, '\t', columns=cols) importedNumRows(self.curs) makeIndex(self.curs, self.tableNameOverlap, ["authorPubmedTrait"])
def __init__(self, assembly, ver, nbins, assay): self.assembly = assembly self.nbins = nbins self.ver = ver self.assay = assay self.minipeaks = paths.path(assembly, "minipeaks", str(ver), str(nbins)) self.raw = paths.path(assembly, "raw") self.masterPeakFnp = os.path.join(self.raw, "cREs.bed") if "GRCh38" == assembly: self.masterPeakFnp = "/data/projects/encode/Registry/V2/GRCh38/GRCh38-ccREs.bed"
def computeIntersections(self): bedFnp = paths.path(self.assembly, "extras", "cREs.sorted.bed") if not os.path.exists(bedFnp): Utils.sortFile(paths.path(self.assembly, "raw", "cREs.bed"), bedFnp) jobs = self.makeJobs() results = Parallel(n_jobs=self.args.j)( delayed(runIntersectJob)(job, bedFnp) for job in jobs) printt("\n") printt("merging intersections into hash...") tfImap = {} fileJsons = [] filesToAccessions = {} for fileJson, accessions in results: if not accessions: continue for etype, label, fileID, accs in accessions: filesToAccessions[fileID] = accs for acc in accs: if acc not in tfImap: tfImap[acc] = {"tf": {}, "histone": {}} if label not in tfImap[acc][etype]: tfImap[acc][etype][label] = [] tfImap[acc][etype][label].append(fileID) fileJsons += fileJson printt("completed hash merge") printt("runDate:", self.runDate) outFnp = paths.path(self.assembly, "extras", self.runDate, "peakIntersections.json.gz") Utils.ensureDir(outFnp) with gzip.open(outFnp, 'w') as f: for k, v in tfImap.iteritems(): f.write('\t'.join( [k, json.dumps(v["tf"]), json.dumps(v["histone"])]) + '\n') printt("wrote", outFnp) outFnp = paths.path(self.assembly, "extras", self.runDate, "chipseqIntersectionsWithCres.json.gz") Utils.ensureDir(outFnp) with gzip.open(outFnp, 'w') as f: for k, v in filesToAccessions.iteritems(): f.write('\t'.join([k, json.dumps(v)]) + '\n') printt("wrote", outFnp)
def run(args, DBCONN): fnp = os.path.join(os.path.dirname(__file__), '../../../minipeak_import.txt') assemblies = Config.assemblies if args.assembly: assemblies = [args.assembly] cores = args.j if args.sample: cores = 1 with open(fnp, 'w') as outF: for assembly in assemblies: for ver, nbins in [(6,20)]: if not args.yes: s = "(Re)import %s, version %s, %s nbins?" % (assembly, ver, nbins) if not GetYesNoToQuestion.immediate(s): print("skipping", assembly, ver, nbins) continue minipeaks = paths.path(assembly, "minipeaks", str(ver), str(nbins)) outF.write("use minipeaks;\n") im = ImportMinipeaks(assembly, nbins, ver, cores) im.prepImportAndWriteScript(outF, args.sample) printWroteNumLines(fnp) cmds = ['CQLSH_HOST="{hosts}"'.format(hosts=Config.cassandra[0]), "cqlsh", "-f", fnp] print("please run this command:") print(' '.join(cmds))
def run(self): fnp = paths.path(self.assembly, "raw", "cREs.bed") cmds = [ "cat", fnp, '|', """awk 'BEGIN {srand()} !/^$/ { if(rand() <= .00001) print $0}'""" ] cres = [x.rstrip('\n').split('\t') for x in Utils.runCmds(cmds) if x] cres = filter(lambda x: x[0] in chroms[self.assembly], cres) print("selected", len(cres), "cres") # chr1 10244 10357 EH37D0000001 EH37E1055273 Promoter-like proximal cres = [CREnt(*x[:5]) for x in cres] for cre in cres: allRanks = CRE(self.pgSearch, cre.accession, self.cache).allRanks() for rm, ct, fnp in self.rankMethodToCtAndFileID: cmds = ['grep', cre.mpName, fnp] zscore = float(Utils.runCmds(cmds)[0].split('\t')[1]) ctIdx = self.rankMethodToIDxToCellType[rm][ct] - 1 zscoreDb = allRanks[rm.lower()][ctIdx] if not isclose(zscore, zscoreDb, 0.001): eprint("PROBLEM") eprint(cre) eprint(rm, ct) eprint("from", fnp) eprint(zscore) eprint("from DB lookup") eprint(zscoreDb) # eprint(allRanks) raise Exception("error") sys.stdout.write('.', ) print(cre.accession, "ok")
def _makeFile(self, assay, title): print("********************", title) self.assaymap = {assay: self.pgc.datasets_multi(assay)} cts = sorted(list(set(self.assaymap[assay].keys()))) for ct in cts: print(ct) cti = self.ctmap[title][ct] self.curs.execute(""" SELECT accession, {assay}_zscores[{cti}], chrom, start, stop FROM {tn} WHERE {assay}_zscores[{cti}] > 1.64 ORDER BY 2 DESC """.format(assay=assay, cti=cti, tn=self.assembly + "_cre_all")) rows = self.curs.fetchall() ctSan = "".join(x for x in ct if x.isalnum() or x == '_') dnase_expID = self.assaymap[assay][ct]["dnase_expid"] other_expID = self.assaymap[assay][ct]["other_expid"] outFnp = paths.path( self.assembly, "export", assay + "-like", '_'.join([ctSan, dnase_expID, other_expID]) + ".tsv") Utils.ensureDir(outFnp) with open(outFnp, 'w') as outF: for r in rows: toks = [r[2], r[3], r[4], r[0], r[1]] outF.write('\t'.join([str(s) for s in toks]) + '\n') printWroteNumLines(outFnp)
def __init__(self, curs, assembly, args): self.curs = curs self.assembly = assembly self.args = args self.tableName = assembly + "_nine_state" self.inFnp = paths.path(self.assembly, self.assembly + "-Look-Up-Matrix.txt")
def run(self): fnp = paths.path("mm10", "Two-Way-Synteny.txt") self.setupLiftover() printt("reading", fnp) mmToHG = [] with open(fnp, 'r') as f: for line in f: line = line.strip().split('\t') mc, ms, me, md = self.ccREmaps["mm10"][line[1]] hc, hs, he, hd = self.ccREmaps["hg19"][line[0]] mmToHg.append([hc, hs, he, md, line[1], hd, line[0]]) cols = "chrom start stop mouseAccession humanAccession overlap".split(' ') printt("writing stringio...") outF = StringIO.StringIO() for r in mmToHg: outF.write("\t".join([r[0], r[1], r[2], r[4], r[6], r[7]]) + '\n') outF.seek(0) printt("copy into db...") self.curs.copy_from(outF, self.tableName, '\t', columns=cols) printt("\tok", self.curs.rowcount) makeIndex(self.curs, self.tableName, ["mouseAccession", "humanAccession"])
def _import(self): lookup = [] fnp = paths.path(self.assembly, self.assembly + "-Look-Up-Matrix.txt") printt("parsing", fnp) with open(fnp) as f: rows = [x.strip() for x in f] outRows = Parallel(n_jobs=self.args.j)(delayed(ontologyToCellTypes)( line) for line in rows) printt('***********', "drop and create", self.tableName) self.curs.execute(""" DROP TABLE IF EXISTS {tableName}; CREATE TABLE {tableName} (id serial PRIMARY KEY, cellTypeName text, biosample_term_id text, synonyms jsonb );""".format(tableName=self.tableName)) printt('***********', "import lookup") printt("rewrite rows") outF = StringIO.StringIO() for row in outRows: for r in row: outF.write('\t'.join(r) + '\n') outF.seek(0) cols = ["cellTypeName", "biosample_term_id"] self.curs.copy_from(outF, self.tableName, '\t', columns=cols) printt("copied in", self.curs.rowcount)
class FCPaths: base = paths.fantomcat enhancer_intersected = os.path.join(base, "cREs.enhancers.intersected.bed") CAGE_intersected = os.path.join(base, "cREs.CAGE.intersected.bed") genetsv = os.path.join(base, "gene.info.tsv") genesdir = os.path.join(base, "genes") genebed = os.path.join(base, "gene.info.bed") intersected = os.path.join(base, "gene.info.intersected.bed") global_statistics = os.path.join(base, "global_statistics.json") cres = paths.path("hg19", "raw/cREs.sorted.bed.gz") twokb = os.path.join(base, "gene.2kbtss.bed") twokb_intersected = os.path.join(base, "gene.2kbtss.intersected.bed") twokb_statistics = os.path.join(base, "twokb_statistics.json") permissive_enhancers = os.path.join(base, "permissive_enhancers.bed.gz") robust_CAGE = os.path.join(base, "robust.cage.bed.gz") forimport = { "genes": os.path.join(base, "gene.import.tsv"), "intersections": os.path.join(base, "intersections.tsv"), "twokb_intersections": os.path.join(base, "twokb.intersections.tsv") } zenbu_track = os.path.join(base, "web_zenbu_downloads", "5BFANTOMCAT5DRobustgene.bed") @staticmethod def genepath(acc): return os.path.join(FCPaths.genesdir, acc + ".json")
def _load_ccRE_map(assembly): ret = {} with open(paths.path(assembly, "raw", "cREs.sorted.bed"), 'r') as f: for line in f: line = line.strip().split('\t') ret[line[4]] = tuple(line[:4]) return ret
def metadata(curs, assembly): fnp = paths.path(assembly, "hg19-tss-rampage-matrix.txt.gz") printt("reading", fnp) with gzip.open(fnp) as f: header = f.readline().rstrip('\n').split('\t') fileIDs = header[1:] tableName = assembly + "_rampage_info" printt("dropping and creating", tableName) curs.execute(""" DROP TABLE IF EXISTS {tn}; CREATE TABLE {tn} (id serial PRIMARY KEY, expID text, fileID text, biosample_term_name text, biosample_type text, biosample_summary text, tissue text, strand VARCHAR(1) ) """.format(tn=tableName)) outF = StringIO.StringIO() mc = None if Config.memcache: mc = MemCacheWrapper(Config.memcache) qd = QueryDCC(auth=False, cache=mc) for fileID in fileIDs: exp = qd.getExpFromFileID(fileID) expID = exp.encodeID tissue = DetermineTissue.TranslateTissue(assembly, exp).strip() for f in exp.files: if f.fileID == fileID: print(f) print(f.output_type) strand = f.output_type.split()[0] if "plus" == strand: strand = '+' elif "minus" == strand: strand = '-' else: raise Exception("unknown strand " + f.output_type) outF.write('\t'.join([ expID, fileID, exp.biosample_term_name, exp.biosample_type, exp.getExpJson()["biosample_summary"], tissue, strand ]) + '\n') outF.seek(0) cols = [ "expID", "fileID", "biosample_term_name", "biosample_type", "biosample_summary", "tissue", "strand" ] curs.copy_from(outF, tableName, '\t', columns=cols) printt("\tok", curs.rowcount)
def run(self): self.setupTable() fnp = paths.path(self.assembly, "raw", "vista.tsv") printt("reading", fnp) with open(fnp) as f: self.curs.copy_from(f, self.tableName, '\t', columns=("accession", "vistaids")) printt("copied in %d vista entries" % self.curs.rowcount)
def loadJobs(assembly, runDate): fnp = paths.path(assembly, "extras", runDate, "cistromeJobs.json.gz") printt("reading", fnp) with gzip.open(fnp) as f: jobs = json.load(f) printt("loaded", len(jobs)) return jobs
def _getFileIDs(self, fn): assay = fn.split('-')[0] printt("***********************", self.assembly, assay) fnp = paths.path(self.assembly, "raw", fn) with open(fnp) as f: rows = [x.rstrip('\n').split('\t') for x in f.readlines()] fileIDs = sorted([r[1] for r in rows]) return assay, fileIDs
def __init__(self, args, assembly, runDate=None): self.args = args self.assembly = assembly self.runDate = runDate if not runDate: self.runDate = arrow.now().format('YYYY-MM-DD') self.jobsFnp = paths.path(self.assembly, "extras", self.runDate, "jobs.json.gz") Utils.ensureDir(self.jobsFnp)
def computeIntersections(args, assembly): bedFnp = paths.path(assembly, "extras", "cREs.sorted.bed") if not os.path.exists(bedFnp): Utils.sortFile(paths.path(assembly, "raw", "cREs.bed"), bedFnp) runDate = arrow.now().format('YYYY-MM-DD') jobs = makeJobs(assembly, paths.cistrome("data", "raw"), runDate) results = Parallel(n_jobs=args.j)( delayed(cistromeIntersectJob)(job, bedFnp) for job in jobs) print("\n") printt("merging intersections into hash...") processResults( results, paths.path(assembly, "extras", runDate, "cistromeIntersections.json.gz"))
def run(self): self.ensemblIDtoGeneName() today = arrow.now().format('YYYY-MM-DD') fnp = paths.path(self.assembly, "geneExp", today + ".tsv.gz") Utils.ensureDir(fnp) with gzip.open(fnp, 'wb') as f: for row in self._getRowsFromFiles(): f.write('\t'.join(row) + '\n') printWroteNumLines(fnp)
def run(self): dataF = paths.path(self.assembly, "gwas", "h3k27ac") origBedFnp = paths.gwasFnp(self.assembly, self.version, ".bed") bedFnp = paths.gwasFnp(self.assembly, self.version, ".sorted.bed") self._gwas(bedFnp) header = self._enrichment() self._studies(header) self._overlap(bedFnp)
def run(self): self.setupTable() cols = ["accession", "tf", "histone"] fnp = paths.path(self.assembly, "extras", self.runDate, "%s.json.gz" % self._tsuffix) printt("copying in data", fnp) with gzip.open(fnp) as f: self.curs.copy_from(f, self.tableName, '\t', columns=cols) importedNumRows(self.curs)
def makeSubsampledAccessions(assembly, ver, nbins, accessionsFnp): Utils.ensureDir(accessionsFnp) minipeaks = paths.path(assembly, "minipeaks", str(ver), str(nbins)) miniPeaksBedFnp = os.path.join(minipeaks, "miniPeakSites.bed.gz") cmds = ["zcat", miniPeaksBedFnp, "| awk '{ print $4 }' ", "| awk 'BEGIN {srand()} !/^$/ { if (rand() <= .01) print $0}'", ">", accessionsFnp] Utils.runCmds(cmds) printWroteNumLines(accessionsFnp)
def __init__(self, assembly, nbins, ver, cores): self.assembly = assembly self.nbins = nbins self.ver = ver self.cores = cores self.cluster = Cluster(Config.cassandra) self.session = self.cluster.connect() self.session.execute("""CREATE KEYSPACE IF NOT EXISTS minipeaks WITH replication = {'class':'SimpleStrategy', 'replication_factor':1};""") self.session.set_keyspace("minipeaks") self.minipeaks = paths.path(assembly, "minipeaks", str(ver), str(nbins))
def run(self): mod = import_module("10_generate_ontology_actual") runF = getattr(mod, "run") downloadDate = '2017-10Oct-25' if 1: uberon_url = "http://ontologies.berkeleybop.org/uberon/composite-metazoan.owl" efo_url = "http://sourceforge.net/p/efo/code/HEAD/tree/trunk/src/efoinowl/InferredEFOOWLview/EFO_inferred.owl?format=raw" obi_url = "http://purl.obolibrary.org/obo/obi.owl" else: uberon_url = paths.path("ontology", downloadDate, "composite-metazoan.owl") efo_url = paths.path("ontology", downloadDate, "EFO_inferred.owl") obi_url = paths.path("ontology", downloadDate, "obi.owl") printt("running ENCODE DCC generate ontology...") terms = runF(uberon_url, efo_url, obi_url) fnp = paths.path("ontology", downloadDate, "ontology.json.gz") Utils.ensureDir(fnp) printt("done; about to write", fnp) with gzip.open(fnp, 'wb') as f: json.dump(terms, f) printWroteNumLines(fnp)
def run(self): fnp = paths.path(self.assembly, "extras", "TADs.bed.gz") printt("reading", fnp) with gzip.open(fnp) as f: rows = [line.rstrip().split('\t') for line in f] f = StringIO.StringIO() for r in rows: f.write('\t'.join(r) + '\n') f.seek(0) self.setupTable() self.curs.copy_from(f, self.tableName, '\t', columns=("chrom", "start", "stop", "tadName")) printt("copied in TADs", self.curs.rowcount)
def _import(self): printt('***********', "drop and create", self.tableName) # "AEO:0001021": { # "assay": [], # "category": [], # "developmental": [], # "name": "stem cell population", # "objectives": [], # "organs": [], # "part_of": [], # "preferred_name": "", # "slims": [], # "synonyms": [], # "systems": [], # "types": [] # }, self.curs.execute(""" DROP TABLE IF EXISTS {tableName}; CREATE TABLE {tableName} (id serial PRIMARY KEY, oid text, info jsonb );""".format(tableName=self.tableName)) printt('***********', "import ontology info") downloadDate = '2017-10Oct-25' fnp = paths.path("ontology", downloadDate, "ontology.json.gz") with gzip.open(fnp, "rb") as f: kv = json.load(f) outRows = Parallel(n_jobs=self.args.j)(delayed(runOntology)( oid, infos) for oid, infos in kv.iteritems()) outF = StringIO.StringIO() for r in outRows: outF.write(r + '\n') outF.seek(0) cols = ["oid", "info"] self.curs.copy_from(outF, self.tableName, '\t', columns=cols) if 0 == self.curs.rowcount: raise Exception("error: no rows inserted") printt("imported", self.curs.rowcount, "rows", self.tableName)
def loadGidsToDbIds(assembly): fnp = paths.path(assembly, "extras", "ensebleToID.txt") printt("reading", fnp) with open(fnp) as f: rows = [line.rstrip('\n').split(',') for line in f.readlines() if line] gidsToDbID = {} requiredGids = {} for r in rows: if 3 != len(r): print(r) raise Exception("wrong num toks") gid = r[0] gidsToDbID[gid] = r[2] requiredGids[gid] = r[2] gidsToDbID[r[1]] = r[2] return gidsToDbID, requiredGids