def run(self): tableName = self.assembly + "_snps" printt("dropping and creating", tableName) self.curs.execute(""" DROP TABLE IF EXISTS {tableName}; CREATE TABLE {tableName} (id serial PRIMARY KEY, snp varchar(15), chrom text, start integer, stop integer ); """.format(tableName=tableName)) fns = { "mm10": "dbsnps.142.mm10.ucsc_table_browser.tsv.gz", "hg19": "dbsnps.144.hg19.ucsc_table_browser.tsv.gz", # "hg38": "dbsnps.144.hg38.ucsc_table_browser.tsv.gz" } fnp = os.path.join(Dirs.dbsnps, fns[self.assembly]) printt("importing", fnp) with gzip.open(fnp) as f: cols = ["chrom", "start", "stop", "snp"] self.curs.copy_from(f, tableName, '\t', columns=cols) printt("imported", self.curs.rowcount) makeIndex(self.curs, tableName, ["snp", "chrom"]) makeIndexIntRange(self.curs, tableName, ["start", "stop"])
def _gwas(self, fnp): printt("******************* GWAS") printt("reading", fnp) with open(fnp) as f: rows = [r.rstrip().split('\t') for r in f if r] self.setupGWAS() printt("rewrite rows") outF = StringIO.StringIO() for r in rows: if 'Lead' == r[4]: r[4] = r[3] r[5] = "{%s}" % r[5].replace('*', "-1") r[2] = str(int(r[2]) + 1) r[-1] = r[-1].replace('-', '_') outF.write('\t'.join(r) + '\n') print("example", '\t'.join(r)) outF.seek(0) cols = "chrom start stop snp taggedSNP r2 ldblock trait author pubmed authorPubmedTrait".split(' ') self.curs.copy_from(outF, self.tableNameGwas, '\t', columns=cols) importedNumRows(self.curs) makeIndex(self.curs, self.tableNameGwas, ["chrom", "authorPubmedTrait", "ldblock", "snp"]) makeIndexIntRange(self.curs, self.tableNameGwas, ["start", "stop"])
def run(self): fnp = paths.path("mm10", "Two-Way-Synteny.txt") self.setupLiftover() printt("reading", fnp) mmToHG = [] with open(fnp, 'r') as f: for line in f: line = line.strip().split('\t') mc, ms, me, md = self.ccREmaps["mm10"][line[1]] hc, hs, he, hd = self.ccREmaps["hg19"][line[0]] mmToHg.append([hc, hs, he, md, line[1], hd, line[0]]) cols = "chrom start stop mouseAccession humanAccession overlap".split(' ') printt("writing stringio...") outF = StringIO.StringIO() for r in mmToHg: outF.write("\t".join([r[0], r[1], r[2], r[4], r[6], r[7]]) + '\n') outF.seek(0) printt("copy into db...") self.curs.copy_from(outF, self.tableName, '\t', columns=cols) printt("\tok", self.curs.rowcount) makeIndex(self.curs, self.tableName, ["mouseAccession", "humanAccession"])
def run(self): with db_connect_single(os.path.realpath(__file__)) as conn: with conn.cursor() as curs: tn = "r_rnas_" + self.assembly makeIndex(curs, tn, ["celltype"]) tn = "r_expression_" + self.assembly makeIndex(curs, tn, ["dataset"])
def _overlap(self, bedFnp): printt("******************* GWAS overlap") self._setupOverlap() cresFnp = paths.path(self.assembly, "extras", "cREs.sorted.bed") if not os.path.exists(cresFnp): Utils.sortFile(paths.path(self.assembly, "raw", "cREs.bed"), cresFnp) printt("running bedtools intersect...") cmds = [cat(bedFnp), '|', "cut -f -4,11-", '|', "bedtools intersect", "-a", "-", "-b", cresFnp, "-wo" ] snpsIntersecting = Utils.runCmds(cmds) print("example", snpsIntersecting[0].rstrip('\n').split('\t')) printt("rewriting...") outF = StringIO.StringIO() count = {} for r in snpsIntersecting: toks = r.rstrip('\n').split('\t') snp = toks[3] authorPubmedTrait = toks[4].replace('-', '_') accession = toks[9] if '_' not in authorPubmedTrait: print(r) print(toks) raise Exception("bad authorPubmedTrait?") if not snp.startswith("rs"): print(r) print(toks) raise Exception("bad rs?") if not accession.startswith("EH3"): print(r) print(toks) raise Exception("bad line?") outF.write('\t'.join([authorPubmedTrait, accession, snp]) + '\n') if authorPubmedTrait not in count: count[authorPubmedTrait] = 0 count[authorPubmedTrait] += 1 print("example", '\t'.join([authorPubmedTrait, accession, snp])) for k, v in count.iteritems(): print("%s: %d" % (k, v)) outF.seek(0) printt("copying into DB...") cols = "authorPubmedTrait accession snp".split(' ') self.curs.copy_from(outF, self.tableNameOverlap, '\t', columns=cols) importedNumRows(self.curs) makeIndex(self.curs, self.tableNameOverlap, ["authorPubmedTrait"])
def run(self): fnp, filetype = paths.gene_files[self.assembly] ts = Transcripts(fnp, filetype) rows = [] for t in ts.getTranscripts(): tss = t.getTSS() rows.append([t.geneid_, tss[0], tss[1], tss[2], tss[3]]) rows.sort(key=lambda x: x[0]) ret = [] for gid, group in groupby(rows, lambda x: x[0]): group = list(group) if 1 == len(group): g = group[0] r = [g[0], g[1], str(g[2]), str(g[3])] ret.append(r) continue chroms = list(set([x[1] for x in group])) if 1 != len(chroms): print("Warning: multiple chroms") #raise Exception("mutliple chroms") continue start = min([x[2] for x in group]) end = max([x[3] for x in group]) r = [gid, chroms[0], str(start), str(end)] print("merge", end - start) ret.append(r) print("merged from", len(rows), "to", len(ret)) tableName = self.assembly + "_tss_info" self.curs.execute(""" DROP TABLE IF EXISTS {tableName}; CREATE TABLE {tableName} (id serial PRIMARY KEY, ensemblid_ver text, chrom text, start integer, stop integer );""".format(tableName=tableName)) cols = ["ensemblid_ver", "chrom", "start", "stop"] outF = StringIO.StringIO() for r in ret: outF.write('\t'.join(r) + '\n') outF.seek(0) self.curs.copy_from(outF, tableName, '\t', columns=cols) printt("updated", tableName) makeIndex(self.curs, tableName, ["ensemblid_ver"])
def run(self): gidsToDbID, requiredGids = loadGidsToDbIds(self.assembly) genes = GeneInfo(self.assembly).genes ret = {} for gid, info in genes.iteritems(): ret[gid] = GeneRow(gid, info, gidsToDbID) printt("merged", len(ret)) count = len(ret) for gid, ver in requiredGids.iteritems(): if gid not in ret: ret[gid] = GeneRow(gid, {}, gidsToDbID) printt("loaded missing genes for DB", len(ret) - count) ret = ret.values() print("example\n", ret[0].output()) tableName = self.assembly + "_gene_info" self.curs.execute(""" DROP TABLE IF EXISTS {tableName} CASCADE; CREATE TABLE {tableName} (id serial PRIMARY KEY, geneid integer, ensemblid text, ver integer, ensemblid_ver text, approved_symbol text, chrom text, start integer, stop integer, strand varchar(1), gene_type text, info jsonb); """.format(tableName=tableName)) cols = [ "geneid", "ensemblid", "ver", "ensemblid_ver", "approved_symbol", "chrom", "start", "stop", "strand", "gene_type", "info" ] outF = StringIO.StringIO() for r in ret: outF.write(r.output() + '\n') outF.seek(0) self.curs.copy_from(outF, tableName, '\t', columns=cols) print("updated", tableName, self.curs.rowcount) makeIndex(self.curs, tableName, [ "geneid", "ensemblid_ver", "ensemblid", "approved_symbol", "chrom" ])
def run(self): tn = self.tableName with db_connect_single(os.path.realpath(__file__)) as conn: with conn.cursor() as curs: makeIndex(curs, tn, ["accession", "chrom"]) #makeIndexInt4Range(curs, tn, ["start", "stop"]) makeIndexRev(curs, tn, [ "maxz", "dnase_max", "h3k4me3_max", "h3k27ac_max", "ctcf_max" ]) conn.commit() for col in self.zscore_cols: makeIndexArr(curs, tn, col, conn)
def _import_tads(self): tableName = self.assembly + "_all_tads" printt("dropping and creating", tableName) self.curs.execute(""" DROP TABLE IF EXISTS {tableName}; CREATE TABLE {tableName} (id serial PRIMARY KEY, acc TEXT, chrom TEXT, start INTEGER, stop INTEGER); """.format(tableName=tableName)) with open( os.path.join(rootDir, "%s/CTCF/all_tads.tsv" % self.assembly), "r") as f: cols = ["acc", "chrom", "start", "stop"] self.curs.copy_from(f, tableName, '\t', columns=cols) printt("imported", self.curs.rowcount) makeIndex(self.curs, tableName, ["acc", "chrom"])
def _import_biosamples(self): tableName = self.assembly + "_tad_biosamples" printt("dropping and creating", tableName) self.curs.execute(""" DROP TABLE IF EXISTS {tableName}; CREATE TABLE {tableName} (id serial PRIMARY KEY, acc TEXT, name TEXT); """.format(tableName=tableName)) with open( os.path.join(rootDir, "%s/CTCF/tad_biosamples.tsv" % self.assembly), "r") as f: cols = ["acc", "name"] self.curs.copy_from(f, tableName, '\t', columns=cols) printt("imported", self.curs.rowcount) makeIndex(self.curs, tableName, ["acc", "name"])
def makeMV(curs, assembly): mv = assembly + "_gene_info_mv" printt("making", mv) curs.execute(""" DROP MATERIALIZED VIEW IF EXISTS {mv} CASCADE; CREATE MATERIALIZED VIEW {mv} AS SELECT q.id AS geneid, LOWER(d.value) AS value FROM {assembly}_gene_info as q JOIN jsonb_each_text(q.info) as d ON true UNION SELECT q.id AS geneid, LOWER(q.ensemblid) AS value FROM {assembly}_gene_info as q UNION SELECT q.id AS geneid, LOWER(q.ensemblid_ver) AS value FROM {assembly}_gene_info as q """.format(mv=mv, assembly=assembly)) makeIndex(curs, mv, ["geneid", "value"])
def doIndex(curs, assembly): tableName = assembly + "_rampage" makeIndex(curs, tableName, ["ensemblid_ver", "chrom"]) makeIndexIntRange(curs, tableName, ["start", "stop"])
def index(self): makeIndex(self.curs, self.tableName, ["leftCtId", "rightCtId", "ensembl"])
def index(self): makeIndexTextPatternOps(self.curs, self.tableName, ["name"]) makeIndexGinTrgmOps(self.curs, self.tableName, ["name"]) makeIndex(self.curs, self.tableName, ["oname"])
def index(self): makeIndex(self.curs, self.tableName, ["tadName"]) makeIndexIntRange(self.curs, self.tableName, ["start", "stop"])
def _doIndex(self): makeIndex(self.curs, self.tableName, ["cellTypeName"])
def _doIndex(self): #makeIndexTextPatternOps(self.curs, self.tableName, ["synonym"]) #makeIndexGinTrgmOps(self.curs, self.tableName, ["synonym"]) makeIndex(self.curs, self.tableName, ["oid"])
def _doIndex(self): makeIndex(self.curs, self.tableName, ["startpos", "endpos", "feature", "transcript_id", "gene_id"])
def index(self): makeIndex(self.curs, self.tableName, ["accession", "tadName"])
def run(self): outF, cols = self._jobgen(self.assembly, self.tableName, self.curs, self.runDate) self.curs.copy_from(outF, self.tableName, '\t', columns=cols) importedNumRows(self.curs) makeIndex(self.curs, self.tableName, ["label", "fileID"])
def _doIndex(self): makeIndex(self.curs, self.tableName, ["cre"])
def doIndex(self): tableName = self._tableNameMetadata() makeIndex(self.curs, tableName, ["expID", "celltype"]) makeIndexMultiCol(self.curs, tableName, ["cellCompartment", "biosample_type"])
def _doIndex(self): makeIndex(self.curs, self.tableName, ["accession"])
def _doIndexData(self, tableNameData): printt("creating indices in", tableNameData, "...") makeIndex(self.curs, tableNameData, ["gene_name", "tpm"])