def __init__(self, args, assembly, ver, nbins, j): self.args = args self.assembly = assembly self.var = ver self.nbins = nbins self.j = j self.raw = paths.path(assembly, "raw") self.minipeaks = paths.path(assembly, "minipeaks", str(ver), str(nbins)) Utils.mkdir_p(self.minipeaks) self.bwtool = "/data/cherrypy/bin/bwtool" if not os.path.exists(self.bwtool): self.bwtool = "/usr/local/bin/bwtool" if not os.path.exists(self.bwtool): self.bwtool = "/data/common/tools/bwtool" if not os.path.exists(self.bwtool): raise Exception("no bwtool found") self.bwtoolFilter = os.path.abspath( os.path.join(os.path.dirname(__file__), 'minipeaks/bin/read_json')) if not os.path.exists(self.bwtoolFilter): raise Exception("missing C++ bwtool filter; please compile?") self.masterPeakFnp = os.path.join(self.raw, "cREs.bed") if "GRCh38" == assembly: self.masterPeakFnp = "/data/projects/encode/Registry/V2/GRCh38/GRCh38-ccREs.bed" self.numPeaks = numLines(self.masterPeakFnp) printt(self.masterPeakFnp, "has", self.numPeaks) self.miniPeaksBedFnp = os.path.join(self.minipeaks, "miniPeakSites.bed.gz") self.debug = False
def setupAll(self, sample): self.setupDb() cts, fnps = self.loadFileLists() ctsToId = self.setupCellTypes(cts) cols = ["leftCtId", "rightCtId", "ensembl", "log2FoldChange", "padj"] # baseMean log2FoldChange lfcSE stat pvalue padj counter = 0 for fnp, ct1, ct2 in fnps: counter += 1 if sample: # if "_0" not in ct1 and "_0" not in ct2: if "limb_15" not in ct1 or "limb_11" not in ct2: continue printt(counter, len(fnps), fnp) data, skipped = self.readFile(fnp) outF = StringIO.StringIO() for d in data: outF.write('\t'.join([str(ctsToId[ct1]), str(ctsToId[ct2])] + d) + '\n') outF.seek(0) self.curs.copy_from(outF, self.tableName, '\t', columns=cols) printt("copied in", self.curs.rowcount, "skipped", skipped)
def run(args, DBCONN): assemblies = Config.assemblies if args.assembly: assemblies = [args.assembly] def doRun(args, assembly, curs, tsuffix, jobgen, runDate): if args.metadata: PI.ImportPeakIntersectionMetadata(curs, assembly, tsuffix, jobgen, runDate).run() elif args.index: PI.ImportPeakIntersections(curs, assembly, tsuffix, runDate).index() else: m = PI.ImportPeakIntersectionMetadata(curs, assembly, tsuffix, jobgen, runDate) runDate = m.run() ipi = PI.ImportPeakIntersections(curs, assembly, tsuffix, runDate) ipi.run() ipi.index() runDate = Config.cistromePeakIntersectionRunDate for assembly in assemblies: printt('***********', assembly) with getcursor(DBCONN, "main") as curs: if assembly in ["hg38", "mm10"]: doRun(args, assembly, curs, "cistromeIntersections", cistrome_peak_metadata, runDate)
def processGwasBed(self, origBedFnp, bedFnp): printt("reading", origBedFnp) with open(origBedFnp) as f: rows = [r.rstrip().split('\t') for r in f if r] printt("split rows") split = [] for r in rows: if ',' not in r[4]: split.append(r) continue taggedSNPs = r[4].split(',') r2s = r[5].split(',') a = list(r) b = list(r) a[4] = taggedSNPs[0] b[4] = taggedSNPs[1] a[5] = r2s[0] b[5] = r2s[1] split.append(a) split.append(b) printt("split rows", len(rows), "to", len(split)) printt("adding authorPubmedTrait") for r in split: authorPubmedTrait = r[-1] r[-1] = authorPubmedTrait.replace('-', '_') print("***********", split[0][-1]) printt("writing", bedFnp) with open(bedFnp, 'w') as f: for r in split: f.write('\t'.join(r) + '\n') Utils.sortFile(bedFnp) printWroteNumLines(bedFnp)
def run(args, DBCONN): assemblies = Config.assemblies for assembly in assemblies: printt('***********', assembly) with getcursor(DBCONN, "07_setup_cart") as curs: setupCart(curs, assembly)
def _build(self, assay_term_name, atn, exps): printt("building", assay_term_name, "...") def sorter(exp): return (exp.label) exps.sort(key=sorter) self.labelNToNormal[atn] = assay_term_name for label, exps in groupby(exps, sorter): exps = list(exps) labelN = Helpers.sanitize(label) self.labelNToNormal[labelN] = label fnp = os.path.join(BaseWwwTmpDir, self.assembly, "subtracks", atn, labelN + '.txt') self.byAssayBiosampleType[atn][labelN] = { "assay_term_name": assay_term_name, "atn": atn, "label": label, "labelN": labelN, "fnp": fnp, "exps": exps, "assembly": self.assembly } printt("making tracks and subtracks...") self._makeSubTracks()
def ensemblIDtoGeneName(self): fnp, filetype = paths.gene_files[self.assembly] printt("loading", fnp) ggff = Genes(fnp, filetype) self.ensemToGene = {} for g in ggff.getGenes(): self.ensemToGene[g.geneid_] = g.genename_
def _getRowsFromFiles(self): counter = 0 for exp, expF in self._getFiles(): counter += 1 printt(counter, exp.encodeID, expF.fileID, expF.biological_replicates, expF.output_type) try: with open(expF.fnp()) as f: lines = [x.strip().split('\t') for x in f] header = lines[0] gene_id_idx = self.gene_id_idx TPM_idx = 5 FPKM_idx = 6 assert ("gene_id" == header[gene_id_idx]) assert ("TPM" == header[TPM_idx]) assert ("FPKM" == header[FPKM_idx]) for row in lines[1:]: # if "0.00" == row[TPM_idx] and "0.00" == row[FPKM_idx]: # continue geneID = row[gene_id_idx] yield (expF.expID, expF.fileID, geneID, self.ensemToGene.get(geneID, geneID), '_'.join([ str(x) for x in expF.biological_replicates ]), row[TPM_idx], row[FPKM_idx]) except: eprint("error reading:", expF.fnp()) raise
def setupDB(self): tableName = self._tableNameMetadata() printt("dropping and creating", tableName) self.curs.execute(""" DROP TABLE IF EXISTS {tn}; CREATE TABLE {tn} (id serial PRIMARY KEY, expID text, fileID text, replicate INT NOT NULL, cellType text, organ text, cellCompartment text, target text, lab text, assay_term_name text, biosample_type text, biosample_term_name text, biosample_summary text, ageTitle text, assay_title text, signal_files jsonb )""".format(tn=tableName))
def __init__(self, assembly, assay, toks): self.assembly = assembly self.assay = assay self.expID = toks[0] self.fileID = toks[1] self.cellTypeName = toks[2] exp = Exp.fromJsonFile(self.expID) self.tissue = DetermineTissue.TranslateTissue(assembly, exp) self.biosample_term_name = exp.biosample_term_name self.biosample_summary = exp.jsondata.get("biosample_summary", self.biosample_term_name) self.biosample_type = exp.jsondata["biosample_type"] if isinstance(self.biosample_type, list): if len(self.biosample_type) > 1: printt("multiple items for biosample_type:", self.biosample_type) self.biosample_type = self.biosample_type[0] if "mm10" == self.assembly: bs = self.biosample_summary bs = bs.replace("C57BL/6 ", "") matches = re.findall(r"\ (\((.*) days\))", bs) #printt(bs, matches, len(matches)) if matches and 1 == len(matches): bs = bs.replace(matches[0][0], "e" + matches[0][1]) bs = bs.replace("postnatal e", "p").replace("embryo e", "e") #printt("new bs", bs) self.biosample_summary = bs out = self.output().encode('ascii', 'ignore').decode('ascii')
def run(inFnp, outFnp): printt("making hammock from", inFnp) with open(inFnp) as inF: with open(outFnp, 'w') as outF: for idx, line in enumerate(inF): toks = line.rstrip().split('\t') attrs = "id:" + str(idx) + ',name:"' + toks[3] + '"' if 9 == len(toks): attrs += ",struct:{{thick:[[{s},{e}],],}}".format( s=toks[1], e=toks[2]) out = toks[:3] + [attrs] outF.write("\t".join(out) + '\n') printt("sorting") Utils.sortFile(outFnp) printWroteNumLines(outFnp) printt("bgzip") cmds = ["bgzip", '-f', outFnp] Utils.runCmds(cmds) printt("tabix") cmds = ["tabix", '-f', outFnp + '.gz'] Utils.runCmds(cmds) printt("wrote", inFnp, outFnp)
def _import(self): lookup = [] fnp = paths.path(self.assembly, self.assembly + "-Look-Up-Matrix.txt") printt("parsing", fnp) with open(fnp) as f: rows = [x.strip() for x in f] outRows = Parallel(n_jobs=self.args.j)(delayed(ontologyToCellTypes)( line) for line in rows) printt('***********', "drop and create", self.tableName) self.curs.execute(""" DROP TABLE IF EXISTS {tableName}; CREATE TABLE {tableName} (id serial PRIMARY KEY, cellTypeName text, biosample_term_id text, synonyms jsonb );""".format(tableName=self.tableName)) printt('***********', "import lookup") printt("rewrite rows") outF = StringIO.StringIO() for row in outRows: for r in row: outF.write('\t'.join(r) + '\n') outF.seek(0) cols = ["cellTypeName", "biosample_term_id"] self.curs.copy_from(outF, self.tableName, '\t', columns=cols) printt("copied in", self.curs.rowcount)
def testHub(): printt("checking hub...") cmds = [ "/data/common/tools/ucsc.v350/hubCheck", "-noTracks", os.path.join(BaseWwwDir, 'hub.txt') ] printt(Utils.runCmds(cmds))
def doImport(curs, assembly): fnp = paths.path(assembly, "hg19-tss-rampage-matrix.txt.gz") printt("reading", fnp) with gzip.open(fnp) as f: header = f.readline().rstrip('\n').split('\t') rows = [line.rstrip('\n').split('\t') for line in f] printt("read header and", len(rows), "rows") fnp = paths.path(assembly, "hg19-tss-filtered.bed.gz") with gzip.open(fnp) as f: tsses = [line.rstrip('\n').split('\t') for line in f] lookup = {r[3]: r for r in tsses} printt("rewriting") outF = StringIO.StringIO() for row in rows: r = [row[0]] t = lookup[row[0]] r.append(t[6]) # gene r.append(t[0]) # chrom r.append(t[1]) # start r.append(t[2]) # stop r.append(t[5]) # strand r.append(t[7].replace("_", " ")) # gene info r += row[1:] outF.write('\t'.join(r) + '\n') outF.seek(0) fileIDs = header[1:] cols = [ "transcript", "ensemblid_ver", "chrom", "start", "stop", "strand", "geneInfo" ] + fileIDs tableName = assembly + "_rampage" printt("copy into", tableName) curs.execute(""" DROP TABLE IF EXISTS {tn}; CREATE TABLE {tn} (id serial PRIMARY KEY, transcript text, ensemblid_ver text, chrom text, start integer, stop integer, strand VARCHAR(1), geneInfo text, maxVal real, {fields} );""".format(tn=tableName, fields=','.join([f + " real" for f in fileIDs]))) curs.copy_from(outF, tableName, '\t', columns=cols) printt("inserted", curs.rowcount) curs.execute(""" UPDATE {tn} SET maxVal = GREATEST( {fields} ) """.format(tn=tableName, fields=','.join(fileIDs)))
def makeIndexMultiCol(curs, tableName, cols): name = '_'.join(cols) idx = _idx(tableName, name) printt("indexing", idx) curs.execute(""" DROP INDEX IF EXISTS {idx}; CREATE INDEX {idx} on {tableName} ({col}); """.format(idx=idx, tableName=tableName, col=','.join(cols)))
def makeIndex(curs, tableName, cols): for col in cols: idx = _idx(tableName, col) printt("indexing", idx) curs.execute(""" DROP INDEX IF EXISTS {idx}; CREATE INDEX {idx} on {tableName} ({col}); """.format(idx=idx, tableName=tableName, col=col))
def _getFileIDs(self, fn): assay = fn.split('-')[0] printt("***********************", self.assembly, assay) fnp = paths.path(self.assembly, "raw", fn) with open(fnp) as f: rows = [x.rstrip('\n').split('\t') for x in f.readlines()] fileIDs = sorted([r[1] for r in rows]) return assay, fileIDs
def makeIndexRange(curs, tableName, cols): for col in cols: idx = _idx(tableName, col) printt("indexing int range", idx) curs.execute(""" DROP INDEX IF EXISTS {idx}; create index {idx} on {tableName} using gist(intarray2int4range({col})); """.format(idx=idx, tableName=tableName, col=col))
def loadJobs(assembly, runDate): fnp = paths.path(assembly, "extras", runDate, "cistromeJobs.json.gz") printt("reading", fnp) with gzip.open(fnp) as f: jobs = json.load(f) printt("loaded", len(jobs)) return jobs
def run(self): self.setupTable() fnp = paths.path(self.assembly, "raw", "vista.tsv") printt("reading", fnp) with open(fnp) as f: self.curs.copy_from(f, self.tableName, '\t', columns=("accession", "vistaids")) printt("copied in %d vista entries" % self.curs.rowcount)
def makeIndexInt4Range(curs, tableName, cols): # http://stackoverflow.com/a/14407839 idx = _idx(tableName, "_".join(cols)) printt("indexing int4range", idx) curs.execute(""" DROP INDEX IF EXISTS {idx}; create index {idx} on {tableName} (int4range({cols})); """.format(idx=idx, tableName=tableName, cols=",".join(cols)))
def run(args, DBCONN): printt('***********', "mm10") with getcursor(DBCONN, "import DEs") as curs: ide = ImportDE(curs) if args.index: return ide.index() ide.setupAll(args.sample) ide.index()
def tag_delete(): uid = request.args.get('uid') try: storage = state.get_storage() storage.remove_tag(uid) except Exception as e: printt(e) return redirect('/tags')
def _setupTable(self): printt("drop and create", self.tableName) self.curs.execute(""" DROP TABLE IF EXISTS {tn}; CREATE TABLE {tn} (id serial PRIMARY KEY, accession text, creGroupsSpecific VARCHAR[] );""".format(tn=self.tableName))
def _setupTable(self): printt("drop and create", self.tableName) self.curs.execute(""" DROP TABLE IF EXISTS {tn}; CREATE TABLE {tn} (id serial PRIMARY KEY, accession text, concordant BOOLEAN NOT NULL DEFAULT FALSE );""".format(tn=self.tableName))
def run(args, DBCONN): assemblies = Config.assemblies if args.assembly: assemblies = [args.assembly] for assembly in assemblies: printt('***********', assembly) ci = CreateIndices(assembly) ci.run()
def run(args, DBCONN): assemblies = Config.assemblies if args.assembly: assemblies = [args.assembly] for assembly in assemblies: printt('***********', assembly) args.assembly = assembly rna1.run(args, DBCONN) rna2.run(args, DBCONN)
def makeIndexTextPatternOps(curs, tableName, cols): for col in cols: idx = _idx(tableName, col, "text_pattern_ops") printt("indexing", idx) curs.execute(""" DROP INDEX IF EXISTS {idx}; CREATE INDEX {idx} on {tableName} USING btree ({col} text_pattern_ops); """.format(idx=idx, tableName=tableName, col=col))
def addCol(self): printt("adding col...") curs.execute(""" ALTER TABLE {tn} ADD COLUMN creGroup integer; UPDATE {tn} SET ... """.format(tn=self.tableName_cre_all))
def run(args): assemblies = Config.assemblies if args.assembly: assemblies = [args.assembly] for assembly in assemblies: printt('***********', assembly) rs = ExtractRNAseq(assembly) rs.run()