def runLiftover(self): mc = MemCacheWrapper() qd = QueryDCC(cache=mc) url = "https://www.encodeproject.org/search/?type=Experiment&assay_title=Hi-C&status=released" for exp in qd.getExps(url): for f in exp.getTADs(): f.download() self.parseOutFile(f.fileID, f.fnp()) fnp = "/home/mjp/tadsLiftOverHg19ToHg38/lengths_orig.tsv" with open(fnp, 'w') as f: for r in self.lengths_orig: f.write('\t'.join([str(x) for x in r]) + '\n') print("wrote", fnp) fnp = "/home/mjp/tadsLiftOverHg19ToHg38/lengths_filtered.tsv" with open(fnp, 'w') as f: for r in self.lengths_filtered: f.write('\t'.join([str(x) for x in r]) + '\n') print("wrote", fnp) fnp = "/home/mjp/tadsLiftOverHg19ToHg38/oldVsNew.tsv" with open(fnp, 'w') as f: for r in self.oldVsNew: f.write(r[0]) f.write(r[1] + '\n') print("wrote", fnp)
def metadata(curs, assembly): fnp = paths.path(assembly, "hg19-tss-rampage-matrix.txt.gz") printt("reading", fnp) with gzip.open(fnp) as f: header = f.readline().rstrip('\n').split('\t') fileIDs = header[1:] tableName = assembly + "_rampage_info" printt("dropping and creating", tableName) curs.execute(""" DROP TABLE IF EXISTS {tn}; CREATE TABLE {tn} (id serial PRIMARY KEY, expID text, fileID text, biosample_term_name text, biosample_type text, biosample_summary text, tissue text, strand VARCHAR(1) ) """.format(tn=tableName)) outF = StringIO.StringIO() mc = None if Config.memcache: mc = MemCacheWrapper(Config.memcache) qd = QueryDCC(auth=False, cache=mc) for fileID in fileIDs: exp = qd.getExpFromFileID(fileID) expID = exp.encodeID tissue = DetermineTissue.TranslateTissue(assembly, exp).strip() for f in exp.files: if f.fileID == fileID: print(f) print(f.output_type) strand = f.output_type.split()[0] if "plus" == strand: strand = '+' elif "minus" == strand: strand = '-' else: raise Exception("unknown strand " + f.output_type) outF.write('\t'.join([ expID, fileID, exp.biosample_term_name, exp.biosample_type, exp.getExpJson()["biosample_summary"], tissue, strand ]) + '\n') outF.seek(0) cols = [ "expID", "fileID", "biosample_term_name", "biosample_type", "biosample_summary", "tissue", "strand" ] curs.copy_from(outF, tableName, '\t', columns=cols) printt("\tok", curs.rowcount)
def _doImport(self): mc = None if Config.memcache: mc = MemCacheWrapper(Config.memcache) qd = QueryDCC(auth=False, cache=mc) m = MetadataWS.byAssembly(self.assembly) allExps = m.all_bigBeds_bigWigs(self.assembly) printt("found", len(allExps)) ret = {} ns = self.pgSearch.loadNineStateGenomeBrowser() total = len(ns) counter = 1 for ctn, v in ns.iteritems(): printt(counter, 'of', total, ctn) counter += 1 btns = set() for fileID in [v["dnase"], v["h3k4me3"], v["h3k27ac"], v["ctcf"]]: if 'NA' == fileID: continue exp = qd.getExpFromFileID(fileID) btns.add(exp.biosample_term_name) exps = filter(lambda e: e.biosample_term_name in btns, allExps) ret[ctn] = [] for e in exps: q = { "expID": e.encodeID, "assay_term_name": e.assay_term_name, "target": e.target, "tf": e.tf, "bigWigs": [{ "fileID": f.fileID, "techRep": f.technical_replicates } for f in e.files if f.isBigWig()], "beds": [{ "fileID": f.fileID, "techRep": f.technical_replicates } for f in e.files if f.isBigBed()] } ret[ctn].append(q) ret[ctn] = sorted(ret[ctn], key=lambda q: (q["assay_term_name"], q["target"], q["tf"])) self.curs.execute( """ INSERT INTO {tableName} (cellTypeName, tracks) VALUES (%s, %s)""".format(tableName=self.tableName), (ctn, json.dumps(ret[ctn])))
def runSubmit(self): authenticateEncodeTxt(self.args) mc = MemCacheWrapper() qd = QueryDCC(cache=mc) url = "https://www.encodeproject.org/search/?type=Experiment&assay_title=Hi-C&status=released" for exp in qd.getExps(url): for f in exp.getTADs(): f.download() self.submitFile(exp, f)
def get9stateInfo(assembly, r): mc = None if Config.memcache: mc = MemCacheWrapper(Config.memcache) qd = QueryDCC(auth=False, cache=mc) fileIDs = filter(lambda x: x.startswith("EN"), [r[2], r[3], r[4], r[5]]) for fileID in fileIDs: exp = qd.getExpFromFileID(fileIDs[0]) tissue = DetermineTissue.TranslateTissue(assembly, exp) if tissue: break return '\t'.join(r + [assembly, tissue])
def ontologyToCellTypes(line): mc = None if Config.memcache: mc = MemCacheWrapper(Config.memcache) qd = QueryDCC(auth=False, cache=mc) toks = line.strip().split('\t') ct = toks[0] ret = [] for fileID in toks[2:]: fileID = fileID.strip() if not fileID or 'NA' == fileID: continue exp = qd.getExpFromFileID(fileID) bsi = exp.jsondata.get("biosample_term_id", []) if not bsi: printt(expID, "missing biosample_term_id") if not isinstance(bsi, list): bsi = [bsi] for i in bsi: ret.append([ct, i]) return ret
from cache_memcache import MemCacheWrapper from querydcc import QueryDCC from metadataws import MetadataWS from files_and_paths import Datasets, Dirs from exp import Exp from get_tss import Genes AddPath(__file__, '../common/') from constants import paths, chroms from common import printr, printt from config import Config mc = None if Config.memcache: mc = MemCacheWrapper(Config.memcache) qd = QueryDCC(cache=mc) class ExtractRNAseq: def __init__(self, assembly): self.assembly = assembly geneIdIdxs = {"hg19": 0, "mm10": 0} self.gene_id_idx = geneIdIdxs[self.assembly] def run(self): self.ensemblIDtoGeneName() today = arrow.now().format('YYYY-MM-DD') fnp = paths.path(self.assembly, "geneExp", today + ".tsv.gz") Utils.ensureDir(fnp) with gzip.open(fnp, 'wb') as f:
def _import(self): url = "https://www.encodeproject.org/search/?type=Annotation&encyclopedia_version=4" url += "&files.file_type=bed+bed3%2B&assembly=" + self.assembly url += "&format=json&limit=all" mc = None if Config.memcache: mc = MemCacheWrapper(Config.memcache) qd = QueryDCC(auth=False, cache=mc) fnp = paths.path(self.assembly, self.assembly + "-Look-Up-Matrix.txt") printt("parsing", fnp) btidToCt = {} with open(fnp) as f: for line in f: line = line.strip().split('\t') btid = re.sub('[^0-9a-zA-Z]+', '-', line[0]) btidToCt[btid] = line[0] printt("looking up ENCODE accessions...") rows = [] for exp in qd.getExps(url): for f in exp.files: if not f.isBed(): continue # [u'zhiping-weng:cREs-hg19-v10-ganglionic...-5group-bigBed', # u'zhiping-weng:cREs-hg19-v10-ganglionic-...-9state-H3K4me3-bigBed'] aliases = f.jsondata["aliases"] typs = [ "5group", "9state-H3K4me3", "9state-DNase", "9state-H3K27ac", "9state-CTCF" ] for a in aliases: a = a.replace( "zhiping-weng:cREs-" + self.assembly + "-v10-", '') a = a.replace("-bed", '') for t in typs: if t in a: ct = a.replace(t, '')[:-1] # remove trailing hyphen if not ct: rows.append(["_agnostic", str(f.fileID), t]) continue if ct not in btidToCt: raise Exception("missing " + ct) rows.append([btidToCt[ct], str(f.fileID), t]) printt('***********', "drop and create", self.tableName) self.curs.execute(""" DROP TABLE IF EXISTS {tableName}; CREATE TABLE {tableName} (id serial PRIMARY KEY, celltype text, dcc_accession text, typ text );""".format(tableName=self.tableName)) printt('***********', "import ENCODE files") printt("rewrite rows") outF = StringIO.StringIO() for r in rows: outF.write('\t'.join(r) + '\n') outF.seek(0) cols = ["celltype", "dcc_accession", "typ"] self.curs.copy_from(outF, self.tableName, '\t', columns=cols) print("copied in", self.curs.rowcount)