コード例 #1
0
    def runLiftover(self):
        mc = MemCacheWrapper()
        qd = QueryDCC(cache=mc)
        url = "https://www.encodeproject.org/search/?type=Experiment&assay_title=Hi-C&status=released"

        for exp in qd.getExps(url):
            for f in exp.getTADs():
                f.download()
                self.parseOutFile(f.fileID, f.fnp())

        fnp = "/home/mjp/tadsLiftOverHg19ToHg38/lengths_orig.tsv"
        with open(fnp, 'w') as f:
            for r in self.lengths_orig:
                f.write('\t'.join([str(x) for x in r]) + '\n')
        print("wrote", fnp)

        fnp = "/home/mjp/tadsLiftOverHg19ToHg38/lengths_filtered.tsv"
        with open(fnp, 'w') as f:
            for r in self.lengths_filtered:
                f.write('\t'.join([str(x) for x in r]) + '\n')
        print("wrote", fnp)

        fnp = "/home/mjp/tadsLiftOverHg19ToHg38/oldVsNew.tsv"
        with open(fnp, 'w') as f:
            for r in self.oldVsNew:
                f.write(r[0])
                f.write(r[1] + '\n')
        print("wrote", fnp)
コード例 #2
0
ファイル: 21_rampage.py プロジェクト: wanliu2019/SCREEN
def metadata(curs, assembly):
    fnp = paths.path(assembly, "hg19-tss-rampage-matrix.txt.gz")

    printt("reading", fnp)
    with gzip.open(fnp) as f:
        header = f.readline().rstrip('\n').split('\t')

    fileIDs = header[1:]

    tableName = assembly + "_rampage_info"
    printt("dropping and creating", tableName)

    curs.execute("""
DROP TABLE IF EXISTS {tn};
CREATE TABLE {tn}
(id serial PRIMARY KEY,
expID text,
fileID text,
biosample_term_name text,
biosample_type text,
biosample_summary text,
tissue text,
strand VARCHAR(1)
) """.format(tn=tableName))

    outF = StringIO.StringIO()

    mc = None
    if Config.memcache:
        mc = MemCacheWrapper(Config.memcache)
    qd = QueryDCC(auth=False, cache=mc)

    for fileID in fileIDs:
        exp = qd.getExpFromFileID(fileID)
        expID = exp.encodeID
        tissue = DetermineTissue.TranslateTissue(assembly, exp).strip()
        for f in exp.files:
            if f.fileID == fileID:
                print(f)
                print(f.output_type)
                strand = f.output_type.split()[0]
                if "plus" == strand:
                    strand = '+'
                elif "minus" == strand:
                    strand = '-'
                else:
                    raise Exception("unknown strand " + f.output_type)
                outF.write('\t'.join([
                    expID, fileID, exp.biosample_term_name, exp.biosample_type,
                    exp.getExpJson()["biosample_summary"], tissue, strand
                ]) + '\n')
    outF.seek(0)

    cols = [
        "expID", "fileID", "biosample_term_name", "biosample_type",
        "biosample_summary", "tissue", "strand"
    ]
    curs.copy_from(outF, tableName, '\t', columns=cols)
    printt("\tok", curs.rowcount)
コード例 #3
0
ファイル: 25_more_tracks.py プロジェクト: wanliu2019/SCREEN
    def _doImport(self):
        mc = None
        if Config.memcache:
            mc = MemCacheWrapper(Config.memcache)
        qd = QueryDCC(auth=False, cache=mc)

        m = MetadataWS.byAssembly(self.assembly)
        allExps = m.all_bigBeds_bigWigs(self.assembly)
        printt("found", len(allExps))

        ret = {}
        ns = self.pgSearch.loadNineStateGenomeBrowser()
        total = len(ns)
        counter = 1
        for ctn, v in ns.iteritems():
            printt(counter, 'of', total, ctn)
            counter += 1
            btns = set()
            for fileID in [v["dnase"], v["h3k4me3"], v["h3k27ac"], v["ctcf"]]:
                if 'NA' == fileID:
                    continue
                exp = qd.getExpFromFileID(fileID)
                btns.add(exp.biosample_term_name)

            exps = filter(lambda e: e.biosample_term_name in btns, allExps)
            ret[ctn] = []
            for e in exps:
                q = {
                    "expID":
                    e.encodeID,
                    "assay_term_name":
                    e.assay_term_name,
                    "target":
                    e.target,
                    "tf":
                    e.tf,
                    "bigWigs": [{
                        "fileID": f.fileID,
                        "techRep": f.technical_replicates
                    } for f in e.files if f.isBigWig()],
                    "beds": [{
                        "fileID": f.fileID,
                        "techRep": f.technical_replicates
                    } for f in e.files if f.isBigBed()]
                }
                ret[ctn].append(q)

            ret[ctn] = sorted(ret[ctn],
                              key=lambda q:
                              (q["assay_term_name"], q["target"], q["tf"]))
            self.curs.execute(
                """
            INSERT INTO {tableName} (cellTypeName, tracks)
VALUES (%s, %s)""".format(tableName=self.tableName),
                (ctn, json.dumps(ret[ctn])))
コード例 #4
0
    def runSubmit(self):
        authenticateEncodeTxt(self.args)

        mc = MemCacheWrapper()
        qd = QueryDCC(cache=mc)
        url = "https://www.encodeproject.org/search/?type=Experiment&assay_title=Hi-C&status=released"

        for exp in qd.getExps(url):
            for f in exp.getTADs():
                f.download()
                self.submitFile(exp, f)
コード例 #5
0
def get9stateInfo(assembly, r):
    mc = None
    if Config.memcache:
        mc = MemCacheWrapper(Config.memcache)
    qd = QueryDCC(auth=False, cache=mc)

    fileIDs = filter(lambda x: x.startswith("EN"),
                     [r[2], r[3], r[4], r[5]])
    for fileID in fileIDs:
        exp = qd.getExpFromFileID(fileIDs[0])
        tissue = DetermineTissue.TranslateTissue(assembly, exp)
        if tissue:
            break
    return '\t'.join(r + [assembly, tissue])
コード例 #6
0
def ontologyToCellTypes(line):
    mc = None
    if Config.memcache:
        mc = MemCacheWrapper(Config.memcache)
    qd = QueryDCC(auth=False, cache=mc)

    toks = line.strip().split('\t')
    ct = toks[0]

    ret = []

    for fileID in toks[2:]:
        fileID = fileID.strip()
        if not fileID or 'NA' == fileID:
            continue
        exp = qd.getExpFromFileID(fileID)
        bsi = exp.jsondata.get("biosample_term_id", [])
        if not bsi:
            printt(expID, "missing biosample_term_id")
        if not isinstance(bsi, list):
            bsi = [bsi]
        for i in bsi:
            ret.append([ct, i])
    return ret
コード例 #7
0
ファイル: 08_rna_seq.py プロジェクト: wanliu2019/SCREEN
from cache_memcache import MemCacheWrapper
from querydcc import QueryDCC
from metadataws import MetadataWS
from files_and_paths import Datasets, Dirs
from exp import Exp
from get_tss import Genes

AddPath(__file__, '../common/')
from constants import paths, chroms
from common import printr, printt
from config import Config

mc = None
if Config.memcache:
    mc = MemCacheWrapper(Config.memcache)
qd = QueryDCC(cache=mc)


class ExtractRNAseq:
    def __init__(self, assembly):
        self.assembly = assembly
        geneIdIdxs = {"hg19": 0, "mm10": 0}
        self.gene_id_idx = geneIdIdxs[self.assembly]

    def run(self):
        self.ensemblIDtoGeneName()

        today = arrow.now().format('YYYY-MM-DD')
        fnp = paths.path(self.assembly, "geneExp", today + ".tsv.gz")
        Utils.ensureDir(fnp)
        with gzip.open(fnp, 'wb') as f:
コード例 #8
0
ファイル: 38_dcc_cres_beds.py プロジェクト: wanliu2019/SCREEN
    def _import(self):
        url = "https://www.encodeproject.org/search/?type=Annotation&encyclopedia_version=4"
        url += "&files.file_type=bed+bed3%2B&assembly=" + self.assembly
        url += "&format=json&limit=all"

        mc = None
        if Config.memcache:
            mc = MemCacheWrapper(Config.memcache)
        qd = QueryDCC(auth=False, cache=mc)

        fnp = paths.path(self.assembly, self.assembly + "-Look-Up-Matrix.txt")
        printt("parsing", fnp)
        btidToCt = {}
        with open(fnp) as f:
            for line in f:
                line = line.strip().split('\t')
                btid = re.sub('[^0-9a-zA-Z]+', '-', line[0])
                btidToCt[btid] = line[0]
        printt("looking up ENCODE accessions...")
        rows = []
        for exp in qd.getExps(url):
            for f in exp.files:
                if not f.isBed():
                    continue
                # [u'zhiping-weng:cREs-hg19-v10-ganglionic...-5group-bigBed',
                #  u'zhiping-weng:cREs-hg19-v10-ganglionic-...-9state-H3K4me3-bigBed']
                aliases = f.jsondata["aliases"]
                typs = [
                    "5group", "9state-H3K4me3", "9state-DNase",
                    "9state-H3K27ac", "9state-CTCF"
                ]
                for a in aliases:
                    a = a.replace(
                        "zhiping-weng:cREs-" + self.assembly + "-v10-", '')
                    a = a.replace("-bed", '')
                    for t in typs:
                        if t in a:
                            ct = a.replace(t,
                                           '')[:-1]  # remove trailing hyphen
                            if not ct:
                                rows.append(["_agnostic", str(f.fileID), t])
                                continue
                            if ct not in btidToCt:
                                raise Exception("missing " + ct)
                            rows.append([btidToCt[ct], str(f.fileID), t])

        printt('***********', "drop and create", self.tableName)
        self.curs.execute("""
DROP TABLE IF EXISTS {tableName};
CREATE TABLE {tableName}
(id serial PRIMARY KEY,
celltype text,
dcc_accession text,
typ text
);""".format(tableName=self.tableName))

        printt('***********', "import ENCODE files")
        printt("rewrite rows")
        outF = StringIO.StringIO()
        for r in rows:
            outF.write('\t'.join(r) + '\n')
        outF.seek(0)

        cols = ["celltype", "dcc_accession", "typ"]
        self.curs.copy_from(outF, self.tableName, '\t', columns=cols)
        print("copied in", self.curs.rowcount)