Esempio n. 1
0
    def __init__(self, args, assembly, ver, nbins, j):
        self.args = args
        self.assembly = assembly
        self.var = ver
        self.nbins = nbins
        self.j = j

        self.raw = paths.path(assembly, "raw")
        self.minipeaks = paths.path(assembly, "minipeaks", str(ver), str(nbins))
        Utils.mkdir_p(self.minipeaks)

        self.bwtool = "/data/cherrypy/bin/bwtool"
        if not os.path.exists(self.bwtool):
            self.bwtool = "/usr/local/bin/bwtool"
        if not os.path.exists(self.bwtool):
            self.bwtool = "/data/common/tools/bwtool"
        if not os.path.exists(self.bwtool):
            raise Exception("no bwtool found")

        self.bwtoolFilter = os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         'minipeaks/bin/read_json'))
        if not os.path.exists(self.bwtoolFilter):
            raise Exception("missing C++ bwtool filter; please compile?")

        self.masterPeakFnp = os.path.join(self.raw, "cREs.bed")
        if "GRCh38" == assembly:
            self.masterPeakFnp = "/data/projects/encode/Registry/V2/GRCh38/GRCh38-ccREs.bed"
        self.numPeaks = numLines(self.masterPeakFnp)
        printt(self.masterPeakFnp, "has", self.numPeaks)

        self.miniPeaksBedFnp = os.path.join(self.minipeaks, "miniPeakSites.bed.gz")

        self.debug = False
Esempio n. 2
0
    def setupAll(self, sample):
        self.setupDb()

        cts, fnps = self.loadFileLists()
        ctsToId = self.setupCellTypes(cts)

        cols = ["leftCtId", "rightCtId", "ensembl", "log2FoldChange", "padj"]
        # baseMean	log2FoldChange	lfcSE	stat	pvalue	padj

        counter = 0
        for fnp, ct1, ct2 in fnps:
            counter += 1
            if sample:
                # if "_0" not in ct1 and "_0" not in ct2:
                if "limb_15" not in ct1 or "limb_11" not in ct2:
                    continue
            printt(counter, len(fnps), fnp)
            data, skipped = self.readFile(fnp)

            outF = StringIO.StringIO()
            for d in data:
                outF.write('\t'.join([str(ctsToId[ct1]),
                                      str(ctsToId[ct2])] + d) + '\n')
            outF.seek(0)

            self.curs.copy_from(outF, self.tableName, '\t', columns=cols)
            printt("copied in", self.curs.rowcount, "skipped", skipped)
def run(args, DBCONN):
    assemblies = Config.assemblies
    if args.assembly:
        assemblies = [args.assembly]

    def doRun(args, assembly, curs, tsuffix, jobgen, runDate):
        if args.metadata:
            PI.ImportPeakIntersectionMetadata(curs, assembly, tsuffix, jobgen,
                                              runDate).run()
        elif args.index:
            PI.ImportPeakIntersections(curs, assembly, tsuffix,
                                       runDate).index()
        else:
            m = PI.ImportPeakIntersectionMetadata(curs, assembly, tsuffix,
                                                  jobgen, runDate)
            runDate = m.run()
            ipi = PI.ImportPeakIntersections(curs, assembly, tsuffix, runDate)
            ipi.run()
            ipi.index()

    runDate = Config.cistromePeakIntersectionRunDate
    for assembly in assemblies:
        printt('***********', assembly)
        with getcursor(DBCONN, "main") as curs:
            if assembly in ["hg38", "mm10"]:
                doRun(args, assembly, curs, "cistromeIntersections",
                      cistrome_peak_metadata, runDate)
Esempio n. 4
0
    def processGwasBed(self, origBedFnp, bedFnp):
        printt("reading", origBedFnp)
        with open(origBedFnp) as f:
            rows = [r.rstrip().split('\t') for r in f if r]
        printt("split rows")
        split = []
        for r in rows:
            if ',' not in r[4]:
                split.append(r)
                continue
            taggedSNPs = r[4].split(',')
            r2s = r[5].split(',')
            a = list(r)
            b = list(r)
            a[4] = taggedSNPs[0]
            b[4] = taggedSNPs[1]
            a[5] = r2s[0]
            b[5] = r2s[1]
            split.append(a)
            split.append(b)
        printt("split rows", len(rows), "to", len(split))

        printt("adding authorPubmedTrait")
        for r in split:
            authorPubmedTrait = r[-1]
            r[-1] = authorPubmedTrait.replace('-', '_')

        print("***********", split[0][-1])

        printt("writing", bedFnp)
        with open(bedFnp, 'w') as f:
            for r in split:
                f.write('\t'.join(r) + '\n')
        Utils.sortFile(bedFnp)
        printWroteNumLines(bedFnp)
Esempio n. 5
0
def run(args, DBCONN):
    assemblies = Config.assemblies

    for assembly in assemblies:
        printt('***********', assembly)
        with getcursor(DBCONN, "07_setup_cart") as curs:
            setupCart(curs, assembly)
    def _build(self, assay_term_name, atn, exps):
        printt("building", assay_term_name, "...")

        def sorter(exp):
            return (exp.label)

        exps.sort(key=sorter)

        self.labelNToNormal[atn] = assay_term_name

        for label, exps in groupby(exps, sorter):
            exps = list(exps)
            labelN = Helpers.sanitize(label)
            self.labelNToNormal[labelN] = label

            fnp = os.path.join(BaseWwwTmpDir, self.assembly, "subtracks", atn,
                               labelN + '.txt')
            self.byAssayBiosampleType[atn][labelN] = {
                "assay_term_name": assay_term_name,
                "atn": atn,
                "label": label,
                "labelN": labelN,
                "fnp": fnp,
                "exps": exps,
                "assembly": self.assembly
            }

        printt("making tracks and subtracks...")
        self._makeSubTracks()
Esempio n. 7
0
 def ensemblIDtoGeneName(self):
     fnp, filetype = paths.gene_files[self.assembly]
     printt("loading", fnp)
     ggff = Genes(fnp, filetype)
     self.ensemToGene = {}
     for g in ggff.getGenes():
         self.ensemToGene[g.geneid_] = g.genename_
Esempio n. 8
0
 def _getRowsFromFiles(self):
     counter = 0
     for exp, expF in self._getFiles():
         counter += 1
         printt(counter, exp.encodeID, expF.fileID,
                expF.biological_replicates, expF.output_type)
         try:
             with open(expF.fnp()) as f:
                 lines = [x.strip().split('\t') for x in f]
             header = lines[0]
             gene_id_idx = self.gene_id_idx
             TPM_idx = 5
             FPKM_idx = 6
             assert ("gene_id" == header[gene_id_idx])
             assert ("TPM" == header[TPM_idx])
             assert ("FPKM" == header[FPKM_idx])
             for row in lines[1:]:
                 # if "0.00" == row[TPM_idx] and "0.00" == row[FPKM_idx]:
                 #    continue
                 geneID = row[gene_id_idx]
                 yield (expF.expID, expF.fileID, geneID,
                        self.ensemToGene.get(geneID, geneID), '_'.join([
                            str(x) for x in expF.biological_replicates
                        ]), row[TPM_idx], row[FPKM_idx])
         except:
             eprint("error reading:", expF.fnp())
             raise
Esempio n. 9
0
    def setupDB(self):
        tableName = self._tableNameMetadata()
        printt("dropping and creating", tableName)

        self.curs.execute("""
    DROP TABLE IF EXISTS {tn};

    CREATE TABLE {tn}
    (id serial PRIMARY KEY,
    expID text,
    fileID text,
    replicate INT NOT NULL,
    cellType text,
    organ text,
    cellCompartment text,
    target text,
    lab text,
    assay_term_name text,
    biosample_type text,
    biosample_term_name text,
    biosample_summary text,
    ageTitle text,
    assay_title text,
    signal_files jsonb
    )""".format(tn=tableName))
Esempio n. 10
0
    def __init__(self, assembly, assay, toks):
        self.assembly = assembly
        self.assay = assay
        self.expID = toks[0]
        self.fileID = toks[1]
        self.cellTypeName = toks[2]

        exp = Exp.fromJsonFile(self.expID)
        self.tissue = DetermineTissue.TranslateTissue(assembly, exp)

        self.biosample_term_name = exp.biosample_term_name
        self.biosample_summary = exp.jsondata.get("biosample_summary",
                                                  self.biosample_term_name)

        self.biosample_type = exp.jsondata["biosample_type"]
        if isinstance(self.biosample_type, list):
            if len(self.biosample_type) > 1:
                printt("multiple items for biosample_type:", self.biosample_type)
            self.biosample_type = self.biosample_type[0]

        if "mm10" == self.assembly:
            bs = self.biosample_summary
            bs = bs.replace("C57BL/6 ", "")
            matches = re.findall(r"\ (\((.*) days\))", bs)
            #printt(bs, matches, len(matches))
            if matches and 1 == len(matches):
                bs = bs.replace(matches[0][0], "e" + matches[0][1])
                bs = bs.replace("postnatal e", "p").replace("embryo e", "e")
            #printt("new bs", bs)
            self.biosample_summary = bs

        out = self.output().encode('ascii', 'ignore').decode('ascii')
Esempio n. 11
0
def run(inFnp, outFnp):
    printt("making hammock from", inFnp)
    with open(inFnp) as inF:
        with open(outFnp, 'w') as outF:
            for idx, line in enumerate(inF):
                toks = line.rstrip().split('\t')
                attrs = "id:" + str(idx) + ',name:"' + toks[3] + '"'
                if 9 == len(toks):
                    attrs += ",struct:{{thick:[[{s},{e}],],}}".format(
                        s=toks[1], e=toks[2])
                out = toks[:3] + [attrs]
                outF.write("\t".join(out) + '\n')
    printt("sorting")
    Utils.sortFile(outFnp)
    printWroteNumLines(outFnp)

    printt("bgzip")
    cmds = ["bgzip", '-f', outFnp]
    Utils.runCmds(cmds)

    printt("tabix")
    cmds = ["tabix", '-f', outFnp + '.gz']
    Utils.runCmds(cmds)

    printt("wrote", inFnp, outFnp)
Esempio n. 12
0
    def _import(self):
        lookup = []

        fnp = paths.path(self.assembly, self.assembly + "-Look-Up-Matrix.txt")
        printt("parsing", fnp)
        with open(fnp) as f:
            rows = [x.strip() for x in f]

        outRows = Parallel(n_jobs=self.args.j)(delayed(ontologyToCellTypes)(
            line) for line in rows)

        printt('***********', "drop and create", self.tableName)
        self.curs.execute("""
DROP TABLE IF EXISTS {tableName};
CREATE TABLE {tableName}
(id serial PRIMARY KEY,
cellTypeName text,
biosample_term_id text,
synonyms jsonb
);""".format(tableName=self.tableName))

        printt('***********', "import lookup")
        printt("rewrite rows")
        outF = StringIO.StringIO()
        for row in outRows:
            for r in row:
                outF.write('\t'.join(r) + '\n')
        outF.seek(0)

        cols = ["cellTypeName", "biosample_term_id"]
        self.curs.copy_from(outF, self.tableName, '\t', columns=cols)
        printt("copied in", self.curs.rowcount)
def testHub():
    printt("checking hub...")
    cmds = [
        "/data/common/tools/ucsc.v350/hubCheck", "-noTracks",
        os.path.join(BaseWwwDir, 'hub.txt')
    ]
    printt(Utils.runCmds(cmds))
Esempio n. 14
0
def doImport(curs, assembly):
    fnp = paths.path(assembly, "hg19-tss-rampage-matrix.txt.gz")
    printt("reading", fnp)
    with gzip.open(fnp) as f:
        header = f.readline().rstrip('\n').split('\t')
        rows = [line.rstrip('\n').split('\t') for line in f]
    printt("read header and", len(rows), "rows")

    fnp = paths.path(assembly, "hg19-tss-filtered.bed.gz")
    with gzip.open(fnp) as f:
        tsses = [line.rstrip('\n').split('\t') for line in f]
    lookup = {r[3]: r for r in tsses}

    printt("rewriting")
    outF = StringIO.StringIO()
    for row in rows:
        r = [row[0]]
        t = lookup[row[0]]
        r.append(t[6])  # gene
        r.append(t[0])  # chrom
        r.append(t[1])  # start
        r.append(t[2])  # stop
        r.append(t[5])  # strand
        r.append(t[7].replace("_", " "))  # gene info
        r += row[1:]
        outF.write('\t'.join(r) + '\n')
    outF.seek(0)

    fileIDs = header[1:]

    cols = [
        "transcript", "ensemblid_ver", "chrom", "start", "stop", "strand",
        "geneInfo"
    ] + fileIDs

    tableName = assembly + "_rampage"
    printt("copy into", tableName)
    curs.execute("""
DROP TABLE IF EXISTS {tn};
CREATE TABLE {tn}
(id serial PRIMARY KEY,
transcript text,
ensemblid_ver text,
chrom text,
start integer,
stop integer,
strand VARCHAR(1),
geneInfo text,
maxVal real,
{fields}
);""".format(tn=tableName, fields=','.join([f + " real" for f in fileIDs])))

    curs.copy_from(outF, tableName, '\t', columns=cols)
    printt("inserted", curs.rowcount)

    curs.execute("""
UPDATE {tn}
SET maxVal = GREATEST( {fields} )
""".format(tn=tableName, fields=','.join(fileIDs)))
Esempio n. 15
0
def makeIndexMultiCol(curs, tableName, cols):
    name = '_'.join(cols)
    idx = _idx(tableName, name)
    printt("indexing", idx)
    curs.execute("""
DROP INDEX IF EXISTS {idx};
CREATE INDEX {idx} on {tableName} ({col});
""".format(idx=idx, tableName=tableName, col=','.join(cols)))
Esempio n. 16
0
def makeIndex(curs, tableName, cols):
    for col in cols:
        idx = _idx(tableName, col)
        printt("indexing", idx)
        curs.execute("""
DROP INDEX IF EXISTS {idx};
CREATE INDEX {idx} on {tableName} ({col});
""".format(idx=idx, tableName=tableName, col=col))
Esempio n. 17
0
 def _getFileIDs(self, fn):
     assay = fn.split('-')[0]
     printt("***********************", self.assembly, assay)
     fnp = paths.path(self.assembly, "raw", fn)
     with open(fnp) as f:
         rows = [x.rstrip('\n').split('\t') for x in f.readlines()]
     fileIDs = sorted([r[1] for r in rows])
     return assay, fileIDs
Esempio n. 18
0
def makeIndexRange(curs, tableName, cols):
    for col in cols:
        idx = _idx(tableName, col)
        printt("indexing int range", idx)
        curs.execute("""
DROP INDEX IF EXISTS {idx};
create index {idx} on {tableName} using gist(intarray2int4range({col}));
""".format(idx=idx, tableName=tableName, col=col))
Esempio n. 19
0
def loadJobs(assembly, runDate):
    fnp = paths.path(assembly, "extras", runDate, "cistromeJobs.json.gz")

    printt("reading", fnp)
    with gzip.open(fnp) as f:
        jobs = json.load(f)
    printt("loaded", len(jobs))
    return jobs
Esempio n. 20
0
 def run(self):
     self.setupTable()
     fnp = paths.path(self.assembly, "raw", "vista.tsv")
     printt("reading", fnp)
     with open(fnp) as f:
         self.curs.copy_from(f, self.tableName, '\t',
                             columns=("accession", "vistaids"))
     printt("copied in %d vista entries" % self.curs.rowcount)
Esempio n. 21
0
def makeIndexInt4Range(curs, tableName, cols):
    # http://stackoverflow.com/a/14407839
    idx = _idx(tableName, "_".join(cols))
    printt("indexing int4range", idx)
    curs.execute("""
DROP INDEX IF EXISTS {idx};
    create index {idx} on {tableName} (int4range({cols}));
""".format(idx=idx, tableName=tableName, cols=",".join(cols)))
Esempio n. 22
0
def run(args, DBCONN):
    printt('***********', "mm10")
    with getcursor(DBCONN, "import DEs") as curs:
        ide = ImportDE(curs)
        if args.index:
            return ide.index()
        ide.setupAll(args.sample)
        ide.index()
Esempio n. 23
0
def tag_delete():
    uid = request.args.get('uid')
    try:
        storage = state.get_storage()
        storage.remove_tag(uid)
    except Exception as e:
        printt(e)
    return redirect('/tags')
Esempio n. 24
0
 def _setupTable(self):
     printt("drop and create", self.tableName)
     self.curs.execute("""
     DROP TABLE IF EXISTS {tn};
     CREATE TABLE {tn}
     (id serial PRIMARY KEY,
     accession text,
     creGroupsSpecific VARCHAR[]
     );""".format(tn=self.tableName))
Esempio n. 25
0
 def _setupTable(self):
     printt("drop and create", self.tableName)
     self.curs.execute("""
     DROP TABLE IF EXISTS {tn};
     CREATE TABLE {tn}
     (id serial PRIMARY KEY,
     accession text,
     concordant BOOLEAN NOT NULL DEFAULT FALSE
     );""".format(tn=self.tableName))
Esempio n. 26
0
def run(args, DBCONN):
    assemblies = Config.assemblies
    if args.assembly:
        assemblies = [args.assembly]

    for assembly in assemblies:
        printt('***********', assembly)
        ci = CreateIndices(assembly)
        ci.run()
Esempio n. 27
0
def run(args, DBCONN):
    assemblies = Config.assemblies
    if args.assembly:
        assemblies = [args.assembly]
    for assembly in assemblies:
        printt('***********', assembly)
        args.assembly = assembly
        rna1.run(args, DBCONN)
        rna2.run(args, DBCONN)
Esempio n. 28
0
def makeIndexTextPatternOps(curs, tableName, cols):
    for col in cols:
        idx = _idx(tableName, col, "text_pattern_ops")
        printt("indexing", idx)
        curs.execute("""
DROP INDEX IF EXISTS {idx};
CREATE INDEX {idx} on {tableName} 
USING btree ({col} text_pattern_ops);
""".format(idx=idx, tableName=tableName, col=col))
Esempio n. 29
0
    def addCol(self):
        printt("adding col...")
        curs.execute("""
    ALTER TABLE {tn}
    ADD COLUMN creGroup integer;

    UPDATE {tn}
    SET ...
    """.format(tn=self.tableName_cre_all))
Esempio n. 30
0
def run(args):
    assemblies = Config.assemblies
    if args.assembly:
        assemblies = [args.assembly]

    for assembly in assemblies:
        printt('***********', assembly)
        rs = ExtractRNAseq(assembly)
        rs.run()