Esempio n. 1
0
    def __init__(self, args, assembly, ver, nbins, j):
        self.args = args
        self.assembly = assembly
        self.var = ver
        self.nbins = nbins
        self.j = j

        self.raw = paths.path(assembly, "raw")
        self.minipeaks = paths.path(assembly, "minipeaks", str(ver), str(nbins))
        Utils.mkdir_p(self.minipeaks)

        self.bwtool = "/data/cherrypy/bin/bwtool"
        if not os.path.exists(self.bwtool):
            self.bwtool = "/usr/local/bin/bwtool"
        if not os.path.exists(self.bwtool):
            self.bwtool = "/data/common/tools/bwtool"
        if not os.path.exists(self.bwtool):
            raise Exception("no bwtool found")

        self.bwtoolFilter = os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         'minipeaks/bin/read_json'))
        if not os.path.exists(self.bwtoolFilter):
            raise Exception("missing C++ bwtool filter; please compile?")

        self.masterPeakFnp = os.path.join(self.raw, "cREs.bed")
        if "GRCh38" == assembly:
            self.masterPeakFnp = "/data/projects/encode/Registry/V2/GRCh38/GRCh38-ccREs.bed"
        self.numPeaks = numLines(self.masterPeakFnp)
        printt(self.masterPeakFnp, "has", self.numPeaks)

        self.miniPeaksBedFnp = os.path.join(self.minipeaks, "miniPeakSites.bed.gz")

        self.debug = False
Esempio n. 2
0
 def _load(self):
     fnBases = [("CTCF", 2), ("DNase", 2), ("Enhancer", 4), ("H3K27ac", 2),
                ("H3K4me3", 2), ("Insulator", 4), ("Promoter", 4)]
     self.rankMethodToCtAndFileID = []
     for fnBase, ctIdx in fnBases:
         fn = fnBase + "-list.txt"
         fnp = paths.path(self.assembly, "raw", fn.lower())
         if not os.path.exists(fnp):
             raise Exception("missing " + fnp)
         with open(fnp) as f:
             rows = [x.rstrip('\n').split('\t') for x in f]
         for r in rows:
             efn = r[:ctIdx]
             if 2 == len(efn):
                 efn = '-'.join(efn) + ".txt"
             else:
                 efn = '.'.join(['-'.join(efn[:2]), '-'.join(efn[2:4])
                                 ]) + ".txt"
             fnp = paths.path(self.assembly, "raw", "signal-output", efn)
             if 0:
                 if not os.path.exists(fnp):
                     raise Exception("missing", fnp)
             if r[ctIdx].startswith(
                     "LNCaP_clone_FGC_immortalized_cell_line_treated_with_1_nM"
             ) and r[ctIdx].endswith("hydroxy-17-methylestra-4"):
                 continue
             if r[ctIdx].startswith(
                     "SK-N-SH_immortalized_cell_line_treated_with"
             ) and r[ctIdx].endswith(
                     "all-trans-retinoic_acid_for_48_hours"):
                 continue
             d = [fnBase, r[ctIdx], fnp]
             self.rankMethodToCtAndFileID.append(d)
Esempio n. 3
0
    def run(self):
        fnp = paths.path(self.assembly, "raw", "tads.txt")
        printt("reading", fnp)
        with open(fnp) as f:
            tadRows = [line.rstrip('\n').split('\t') for line in f]

        fnp = paths.path(self.assembly, "extras", "ensebleToID.txt")
        printt("reading", fnp)
        with open(fnp) as f:
            ensemblidVerToGeneID = [line.rstrip('\n').split(',') for line in f]
        lookup = {r[0]: r[2] for r in ensemblidVerToGeneID}

        f = StringIO.StringIO()
        for tr in tadRows:
            r = [tr[1], tr[2]]
            geneIDs = [lookup[g] for g in tr[3].split(',')]
            r.append('{' + ','.join(geneIDs) + '}')
            f.write('\t'.join(r) + '\n')
        f.seek(0)

        self.setupTable()
        self.curs.copy_from(f,
                            self.tableName,
                            '\t',
                            columns=("accession", "tadName", "geneIDs"))
        printt("copied in TADs", self.curs.rowcount)
Esempio n. 4
0
def doImport(curs, assembly):
    fnp = paths.path(assembly, "hg19-tss-rampage-matrix.txt.gz")
    printt("reading", fnp)
    with gzip.open(fnp) as f:
        header = f.readline().rstrip('\n').split('\t')
        rows = [line.rstrip('\n').split('\t') for line in f]
    printt("read header and", len(rows), "rows")

    fnp = paths.path(assembly, "hg19-tss-filtered.bed.gz")
    with gzip.open(fnp) as f:
        tsses = [line.rstrip('\n').split('\t') for line in f]
    lookup = {r[3]: r for r in tsses}

    printt("rewriting")
    outF = StringIO.StringIO()
    for row in rows:
        r = [row[0]]
        t = lookup[row[0]]
        r.append(t[6])  # gene
        r.append(t[0])  # chrom
        r.append(t[1])  # start
        r.append(t[2])  # stop
        r.append(t[5])  # strand
        r.append(t[7].replace("_", " "))  # gene info
        r += row[1:]
        outF.write('\t'.join(r) + '\n')
    outF.seek(0)

    fileIDs = header[1:]

    cols = [
        "transcript", "ensemblid_ver", "chrom", "start", "stop", "strand",
        "geneInfo"
    ] + fileIDs

    tableName = assembly + "_rampage"
    printt("copy into", tableName)
    curs.execute("""
DROP TABLE IF EXISTS {tn};
CREATE TABLE {tn}
(id serial PRIMARY KEY,
transcript text,
ensemblid_ver text,
chrom text,
start integer,
stop integer,
strand VARCHAR(1),
geneInfo text,
maxVal real,
{fields}
);""".format(tn=tableName, fields=','.join([f + " real" for f in fileIDs])))

    curs.copy_from(outF, tableName, '\t', columns=cols)
    printt("inserted", curs.rowcount)

    curs.execute("""
UPDATE {tn}
SET maxVal = GREATEST( {fields} )
""".format(tn=tableName, fields=','.join(fileIDs)))
Esempio n. 5
0
    def _overlap(self, bedFnp):
        printt("******************* GWAS overlap")
        self._setupOverlap()

        cresFnp = paths.path(self.assembly, "extras", "cREs.sorted.bed")
        if not os.path.exists(cresFnp):
            Utils.sortFile(paths.path(self.assembly, "raw", "cREs.bed"),
                           cresFnp)

        printt("running bedtools intersect...")
        cmds = [cat(bedFnp),
                '|', "cut -f -4,11-",
                '|', "bedtools intersect",
                "-a", "-",
                "-b", cresFnp,
                "-wo" ]
        snpsIntersecting = Utils.runCmds(cmds)
        print("example", snpsIntersecting[0].rstrip('\n').split('\t'))

        printt("rewriting...")
        outF = StringIO.StringIO()
        count = {}
        for r in snpsIntersecting:
            toks = r.rstrip('\n').split('\t')
            snp = toks[3]
            authorPubmedTrait = toks[4].replace('-', '_')
            accession = toks[9]

            if '_' not in authorPubmedTrait:
                print(r)
                print(toks)
                raise Exception("bad authorPubmedTrait?")
            if not snp.startswith("rs"):
                print(r)
                print(toks)
                raise Exception("bad rs?")
            if not accession.startswith("EH3"):
                print(r)
                print(toks)
                raise Exception("bad line?")
            outF.write('\t'.join([authorPubmedTrait, accession, snp]) + '\n')
            if authorPubmedTrait not in count: count[authorPubmedTrait] = 0
            count[authorPubmedTrait] += 1
        print("example", '\t'.join([authorPubmedTrait, accession, snp]))
        for k, v in count.iteritems():
            print("%s: %d" % (k, v))
        outF.seek(0)

        printt("copying into DB...")
        cols = "authorPubmedTrait accession snp".split(' ')
        self.curs.copy_from(outF, self.tableNameOverlap, '\t', columns=cols)
        importedNumRows(self.curs)

        makeIndex(self.curs, self.tableNameOverlap, ["authorPubmedTrait"])
Esempio n. 6
0
    def __init__(self, assembly, ver, nbins, assay):
        self.assembly = assembly
        self.nbins = nbins
        self.ver = ver
        self.assay = assay

        self.minipeaks = paths.path(assembly, "minipeaks", str(ver), str(nbins))

        self.raw = paths.path(assembly, "raw")
        self.masterPeakFnp = os.path.join(self.raw, "cREs.bed")
        if "GRCh38" == assembly:
            self.masterPeakFnp = "/data/projects/encode/Registry/V2/GRCh38/GRCh38-ccREs.bed"
Esempio n. 7
0
    def computeIntersections(self):
        bedFnp = paths.path(self.assembly, "extras", "cREs.sorted.bed")
        if not os.path.exists(bedFnp):
            Utils.sortFile(paths.path(self.assembly, "raw", "cREs.bed"),
                           bedFnp)

        jobs = self.makeJobs()

        results = Parallel(n_jobs=self.args.j)(
            delayed(runIntersectJob)(job, bedFnp) for job in jobs)

        printt("\n")
        printt("merging intersections into hash...")

        tfImap = {}
        fileJsons = []
        filesToAccessions = {}
        for fileJson, accessions in results:
            if not accessions:
                continue
            for etype, label, fileID, accs in accessions:
                filesToAccessions[fileID] = accs
                for acc in accs:
                    if acc not in tfImap:
                        tfImap[acc] = {"tf": {}, "histone": {}}
                    if label not in tfImap[acc][etype]:
                        tfImap[acc][etype][label] = []
                    tfImap[acc][etype][label].append(fileID)
            fileJsons += fileJson

        printt("completed hash merge")

        printt("runDate:", self.runDate)
        outFnp = paths.path(self.assembly, "extras", self.runDate,
                            "peakIntersections.json.gz")
        Utils.ensureDir(outFnp)
        with gzip.open(outFnp, 'w') as f:
            for k, v in tfImap.iteritems():
                f.write('\t'.join(
                    [k, json.dumps(v["tf"]),
                     json.dumps(v["histone"])]) + '\n')
        printt("wrote", outFnp)

        outFnp = paths.path(self.assembly, "extras", self.runDate,
                            "chipseqIntersectionsWithCres.json.gz")
        Utils.ensureDir(outFnp)
        with gzip.open(outFnp, 'w') as f:
            for k, v in filesToAccessions.iteritems():
                f.write('\t'.join([k, json.dumps(v)]) + '\n')
        printt("wrote", outFnp)
Esempio n. 8
0
def run(args, DBCONN):
    fnp = os.path.join(os.path.dirname(__file__), '../../../minipeak_import.txt')

    assemblies = Config.assemblies
    if args.assembly:
        assemblies = [args.assembly]

    cores = args.j
    if args.sample:
        cores = 1

    with open(fnp, 'w') as outF:
        for assembly in assemblies:
            for ver, nbins in [(6,20)]:
                if not args.yes:
                    s = "(Re)import %s, version %s, %s nbins?" % (assembly, ver, nbins)
                    if not GetYesNoToQuestion.immediate(s):
                        print("skipping", assembly, ver, nbins)
                        continue

                minipeaks = paths.path(assembly, "minipeaks", str(ver), str(nbins))

                outF.write("use minipeaks;\n")
                im = ImportMinipeaks(assembly, nbins, ver, cores)
                im.prepImportAndWriteScript(outF, args.sample)

    printWroteNumLines(fnp)
    cmds = ['CQLSH_HOST="{hosts}"'.format(hosts=Config.cassandra[0]),
            "cqlsh",
            "-f", fnp]
    print("please run this command:")
    print(' '.join(cmds))
Esempio n. 9
0
    def run(self):
        fnp = paths.path(self.assembly, "raw", "cREs.bed")
        cmds = [
            "cat", fnp, '|',
            """awk 'BEGIN {srand()} !/^$/ { if(rand() <= .00001) print $0}'"""
        ]
        cres = [x.rstrip('\n').split('\t') for x in Utils.runCmds(cmds) if x]
        cres = filter(lambda x: x[0] in chroms[self.assembly], cres)
        print("selected", len(cres), "cres")
        # chr1    10244   10357   EH37D0000001    EH37E1055273    Promoter-like   proximal
        cres = [CREnt(*x[:5]) for x in cres]

        for cre in cres:
            allRanks = CRE(self.pgSearch, cre.accession, self.cache).allRanks()
            for rm, ct, fnp in self.rankMethodToCtAndFileID:
                cmds = ['grep', cre.mpName, fnp]
                zscore = float(Utils.runCmds(cmds)[0].split('\t')[1])
                ctIdx = self.rankMethodToIDxToCellType[rm][ct] - 1
                zscoreDb = allRanks[rm.lower()][ctIdx]
                if not isclose(zscore, zscoreDb, 0.001):
                    eprint("PROBLEM")
                    eprint(cre)
                    eprint(rm, ct)
                    eprint("from", fnp)
                    eprint(zscore)
                    eprint("from DB lookup")
                    eprint(zscoreDb)
                    # eprint(allRanks)
                    raise Exception("error")
                sys.stdout.write('.', )
            print(cre.accession, "ok")
Esempio n. 10
0
    def _makeFile(self, assay, title):
        print("********************", title)
        self.assaymap = {assay: self.pgc.datasets_multi(assay)}
        cts = sorted(list(set(self.assaymap[assay].keys())))

        for ct in cts:
            print(ct)
            cti = self.ctmap[title][ct]
            self.curs.execute("""
            SELECT accession, {assay}_zscores[{cti}], chrom, start, stop
            FROM {tn}
            WHERE {assay}_zscores[{cti}] > 1.64
            ORDER BY 2 DESC
            """.format(assay=assay, cti=cti, tn=self.assembly + "_cre_all"))

            rows = self.curs.fetchall()
            ctSan = "".join(x for x in ct if x.isalnum() or x == '_')
            dnase_expID = self.assaymap[assay][ct]["dnase_expid"]
            other_expID = self.assaymap[assay][ct]["other_expid"]
            outFnp = paths.path(
                self.assembly, "export", assay + "-like",
                '_'.join([ctSan, dnase_expID, other_expID]) + ".tsv")
            Utils.ensureDir(outFnp)
            with open(outFnp, 'w') as outF:
                for r in rows:
                    toks = [r[2], r[3], r[4], r[0], r[1]]
                    outF.write('\t'.join([str(s) for s in toks]) + '\n')
            printWroteNumLines(outFnp)
Esempio n. 11
0
 def __init__(self, curs, assembly, args):
     self.curs = curs
     self.assembly = assembly
     self.args = args
     self.tableName = assembly + "_nine_state"
     self.inFnp = paths.path(self.assembly,
                             self.assembly + "-Look-Up-Matrix.txt")
Esempio n. 12
0
    def run(self):
        fnp = paths.path("mm10", "Two-Way-Synteny.txt")
        self.setupLiftover()

        printt("reading", fnp)
        mmToHG = []
        with open(fnp, 'r') as f:
            for line in f:
                line = line.strip().split('\t')
                mc, ms, me, md = self.ccREmaps["mm10"][line[1]]
                hc, hs, he, hd = self.ccREmaps["hg19"][line[0]]
                mmToHg.append([hc, hs, he, md, line[1], hd, line[0]])

        cols = "chrom start stop mouseAccession humanAccession overlap".split(' ')
        printt("writing stringio...")
        outF = StringIO.StringIO()
        for r in mmToHg:
            outF.write("\t".join([r[0], r[1], r[2], r[4], r[6], r[7]]) + '\n')
        outF.seek(0)

        printt("copy into db...")
        self.curs.copy_from(outF, self.tableName, '\t', columns=cols)
        printt("\tok", self.curs.rowcount)

        makeIndex(self.curs, self.tableName, ["mouseAccession", "humanAccession"])
Esempio n. 13
0
    def _import(self):
        lookup = []

        fnp = paths.path(self.assembly, self.assembly + "-Look-Up-Matrix.txt")
        printt("parsing", fnp)
        with open(fnp) as f:
            rows = [x.strip() for x in f]

        outRows = Parallel(n_jobs=self.args.j)(delayed(ontologyToCellTypes)(
            line) for line in rows)

        printt('***********', "drop and create", self.tableName)
        self.curs.execute("""
DROP TABLE IF EXISTS {tableName};
CREATE TABLE {tableName}
(id serial PRIMARY KEY,
cellTypeName text,
biosample_term_id text,
synonyms jsonb
);""".format(tableName=self.tableName))

        printt('***********', "import lookup")
        printt("rewrite rows")
        outF = StringIO.StringIO()
        for row in outRows:
            for r in row:
                outF.write('\t'.join(r) + '\n')
        outF.seek(0)

        cols = ["cellTypeName", "biosample_term_id"]
        self.curs.copy_from(outF, self.tableName, '\t', columns=cols)
        printt("copied in", self.curs.rowcount)
Esempio n. 14
0
class FCPaths:
    base = paths.fantomcat
    enhancer_intersected = os.path.join(base, "cREs.enhancers.intersected.bed")
    CAGE_intersected = os.path.join(base, "cREs.CAGE.intersected.bed")
    genetsv = os.path.join(base, "gene.info.tsv")
    genesdir = os.path.join(base, "genes")
    genebed = os.path.join(base, "gene.info.bed")
    intersected = os.path.join(base, "gene.info.intersected.bed")
    global_statistics = os.path.join(base, "global_statistics.json")
    cres = paths.path("hg19", "raw/cREs.sorted.bed.gz")
    twokb = os.path.join(base, "gene.2kbtss.bed")
    twokb_intersected = os.path.join(base, "gene.2kbtss.intersected.bed")
    twokb_statistics = os.path.join(base, "twokb_statistics.json")
    permissive_enhancers = os.path.join(base, "permissive_enhancers.bed.gz")
    robust_CAGE = os.path.join(base, "robust.cage.bed.gz")    
    forimport = {
        "genes": os.path.join(base, "gene.import.tsv"),
        "intersections": os.path.join(base, "intersections.tsv"),
        "twokb_intersections": os.path.join(base, "twokb.intersections.tsv")
    }
    zenbu_track = os.path.join(base, "web_zenbu_downloads", "5BFANTOMCAT5DRobustgene.bed")

    @staticmethod
    def genepath(acc):
        return os.path.join(FCPaths.genesdir, acc + ".json")
Esempio n. 15
0
 def _load_ccRE_map(assembly):
     ret = {}
     with open(paths.path(assembly, "raw", "cREs.sorted.bed"), 'r') as f:
         for line in f:
             line = line.strip().split('\t')
             ret[line[4]] = tuple(line[:4])
     return ret
Esempio n. 16
0
def metadata(curs, assembly):
    fnp = paths.path(assembly, "hg19-tss-rampage-matrix.txt.gz")

    printt("reading", fnp)
    with gzip.open(fnp) as f:
        header = f.readline().rstrip('\n').split('\t')

    fileIDs = header[1:]

    tableName = assembly + "_rampage_info"
    printt("dropping and creating", tableName)

    curs.execute("""
DROP TABLE IF EXISTS {tn};
CREATE TABLE {tn}
(id serial PRIMARY KEY,
expID text,
fileID text,
biosample_term_name text,
biosample_type text,
biosample_summary text,
tissue text,
strand VARCHAR(1)
) """.format(tn=tableName))

    outF = StringIO.StringIO()

    mc = None
    if Config.memcache:
        mc = MemCacheWrapper(Config.memcache)
    qd = QueryDCC(auth=False, cache=mc)

    for fileID in fileIDs:
        exp = qd.getExpFromFileID(fileID)
        expID = exp.encodeID
        tissue = DetermineTissue.TranslateTissue(assembly, exp).strip()
        for f in exp.files:
            if f.fileID == fileID:
                print(f)
                print(f.output_type)
                strand = f.output_type.split()[0]
                if "plus" == strand:
                    strand = '+'
                elif "minus" == strand:
                    strand = '-'
                else:
                    raise Exception("unknown strand " + f.output_type)
                outF.write('\t'.join([
                    expID, fileID, exp.biosample_term_name, exp.biosample_type,
                    exp.getExpJson()["biosample_summary"], tissue, strand
                ]) + '\n')
    outF.seek(0)

    cols = [
        "expID", "fileID", "biosample_term_name", "biosample_type",
        "biosample_summary", "tissue", "strand"
    ]
    curs.copy_from(outF, tableName, '\t', columns=cols)
    printt("\tok", curs.rowcount)
Esempio n. 17
0
 def run(self):
     self.setupTable()
     fnp = paths.path(self.assembly, "raw", "vista.tsv")
     printt("reading", fnp)
     with open(fnp) as f:
         self.curs.copy_from(f, self.tableName, '\t',
                             columns=("accession", "vistaids"))
     printt("copied in %d vista entries" % self.curs.rowcount)
Esempio n. 18
0
def loadJobs(assembly, runDate):
    fnp = paths.path(assembly, "extras", runDate, "cistromeJobs.json.gz")

    printt("reading", fnp)
    with gzip.open(fnp) as f:
        jobs = json.load(f)
    printt("loaded", len(jobs))
    return jobs
Esempio n. 19
0
 def _getFileIDs(self, fn):
     assay = fn.split('-')[0]
     printt("***********************", self.assembly, assay)
     fnp = paths.path(self.assembly, "raw", fn)
     with open(fnp) as f:
         rows = [x.rstrip('\n').split('\t') for x in f.readlines()]
     fileIDs = sorted([r[1] for r in rows])
     return assay, fileIDs
Esempio n. 20
0
 def __init__(self, args, assembly, runDate=None):
     self.args = args
     self.assembly = assembly
     self.runDate = runDate
     if not runDate:
         self.runDate = arrow.now().format('YYYY-MM-DD')
     self.jobsFnp = paths.path(self.assembly, "extras", self.runDate,
                               "jobs.json.gz")
     Utils.ensureDir(self.jobsFnp)
Esempio n. 21
0
def computeIntersections(args, assembly):
    bedFnp = paths.path(assembly, "extras", "cREs.sorted.bed")
    if not os.path.exists(bedFnp):
        Utils.sortFile(paths.path(assembly, "raw", "cREs.bed"), bedFnp)

    runDate = arrow.now().format('YYYY-MM-DD')
    jobs = makeJobs(assembly, paths.cistrome("data", "raw"), runDate)

    results = Parallel(n_jobs=args.j)(
        delayed(cistromeIntersectJob)(job, bedFnp) for job in jobs)

    print("\n")
    printt("merging intersections into hash...")

    processResults(
        results,
        paths.path(assembly, "extras", runDate,
                   "cistromeIntersections.json.gz"))
Esempio n. 22
0
    def run(self):
        self.ensemblIDtoGeneName()

        today = arrow.now().format('YYYY-MM-DD')
        fnp = paths.path(self.assembly, "geneExp", today + ".tsv.gz")
        Utils.ensureDir(fnp)
        with gzip.open(fnp, 'wb') as f:
            for row in self._getRowsFromFiles():
                f.write('\t'.join(row) + '\n')
        printWroteNumLines(fnp)
Esempio n. 23
0
    def run(self):
        dataF = paths.path(self.assembly, "gwas", "h3k27ac")

        origBedFnp = paths.gwasFnp(self.assembly, self.version, ".bed")
        bedFnp = paths.gwasFnp(self.assembly, self.version, ".sorted.bed")

        self._gwas(bedFnp)
        header = self._enrichment()
        self._studies(header)
        self._overlap(bedFnp)
Esempio n. 24
0
    def run(self):
        self.setupTable()

        cols = ["accession", "tf", "histone"]

        fnp = paths.path(self.assembly, "extras", self.runDate,
                         "%s.json.gz" % self._tsuffix)
        printt("copying in data", fnp)
        with gzip.open(fnp) as f:
            self.curs.copy_from(f, self.tableName, '\t', columns=cols)
        importedNumRows(self.curs)
Esempio n. 25
0
def makeSubsampledAccessions(assembly, ver, nbins, accessionsFnp):
    Utils.ensureDir(accessionsFnp)

    minipeaks = paths.path(assembly, "minipeaks", str(ver), str(nbins))
    miniPeaksBedFnp = os.path.join(minipeaks, "miniPeakSites.bed.gz")

    cmds = ["zcat", miniPeaksBedFnp,
            "| awk '{ print $4 }' ",
            "| awk 'BEGIN {srand()} !/^$/ { if (rand() <= .01) print $0}'",
            ">", accessionsFnp]
    Utils.runCmds(cmds)
    printWroteNumLines(accessionsFnp)
Esempio n. 26
0
    def __init__(self, assembly, nbins, ver, cores):
        self.assembly = assembly
        self.nbins = nbins
        self.ver = ver
        self.cores = cores

        self.cluster = Cluster(Config.cassandra)
        self.session = self.cluster.connect()
        self.session.execute("""CREATE KEYSPACE IF NOT EXISTS minipeaks
WITH replication = {'class':'SimpleStrategy', 'replication_factor':1};""")
        self.session.set_keyspace("minipeaks")
        self.minipeaks = paths.path(assembly, "minipeaks", str(ver), str(nbins))
Esempio n. 27
0
    def run(self):
        mod = import_module("10_generate_ontology_actual")
        runF = getattr(mod, "run")

        downloadDate = '2017-10Oct-25'
        if 1:
            uberon_url = "http://ontologies.berkeleybop.org/uberon/composite-metazoan.owl"
            efo_url = "http://sourceforge.net/p/efo/code/HEAD/tree/trunk/src/efoinowl/InferredEFOOWLview/EFO_inferred.owl?format=raw"
            obi_url = "http://purl.obolibrary.org/obo/obi.owl"
        else:
            uberon_url = paths.path("ontology", downloadDate,
                                    "composite-metazoan.owl")
            efo_url = paths.path("ontology", downloadDate, "EFO_inferred.owl")
            obi_url = paths.path("ontology", downloadDate, "obi.owl")

        printt("running ENCODE DCC generate ontology...")
        terms = runF(uberon_url, efo_url, obi_url)

        fnp = paths.path("ontology", downloadDate, "ontology.json.gz")
        Utils.ensureDir(fnp)
        printt("done; about to write", fnp)
        with gzip.open(fnp, 'wb') as f:
            json.dump(terms, f)
        printWroteNumLines(fnp)
Esempio n. 28
0
    def run(self):
        fnp = paths.path(self.assembly, "extras", "TADs.bed.gz")

        printt("reading", fnp)
        with gzip.open(fnp) as f:
            rows = [line.rstrip().split('\t') for line in f]
        f = StringIO.StringIO()
        for r in rows:
            f.write('\t'.join(r) + '\n')
        f.seek(0)

        self.setupTable()
        self.curs.copy_from(f,
                            self.tableName,
                            '\t',
                            columns=("chrom", "start", "stop", "tadName"))
        printt("copied in TADs", self.curs.rowcount)
Esempio n. 29
0
    def _import(self):
        printt('***********', "drop and create", self.tableName)

        # "AEO:0001021": {
        #                 "assay": [],
        #                 "category": [],
        #                 "developmental": [],
        #                 "name": "stem cell population",
        #                 "objectives": [],
        #                 "organs": [],
        #                 "part_of": [],
        #                 "preferred_name": "",
        #                 "slims": [],
        #                 "synonyms": [],
        #                 "systems": [],
        #                 "types": []
        #             },

        self.curs.execute("""
DROP TABLE IF EXISTS {tableName};
CREATE TABLE {tableName}
(id serial PRIMARY KEY,
oid text,
info jsonb
);""".format(tableName=self.tableName))

        printt('***********', "import ontology info")
        downloadDate = '2017-10Oct-25'
        fnp = paths.path("ontology", downloadDate, "ontology.json.gz")

        with gzip.open(fnp, "rb") as f:
            kv = json.load(f)

        outRows = Parallel(n_jobs=self.args.j)(delayed(runOntology)(
            oid, infos) for oid, infos in kv.iteritems())

        outF = StringIO.StringIO()
        for r in outRows:
            outF.write(r + '\n')
        outF.seek(0)

        cols = ["oid", "info"]
        self.curs.copy_from(outF, self.tableName, '\t', columns=cols)
        if 0 == self.curs.rowcount:
            raise Exception("error: no rows inserted")
        printt("imported", self.curs.rowcount, "rows", self.tableName)
Esempio n. 30
0
def loadGidsToDbIds(assembly):
    fnp = paths.path(assembly, "extras", "ensebleToID.txt")

    printt("reading", fnp)
    with open(fnp) as f:
        rows = [line.rstrip('\n').split(',') for line in f.readlines() if line]
    gidsToDbID = {}
    requiredGids = {}
    for r in rows:
        if 3 != len(r):
            print(r)
            raise Exception("wrong num toks")
        gid = r[0]
        gidsToDbID[gid] = r[2]
        requiredGids[gid] = r[2]
        gidsToDbID[r[1]] = r[2]
    return gidsToDbID, requiredGids