Esempio n. 1
0
    def run(self):
        tableName = self.assembly + "_snps"
        printt("dropping and creating", tableName)
        self.curs.execute("""
DROP TABLE IF EXISTS {tableName};
CREATE TABLE {tableName}
(id serial PRIMARY KEY,
snp varchar(15),
chrom text,
start integer,
stop integer
);
""".format(tableName=tableName))

        fns = {
            "mm10": "dbsnps.142.mm10.ucsc_table_browser.tsv.gz",
            "hg19": "dbsnps.144.hg19.ucsc_table_browser.tsv.gz",
            #       "hg38": "dbsnps.144.hg38.ucsc_table_browser.tsv.gz"
        }
        fnp = os.path.join(Dirs.dbsnps, fns[self.assembly])

        printt("importing", fnp)
        with gzip.open(fnp) as f:
            cols = ["chrom", "start", "stop", "snp"]
            self.curs.copy_from(f, tableName, '\t', columns=cols)
        printt("imported", self.curs.rowcount)

        makeIndex(self.curs, tableName, ["snp", "chrom"])
        makeIndexIntRange(self.curs, tableName, ["start", "stop"])
Esempio n. 2
0
    def _gwas(self, fnp):
        printt("******************* GWAS")
        printt("reading", fnp)
        with open(fnp) as f:
            rows = [r.rstrip().split('\t') for r in f if r]

        self.setupGWAS()

        printt("rewrite rows")
        outF = StringIO.StringIO()
        for r in rows:
            if 'Lead' == r[4]:
                r[4] = r[3]
            r[5] = "{%s}" % r[5].replace('*', "-1")
            r[2] = str(int(r[2]) + 1)
            r[-1] = r[-1].replace('-', '_')
            outF.write('\t'.join(r) + '\n')
        print("example", '\t'.join(r))
        outF.seek(0)

        cols = "chrom start stop snp taggedSNP r2 ldblock trait author pubmed authorPubmedTrait".split(' ')
        self.curs.copy_from(outF, self.tableNameGwas, '\t', columns=cols)
        importedNumRows(self.curs)

        makeIndex(self.curs, self.tableNameGwas,
                  ["chrom", "authorPubmedTrait", "ldblock", "snp"])
        makeIndexIntRange(self.curs, self.tableNameGwas, ["start", "stop"])
Esempio n. 3
0
    def run(self):
        fnp = paths.path("mm10", "Two-Way-Synteny.txt")
        self.setupLiftover()

        printt("reading", fnp)
        mmToHG = []
        with open(fnp, 'r') as f:
            for line in f:
                line = line.strip().split('\t')
                mc, ms, me, md = self.ccREmaps["mm10"][line[1]]
                hc, hs, he, hd = self.ccREmaps["hg19"][line[0]]
                mmToHg.append([hc, hs, he, md, line[1], hd, line[0]])

        cols = "chrom start stop mouseAccession humanAccession overlap".split(' ')
        printt("writing stringio...")
        outF = StringIO.StringIO()
        for r in mmToHg:
            outF.write("\t".join([r[0], r[1], r[2], r[4], r[6], r[7]]) + '\n')
        outF.seek(0)

        printt("copy into db...")
        self.curs.copy_from(outF, self.tableName, '\t', columns=cols)
        printt("\tok", self.curs.rowcount)

        makeIndex(self.curs, self.tableName, ["mouseAccession", "humanAccession"])
Esempio n. 4
0
    def run(self):
        with db_connect_single(os.path.realpath(__file__)) as conn:
            with conn.cursor() as curs:
                tn = "r_rnas_" + self.assembly
                makeIndex(curs, tn, ["celltype"])

                tn = "r_expression_" + self.assembly
                makeIndex(curs, tn, ["dataset"])
Esempio n. 5
0
    def _overlap(self, bedFnp):
        printt("******************* GWAS overlap")
        self._setupOverlap()

        cresFnp = paths.path(self.assembly, "extras", "cREs.sorted.bed")
        if not os.path.exists(cresFnp):
            Utils.sortFile(paths.path(self.assembly, "raw", "cREs.bed"),
                           cresFnp)

        printt("running bedtools intersect...")
        cmds = [cat(bedFnp),
                '|', "cut -f -4,11-",
                '|', "bedtools intersect",
                "-a", "-",
                "-b", cresFnp,
                "-wo" ]
        snpsIntersecting = Utils.runCmds(cmds)
        print("example", snpsIntersecting[0].rstrip('\n').split('\t'))

        printt("rewriting...")
        outF = StringIO.StringIO()
        count = {}
        for r in snpsIntersecting:
            toks = r.rstrip('\n').split('\t')
            snp = toks[3]
            authorPubmedTrait = toks[4].replace('-', '_')
            accession = toks[9]

            if '_' not in authorPubmedTrait:
                print(r)
                print(toks)
                raise Exception("bad authorPubmedTrait?")
            if not snp.startswith("rs"):
                print(r)
                print(toks)
                raise Exception("bad rs?")
            if not accession.startswith("EH3"):
                print(r)
                print(toks)
                raise Exception("bad line?")
            outF.write('\t'.join([authorPubmedTrait, accession, snp]) + '\n')
            if authorPubmedTrait not in count: count[authorPubmedTrait] = 0
            count[authorPubmedTrait] += 1
        print("example", '\t'.join([authorPubmedTrait, accession, snp]))
        for k, v in count.iteritems():
            print("%s: %d" % (k, v))
        outF.seek(0)

        printt("copying into DB...")
        cols = "authorPubmedTrait accession snp".split(' ')
        self.curs.copy_from(outF, self.tableNameOverlap, '\t', columns=cols)
        importedNumRows(self.curs)

        makeIndex(self.curs, self.tableNameOverlap, ["authorPubmedTrait"])
Esempio n. 6
0
    def run(self):
        fnp, filetype = paths.gene_files[self.assembly]
        ts = Transcripts(fnp, filetype)

        rows = []
        for t in ts.getTranscripts():
            tss = t.getTSS()
            rows.append([t.geneid_, tss[0], tss[1], tss[2], tss[3]])

        rows.sort(key=lambda x: x[0])

        ret = []
        for gid, group in groupby(rows, lambda x: x[0]):
            group = list(group)
            if 1 == len(group):
                g = group[0]
                r = [g[0], g[1], str(g[2]), str(g[3])]
                ret.append(r)
                continue
            chroms = list(set([x[1] for x in group]))
            if 1 != len(chroms):
                print("Warning: multiple chroms")
                #raise Exception("mutliple chroms")
                continue
            start = min([x[2] for x in group])
            end = max([x[3] for x in group])
            r = [gid, chroms[0], str(start), str(end)]
            print("merge", end - start)
            ret.append(r)

        print("merged from", len(rows), "to", len(ret))

        tableName = self.assembly + "_tss_info"
        self.curs.execute("""
    DROP TABLE IF EXISTS {tableName};
CREATE TABLE {tableName}
(id serial PRIMARY KEY,
ensemblid_ver text,
chrom text,
start integer,
stop integer
);""".format(tableName=tableName))

        cols = ["ensemblid_ver", "chrom", "start", "stop"]

        outF = StringIO.StringIO()
        for r in ret:
            outF.write('\t'.join(r) + '\n')
        outF.seek(0)
        self.curs.copy_from(outF, tableName, '\t', columns=cols)
        printt("updated", tableName)

        makeIndex(self.curs, tableName, ["ensemblid_ver"])
Esempio n. 7
0
    def run(self):
        gidsToDbID, requiredGids = loadGidsToDbIds(self.assembly)

        genes = GeneInfo(self.assembly).genes

        ret = {}
        for gid, info in genes.iteritems():
            ret[gid] = GeneRow(gid, info, gidsToDbID)
        printt("merged", len(ret))
        count = len(ret)

        for gid, ver in requiredGids.iteritems():
            if gid not in ret:
                ret[gid] = GeneRow(gid, {}, gidsToDbID)
        printt("loaded missing genes for DB", len(ret) - count)

        ret = ret.values()
        print("example\n", ret[0].output())

        tableName = self.assembly + "_gene_info"
        self.curs.execute("""
DROP TABLE IF EXISTS {tableName} CASCADE;
CREATE TABLE {tableName}
(id serial PRIMARY KEY,
geneid integer,
ensemblid text,
ver integer,
ensemblid_ver text,
approved_symbol text,
chrom text,
start integer,
stop integer,
strand varchar(1),
gene_type text,
info jsonb);
""".format(tableName=tableName))

        cols = [
            "geneid", "ensemblid", "ver", "ensemblid_ver", "approved_symbol",
            "chrom", "start", "stop", "strand", "gene_type", "info"
        ]

        outF = StringIO.StringIO()
        for r in ret:
            outF.write(r.output() + '\n')
        outF.seek(0)
        self.curs.copy_from(outF, tableName, '\t', columns=cols)
        print("updated", tableName, self.curs.rowcount)

        makeIndex(self.curs, tableName, [
            "geneid", "ensemblid_ver", "ensemblid", "approved_symbol", "chrom"
        ])
Esempio n. 8
0
 def run(self):
     tn = self.tableName
     with db_connect_single(os.path.realpath(__file__)) as conn:
         with conn.cursor() as curs:
             makeIndex(curs, tn, ["accession", "chrom"])
             #makeIndexInt4Range(curs, tn, ["start", "stop"])
             makeIndexRev(curs, tn, [
                 "maxz", "dnase_max", "h3k4me3_max", "h3k27ac_max",
                 "ctcf_max"
             ])
             conn.commit()
             for col in self.zscore_cols:
                 makeIndexArr(curs, tn, col, conn)
Esempio n. 9
0
 def _import_tads(self):
     tableName = self.assembly + "_all_tads"
     printt("dropping and creating", tableName)
     self.curs.execute("""
         DROP TABLE IF EXISTS {tableName};
         CREATE TABLE {tableName}
         (id serial PRIMARY KEY,
         acc TEXT, chrom TEXT, start INTEGER, stop INTEGER);
     """.format(tableName=tableName))
     with open(
             os.path.join(rootDir, "%s/CTCF/all_tads.tsv" % self.assembly),
             "r") as f:
         cols = ["acc", "chrom", "start", "stop"]
         self.curs.copy_from(f, tableName, '\t', columns=cols)
     printt("imported", self.curs.rowcount)
     makeIndex(self.curs, tableName, ["acc", "chrom"])
Esempio n. 10
0
    def _import_biosamples(self):
        tableName = self.assembly + "_tad_biosamples"
        printt("dropping and creating", tableName)
        self.curs.execute("""
            DROP TABLE IF EXISTS {tableName};
            CREATE TABLE {tableName}
            (id serial PRIMARY KEY,
            acc TEXT, name TEXT);
        """.format(tableName=tableName))

        with open(
                os.path.join(rootDir,
                             "%s/CTCF/tad_biosamples.tsv" % self.assembly),
                "r") as f:
            cols = ["acc", "name"]
            self.curs.copy_from(f, tableName, '\t', columns=cols)
        printt("imported", self.curs.rowcount)
        makeIndex(self.curs, tableName, ["acc", "name"])
Esempio n. 11
0
def makeMV(curs, assembly):
    mv = assembly + "_gene_info_mv"
    printt("making", mv)
    curs.execute("""
DROP MATERIALIZED VIEW IF EXISTS {mv} CASCADE;

CREATE MATERIALIZED VIEW {mv} AS
SELECT q.id AS geneid, LOWER(d.value) AS value
FROM {assembly}_gene_info as q
JOIN jsonb_each_text(q.info) as d ON true
UNION
SELECT q.id AS geneid, LOWER(q.ensemblid) AS value
FROM {assembly}_gene_info as q
UNION
SELECT q.id AS geneid, LOWER(q.ensemblid_ver) AS value
FROM {assembly}_gene_info as q
""".format(mv=mv, assembly=assembly))

    makeIndex(curs, mv, ["geneid", "value"])
Esempio n. 12
0
def doIndex(curs, assembly):
    tableName = assembly + "_rampage"
    makeIndex(curs, tableName, ["ensemblid_ver", "chrom"])
    makeIndexIntRange(curs, tableName, ["start", "stop"])
Esempio n. 13
0
 def index(self):
     makeIndex(self.curs, self.tableName, ["leftCtId", "rightCtId", "ensembl"])
Esempio n. 14
0
 def index(self):
     makeIndexTextPatternOps(self.curs, self.tableName, ["name"])
     makeIndexGinTrgmOps(self.curs, self.tableName, ["name"])
     makeIndex(self.curs, self.tableName, ["oname"])
Esempio n. 15
0
 def index(self):
     makeIndex(self.curs, self.tableName, ["tadName"])
     makeIndexIntRange(self.curs, self.tableName, ["start", "stop"])
Esempio n. 16
0
 def _doIndex(self):
     makeIndex(self.curs, self.tableName, ["cellTypeName"])
Esempio n. 17
0
 def _doIndex(self):
     #makeIndexTextPatternOps(self.curs, self.tableName, ["synonym"])
     #makeIndexGinTrgmOps(self.curs, self.tableName, ["synonym"])
     makeIndex(self.curs, self.tableName, ["oid"])
Esempio n. 18
0
 def _doIndex(self):
     makeIndex(self.curs, self.tableName, ["startpos", "endpos", "feature", "transcript_id", "gene_id"])
Esempio n. 19
0
 def index(self):
     makeIndex(self.curs, self.tableName, ["accession", "tadName"])
Esempio n. 20
0
 def run(self):
     outF, cols = self._jobgen(self.assembly, self.tableName, self.curs,
                               self.runDate)
     self.curs.copy_from(outF, self.tableName, '\t', columns=cols)
     importedNumRows(self.curs)
     makeIndex(self.curs, self.tableName, ["label", "fileID"])
Esempio n. 21
0
 def _doIndex(self):
     makeIndex(self.curs, self.tableName, ["cre"])
Esempio n. 22
0
 def doIndex(self):
     tableName = self._tableNameMetadata()
     makeIndex(self.curs, tableName, ["expID", "celltype"])
     makeIndexMultiCol(self.curs, tableName,
                       ["cellCompartment", "biosample_type"])
Esempio n. 23
0
 def _doIndex(self):
     makeIndex(self.curs, self.tableName, ["accession"])
Esempio n. 24
0
 def _doIndexData(self, tableNameData):
     printt("creating indices in", tableNameData, "...")
     makeIndex(self.curs, tableNameData, ["gene_name", "tpm"])