Beispiel #1
0
    def _gwas(self, fnp):
        printt("******************* GWAS")
        printt("reading", fnp)
        with open(fnp) as f:
            rows = [r.rstrip().split('\t') for r in f if r]

        self.setupGWAS()

        printt("rewrite rows")
        outF = StringIO.StringIO()
        for r in rows:
            if 'Lead' == r[4]:
                r[4] = r[3]
            r[5] = "{%s}" % r[5].replace('*', "-1")
            r[2] = str(int(r[2]) + 1)
            r[-1] = r[-1].replace('-', '_')
            outF.write('\t'.join(r) + '\n')
        print("example", '\t'.join(r))
        outF.seek(0)

        cols = "chrom start stop snp taggedSNP r2 ldblock trait author pubmed authorPubmedTrait".split(' ')
        self.curs.copy_from(outF, self.tableNameGwas, '\t', columns=cols)
        importedNumRows(self.curs)

        makeIndex(self.curs, self.tableNameGwas,
                  ["chrom", "authorPubmedTrait", "ldblock", "snp"])
        makeIndexIntRange(self.curs, self.tableNameGwas, ["start", "stop"])
Beispiel #2
0
 def import_cage_fromfile(self, fnp):
     numRows = self.pw.copy_from(
         "import_cage_fromfile",
         fnp,
         self._tables["cage"],
         columns=[x for x in PGFantomCat.CAGEFIELDS[1:]])
     importedNumRows(numRows)
Beispiel #3
0
 def import_genes_fromfile(self, fnp):
     numRows = self.pw.copy_from(
         "import_genes_fromfile",
         fnp,
         self._tables["genes"],
         columns=[x for x in PGFantomCat.GENEFIELDS[1:]])
     importedNumRows(numRows)
Beispiel #4
0
 def import_enhancers_fromfile(self, fnp):
     numRows = self.pw.copy_from(
         "import_enhancers_fromfile",
         fnp,
         self._tables["enhancers"],
         columns=[x for x in PGFantomCat.ENHANCERFIELDS[1:]])
     importedNumRows(numRows)
Beispiel #5
0
    def _overlap(self, bedFnp):
        printt("******************* GWAS overlap")
        self._setupOverlap()

        cresFnp = paths.path(self.assembly, "extras", "cREs.sorted.bed")
        if not os.path.exists(cresFnp):
            Utils.sortFile(paths.path(self.assembly, "raw", "cREs.bed"),
                           cresFnp)

        printt("running bedtools intersect...")
        cmds = [cat(bedFnp),
                '|', "cut -f -4,11-",
                '|', "bedtools intersect",
                "-a", "-",
                "-b", cresFnp,
                "-wo" ]
        snpsIntersecting = Utils.runCmds(cmds)
        print("example", snpsIntersecting[0].rstrip('\n').split('\t'))

        printt("rewriting...")
        outF = StringIO.StringIO()
        count = {}
        for r in snpsIntersecting:
            toks = r.rstrip('\n').split('\t')
            snp = toks[3]
            authorPubmedTrait = toks[4].replace('-', '_')
            accession = toks[9]

            if '_' not in authorPubmedTrait:
                print(r)
                print(toks)
                raise Exception("bad authorPubmedTrait?")
            if not snp.startswith("rs"):
                print(r)
                print(toks)
                raise Exception("bad rs?")
            if not accession.startswith("EH3"):
                print(r)
                print(toks)
                raise Exception("bad line?")
            outF.write('\t'.join([authorPubmedTrait, accession, snp]) + '\n')
            if authorPubmedTrait not in count: count[authorPubmedTrait] = 0
            count[authorPubmedTrait] += 1
        print("example", '\t'.join([authorPubmedTrait, accession, snp]))
        for k, v in count.iteritems():
            print("%s: %d" % (k, v))
        outF.seek(0)

        printt("copying into DB...")
        cols = "authorPubmedTrait accession snp".split(' ')
        self.curs.copy_from(outF, self.tableNameOverlap, '\t', columns=cols)
        importedNumRows(self.curs)

        makeIndex(self.curs, self.tableNameOverlap, ["authorPubmedTrait"])
Beispiel #6
0
    def run(self):
        self.setupTable()

        cols = ["accession", "tf", "histone"]

        fnp = paths.path(self.assembly, "extras", self.runDate,
                         "%s.json.gz" % self._tsuffix)
        printt("copying in data", fnp)
        with gzip.open(fnp) as f:
            self.curs.copy_from(f, self.tableName, '\t', columns=cols)
        importedNumRows(self.curs)
Beispiel #7
0
    def _extractExpIDs(self, tableNameData, tableNameExperimentList):
        printt("dropping and creating", tableNameExperimentList)
        self.curs.execute("""
    DROP TABLE IF EXISTS {tableNameExperimentList};

    CREATE TABLE {tableNameExperimentList} AS
    SELECT DISTINCT expID, fileID, replicate
    FROM {tableNameData}
    """.format(tableNameData = tableNameData,
               tableNameExperimentList = tableNameExperimentList))
        importedNumRows(self.curs)
Beispiel #8
0
    def _import(self):
        fnp = paths.path('', "extras", "google-help-text.json")
        printt("reading", fnp)
        with open(fnp) as f:
            j = json.load(f)
        records_list_template = j["records_list_template"]
        rows = [tuple(r) for r in j["rows"]]  # for psycopg2

        keys = [r[0] for r in rows]
        print('\n'.join(keys))

        # from http://stackoverflow.com/a/30985541
        q = """
INSERT INTO helpkeys (key, title, summary)
VALUES {}
""".format(records_list_template)

        self.curs.execute(q, rows)
        importedNumRows(self.curs)
Beispiel #9
0
    def _studies(self, header):
        printt("******************* GWAS studies")
        self._setupStudies()

        printt("import to db")
        self.curs.execute("""
    INSERT INTO {tn} (authorpubmedtrait,  author, pubmed, trait, numLDblocks)
    SELECT DISTINCT(authorpubmedtrait), author, pubmed, trait, COUNT(DISTINCT(ldblock))
    FROM {gwasTn}
    GROUP BY authorpubmedtrait, author, pubmed, trait
     """.format(tn=self.tableNameStudies,
                gwasTn=self.tableNameGwas))
        importedNumRows(self.curs)

        self.curs.execute("""
    SELECT authorpubmedtrait
    FROM {tn}
    ORDER BY authorpubmedtrait
     """.format(tn=self.tableNameStudies))
        return [r[0] for r in self.curs.fetchall()]
Beispiel #10
0
def encode_peak_metadata(assembly, t, curs, runDate):
    printt("dropping and creating table", t)
    curs.execute("""
DROP TABLE IF EXISTS {tn};
CREATE TABLE {tn}(
id serial PRIMARY KEY,
expID text,
fileID text,
assay text,
label text,
biosample_term_name text
)""".format(tn=t))
    jobs = peakIntersections.loadJobs(assembly, runDate)
    outF = StringIO.StringIO()
    for r in jobs:
        outF.write('\t'.join([
            r["bed"]["expID"], r["bed"]["fileID"], r["etype"], r["exp"]
            ["label"], r["exp"]["biosample_term_name"]
        ]) + '\n')
    outF.seek(0)
    cols = "expID fileID assay label biosample_term_name".split(' ')

    tableName = t + '_runDate'
    printt("dropping and creating table", tableName)
    curs.execute("""
DROP TABLE IF EXISTS {tn};
CREATE TABLE {tn}(
id serial PRIMARY KEY,
runDate text
)""".format(tn=tableName))

    curs.execute(
        """ 
INSERT into {tn}
(runDate)
VALUES (%s)
    """.format(tn=tableName), (runDate, ))
    importedNumRows(curs)

    return (outF, cols)
Beispiel #11
0
    def _setupAndCopy(self, tableNameData, fnp):
        printt("dropping and creating", tableNameData)
        self.curs.execute("""
    DROP TABLE IF EXISTS {tableNameData};

    CREATE TABLE {tableNameData} (
    id serial PRIMARY KEY,
    ensembl_id VARCHAR(256) NOT NULL,
    gene_name VARCHAR(256) NOT NULL,
    expID VARCHAR(256) NOT NULL,
    fileID VARCHAR(256) NOT NULL,
    replicate INT NOT NULL,
    fpkm NUMERIC NOT NULL,
    tpm NUMERIC NOT NULL);
        """.format(tableNameData=tableNameData))

        printt("importing", fnp)
        with gzip.open(fnp) as f:
            self.curs.copy_from(f, tableNameData, '\t',
                                columns=("expID", "replicate", "ensembl_id", "gene_name",
                                         "fileID", "tpm", "fpkm"))
        importedNumRows(self.curs)
Beispiel #12
0
    def _do_enrichment(self, fnBase, tableName, takeLog, skip = []):
        fnp = paths.gwasFnp(self.assembly, self.version, fnBase)
        printt("reading", fnp)
        with open(fnp) as f:
            header = f.readline().rstrip('\n').split('\t')
            rows = [r.rstrip('\n').split('\t') for r in f if r]

        fields = [f.replace('-', '_').replace("'", '_') for f in header[2:]]
        fields = [fields[i] for i in xrange(len(fields)) if i + 2 not in skip]
        self._setupEnrichment(fields, tableName)

        printt("rewrite rows")
        outF = StringIO.StringIO()
        for r in rows:
            for idx in xrange(2, len(r)):
                r[idx] = str(float(r[idx]))
            r = [r[i] for i in xrange(len(r)) if i + 2 not in skip]
            outF.write('\t'.join(r) + '\n')
        outF.seek(0)
        cols = ["expID", "cellTypeName"] + fields
        print(cols)

        printt("import to db")
        self.curs.copy_from(outF, tableName, '\t', columns=cols)
        importedNumRows(self.curs)

        self.curs.execute("""
        UPDATE {tne} as ge
        set cellTypeName = d.cellTypeName,
        biosample_summary = d.biosample_summary
        from {tnd} as d
        where ge.expID = d.expID
        """.format(tne=tableName, tnd=self.tableNameDatasets))
        updatedNumRows(self.curs)

        return header
Beispiel #13
0
 def import_intersections_fromfile(self, fnp, key="intersections"):
     numRows = self.pw.copy_from("import_intersections_fromfile",
                                 fnp,
                                 self._tables[key],
                                 columns=["geneid", "cre"])
     importedNumRows(numRows)
Beispiel #14
0
 def run(self):
     outF, cols = self._jobgen(self.assembly, self.tableName, self.curs,
                               self.runDate)
     self.curs.copy_from(outF, self.tableName, '\t', columns=cols)
     importedNumRows(self.curs)
     makeIndex(self.curs, self.tableName, ["label", "fileID"])