def _gwas(self, fnp): printt("******************* GWAS") printt("reading", fnp) with open(fnp) as f: rows = [r.rstrip().split('\t') for r in f if r] self.setupGWAS() printt("rewrite rows") outF = StringIO.StringIO() for r in rows: if 'Lead' == r[4]: r[4] = r[3] r[5] = "{%s}" % r[5].replace('*', "-1") r[2] = str(int(r[2]) + 1) r[-1] = r[-1].replace('-', '_') outF.write('\t'.join(r) + '\n') print("example", '\t'.join(r)) outF.seek(0) cols = "chrom start stop snp taggedSNP r2 ldblock trait author pubmed authorPubmedTrait".split(' ') self.curs.copy_from(outF, self.tableNameGwas, '\t', columns=cols) importedNumRows(self.curs) makeIndex(self.curs, self.tableNameGwas, ["chrom", "authorPubmedTrait", "ldblock", "snp"]) makeIndexIntRange(self.curs, self.tableNameGwas, ["start", "stop"])
def import_cage_fromfile(self, fnp): numRows = self.pw.copy_from( "import_cage_fromfile", fnp, self._tables["cage"], columns=[x for x in PGFantomCat.CAGEFIELDS[1:]]) importedNumRows(numRows)
def import_genes_fromfile(self, fnp): numRows = self.pw.copy_from( "import_genes_fromfile", fnp, self._tables["genes"], columns=[x for x in PGFantomCat.GENEFIELDS[1:]]) importedNumRows(numRows)
def import_enhancers_fromfile(self, fnp): numRows = self.pw.copy_from( "import_enhancers_fromfile", fnp, self._tables["enhancers"], columns=[x for x in PGFantomCat.ENHANCERFIELDS[1:]]) importedNumRows(numRows)
def _overlap(self, bedFnp): printt("******************* GWAS overlap") self._setupOverlap() cresFnp = paths.path(self.assembly, "extras", "cREs.sorted.bed") if not os.path.exists(cresFnp): Utils.sortFile(paths.path(self.assembly, "raw", "cREs.bed"), cresFnp) printt("running bedtools intersect...") cmds = [cat(bedFnp), '|', "cut -f -4,11-", '|', "bedtools intersect", "-a", "-", "-b", cresFnp, "-wo" ] snpsIntersecting = Utils.runCmds(cmds) print("example", snpsIntersecting[0].rstrip('\n').split('\t')) printt("rewriting...") outF = StringIO.StringIO() count = {} for r in snpsIntersecting: toks = r.rstrip('\n').split('\t') snp = toks[3] authorPubmedTrait = toks[4].replace('-', '_') accession = toks[9] if '_' not in authorPubmedTrait: print(r) print(toks) raise Exception("bad authorPubmedTrait?") if not snp.startswith("rs"): print(r) print(toks) raise Exception("bad rs?") if not accession.startswith("EH3"): print(r) print(toks) raise Exception("bad line?") outF.write('\t'.join([authorPubmedTrait, accession, snp]) + '\n') if authorPubmedTrait not in count: count[authorPubmedTrait] = 0 count[authorPubmedTrait] += 1 print("example", '\t'.join([authorPubmedTrait, accession, snp])) for k, v in count.iteritems(): print("%s: %d" % (k, v)) outF.seek(0) printt("copying into DB...") cols = "authorPubmedTrait accession snp".split(' ') self.curs.copy_from(outF, self.tableNameOverlap, '\t', columns=cols) importedNumRows(self.curs) makeIndex(self.curs, self.tableNameOverlap, ["authorPubmedTrait"])
def run(self): self.setupTable() cols = ["accession", "tf", "histone"] fnp = paths.path(self.assembly, "extras", self.runDate, "%s.json.gz" % self._tsuffix) printt("copying in data", fnp) with gzip.open(fnp) as f: self.curs.copy_from(f, self.tableName, '\t', columns=cols) importedNumRows(self.curs)
def _extractExpIDs(self, tableNameData, tableNameExperimentList): printt("dropping and creating", tableNameExperimentList) self.curs.execute(""" DROP TABLE IF EXISTS {tableNameExperimentList}; CREATE TABLE {tableNameExperimentList} AS SELECT DISTINCT expID, fileID, replicate FROM {tableNameData} """.format(tableNameData = tableNameData, tableNameExperimentList = tableNameExperimentList)) importedNumRows(self.curs)
def _import(self): fnp = paths.path('', "extras", "google-help-text.json") printt("reading", fnp) with open(fnp) as f: j = json.load(f) records_list_template = j["records_list_template"] rows = [tuple(r) for r in j["rows"]] # for psycopg2 keys = [r[0] for r in rows] print('\n'.join(keys)) # from http://stackoverflow.com/a/30985541 q = """ INSERT INTO helpkeys (key, title, summary) VALUES {} """.format(records_list_template) self.curs.execute(q, rows) importedNumRows(self.curs)
def _studies(self, header): printt("******************* GWAS studies") self._setupStudies() printt("import to db") self.curs.execute(""" INSERT INTO {tn} (authorpubmedtrait, author, pubmed, trait, numLDblocks) SELECT DISTINCT(authorpubmedtrait), author, pubmed, trait, COUNT(DISTINCT(ldblock)) FROM {gwasTn} GROUP BY authorpubmedtrait, author, pubmed, trait """.format(tn=self.tableNameStudies, gwasTn=self.tableNameGwas)) importedNumRows(self.curs) self.curs.execute(""" SELECT authorpubmedtrait FROM {tn} ORDER BY authorpubmedtrait """.format(tn=self.tableNameStudies)) return [r[0] for r in self.curs.fetchall()]
def encode_peak_metadata(assembly, t, curs, runDate): printt("dropping and creating table", t) curs.execute(""" DROP TABLE IF EXISTS {tn}; CREATE TABLE {tn}( id serial PRIMARY KEY, expID text, fileID text, assay text, label text, biosample_term_name text )""".format(tn=t)) jobs = peakIntersections.loadJobs(assembly, runDate) outF = StringIO.StringIO() for r in jobs: outF.write('\t'.join([ r["bed"]["expID"], r["bed"]["fileID"], r["etype"], r["exp"] ["label"], r["exp"]["biosample_term_name"] ]) + '\n') outF.seek(0) cols = "expID fileID assay label biosample_term_name".split(' ') tableName = t + '_runDate' printt("dropping and creating table", tableName) curs.execute(""" DROP TABLE IF EXISTS {tn}; CREATE TABLE {tn}( id serial PRIMARY KEY, runDate text )""".format(tn=tableName)) curs.execute( """ INSERT into {tn} (runDate) VALUES (%s) """.format(tn=tableName), (runDate, )) importedNumRows(curs) return (outF, cols)
def _setupAndCopy(self, tableNameData, fnp): printt("dropping and creating", tableNameData) self.curs.execute(""" DROP TABLE IF EXISTS {tableNameData}; CREATE TABLE {tableNameData} ( id serial PRIMARY KEY, ensembl_id VARCHAR(256) NOT NULL, gene_name VARCHAR(256) NOT NULL, expID VARCHAR(256) NOT NULL, fileID VARCHAR(256) NOT NULL, replicate INT NOT NULL, fpkm NUMERIC NOT NULL, tpm NUMERIC NOT NULL); """.format(tableNameData=tableNameData)) printt("importing", fnp) with gzip.open(fnp) as f: self.curs.copy_from(f, tableNameData, '\t', columns=("expID", "replicate", "ensembl_id", "gene_name", "fileID", "tpm", "fpkm")) importedNumRows(self.curs)
def _do_enrichment(self, fnBase, tableName, takeLog, skip = []): fnp = paths.gwasFnp(self.assembly, self.version, fnBase) printt("reading", fnp) with open(fnp) as f: header = f.readline().rstrip('\n').split('\t') rows = [r.rstrip('\n').split('\t') for r in f if r] fields = [f.replace('-', '_').replace("'", '_') for f in header[2:]] fields = [fields[i] for i in xrange(len(fields)) if i + 2 not in skip] self._setupEnrichment(fields, tableName) printt("rewrite rows") outF = StringIO.StringIO() for r in rows: for idx in xrange(2, len(r)): r[idx] = str(float(r[idx])) r = [r[i] for i in xrange(len(r)) if i + 2 not in skip] outF.write('\t'.join(r) + '\n') outF.seek(0) cols = ["expID", "cellTypeName"] + fields print(cols) printt("import to db") self.curs.copy_from(outF, tableName, '\t', columns=cols) importedNumRows(self.curs) self.curs.execute(""" UPDATE {tne} as ge set cellTypeName = d.cellTypeName, biosample_summary = d.biosample_summary from {tnd} as d where ge.expID = d.expID """.format(tne=tableName, tnd=self.tableNameDatasets)) updatedNumRows(self.curs) return header
def import_intersections_fromfile(self, fnp, key="intersections"): numRows = self.pw.copy_from("import_intersections_fromfile", fnp, self._tables[key], columns=["geneid", "cre"]) importedNumRows(numRows)
def run(self): outF, cols = self._jobgen(self.assembly, self.tableName, self.curs, self.runDate) self.curs.copy_from(outF, self.tableName, '\t', columns=cols) importedNumRows(self.curs) makeIndex(self.curs, self.tableName, ["label", "fileID"])