def createViewMapping(infile, outfile): '''create view in database for alignment stats. This view aggregates all information on a per-track basis. The table is built from the following tracks: mapping_stats bam_stats ''' tablename = P.toTable(outfile) # can not create views across multiple database, so use table view_type = "TABLE" dbhandle = connect() Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT * FROM bam_stats AS b ''' Database.executewait(dbhandle, statement % locals())
def loadCodingPotential(infile, outfile): '''load annotations''' table = P.toTable(outfile) statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=gene_id --map=gene_id:str --table=%(table)s > %(outfile)s''' P.run() # set the is_coding flag dbhandle = sqlite3.connect(PARAMS["database"]) Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals()) Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals()) dbhandle.commit()
def loadHypergeometricAnalysis(infile, outfile): '''load GO results.''' track = P.toTable(outfile) tablename = 'hypergeometric_%s_summary' % track P.load(infile, outfile, tablename=tablename) dbh = connect() ontologies = [x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()] genelists = [x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()] # output files from runGO.py sections = ('results', 'parameters', 'withgenes') for section in sections: tablename = 'hypergeometric_%s_%s' % (track, section) statement = ''' python %(scriptsdir)s/combine_tables.py --cat=track --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s" hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(tablename)s >> %(outfile)s''' P.run() for ontology in ontologies: fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology) if not os.path.exists(fn): E.warn("file %s does not exist" % fn) continue P.load(fn, outfile, tablename='hypergeometric_%s_%s_l2fold' % (track, ontology), options='--allow-empty') fn = os.path.join( infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology), options='--allow-empty') fn = os.path.join( infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology), options='--allow-empty')
def createViewMapping(infile, outfile): """create view in database for alignment stats. This view aggregates all information on a per-track basis. The table is built from the following tracks: mapping_stats bam_stats """ tablename = P.toTable(outfile) # can not create views across multiple database, so use table view_type = "TABLE" dbhandle = connect() Database.executewait(dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) statement = """ CREATE %(view_type)s %(tablename)s AS SELECT * FROM bam_stats AS b """ Database.executewait(dbhandle, statement % locals())
def ReadGene2GOFromDatabase(dbhandle, go_type, database, species): """read go assignments from ensembl database. returns a dictionary of lists. (one to many mapping of genes to GO categories) and a dictionary of go-term to go information Note: assumes that external_db_id for GO is 1000 """ statement = GetGOStatement(go_type, database, species) result = Database.executewait(dbhandle, statement, retries=0).fetchall() gene2go = {} go2info = collections.defaultdict(GOInfo) for gene_id, goid, description, evidence in result: gm = GOMatch(goid, go_type, description, evidence) gi = GOInfo(goid, go_type, description) if gene_id not in gene2go: gene2go[gene_id] = [] gene2go[gene_id].append(gm) go2info[goid] = gi return gene2go, go2info
def importCodingPotential( infile, outfile ): '''import annotations''' table = outfile[:-len(".import")] statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=gene_id --map=gene_id:str --table=%(table)s < %(infile)s > %(outfile)s''' P.run() # set the is_coding flag dbhandle = sqlite3.connect( PARAMS["database"] ) Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals()) Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (f_iscoding == 'coding') OR (r_iscoding == 'coding') ''' % locals()) dbhandle.commit()
def mergeEffectsPerGene( infile, outfile ): '''summarize effects on a per-gene level.''' tablename = outfile[:-len(".load")] dbhandle = connect() statement = ''' CREATE TABLE %(tablename)s AS SELECT DISTINCT track, gene_id, COUNT(*) AS ntranscripts, MIN(e.nalleles) AS min_nalleles, MAX(e.nalleles) AS max_nalleles, MIN(e.stop_min) AS min_stop_min, MAX(e.stop_min) AS max_stop_min, MIN(e.stop_max) AS min_stop_max, MAX(e.stop_max) AS max_stop_max, SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_knockout, SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_affected FROM annotations.transcript_info as i, effects AS e WHERE i.transcript_id = e.transcript_id GROUP BY i.gene_id, track ''' % locals() Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals() ) Database.executewait( dbhandle, statement ) Database.executewait( dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals()) dbhandle.commit() P.touch(outfile)
def loadCodingPotential( infile, outfile ): '''load annotations''' table = P.toTable( outfile ) statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=gene_id --map=gene_id:str --table=%(table)s > %(outfile)s''' P.run() # set the is_coding flag dbhandle = sqlite3.connect( PARAMS["database"] ) Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals()) Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals()) dbhandle.commit()
def DumpGOFromDatabase(outfile, dbhandle, options): """read go assignments from database. and dump them into a flatfile. (one to many mapping of genes to GO categories) and a dictionary of go-term to go information """ E.info("category\ttotal\tgenes\tcategories") all_genes = collections.defaultdict(int) all_categories = collections.defaultdict(int) all_ntotal = 0 outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") for go_type in options.ontology: genes = collections.defaultdict(int) categories = collections.defaultdict(int) ntotal = 0 statement = GetGOStatement(go_type, options.database_name, options.species) results = Database.executewait( dbhandle, statement, retries=0).fetchall() for result in results: outfile.write("\t".join(map(str, (go_type,) + result)) + "\n") gene_id, goid, description, evidence = result genes[gene_id] += 1 categories[goid] += 1 ntotal += 1 all_genes[gene_id] += 1 all_categories[goid] += 1 all_ntotal += 1 E.info("%s\t%i\t%i\t%i" % (go_type, ntotal, len(genes), len(categories))) E.info("%s\t%i\t%i\t%i" % ("all", all_ntotal, len(all_genes), len(all_categories))) return
def DumpGOFromDatabase(outfile, dbhandle, options): """read go assignments from database. and dump them into a flatfile. (one to many mapping of genes to GO categories) and a dictionary of go-term to go information """ E.info("category\ttotal\tgenes\tcategories") all_genes = collections.defaultdict(int) all_categories = collections.defaultdict(int) all_ntotal = 0 outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") for go_type in options.ontology: genes = collections.defaultdict(int) categories = collections.defaultdict(int) ntotal = 0 statement = GetGOStatement(go_type, options.database_name, options.species) results = Database.executewait(dbhandle, statement, retries=0).fetchall() for result in results: outfile.write("\t".join(map(str, (go_type, ) + result)) + "\n") gene_id, goid, description, evidence = result genes[gene_id] += 1 categories[goid] += 1 ntotal += 1 all_genes[gene_id] += 1 all_categories[goid] += 1 all_ntotal += 1 E.info("%s\t%i\t%i\t%i" % (go_type, ntotal, len(genes), len(categories))) E.info("%s\t%i\t%i\t%i" % ("all", all_ntotal, len(all_genes), len(all_categories))) return
def summarizeEffectsPerGene(infile, outfile): '''summarize effects on a per-gene level.''' tablename = outfile[:-len(".load")] track = infile[:-len("_effects.load")] dbhandle = connect() statement = ''' CREATE TABLE %(tablename)s AS SELECT DISTINCT gene_id, COUNT(*) AS ntranscripts, MIN(e.nalleles) AS min_nalleles, MAX(e.nalleles) AS max_nalleles, MIN(e.stop_min) AS min_stop_min, MAX(e.stop_min) AS max_stop_min, MIN(e.stop_max) AS min_stop_max, MAX(e.stop_max) AS max_stop_max, SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_knockout, SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_affected FROM annotations.transcript_info as i, %(track)s_effects AS e WHERE i.transcript_id = e.transcript_id GROUP BY i.gene_id ''' % locals() Database.executewait(dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals()) Database.executewait(dbhandle, statement) Database.executewait( dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals()) dbhandle.commit() P.touch(outfile)
def buildDMRStats( tables, method, outfile ): '''build dmr summary statistics. Creates some diagnostic plots in <exportdir>/<method> directory. Tables should be labeled <tileset>_<design>_<method>. ''' dbhandle = sqlite3.connect( PARAMS["database"] ) def togeneset( tablename ): return re.match("([^_]+)_", tablename ).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile( outfile, "w" ) outf.write( "\t".join( ("tileset", "design", "track1", "track2", "tested", "\t".join( [ "status_%s" % x for x in keys_status ] ), "significant", "up", "down", "twofold", "twofold_up", "twofold_down", ) ) + "\n" ) all_tables = set(Database.getTables( dbhandle )) outdir = os.path.join( PARAMS["exportdir"], "diff_methylation" ) for tablename in tables: prefix = P.snip( tablename, "_%s" % method ) tileset, design = prefix.split("_") def toDict( vals, l = 2 ): return collections.defaultdict( int, [ (tuple( x[:l]), x[l]) for x in vals ] ) E.info( "collecting data from %s" % tablename ) tested = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name""" % locals() ).fetchall() ) status = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name,status""" % locals() ).fetchall(), 3 ) signif = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE significant GROUP BY treatment_name,control_name""" % locals() ).fetchall() ) fold2 = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE (l2fold >= 1 or l2fold <= -1) AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) fold2up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 1 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) fold2down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < -1 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) groups = tested.keys() for treatment_name, control_name in groups: k = (treatment_name,control_name) outf.write( "\t".join(map(str, ( tileset, design, treatment_name, control_name, tested[k], "\t".join( [ str(status[(treatment_name,control_name,x)]) for x in keys_status]), signif[(k)], up[k], down[k], fold2[k], fold2up[k], fold2down[k] ) ) ) + "\n" ) ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, '''SELECT end - start, pvalue FROM %(tablename)s WHERE significant'''% locals() ).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals() R.png( pngfile ) R.smoothScatter( R.log10( ro.FloatVector(data[0]) ), R.log10( ro.FloatVector(data[1]) ), xlab = 'log10( length )', ylab = 'log10( pvalue )', log="x", pch=20, cex=.1 ) R['dev.off']() outf.close()
def buildExpressionStats( dbhandle, outfile, tablenames, outdir, regex_table="(?P<design>[^_]+)_" "(?P<geneset>[^_]+)_" "(?P<counting_method>[^_]+)_" "(?P<method>[^_]+)_" "(?P<level>[^_]+)_diff"): """compile expression summary statistics from database. This method outputs a table with the number of genes tested, failed, differentially expressed, etc. for a series of DE tests. Arguments --------- dbhandle : object Database handle. tables : list List of tables to process. outfile : string Output filename in :term:`tsv` format. outdir : string Output directory for diagnostic plots. regex : string Regular expression to extract experimental information from table name. """ keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join( ("design", "geneset", "level", "counting_method", "treatment_name", "control_name", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "twofold")) + "\n") for tablename in tablenames: r = re.search(regex_table, tablename) if r is None: raise ValueError( "can't match tablename '%s' to regex" % tablename) geneset = r.group("geneset") design = r.group("design") level = r.group("level") counting_method = r.group("counting_method") geneset = r.group("geneset") def toDict(vals, l=2): return collections.defaultdict( int, [(tuple(x[:l]), x[l]) for x in vals]) tested = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename)s " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) status = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, status, " "COUNT(*) FROM %(tablename)s " "GROUP BY treatment_name,control_name,status" % locals()).fetchall(), 3) signif = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename)s " "WHERE significant " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) fold2 = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename)s " "WHERE (l2fold >= 1 or l2fold <= -1) AND significant " "GROUP BY treatment_name,control_name,significant" % locals()).fetchall()) for treatment_name, control_name in tested.keys(): outf.write("\t".join(map(str, ( design, geneset, level, counting_method, treatment_name, control_name, tested[(treatment_name, control_name)], "\t".join( [str(status[(treatment_name, control_name, x)]) for x in keys_status]), signif[(treatment_name, control_name)], fold2[(treatment_name, control_name)]))) + "\n") # plot length versus P-Value data = Database.executewait( dbhandle, "SELECT i.sum, pvalue " "FROM %(tablename)s, " "%(geneset)s_geneinfo as i " "WHERE i.gene_id = test_id AND " "significant" % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = ("%(outdir)s/%(design)s_%(geneset)s_%(level)s" "_pvalue_vs_length.png") % locals() R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def loadSummary( infile, outfile ): '''load several rates into a single convenience table. ''' stmt_select = [] stmt_from = [] stmt_where = ["1"] track = infile[:-len(".gtf.gz")] tablename = "%s_evol" % track if os.path.exists( "%s_rates.load" % track ): stmt_select.append( "a.distance AS ks, a.aligned AS aligned" ) stmt_from.append('''LEFT JOIN %(track)s_rates AS a ON r.gene_id = a.gene_id AND a.aligned >= %(rates_min_aligned)i AND a.distance <= %(rates_max_rate)f''' ) if os.path.exists( "%s_coverage.load" % track ): stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage" ) stmt_from.append("LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id" ) if os.path.exists( "%s_repeats_gc.load" % track ): stmt_select.append("ar_gc.exons_mean AS repeats_gc" ) stmt_from.append("LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id" ) if os.path.exists( "%s_repeats_rates.load" % track ): stmt_select.append("ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska" ) stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar ON r.gene_id = ar.gene_id AND ar.exons_nval >= %(rates_min_repeats)i''' ) if os.path.exists( "%s_introns_rates.load" % track ): stmt_select.append("ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski" ) stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir ON r.gene_id = ir.gene_id AND ir.aligned >= %(rates_min_aligned)i''' ) x = locals() x.update( PARAMS ) stmt_select = ", ".join( stmt_select ) % x stmt_from = " ".join( stmt_from ) % x stmt_where = " AND ".join( stmt_where ) % x dbhandle = sqlite3.connect( PARAMS["database"] ) Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s " % locals() ) statement = ''' CREATE TABLE %(tablename)s AS SELECT CAST(r.gene_id AS TEXT) AS gene_id, r.exons_sum as length, r.exons_pGC as pgc, %(stmt_select)s FROM %(track)s_annotation AS r %(stmt_from)s WHERE %(stmt_where)s ''' % locals() Database.executewait( dbhandle, statement) dbhandle.commit() P.touch(outfile)
def buildCuffdiffPlots(infile, outfile): '''create summaries of cufflinks results (including some diagnostic plots) Plots are created in the <exportdir>/cuffdiff directory. Plots are: <geneset>_<method>_<level>_<track1>_vs_<track2>_significance.png fold change against expression level ''' ########################################### ########################################### # create diagnostic plots ########################################### outdir = os.path.join(PARAMS["exportdir"], "cuffdiff") dbhandle = sqlite3.connect(PARAMS["database"]) prefix = P.snip(infile, ".load") geneset, method = prefix.split("_") for level in CUFFDIFF_LEVELS: tablename_diff = prefix + "_%s_diff" % level tablename_levels = prefix + "_%s_levels" % level # note that the ordering of EXPERIMENTS and the _diff table # needs to be the same as only one triangle is stored of the # pairwise results. do not plot "undefined" lfold values # (where treatment_mean or control_mean = 0) do not plot lfold # values where the confidence bounds contain 0. for track1, track2 in itertools.combinations(EXPERIMENTS, 2): statement = """ SELECT CASE WHEN d.treatment_mean < d.control_mean THEN d.treatment_mean ELSE d.control_mean END, d.l2fold, d.significant FROM %(tablename_diff)s AS d WHERE treatment_name = '%(track1)s' AND control_name = '%(track2)s' AND status = 'OK' AND treatment_mean > 0 AND control_mean > 0 """ % locals() data = zip(*Database.executewait(dbhandle, statement)) pngfile = "%(outdir)s/%(geneset)s_%(method)s_%(level)s_%(track1)s_vs_%(track2)s_significance.png" % locals( ) # ian: Bug fix: moved R.png to after data check so that no # plot is started if there is no data this was leading # to R falling over from too many open devices if len(data) == 0: E.warn("no plot for %s - %s -%s vs %s" % (pngfile, level, track1, track2)) continue R.png(pngfile) R.plot(ro.FloatVector(data[0]), ro.FloatVector(data[1]), xlab='min(FPKM)', ylab='log2fold', log="x", pch=20, cex=.1, col=R.ifelse(ro.IntVector(data[2]), "red", "black")) R['dev.off']() P.touch(outfile)
def buildExpressionStats(tables, method, outfile, outdir): '''build expression summary statistics. Creates also diagnostic plots in <exportdir>/<method> directory. ''' dbhandle = sqlite3.connect(PARAMS["database"]) def _split(tablename): # this would be much easier, if feature_counts/gene_counts/etc. # would not contain an underscore. try: design, geneset, counting_method = re.match( "([^_]+)_vs_([^_]+)_(.*)_%s" % method, tablename).groups() except AttributeError: try: design, geneset = re.match("([^_]+)_([^_]+)_%s" % method, tablename).groups() counting_method = "na" except AttributeError: raise ValueError("can't parse tablename %s" % tablename) return design, geneset, counting_method # return re.match("([^_]+)_", tablename ).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join(("design", "geneset", "level", "treatment_name", "counting_method", "control_name", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "twofold")) + "\n") all_tables = set(Database.getTables(dbhandle)) for level in CUFFDIFF_LEVELS: for tablename in tables: tablename_diff = "%s_%s_diff" % (tablename, level) tablename_levels = "%s_%s_diff" % (tablename, level) design, geneset, counting_method = _split(tablename_diff) if tablename_diff not in all_tables: continue def toDict(vals, l=2): return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals]) tested = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name" % locals()).fetchall()) status = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, status, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name,status" % locals()).fetchall(), 3) signif = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE significant " "GROUP BY treatment_name,control_name" % locals()).fetchall()) fold2 = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE (l2fold >= 1 or l2fold <= -1) AND significant " "GROUP BY treatment_name,control_name,significant" % locals()).fetchall()) for treatment_name, control_name in tested.keys(): outf.write("\t".join( map(str, (design, geneset, level, counting_method, treatment_name, control_name, tested[ (treatment_name, control_name)], "\t".join([ str(status[(treatment_name, control_name, x)]) for x in keys_status ]), signif[(treatment_name, control_name)], fold2[(treatment_name, control_name)]))) + "\n") ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, "SELECT i.sum, pvalue " "FROM %(tablename_diff)s, " "%(geneset)s_geneinfo as i " "WHERE i.gene_id = test_id AND " "significant" % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(design)s_%(geneset)s_%(level)s_pvalue_vs_length.png" % locals( ) R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def importLincRNA( infile, outfile ): '''build a linc RNA set. * no coding potential * unknown and intergenic transcripts * no overlap with ``linc_exclude`` (usually: human refseq) * at least ``linc_min_length`` bp in length * at least ``linc_min_reads`` reads in transcript ''' table = outfile[:-len(".import")] track = table[:-len("Linc")] dbhandle = sqlite3.connect( PARAMS["database"] ) Database.executewait( dbhandle, '''DROP TABLE IF EXISTS %(table)s''' % locals()) Database.executewait( dbhandle, '''CREATE TABLE %(table)s (gene_id TEXT)''' % locals()) Database.executewait( dbhandle, '''CREATE INDEX %(table)s_index1 ON %(table)s (gene_id)''' % locals()) joins, wheres = [], ["1"] if PARAMS["linc_min_reads"] > 0: joins.append( ", %(track)s_coverage as cov" % locals() ) wheres.append( "cov.gene_id = m.gene_id2 AND cov.nmatches >= %(i)" % PARAMS["linc_min_reads"] ) if PARAMS["linc_exclude"] > 0: joins.append( "LEFT JOIN %s_vs_%s_ovl as ovl on ovl.gene_id2 = a.gene_id" %\ (PARAMS["linc_exclude"], track ) ) wheres.append( "ovl.gene_id1 IS NULL" ) wheres = " AND ".join( wheres ) joins = " ".join( joins ) statement = '''INSERT INTO %(table)s SELECT DISTINCT(a.gene_id) FROM %(track)s_annotation as a %(joins)s LEFT JOIN %(track)s_coding AS c on c.gene_id = a.gene_id WHERE is_unknown AND is_intergenic AND exons_sum >= %(linc_min_length)i AND (c.is_coding IS NULL or not c.is_coding) AND %(wheres)s ''' % dict( PARAMS.items() + locals().items() ) E.debug( "statement to build lincRNA: %s" % statement) Database.executewait( dbhandle, statement % locals()) dbhandle.commit() cc = dbhandle.cursor() result = cc.execute("SELECT COUNT(*) FROM %(table)s" % locals() ).fetchall()[0][0] E.info( "build lincRNA set for %s: %i entries" % ( track, result )) outgtf = "%s.gtf.gz" % table E.info( "creating gtf file `%s`" % outgtf ) # output gtf file statement = '''%(cmd-sql)s %(database)s "SELECT g.* FROM %(track)s_gtf as g, %(table)s AS t WHERE t.gene_id = g.gene_id" | python %(scriptsdir)s/gtf2tsv.py --invert --log=%(outfile)s | gzip > %(outgtf)s''' P.run()
def loadSummary(infile, outfile): '''load several rates into a single convenience table. ''' stmt_select = [] stmt_from = [] stmt_where = ["1"] track = infile[:-len(".gtf.gz")] tablename = "%s_evol" % track if os.path.exists("%s_rates.load" % track): stmt_select.append("a.distance AS ks, a.aligned AS aligned") stmt_from.append('''LEFT JOIN %(track)s_rates AS a ON r.gene_id = a.gene_id AND a.aligned >= %(rates_min_aligned)i AND a.distance <= %(rates_max_rate)f''') if os.path.exists("%s_coverage.load" % track): stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage") stmt_from.append( "LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id") if os.path.exists("%s_repeats_gc.load" % track): stmt_select.append("ar_gc.exons_mean AS repeats_gc") stmt_from.append( "LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id" ) if os.path.exists("%s_repeats_rates.load" % track): stmt_select.append( "ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska" ) stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar ON r.gene_id = ar.gene_id AND ar.exons_nval >= %(rates_min_repeats)i''') if os.path.exists("%s_introns_rates.load" % track): stmt_select.append( "ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski" ) stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir ON r.gene_id = ir.gene_id AND ir.aligned >= %(rates_min_aligned)i''') x = locals() x.update(PARAMS) stmt_select = ", ".join(stmt_select) % x stmt_from = " ".join(stmt_from) % x stmt_where = " AND ".join(stmt_where) % x dbhandle = sqlite3.connect(PARAMS["database"]) Database.executewait(dbhandle, "DROP TABLE IF EXISTS %(tablename)s " % locals()) statement = ''' CREATE TABLE %(tablename)s AS SELECT CAST(r.gene_id AS TEXT) AS gene_id, r.exons_sum as length, r.exons_pGC as pgc, %(stmt_select)s FROM %(track)s_annotation AS r %(stmt_from)s WHERE %(stmt_where)s ''' % locals() Database.executewait(dbhandle, statement) dbhandle.commit() P.touch(outfile)
def buildExpressionStats(tables, method, outfile, outdir): '''build expression summary statistics. Creates also diagnostic plots in <exportdir>/<method> directory. ''' dbhandle = sqlite3.connect(PARAMS["database"]) def _split(tablename): # this would be much easier, if feature_counts/gene_counts/etc. # would not contain an underscore. try: design, geneset, counting_method = re.match( "([^_]+)_vs_([^_]+)_(.*)_%s" % method, tablename).groups() except AttributeError: try: design, geneset = re.match( "([^_]+)_([^_]+)_%s" % method, tablename).groups() counting_method = "na" except AttributeError: raise ValueError("can't parse tablename %s" % tablename) return design, geneset, counting_method # return re.match("([^_]+)_", tablename ).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join( ("design", "geneset", "level", "treatment_name", "counting_method", "control_name", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "twofold")) + "\n") all_tables = set(Database.getTables(dbhandle)) for level in CUFFDIFF_LEVELS: for tablename in tables: tablename_diff = "%s_%s_diff" % (tablename, level) tablename_levels = "%s_%s_diff" % (tablename, level) design, geneset, counting_method = _split(tablename_diff) if tablename_diff not in all_tables: continue def toDict(vals, l=2): return collections.defaultdict( int, [(tuple(x[:l]), x[l]) for x in vals]) tested = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) status = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, status, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name,status" % locals()).fetchall(), 3) signif = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE significant " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) fold2 = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE (l2fold >= 1 or l2fold <= -1) AND significant " "GROUP BY treatment_name,control_name,significant" % locals()).fetchall()) for treatment_name, control_name in tested.keys(): outf.write("\t".join(map(str, ( design, geneset, level, counting_method, treatment_name, control_name, tested[(treatment_name, control_name)], "\t".join( [str(status[(treatment_name, control_name, x)]) for x in keys_status]), signif[(treatment_name, control_name)], fold2[(treatment_name, control_name)]))) + "\n") ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, "SELECT i.sum, pvalue " "FROM %(tablename_diff)s, " "%(geneset)s_geneinfo as i " "WHERE i.gene_id = test_id AND " "significant" % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(design)s_%(geneset)s_%(level)s_pvalue_vs_length.png" % locals() R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def createView(dbhandle, tables, tablename, outfile, view_type="TABLE", ignore_duplicates=True): '''create a database view for a list of tables. This method performs a join across multiple tables and stores the result either as a view or a table in the database. Arguments --------- dbhandle : A database handle. tables : list of tuples Tables to merge. Each tuple contains the name of a table and the field to join with the first table. For example:: tables = ( "reads_summary", "track", "bam_stats", "track", "context_stats", "track", "picard_stats_alignment_summary_metrics", "track") tablename : string Name of the view or table to be created. outfile : string Output filename for status information. view_type : string Type of view, either ``VIEW`` or ``TABLE``. If a view is to be created across multiple databases, use ``TABLE``. ignore_duplicates : bool If set to False, duplicate column names will be added with the tablename as prefix. The default is to ignore. ''' Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) tracks, columns = [], [] tablenames = [x[0] for x in tables] for table, track in tables: d = Database.executewait( dbhandle, "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table)) tracks.append(d.fetchone()[0]) columns.append([ x.lower() for x in Database.getColumnNames(dbhandle, table) if x != track ]) E.info("creating %s from the following tables: %s" % (tablename, str(list(zip(tablenames, tracks))))) if min(tracks) != max(tracks): raise ValueError("number of rows not identical - will not create view") from_statement = " , ".join( ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)]) f = tables[0][1] where_statement = " AND ".join([ "t0.%s = t%i.%s" % (f, x + 1, y[1]) for x, y in enumerate(tables[1:]) ]) all_columns, taken = [], set() for x, c in enumerate(columns): i = set(taken).intersection(set(c)) if i: E.warn("duplicate column names: %s " % i) if not ignore_duplicates: table = tables[x][0] all_columns.extend( ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i]) c = [y for y in c if y not in i] all_columns.extend(["t%i.%s" % (x, y) for y in c]) taken.update(set(c)) all_columns = ",".join(all_columns) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s FROM %(from_statement)s WHERE %(where_statement)s ''' % locals() Database.executewait(dbhandle, statement) nrows = Database.executewait( dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0] if nrows == 0: raise ValueError("empty view mapping, check statement = %s" % (statement % locals())) if nrows != min(tracks): E.warn("view creates duplicate rows, got %i, expected %i" % (nrows, min(tracks))) E.info("created view_mapping with %i rows" % nrows) touchFile(outfile)
def loadHypergeometricAnalysis(infile, outfile): '''load GO results.''' track = P.toTable(outfile) tablename = 'hypergeometric_%s_summary' % track P.load(infile, outfile, tablename=tablename) dbh = connect() ontologies = [ x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall() ] genelists = [ x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall() ] # output files from runGO.py sections = ('results', 'parameters', 'withgenes') for section in sections: tablename = 'hypergeometric_%s_%s' % (track, section) statement = ''' python %(scriptsdir)s/combine_tables.py --cat=track --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s" hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(tablename)s >> %(outfile)s''' P.run() for ontology in ontologies: fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology) if not os.path.exists(fn): E.warn("file %s does not exist" % fn) continue P.load(fn, outfile, tablename='hypergeometric_%s_%s_l2fold' % (track, ontology), options='--allow-empty') fn = os.path.join(infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology), options='--allow-empty') fn = os.path.join(infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology), options='--allow-empty')
def buildDMRStats(tables, method, outfile, dbhandle): """build dmr summary statistics. This method counts the number of up/down, 2fold up/down, etc. genes in output from (:mod:`scripts/runExpression`). This method also creates diagnostic plots in the <exportdir>/<method> directory. Tables should be labeled <tileset>_<design>_<method>. Arguments --------- tables ; list List of tables with DMR output method : string Method name outfile : string Output filename. Tab separated file summarizing """ def togeneset(tablename): return re.match("([^_]+)_", tablename).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write( "\t".join( ( "tileset", "design", "track1", "track2", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "up", "down", "twofold", "twofold_up", "twofold_down", ) ) + "\n" ) all_tables = set(Database.getTables(dbhandle)) outdir = os.path.join(PARAMS["exportdir"], "diff_methylation") for tablename in tables: prefix = P.snip(tablename, "_%s" % method) tileset, design = prefix.split("_") def toDict(vals, l=2): return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals]) E.info("collecting data from %s" % tablename) tested = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name""" % locals(), ).fetchall() ) status = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name,status""" % locals(), ).fetchall(), 3, ) signif = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE significant GROUP BY treatment_name,control_name""" % locals(), ).fetchall() ) fold2 = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE (l2fold >= 1 or l2fold <= -1) AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) fold2up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 1 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) fold2down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < -1 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) groups = tested.keys() for treatment_name, control_name in groups: k = (treatment_name, control_name) outf.write( "\t".join( map( str, ( tileset, design, treatment_name, control_name, tested[k], "\t".join([str(status[(treatment_name, control_name, x)]) for x in keys_status]), signif[(k)], up[k], down[k], fold2[k], fold2up[k], fold2down[k], ), ) ) + "\n" ) ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, """SELECT end - start, pvalue FROM %(tablename)s WHERE significant""" % locals(), ).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals() R.png(pngfile) R.smoothScatter( R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab="log10(length)", ylab="log10(pvalue)", log="x", pch=20, cex=0.1, ) R["dev.off"]() outf.close()
def createView(dbhandle, tables, tablename, outfile, view_type="TABLE", ignore_duplicates=True): '''create a database view for a list of tables. This method performs a join across multiple tables and stores the result either as a view or a table in the database. Arguments --------- dbhandle : A database handle. tables : list of tuples Tables to merge. Each tuple contains the name of a table and the field to join with the first table. For example:: tables = ( "reads_summary", "track", "bam_stats", "track", "context_stats", "track", "picard_stats_alignment_summary_metrics", "track") tablename : string Name of the view or table to be created. outfile : string Output filename for status information. view_type : string Type of view, either ``VIEW`` or ``TABLE``. If a view is to be created across multiple databases, use ``TABLE``. ignore_duplicates : bool If set to False, duplicate column names will be added with the tablename as prefix. The default is to ignore. ''' Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) tracks, columns = [], [] tablenames = [x[0] for x in tables] for table, track in tables: d = Database.executewait( dbhandle, "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table)) tracks.append(d.fetchone()[0]) columns.append( [x.lower() for x in Database.getColumnNames(dbhandle, table) if x != track]) E.info("creating %s from the following tables: %s" % (tablename, str(list(zip(tablenames, tracks))))) if min(tracks) != max(tracks): raise ValueError( "number of rows not identical - will not create view") from_statement = " , ".join( ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)]) f = tables[0][1] where_statement = " AND ".join( ["t0.%s = t%i.%s" % (f, x + 1, y[1]) for x, y in enumerate(tables[1:])]) all_columns, taken = [], set() for x, c in enumerate(columns): i = set(taken).intersection(set(c)) if i: E.warn("duplicate column names: %s " % i) if not ignore_duplicates: table = tables[x][0] all_columns.extend( ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i]) c = [y for y in c if y not in i] all_columns.extend(["t%i.%s" % (x, y) for y in c]) taken.update(set(c)) all_columns = ",".join(all_columns) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s FROM %(from_statement)s WHERE %(where_statement)s ''' % locals() Database.executewait(dbhandle, statement) nrows = Database.executewait( dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0] if nrows == 0: raise ValueError( "empty view mapping, check statement = %s" % (statement % locals())) if nrows != min(tracks): E.warn("view creates duplicate rows, got %i, expected %i" % (nrows, min(tracks))) E.info("created view_mapping with %i rows" % nrows) touchFile(outfile)
def buildCuffdiffPlots(infile, outfile): '''create summaries of cufflinks results (including some diagnostic plots) Plots are created in the <exportdir>/cuffdiff directory. Plots are: <geneset>_<method>_<level>_<track1>_vs_<track2>_significance.png fold change against expression level ''' ########################################### ########################################### # create diagnostic plots ########################################### outdir = os.path.join(PARAMS["exportdir"], "cuffdiff") dbhandle = sqlite3.connect(PARAMS["database"]) prefix = P.snip(infile, ".load") geneset, method = prefix.split("_") for level in CUFFDIFF_LEVELS: tablename_diff = prefix + "_%s_diff" % level tablename_levels = prefix + "_%s_levels" % level # note that the ordering of EXPERIMENTS and the _diff table # needs to be the same as only one triangle is stored of the # pairwise results. do not plot "undefined" lfold values # (where treatment_mean or control_mean = 0) do not plot lfold # values where the confidence bounds contain 0. for track1, track2 in itertools.combinations(EXPERIMENTS, 2): statement = """ SELECT CASE WHEN d.treatment_mean < d.control_mean THEN d.treatment_mean ELSE d.control_mean END, d.l2fold, d.significant FROM %(tablename_diff)s AS d WHERE treatment_name = '%(track1)s' AND control_name = '%(track2)s' AND status = 'OK' AND treatment_mean > 0 AND control_mean > 0 """ % locals() data = zip(*Database.executewait(dbhandle, statement)) pngfile = "%(outdir)s/%(geneset)s_%(method)s_%(level)s_%(track1)s_vs_%(track2)s_significance.png" % locals() # ian: Bug fix: moved R.png to after data check so that no # plot is started if there is no data this was leading # to R falling over from too many open devices if len(data) == 0: E.warn("no plot for %s - %s -%s vs %s" % (pngfile, level, track1, track2)) continue R.png(pngfile) R.plot(ro.FloatVector(data[0]), ro.FloatVector(data[1]), xlab='min(FPKM)', ylab='log2fold', log="x", pch=20, cex=.1, col=R.ifelse(ro.IntVector(data[2]), "red", "black")) R['dev.off']() P.touch(outfile)