def calculateFalsePositiveRate(infiles, outfile): ''' taxonomy false positives and negatives etc ''' # connect to database dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() levels = ["phylum", "class", "order", "family", "genus", "species"] tablename_true = P.toTable(infiles[0]) # get corresponding estimate file tablename_estimate = P.toTable(os.path.basename([inf for inf in infiles[ 1:] if os.path.basename(inf)[len("metaphlan_"):] == os.path.basename(infiles[0])][0])) outf = open(outfile, "w") track = P.snip(os.path.basename(infiles[0]), ".taxonomy.relab.load") for level in levels: for cutoff in [0, 1]: true_set = set() estimate_set = set() for taxa in cc.execute("""SELECT taxa FROM %s WHERE level == '%s' AND relab > %f""" % (tablename_true, level, float(cutoff) / 100)): true_set.add(taxa[0]) for taxa in cc.execute("""SELECT taxon FROM %s WHERE taxon_level == '%s' AND rel_abundance > %f""" % (tablename_estimate, level, float(cutoff))): estimate_set.add(taxa[0]) total_true = len(true_set) total_estimate = len(estimate_set) tp = true_set.intersection(estimate_set) fp = estimate_set.difference(true_set) fp_rate = float(len(fp)) / total_estimate tp_rate = float(len(tp)) / total_true outf.write("%s\t%f\t%f\t%s\t%s\n" % (level, fp_rate, tp_rate, track, str(cutoff))) outf.close()
def calculateFalsePositiveRate(infiles, outfile): ''' taxonomy false positives and negatives etc ''' # connect to database dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() levels = ["phylum", "class", "order", "family", "genus", "species"] tablename_true = P.toTable(infiles[0]) # get corresponding estimate file tablename_estimate = P.toTable(os.path.basename([inf for inf in infiles[1:] if os.path.basename(inf)[len("metaphlan_"):] == os.path.basename(infiles[0])][0])) outf = open(outfile, "w") track = P.snip(os.path.basename(infiles[0]), ".taxonomy.relab.load") for level in levels: for cutoff in [0, 1]: true_set = set() estimate_set = set() for taxa in cc.execute("""SELECT taxa FROM %s WHERE level == '%s' AND relab > %f""" % (tablename_true, level, float(cutoff)/100)): true_set.add(taxa[0]) for taxa in cc.execute("""SELECT taxon FROM %s WHERE taxon_level == '%s' AND rel_abundance > %f""" % (tablename_estimate, level, float(cutoff))): estimate_set.add(taxa[0]) total_true = len(true_set) total_estimate = len(estimate_set) tp = true_set.intersection(estimate_set) fp = estimate_set.difference(true_set) fp_rate = float(len(fp))/total_estimate tp_rate = float(len(tp))/total_true outf.write("%s\t%f\t%f\t%s\t%s\n" % (level, fp_rate, tp_rate, track, str(cutoff))) outf.close()
def compareAbundanceOfFalsePositiveSpecies(infiles, outfile): ''' boxplot the relative abundance of false positive species compared to true positives ''' tablename_estimate = P.toTable(infiles[0]) track = P.snip( os.path.basename(infiles[0]).replace("metaphlan_", ""), ".load") tablename_true = [ P.toTable(x) for x in infiles[1:] if P.snip(os.path.basename(x), ".load") == track ][0] dbh = sqlite3.connect("csvdb") cc = dbh.cursor() tmp = P.getTempFile(".") tmp.write("taxa\tabundance\tstatus\n") estimate = {} true = set() for data in cc.execute( """SELECT taxon, rel_abundance FROM %s WHERE taxon_level == 'species'""" % tablename_estimate).fetchall(): estimate[data[0]] = data[1] for data in cc.execute("""SELECT taxa FROM %s WHERE level == 'species'""" % tablename_true).fetchall(): true.add(data[0]) for taxa, abundance in estimate.iteritems(): if taxa in true: tmp.write("%s\t%f\ttp\n" % (taxa, abundance)) else: tmp.write("%s\t%f\tfp\n" % (taxa, abundance)) tmp.close() inf = tmp.name if track.find("15M") != -1: col = "cadetblue" elif track.find("30M") != -1: col = "lightblue" elif track.find("50M") != -1: col = "slategray" R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % inf) R('''library(ggplot2)''') R('''ggplot(dat, aes(x = status, y = log2(abundance))) + geom_boxplot(colour = "%s") + geom_hline(yintersect=0, linetype="dashed")''' % col) R('''ggsave("%s")''' % outfile) os.unlink(inf)
def loadGeneInformation(infile, outfile, only_proteincoding=False): '''load gene information gleaned from the attributes in the gene set gtf file. *infile* is an ENSEMBL gtf file. ''' table = P.toTable(outfile) if only_proteincoding: filter_cmd = ''' awk '$2 == "protein_coding"' ''' else: filter_cmd = "cat" statement = ''' gunzip < %(infile)s | %(filter_cmd)s | grep "transcript_id" | python %(scriptsdir)s/gtf2gtf.py --sort=gene+transcript | python %(scriptsdir)s/gtf2tsv.py --full --only-attributes -v 0 | python %(toolsdir)s/csv_cut.py --remove exon_id transcript_id transcript_name protein_id exon_number | %(scriptsdir)s/hsort 1 | uniq | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=gene_id --index=gene_name --map=gene_name:str --table=%(table)s > %(outfile)s''' P.run()
def loadNumberExonsLengthSummaryStats(infile, outfile): ''' load the table of exon counts and transcript lengths ''' tablename = P.toTable(outfile.replace("/", "_")) + "_stats" statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def loadSummariseReadsContributingToTranscripts(infile, outfile): ''' loads the summary of reads contributing to transcripts ''' tablename = P.toTable(outfile.replace("/", "_")) statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def loadCountSingleAndMultiExonLincRNA(infile, outfile): ''' load the counts for the multi and single exon lincRNA ''' tablename = P.toTable(outfile.replace("/", "_")) + ".count" statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def loadHypergeometricAnalysis(infile, outfile): '''load GO results.''' track = P.toTable(outfile) tablename = 'hypergeometric_%s_summary' % track P.load(infile, outfile, tablename=tablename) dbh = connect() ontologies = [x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()] genelists = [x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()] # output files from runGO.py sections = ('results', 'parameters', 'withgenes') for section in sections: tablename = 'hypergeometric_%s_%s' % (track, section) statement = ''' python %(scriptsdir)s/combine_tables.py --cat=track --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s" hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(tablename)s >> %(outfile)s''' P.run() for ontology in ontologies: fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology) if not os.path.exists(fn): E.warn("file %s does not exist" % fn) continue P.load(fn, outfile, tablename='hypergeometric_%s_%s_l2fold' % (track, ontology), options='--allow-empty') fn = os.path.join( infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology), options='--allow-empty') fn = os.path.join( infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology), options='--allow-empty')
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in open( statfile, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def loadTranscriptSummary(infile, outfile): '''summarize binding information per transcript.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() # sqlite can not do full outer join cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() ) transcripts = [x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()] tmpf = P.getTempFile() tables = ("tata", "cpg") titles = tables vals = [] for table in tables: t = set([x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()]) vals.append(t) tmpf.write("transcript_id\t%s\n" % "\t".join(titles)) for transcript_id in transcripts: tmpf.write("%s\t%s\n" % (transcript_id, "\t".join([str(int(transcript_id in v)) for v in vals]))) tmpf.close() P.load(tmpf.name, outfile) os.unlink(tmpf.name)
def buildGeneOntology(infile, outfile): '''create an output file akin to GO ontology files to be used with GO.py ''' table = P.toTable(infile) columns = ("cpg", "tata") dbh = connect() cc = dbh.cursor() outf = IOTools.openFile(outfile, "w") outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") i = 1 for c in columns: cc.execute( "SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" % locals()) outf.write( "".join(["promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc])) i += 1 cc.execute( "SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" % locals()) outf.write( "".join(["promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c) for x in cc])) i += 1 outf.close()
def createViewMapping(infile, outfile): '''create view in database for alignment stats. This view aggregates all information on a per-track basis. The table is built from the following tracks: mapping_stats bam_stats ''' tablename = P.toTable(outfile) # can not create views across multiple database, so use table view_type = "TABLE" dbhandle = connect() Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT * FROM bam_stats AS b ''' Database.executewait(dbhandle, statement % locals())
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s''' P.run() os.unlink(tmpfilename)
def mergeAndLoad(infiles, outfile, suffix): '''load categorical tables (two columns) into a database. The tables are merged and entered row-wise. ''' header = ",".join([P.quote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join( ["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """python %(scriptsdir)s/combine_tables.py --headers=%(header)s --missing=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s """ P.run()
def loadPicardGCStats(infiles, outfile): '''Merge Picard insert size stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".gcstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s''' P.run() os.unlink(tmpfilename)
def loadLowerStringencyDeNovos(infile, outfile): '''Load lower stringency de novos into database''' scriptsdir = PARAMS["general_scriptsdir"] tablename = P.toTable(outfile) statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty --allow-empty > %(outfile)s''' % locals( ) P.run()
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [ x for x in open(statfile, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def loadGeneListStats(infiles, outfile): '''Merge gene list stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = open("genelist_stats.txt", "w") first = True for f in infiles: track = P.snip(os.path.basename(f), ".genelist.stats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print contigs for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def loadCodingPotential(infile, outfile): '''load annotations''' table = P.toTable(outfile) statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=gene_id --map=gene_id:str --table=%(table)s > %(outfile)s''' P.run() # set the is_coding flag dbhandle = sqlite3.connect(PARAMS["database"]) Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals()) Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals()) dbhandle.commit()
def loadProteinStats( infile, outfile ): '''load protein statistics to database. The *infile* is an ENSEMBL peptide file. ''' to_cluster = True table = P.toTable(outfile) statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/fasta2table.py --log=%(outfile)s --type=aa --section=length --section=hid --section=aa --regex-identifier="(\S+)" | sed "s/^id/protein_id/" | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=protein_id --map=protein_id:str --table=%(table)s > %(outfile)s''' P.run()
def loadTranscriptStats( infile, outfile ): '''load gene statistics to database. The *infile* is the *outfile* from :meth:`buildTranscripts` ''' to_cluster = True table = P.toTable(outfile) statement = ''' gunzip < %(infile)s |\ python %(scriptsdir)s/gtf2table.py \ --log=%(outfile)s.log \ --genome=%(genome_dir)s/%(genome)s \ --reporter=transcripts \ --counter=position \ --counter=length \ --counter=composition-na |\ python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --index=gene_id \ --map=gene_id:str \ --table=%(table)s \ > %(outfile)s''' P.run()
def loadRecs(infile, outfile): '''Load homozygous recessive disease candidates into database''' scriptsdir = PARAMS["general_scriptsdir"] tablename = P.toTable(outfile) statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty --allow-empty > %(outfile)s''' % locals( ) P.run()
def loadTranscriptInformation(infile, outfile, only_proteincoding=False): '''load the transcript set. *infile* is an ENSEMBL gtf file. ''' to_cluster = True table = P.toTable(outfile) if only_proteincoding: filter_cmd = ''' awk '$2 == "protein_coding"' ''' else: filter_cmd = "cat" statement = '''gunzip < %(infile)s | %(filter_cmd)s | awk '$3 == "CDS"' | python %(scriptsdir)s/gtf2gtf.py --sort=gene | python %(scriptsdir)s/gtf2tsv.py --full --only-attributes -v 0 | python %(toolsdir)s/csv_cut.py --remove exon_id exon_number | %(scriptsdir)s/hsort 1 | uniq | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=transcript_id --index=gene_id --index=protein_id --index=gene_name --map=transcript_name:str --map=gene_name:str --table=%(table)s > %(outfile)s''' P.run()
def loadCoverageStats(infiles, outfile): '''Import coverage statistics into SQLite''' scriptsdir = PARAMS["general_scriptsdir"] tablename = P.toTable(outfile) outf = open('coverage.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".cov") lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s --ignore-empty --retry > %(outfile)s ''' P.run()
def exportMotifLocations( infiles, outfile ): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [ x[0] for x in cc.execute( "SELECT motif FROM motif_info" ).fetchall()] for motif in motifs: tmpf = P.getTempFile(".") for infile in infiles: table = P.toTable(infile) track = P.snip( table, "_mast" ) for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals() ): tmpf.write( "\t".join( map(str, x) ) + "\n" ) tmpf.close() outfile = os.path.join( PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif ) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run() os.unlink( tmpf.name )
def loadCompoundHets(infile, outfile): '''Load compound heterozygous variants into database''' scriptsdir = PARAMS["general_scriptsdir"] tablename = P.toTable(outfile) statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty --allow-empty > %(outfile)s''' % locals( ) P.run()
def buildGeneOntology(infile, outfile): '''create an output file akin to GO ontology files to be used with GO.py ''' table = P.toTable(infile) columns = ("cpg", "tata") dbh = connect() cc = dbh.cursor() outf = IOTools.openFile(outfile, "w") outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") i = 1 for c in columns: cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" % locals()) outf.write("".join([ "promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc ])) i += 1 cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" % locals()) outf.write("".join([ "promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c) for x in cc ])) i += 1 outf.close()
def buildContigSummary(infiles, outfile): ''' merge the contig summary statistics ''' stats = collections.defaultdict(list) for filepath in infiles: dirname = os.path.dirname(filepath) stats[dirname].append(os.path.basename(filepath)) N = PARAMS["scaffold_n"] # connect to database dbh = connect() cc = dbh.cursor() for dirname in stats.keys(): outfname = os.path.join(dirname, "contig.summary.tsv") outf = open(outfname, "w") outf.write( "track\tnscaffolds\tscaffold_length\tN%i\tmean_length\tmedian_length\tmax_length\n" % N) for infile in stats[dirname]: track = P.snip( infile.split(dirname.split(".dir")[0])[1][1:], ".summary.load") table = P.toTable(infile) data = cc.execute("""SELECT nscaffolds , scaffold_length , N50 , mean_length , median_length , max_length FROM %s""" % table).fetchone() outf.write("\t".join( map(str, [track, data[0], data[1], data[2], data[3], data[4], data[5]])) + "\n") outf.close()
def exportMotifLocations(infiles, outfile): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [ x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall() ] for motif in motifs: tmpf = P.getTempFile(".") for infile in infiles: table = P.toTable(infile) track = P.snip(table, "_mast") for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals()): tmpf.write("\t".join(map(str, x)) + "\n") tmpf.close() outfile = os.path.join(PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run() os.unlink(tmpf.name)
def createViewMapping(infile, outfile): """create view in database for alignment stats. This view aggregates all information on a per-track basis. The table is built from the following tracks: mapping_stats bam_stats """ tablename = P.toTable(outfile) # can not create views across multiple database, so use table view_type = "TABLE" dbhandle = connect() Database.executewait(dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) statement = """ CREATE %(view_type)s %(tablename)s AS SELECT * FROM bam_stats AS b """ Database.executewait(dbhandle, statement % locals())
def loadProteinStats(infile, outfile): '''load protein statistics to database. The *infile* is an ENSEMBL peptide file. ''' to_cluster = True table = P.toTable(outfile) statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/fasta2table.py --log=%(outfile)s --type=aa --section=length --section=hid --section=aa --regex-identifier="(\S+)" | sed "s/^id/protein_id/" | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=protein_id --map=protein_id:str --table=%(table)s > %(outfile)s''' P.run()
def loadDeNovos(infile, outfile): '''load de novo variants into the database''' scriptsdir = PARAMS["general_scriptsdir"] tablename = P.toTable(outfile) statement = '''cat %(infile)s | sed 's/#CHROM/CHROM/g;s/EFF\[\*\]/EFF/g;s/GEN\[0\]/GEN/g' | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty > %(outfile)s''' % locals( ) P.run()
def loadPicardHistogram( infiles, outfile, suffix, column, pipeline_suffix = ".picard_stats" ): '''extract a histogram from a picard output file and load it into database.''' tablename = P.toTable( outfile ) tname = "%s_%s" % (tablename, suffix) tname = P.snip( tname, "_metrics") + "_histogram" # some files might be missing xfiles = [ x for x in infiles if os.path.exists( "%s.%s" % (x, suffix) ) ] if len(xfiles) == 0: E.warn ( "no files for %s" % tname ) return header = ",".join( [P.snip( os.path.basename(x), pipeline_suffix) for x in xfiles ] ) filenames = " ".join( [ "%s.%s" % (x, suffix) for x in xfiles ] ) # there might be a variable number of columns in the tables # only take the first ignoring the rest statement = """python %(scriptsdir)s/combine_tables.py --regex-start="## HISTOGRAM" --missing=0 --take=2 %(filenames)s | python %(scriptsdir)s/csv2db.py --header=%(column)s,%(header)s --replace-header --index=track --table=%(tname)s >> %(outfile)s """ P.run()
def loadTranscriptStats(infile, outfile): '''load gene statistics to database. The *infile* is the *outfile* from :meth:`buildTranscripts` ''' to_cluster = True table = P.toTable(outfile) statement = ''' gunzip < %(infile)s |\ python %(scriptsdir)s/gtf2table.py \ --log=%(outfile)s.log \ --genome=%(genome_dir)s/%(genome)s \ --reporter=transcripts \ --counter=position \ --counter=length \ --counter=composition-na |\ python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --index=gene_id \ --map=gene_id:str \ --table=%(table)s \ > %(outfile)s''' P.run()
def loadTranscriptInformation( infile, outfile, only_proteincoding = False ): '''load the transcript set. *infile* is an ENSEMBL gtf file. ''' to_cluster = True table = P.toTable(outfile) if only_proteincoding: filter_cmd = ''' awk '$2 == "protein_coding"' ''' else: filter_cmd = "cat" statement = '''gunzip < %(infile)s | %(filter_cmd)s | awk '$3 == "CDS"' | python %(scriptsdir)s/gtf2gtf.py --sort=gene | python %(scriptsdir)s/gtf2tsv.py --full --only-attributes -v 0 | python %(toolsdir)s/csv_cut.py --remove exon_id exon_number | %(scriptsdir)s/hsort 1 | uniq | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=transcript_id --index=gene_id --index=protein_id --index=gene_name --map=transcript_name:str --map=gene_name:str --table=%(table)s > %(outfile)s''' P.run()
def loadVariantAnnotation(infile, outfile): '''Load VCF annotations into database''' scriptsdir = PARAMS["general_scriptsdir"] tablename = P.toTable(outfile) statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty > %(outfile)s''' % locals( ) P.run()
def mergeAndLoad(infiles, outfile, suffix): """load categorical tables (two columns) into a database. The tables are merged and entered row-wise. """ header = ",".join([P.quote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """python %(scriptsdir)s/combine_tables.py --headers=%(header)s --missing=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s """ P.run()
def loadEdgeRResults(infile, outfile): tableName = P.toTable(outfile) statement = ''' python %(scriptsdir)s/csv2db.py --table=%(tableName)s --index=id < infile >outfile ''' P.run()
def loadSnpeffAnnotation(infile, outfile): '''Load snpeff annotations into database''' scriptsdir = PARAMS["general_scriptsdir"] tablename = P.toTable(outfile) statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty > %(outfile)s''' % locals() P.run()
def loadEdgeRResults(infile,outfile): tableName = P.toTable(outfile) statement = ''' python %(scriptsdir)s/csv2db.py --table=%(tableName)s --index=id < infile >outfile ''' P.run()
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --missing=0 %(filenames)s | python %(scriptsdir)s/csv2db.py --header=%(column)s,%(header)s --replace-header --index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --missing=0 %(filenames)s | python %(scriptsdir)s/csv2db.py --header=%(column)s,%(header)s --replace-header --index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def compareAbundanceOfFalsePositiveSpecies(infiles, outfile): ''' boxplot the relative abundance of false positive species compared to true positives ''' tablename_estimate = P.toTable(infiles[0]) track = P.snip( os.path.basename(infiles[0]).replace("metaphlan_", ""), ".load") tablename_true = [P.toTable(x) for x in infiles[1:] if P.snip( os.path.basename(x), ".load") == track][0] dbh = sqlite3.connect("csvdb") cc = dbh.cursor() tmp = P.getTempFile(".") tmp.write("taxa\tabundance\tstatus\n") estimate = {} true = set() for data in cc.execute("""SELECT taxon, rel_abundance FROM %s WHERE taxon_level == 'species'""" % tablename_estimate).fetchall(): estimate[data[0]] = data[1] for data in cc.execute("""SELECT taxa FROM %s WHERE level == 'species'""" % tablename_true).fetchall(): true.add(data[0]) for taxa, abundance in estimate.iteritems(): if taxa in true: tmp.write("%s\t%f\ttp\n" % (taxa, abundance)) else: tmp.write("%s\t%f\tfp\n" % (taxa, abundance)) tmp.close() inf = tmp.name if track.find("15M") != -1: col = "cadetblue" elif track.find("30M") != -1: col = "lightblue" elif track.find("50M") != -1: col = "slategray" R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % inf) R('''library(ggplot2)''') R('''ggplot(dat, aes(x = status, y = log2(abundance))) + geom_boxplot(colour = "%s") + geom_hline(yintersect=0, linetype="dashed")''' % col) R('''ggsave("%s")''' % outfile) os.unlink(inf)
def mergeEffects(infiles, outfile): '''load transcript effects into single table.''' tablename = P.toTable(outfile) outf = open('effects.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [x for x in gzip.open(f, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() statement = '''cat effect.txt | python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --index=transcript_id \ --table=%(tablename)s \ > %(outfile)s''' P.run() for suffix in ("cds", "intron", "splicing", "translation", "genes"): outf = open('effects.' + suffix + '.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") statfile = f + "." + suffix + ".gz" print(statfile) if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in gzip.open(statfile, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=transcript_id --table=%(tablename)s_%(suffix)s --ignore-column=seq_na --ignore-column=seq_aa >> %(outfile)s''' P.run()
def loadNCG(infile, outfile): '''Load NCG into database''' dbh = connect() scriptsdir = PARAMS["general_scriptsdir"] tablename = P.toTable(outfile) statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty > %(outfile)s''' % locals() P.run()
def loadMutectFilteringSummary(infile, outfile): '''Load mutect extended output into database''' dbh = connect() scriptsdir = PARAMS["general_scriptsdir"] tablename = P.toTable(outfile) statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty > %(outfile)s''' % locals() P.run()
def loadROI2Gene(infile, outfile): '''Import genes mapping to regions of interest bed file into SQLite.''' scriptsdir = PARAMS["general_scriptsdir"] tablename = P.toTable(outfile) statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --ignore-empty --retry --table=%(tablename)s > %(outfile)s ''' P.run()
def loadSamples(infile, outfile): '''Import sample information into SQLite.''' scriptsdir = PARAMS["general_scriptsdir"] tablename = P.toTable(outfile) statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --ignore-empty --retry --table=%(tablename)s > %(outfile)s ''' P.run()
def loadCoverageStats(infile, outfile): ''' load coverage stats ''' tablename = P.toTable( P.snip(os.path.dirname(infile), ".dir") + "_%s" % os.path.basename(outfile)) statement = '''zcat %(infile)s | python %(scriptsdir)s/csv2db.py -t %(tablename)s --index=contig --log=%(outfile)s.log > %(outfile)s''' P.run()