def buildPicardGCStats(infile, outfile, genome_file): """picard:CollectGCBiasMetrics Collect GC bias metrics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. """ job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s CollectGcBiasMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT CHART_OUTPUT=%(outfile)s.pdf SUMMARY_OUTPUT=%(outfile)s.summary >& %(outfile)s''' P.run(statement)
def buildPicardInsertSizeStats(infile, outfile, genome_file): '''run Picard:CollectInsertSizeMetrics Collect insert size statistics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s CollectInsertSizeMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run(statement, job_memory=PICARD_MEMORY)
def buildPicardCoverageStats(infile, outfile, baits, regions): '''run picard:CollectHSMetrics Generate coverage statistics for regions of interest from a bed file using Picard. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. baits : :term:`bed` formatted file of bait regions regions : :term:`bed` formatted file of target regions ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s CollectHsMetrics BAIT_INTERVALS=%(baits)s TARGET_INTERVALS=%(regions)s INPUT=%(infile)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=LENIENT''' % locals() P.run(statement)
def buildPicardDuplicateStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard and keep the dedupped .bam file. Pair duplication is properly handled, including inter-chromosomal cases. SE data is also handled. These stats also contain a histogram that estimates the return from additional sequecing. No marked bam files are retained (/dev/null...) Note that picards counts reads but they are in fact alignments. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s MarkDuplicates INPUT=%(infile)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s.duplicate_metrics OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s.log && samtools index %(outfile)s''' P.run(statement)
def buildPicardRnaSeqMetrics(infiles, strand, outfile): '''run picard:RNASeqMetrics Arguments --------- infiles : string Input filename in :term:`BAM` format. Genome file in refflat format (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat) outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 infile, genome = infiles if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return statement = '''picard %(picard_opts)s CollectRnaSeqMetrics REF_FLAT=%(genome)s INPUT=%(infile)s ASSUME_SORTED=true OUTPUT=%(outfile)s STRAND=%(strand)s VALIDATION_STRINGENCY=SILENT ''' P.run(statement)
def buildPicardDuplicationStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard, the marked records are discarded. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return # currently, MarkDuplicates cannot handle split alignments from gsnap # these can be identified by the custom XT tag. if ".gsnap.bam" in infile: tmpf = P.get_temp_file(".") tmpfile_name = tmpf.name statement = '''samtools view -h %(infile)s | awk "!/\\tXT:/" | samtools view /dev/stdin -S -b > %(tmpfile_name)s; ''' % locals() data_source = tmpfile_name else: statement = "" data_source = infile statement += '''picard %(picard_opts)s MarkDuplicates INPUT=%(data_source)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s OUTPUT=/dev/null VALIDATION_STRINGENCY=SILENT ''' P.run(statement) if ".gsnap.bam" in infile: os.unlink(tmpfile_name)
def buildPicardAlignmentStats(infile, outfile, genome_file): '''run picard:CollectMultipleMetrics Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return # Picard seems to have problem if quality information is missing # or there is no sequence/quality information within the bam file. # Thus, add it explicitly. statement = '''cat %(infile)s | cgat bam2bam -v 0 --method=set-sequence --output-sam --log=%(outfile)s.bam2bam.log | picard %(picard_opts)s CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run(statement)