def buildPicardCoverageStats(infile, outfile, baits, regions): '''run picard:CollectHsMetrics Generate coverage statistics for regions of interest from a bed file using Picard. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. baits : :term:`bed` formatted file of bait regions regions : :term:`bed` formatted file of target regions ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectHsMetrics BAIT_INTERVALS=%(baits)s TARGET_INTERVALS=%(regions)s INPUT=%(infile)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=LENIENT''' % locals() P.run()
def buildPicardGCStats(infile, outfile, genome_file): """picard:CollectGCBiasMetrics Collect GC bias metrics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. """ job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectGcBiasMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT CHART_OUTPUT=%(outfile)s.pdf SUMMARY_OUTPUT=%(outfile)s.summary >& %(outfile)s''' P.run()
def buildPicardAlignmentStats(infile, outfile, genome_file): '''run picard:CollectMultipleMetrics Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectMultipleMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def buildPicardDuplicateStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard and keep the dedupped .bam file. Pair duplication is properly handled, including inter-chromosomal cases. SE data is also handled. These stats also contain a histogram that estimates the return from additional sequecing. No marked bam files are retained (/dev/null...) Note that picards counts reads but they are in fact alignments. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s MarkDuplicates INPUT=%(infile)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s.duplicate_metrics OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT; ''' statement += '''samtools index %(outfile)s ;''' P.run()
def buildPicardInsertSizeStats(infile, outfile, genome_file): '''run Picard:CollectInsertSizeMetrics Collect insert size statistics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''CollectInsertSizeMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def buildPicardRnaSeqMetrics(infiles, strand, outfile): '''run picard:RNASeqMetrics Arguments --------- infiles : string Input filename in :term:`BAM` format. Genome file in refflat format (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat) outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 infile, genome = infiles if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectRnaSeqMetrics REF_FLAT=%(genome)s INPUT=%(infile)s ASSUME_SORTED=true OUTPUT=%(outfile)s STRAND=%(strand)s VALIDATION_STRINGENCY=SILENT ''' P.run()
def buildPicardInsertSizeStats(infile, outfile, genome_file): '''run Picard:CollectInsertSizeMetrics Collect insert size statistics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''CollectInsertSizeMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def buildPicardDuplicationStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard, the marked records are discarded. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # currently, MarkDuplicates cannot handle split alignments from gsnap # these can be identified by the custom XT tag. if ".gsnap.bam" in infile: tmpf = P.getTempFile(".") tmpfile_name = tmpf.name statement = '''samtools view -h %(infile)s | awk "!/\\tXT:/" | samtools view /dev/stdin -S -b > %(tmpfile_name)s; ''' % locals() data_source = tmpfile_name else: statement = "" data_source = infile os.environ["CGAT_JAVA_OPTS"] = "-Xmx%s -XX:+UseParNewGC\ -XX:+UseConcMarkSweepGC" % (PICARD_MEMORY) statement += '''MarkDuplicates INPUT=%(data_source)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s OUTPUT=/dev/null VALIDATION_STRINGENCY=SILENT ''' P.run() os.unsetenv("CGAT_JAVA_OPTS") if ".gsnap.bam" in infile: os.unlink(tmpfile_name)
def buildPicardDuplicationStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard, the marked records are discarded. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # currently, MarkDuplicates cannot handle split alignments from gsnap # these can be identified by the custom XT tag. if ".gsnap.bam" in infile: tmpf = P.getTempFile(".") tmpfile_name = tmpf.name statement = '''samtools view -h %(infile)s | awk "!/\\tXT:/" | samtools view /dev/stdin -S -b > %(tmpfile_name)s; ''' % locals() data_source = tmpfile_name else: statement = "" data_source = infile os.environ["CGAT_JAVA_OPTS"] = "-Xmx%s -XX:+UseParNewGC\ -XX:+UseConcMarkSweepGC" % (PICARD_MEMORY) statement += '''MarkDuplicates INPUT=%(data_source)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s OUTPUT=/dev/null VALIDATION_STRINGENCY=SILENT ''' P.run() os.unsetenv("CGAT_JAVA_OPTS") if ".gsnap.bam" in infile: os.unlink(tmpfile_name)
def buildPicardAlignmentStats(infile, outfile, genome_file): '''run picard:CollectMultipleMetrics Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # Picard seems to have problem if quality information is missing # or there is no sequence/quality information within the bam file. # Thus, add it explicitly. statement = '''cat %(infile)s | cgat bam2bam -v 0 --method=set-sequence --output-sam | picard %(picard_opts)s CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def buildPicardAlignmentStats(infile, outfile, genome_file): '''run picard:CollectMultipleMetrics Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # Picard seems to have problem if quality information is missing # or there is no sequence/quality information within the bam file. # Thus, add it explicitly. statement = '''cat %(infile)s | cgat bam2bam -v 0 --method=set-sequence --output-sam | picard %(picard_opts)s CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()