def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-r", "--filename-rna", dest="filename_rna", type="string", metavar='GFF', help = "gff formatted file with rna locations. Note that the computation currently " "does not take into account indels, so it is an approximate count only [%default]" ) parser.add_option( "-f", "--remove-rna", dest="remove_rna", action="store_true", help = "as well as counting, also remove rna reads for duplicate and other counts [%default]" ) parser.add_option( "-i", "--input-reads", dest="input_reads", type="int", help = "the number of reads - if given, used to provide percentages [%default]" ) parser.add_option( "--force-output", dest="force_output", action="store_true", help = "output nh/nm stats even if there is only a single count [%default]" ) parser.add_option( "-d", "--output-details", dest="output_details", action="store_true", help = "output per-read details [%default]" ) parser.add_option( "-q", "--filename-fastq", dest = "filename_fastq", help = "filename with sequences and quality scores. This file is only " "used to collect sequence identifiers. Thus, for paired end data a " "single file is sufficient [%default]" ) parser.set_defaults( filename_rna = None, remove_rna = False, input_reads = 0, force_output = False, filename_fastq = None, output_details = False, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv, add_output_options = True ) if options.filename_rna: rna = GTF.readAndIndex( GTF.iterator( IOTools.openFile( options.filename_rna ) ) ) else: rna = None pysam_in = pysam.Samfile( "-", "rb" ) if options.output_details: outfile_details = E.openOutputFile( "details", "w") else: outfile_details = None if options.filename_fastq and not os.path.exists( options.filename_fastq ): raise IOError("file %s does not exist" % options.filename_fastq) counter, flags_counts, nh_filtered, nh_all, nm_filtered, nm_all, mapq, mapq_all, max_hi = \ _bam2stats.count( pysam_in, options.remove_rna, rna, filename_fastq = options.filename_fastq, outfile_details = outfile_details ) if max_hi > 0 and max_hi != max( nh_all.keys() ): E.warn( "max_hi(%i) is inconsistent with max_nh (%i) - counts will be corrected" \ % (max_hi, max(nh_all.keys()))) flags = sorted(flags_counts.keys()) outs = options.stdout outs.write( "category\tcounts\tpercent\tof\n" ) outs.write( "alignments_total\t%i\t%5.2f\ttotal\n" % (counter.input, 100.0 ) ) if counter.input == 0: E.warn( "no input - skipped" ) E.Stop() return nalignments_unmapped = flags_counts["unmapped"] nalignments_mapped = counter.input - nalignments_unmapped outs.write( "alignments_mapped\t%i\t%5.2f\ttotal\n" % \ (nalignments_mapped, 100.0 * nalignments_mapped / counter.input ) ) outs.write( "alignments_unmapped\t%i\t%5.2f\ttotal\n" % \ ( nalignments_unmapped, 100.0 * nalignments_unmapped / counter.input ) ) if nalignments_mapped == 0: E.warn( "no alignments - skipped" ) E.Stop() return for flag, counts in flags_counts.iteritems(): if flag == "unmapped": continue outs.write( "%s\t%i\t%5.2f\talignments_mapped\n" % ( flag, counts, 100.0 * counts / nalignments_mapped ) ) if options.filename_rna: outs.write( "alignments_rna\t%i\t%5.2f\talignments_mapped\n" % (counter.rna, 100.0 * counter.rna / nalignments_mapped ) ) outs.write( "alignments_no_rna\t%i\t%5.2f\talignments_mapped\n" % (counter.no_rna, 100.0 * counter.no_rna / nalignments_mapped ) ) outs.write( "alignments_filtered\t%i\t%5.2f\talignments_mapped\n" % (counter.filtered, 100.0 * counter.filtered / nalignments_mapped ) ) if counter.filtered == nalignments_mapped: normby = "alignments_mapped" else: normby = "alignments_filtered" if counter.filtered > 0: outs.write( "alignments_duplicates\t%i\t%5.2f\t%s\n" % (counter.duplicates, 100.0* counter.duplicates / counter.filtered, normby)) outs.write( "alignments_unique\t%i\t%5.2f\t%s\n" % (counter.filtered - counter.duplicates, 100.0*(counter.filtered - counter.duplicates)/counter.filtered, normby) ) # derive the number of mapped reads in file from alignment counts nreads_unmapped = flags_counts["unmapped"] nreads_mapped = computeMappedReadsFromAlignments( nalignments_mapped, nh_all, max_hi ) nreads_missing = 0 if options.input_reads: nreads_total = options.input_reads # unmapped reads in bam file? if nreads_unmapped: nreads_missing = nreads_total - nreads_unmapped - nreads_mapped else: nreads_unmapped = nreads_total - nreads_mapped elif nreads_unmapped: # if unmapped reads are in bam file, take those nreads_total = nreads_mapped + nreads_unmapped else: # otherwise normalize by mapped reads nreads_unmapped = 0 nreads_total = nreads_mapped outs.write( "reads_total\t%i\t%5.2f\treads_total\n" % (nreads_total, 100.0 ) ) outs.write( "reads_mapped\t%i\t%5.2f\treads_total\n" % (nreads_mapped, 100.0 * nreads_mapped / nreads_total ) ) outs.write( "reads_unmapped\t%i\t%5.2f\treads_total\n" % (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total ) ) outs.write( "reads_missing\t%i\t%5.2f\treads_total\n" % (nreads_missing, 100.0 * nreads_missing / nreads_total ) ) if len(nh_all) > 1: outs.write( "reads_unique\t%i\t%5.2f\treads_mapped\n" % (nh_all[1], 100.0 * nh_all[1] / nreads_mapped ) ) # compute after filtering # not that these are rough guesses if options.filename_rna: nreads_norna = computeMappedReadsFromAlignments( counter.filtered, nh_filtered, max_hi ) outs.write( "reads_norna\t%i\t%5.2f\treads_mapped\n" % (nreads_norna, 100.0 * nreads_norna / nreads_mapped ) ) if len(nh_filtered) > 1: outs.write( "reads_norna_unique\t%i\t%5.2f\treads_norna\n" % (nh_filtered[1], 100.0 * nh_filtered[1] / nreads_norna ) ) pysam_in.close() # output paired end data if flags_counts["read2"] > 0: if options.filename_fastq: pairs_mapped = counter.total_pairs - counter.total_pair_is_unmapped outs.write( "pairs_total\t%i\t%5.2f\tpairs_total\n" % \ (counter.total_pairs, 100.0 * counter.total_pairs / counter.total_pairs ) ) outs.write( "pairs_mapped\t%i\t%5.2f\tpairs_total\n" % \ (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs)) outs.write( "pairs_unmapped\t%i\t%5.2f\tpairs_total\n" % \ ( counter.total_pair_is_unmapped, 100.0 * counter.total_pair_is_unmapped / counter.total_pairs ) ) outs.write( "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" % \ ( counter.total_pair_is_proper_uniq, 100.0 * counter.total_pair_is_proper_uniq / counter.total_pairs ) ) outs.write( "pairs_incomplete\t%i\t%5.2f\tpairs_total\n" % \ ( counter.total_pair_is_incomplete, 100.0 * counter.total_pair_is_incomplete / counter.total_pairs ) ) outs.write( "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" % \ ( counter.total_pair_is_proper_duplicate, 100.0 * counter.total_pair_is_proper_duplicate / counter.total_pairs ) ) outs.write( "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" % \ ( counter.total_pair_is_proper_mmap, 100.0 * counter.total_pair_is_proper_mmap / counter.total_pairs ) ) outs.write( "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" % \ ( counter.total_pair_not_proper_uniq, 100.0 * counter.total_pair_not_proper_uniq / counter.total_pairs ) ) outs.write( "pairs_other\t%i\t%5.2f\tpairs_total\n" % \ ( counter.total_pair_is_other, 100.0 * counter.total_pair_is_other / counter.total_pairs ) ) else: # approximate counts pairs_total = nreads_total // 2 pairs_mapped = flags_counts["proper_pair"] // 2 outs.write( "pairs_total\t%i\t%5.2f\tpairs_total\n" % \ (pairs_total, 100.0)) outs.write( "pairs_mapped\t%i\t%5.2f\tpairs_total\n" % \ (pairs_mapped, 100.0 * pairs_mapped / pairs_total)) else: pairs_total = pairs_mapped = 0 outs.write( "pairs_total\t%i\t%5.2f\tpairs_total\n" % (pairs_total,0.0)) outs.write( "pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 0.0)) if options.force_output or len(nm_filtered) > 0: outfile = E.openOutputFile( "nm", "w" ) outfile.write( "NM\talignments\n" ) if len(nm_filtered) > 0: for x in xrange( 0, max( nm_filtered.keys() ) + 1 ): outfile.write("%i\t%i\n" % (x, nm_filtered[x])) else: outfile.write( "0\t%i\n" % (counter.filtered) ) outfile.close() if options.force_output or len(nh_all) > 1: outfile = E.openOutputFile( "nh_all", "w") outfile.write( "NH\treads\n" ) if len(nh_all) > 0: writeNH( outfile, nh_all, max_hi ) else: # assume all are unique if NH flag not set outfile.write( "1\t%i\n" % (counter.mapped_reads) ) outfile.close() if options.force_output or len(nh_filtered) > 1: outfile = E.openOutputFile( "nh", "w") outfile.write( "NH\treads\n" ) if len(nh_filtered) > 0: writeNH( outfile, nh_filtered, max_hi ) else: # assume all are unique if NH flag not set outfile.write( "1\t%i\n" % (counter.filtered) ) outfile.close() if options.force_output or len(mapq_all) > 1: outfile = E.openOutputFile( "mapq", "w") outfile.write( "mapq\tall_reads\tfiltered_reads\n" ) for x in xrange( 0, max( mapq_all.keys() ) + 1 ): outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x])) outfile.close() ## write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--mask-bed-file", dest="filename_rna", type="string", metavar='GFF', help="gff formatted file with masking locations. The number of " "reads overlapping the intervals in the given file will be " "computed. Note that the computation currently does not take " "into account indels, so it is an approximate count only. " "[%default]") parser.add_option( "-f", "--ignore-masked-reads", dest="remove_rna", action="store_true", help="as well as counting reads in the file given by --mask-bed-file, " "also remove these reads for duplicate and match statistics. " "[%default]") parser.add_option( "-i", "--num-reads", dest="input_reads", type="int", help="the number of reads - if given, used to provide percentages " "[%default]") parser.add_option( "-d", "--output-details", dest="output_details", action="store_true", help="output per-read details into a separate file. Read names are " "md5/base64 encoded [%default]") parser.add_option( "-q", "--fastq-file", dest="filename_fastq", help="filename with sequences and quality scores. This file is only " "used to collect sequence identifiers. Thus, for paired end data a " "single file is sufficient [%default]") parser.set_defaults( filename_rna=None, remove_rna=False, input_reads=0, force_output=False, filename_fastq=None, output_details=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if options.filename_rna: rna = GTF.readAndIndex( GTF.iterator(IOTools.openFile(options.filename_rna))) else: rna = None if len(args) > 0: pysam_in = pysam.AlignmentFile(args[0], "rb") elif options.stdin == sys.stdin: pysam_in = pysam.AlignmentFile("-", "rb") else: pysam_in = pysam.AlignmentFile(options.stdin, "rb") if options.output_details: outfile_details = E.openOutputFile("details", "w") else: outfile_details = None if options.filename_fastq and not os.path.exists(options.filename_fastq): raise IOError("file %s does not exist" % options.filename_fastq) (counter, flags_counts, nh_filtered, nh_all, nm_filtered, nm_all, mapq, mapq_all, max_hi) = \ _bam2stats.count(pysam_in, options.remove_rna, rna, filename_fastq=options.filename_fastq, outfile_details=outfile_details) if max_hi > 0 and max_hi != max(nh_all.keys()): E.warn("max_hi(%i) is inconsistent with max_nh (%i) " "- counts will be corrected" % (max_hi, max(nh_all.keys()))) outs = options.stdout outs.write("category\tcounts\tpercent\tof\n") def _write(outs, text, numerator, denominator, base): percent = IOTools.prettyPercent(numerator, denominator) outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base)) ############################### ############################### ############################### # Output alignment information ############################### nalignments_unmapped = flags_counts["unmapped"] nalignments_mapped = counter.alignments_input - nalignments_unmapped _write(outs, "alignments_total", counter.alignments_input, counter.alignments_input, "alignments_total") if counter.alignments_input == 0: E.warn("no alignments in BAM file - no further output") E.Stop() return _write(outs, "alignments_mapped", nalignments_mapped, counter.alignments_input, 'alignments_total') _write(outs, "alignments_unmapped", nalignments_unmapped, counter.alignments_input, 'alignments_total') if nalignments_mapped == 0: E.warn("no mapped alignments - no further output") E.Stop() return for flag, counts in sorted(flags_counts.items()): if flag == "unmapped": continue _write(outs, 'alignments_' + flag, counts, nalignments_mapped, 'alignments_mapped') if options.filename_rna: _write(outs, "alignments_rna", counter.alignments_rna, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_no_rna", counter.alignments_no_rna, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_filtered", counter.alignments_filtered, nalignments_mapped, "alignments_mapped") if counter.filtered == nalignments_mapped: normby = "alignments_mapped" else: normby = "alignments_filtered" if counter.filtered > 0: _write(outs, "alignments_duplicates", counter.alignments_duplicates, counter.alignments_filtered, normby) _write(outs, "alignments_unique", counter.aligmnments_filtered - counter.alignments_duplicates, counter.alignments_filtered, normby) ############################### ############################### ############################### # Output read based information ############################### # derive the number of mapped reads in file from alignment counts if options.filename_fastq: nreads_total = counter.total_read _write(outs, "reads_total", counter.total_read, nreads_total, 'reads_total') _write(outs, "reads_unmapped", counter.total_read_is_unmapped, nreads_total, 'reads_total') _write(outs, "reads_mapped", counter.total_read_is_mapped, nreads_total, 'reads_total') _write(outs, "reads_missing", counter.total_read_is_missing, nreads_total, 'reads_total') _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq, counter.total_read_is_mapped, 'reads_mapped') _write(outs, "reads_multimapping", counter.total_read_is_mmap, counter.total_read_is_mapped, 'reads_mapped') else: E.warn('inferring read counts from alignments and NH tags') nreads_unmapped = flags_counts["unmapped"] nreads_mapped = computeMappedReadsFromAlignments(nalignments_mapped, nh_all, max_hi) nreads_missing = 0 if options.input_reads: nreads_total = options.input_reads # unmapped reads in bam file? if nreads_unmapped: nreads_missing = nreads_total - nreads_unmapped - nreads_mapped else: nreads_unmapped = nreads_total - nreads_mapped elif nreads_unmapped: # if unmapped reads are in bam file, take those nreads_total = nreads_mapped + nreads_unmapped else: # otherwise normalize by mapped reads nreads_unmapped = 0 nreads_total = nreads_mapped outs.write("reads_total\t%i\t%5.2f\treads_total\n" % (nreads_total, 100.0)) outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" % (nreads_mapped, 100.0 * nreads_mapped / nreads_total)) outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" % (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total)) outs.write("reads_missing\t%i\t%5.2f\treads_total\n" % (nreads_missing, 100.0 * nreads_missing / nreads_total)) if len(nh_all) > 1: outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" % (nh_all[1], 100.0 * nh_all[1] / nreads_mapped)) # compute after filtering # not that these are rough guesses if options.filename_rna: nreads_norna = computeMappedReadsFromAlignments( counter.filtered, nh_filtered, max_hi) _write(outs, "reads_norna", nreads_norna, nreads_mapped, "reads_mapped") if len(nh_filtered) > 1: _write(outs, "reads_norna_unique", nh_filtered[1], nreads_norna, "reads_mapped") pysam_in.close() ############################### ############################### ############################### # Output pair information ############################### if flags_counts["read2"] > 0: if options.filename_fastq: pairs_mapped = counter.total_pair_is_mapped # sanity check assert counter.total_pair_is_mapped == \ (counter.total_pair_is_proper_uniq + counter.total_pair_is_incomplete_uniq + counter.total_pair_is_incomplete_mmap + counter.total_pair_is_proper_duplicate + counter.total_pair_is_proper_mmap + counter.total_pair_not_proper_uniq + counter.total_pair_is_other) outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (counter.total_pairs, 100.0 * counter.total_pairs / counter.total_pairs)) outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs)) outs.write( "pairs_unmapped\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_unmapped, 100.0 * counter.total_pair_is_unmapped / counter.total_pairs)) outs.write( "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_uniq, 100.0 * counter.total_pair_is_proper_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_uniq, 100.0 * counter.total_pair_is_incomplete_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_mmap, 100.0 * counter.total_pair_is_incomplete_mmap / counter.total_pairs)) outs.write( "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_duplicate, 100.0 * counter.total_pair_is_proper_duplicate / counter.total_pairs)) outs.write( "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_mmap, 100.0 * counter.total_pair_is_proper_mmap / counter.total_pairs)) outs.write( "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_not_proper_uniq, 100.0 * counter.total_pair_not_proper_uniq / counter.total_pairs)) outs.write( "pairs_other\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_other, 100.0 * counter.total_pair_is_other / counter.total_pairs)) nread1_total = counter.total_read1 _write(outs, "read1_total", counter.total_read1, nread1_total, 'read1_total') _write(outs, "read1_unmapped", counter.total_read1_is_unmapped, nread1_total, 'read1_total') _write(outs, "read1_mapped", counter.total_read1_is_mapped, nread1_total, 'read1_total') _write(outs, "read1_mapped_unique", counter.total_read1_is_mapped_uniq, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "reads_multimapping", counter.total_read1_is_mmap, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "read1_missing", counter.total_read1_is_missing, counter.total_read1_is_mapped, 'read1_total') nread2_total = counter.total_read2 _write(outs, "read2_total", counter.total_read2, nread2_total, 'read2_total') _write(outs, "read2_unmapped", counter.total_read2_is_unmapped, nread2_total, 'read2_total') _write(outs, "read2_mapped", counter.total_read2_is_mapped, nread2_total, 'read2_total') _write(outs, "read2_mapped_unique", counter.total_read2_is_mapped_uniq, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "reads_multimapping", counter.total_read2_is_mmap, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "read2_missing", counter.total_read2_is_missing, counter.total_read2_is_mapped, 'read2_total') else: # approximate counts pairs_total = nreads_total // 2 pairs_mapped = flags_counts["proper_pair"] // 2 _write(outs, "pairs_total", pairs_total, pairs_total, "pairs_total") _write(outs, "pairs_mapped", pairs_mapped, pairs_total, "pairs_total") else: # no paired end data pairs_total = pairs_mapped = 0 outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (pairs_total, 0.0)) outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 0.0)) if options.force_output or len(nm_filtered) > 0: outfile = E.openOutputFile("nm", "w") outfile.write("NM\talignments\n") if len(nm_filtered) > 0: for x in range(0, max(nm_filtered.keys()) + 1): outfile.write("%i\t%i\n" % (x, nm_filtered[x])) else: outfile.write("0\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(nh_all) > 1: outfile = E.openOutputFile("nh_all", "w") outfile.write("NH\treads\n") if len(nh_all) > 0: writeNH(outfile, nh_all, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.mapped_reads)) outfile.close() if options.force_output or len(nh_filtered) > 1: outfile = E.openOutputFile("nh", "w") outfile.write("NH\treads\n") if len(nh_filtered) > 0: writeNH(outfile, nh_filtered, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(mapq_all) > 1: outfile = E.openOutputFile("mapq", "w") outfile.write("mapq\tall_reads\tfiltered_reads\n") for x in range(0, max(mapq_all.keys()) + 1): outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x])) outfile.close() # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--mask-bed-file", dest="filename_rna", type="string", metavar='GFF', help="gff formatted file with masking locations. The number of " "reads overlapping the intervals in the given file will be " "computed. Note that the computation currently does not take " "into account indels, so it is an approximate count only. " "[%default]") parser.add_option( "-f", "--ignore-masked-reads", dest="remove_rna", action="store_true", help="as well as counting reads in the file given by --mask-bed-file, " "also remove these reads for duplicate and match statistics. " "[%default]") parser.add_option( "-i", "--num-reads", dest="input_reads", type="int", help="the number of reads - if given, used to provide percentages " "[%default]") parser.add_option( "-d", "--output-details", dest="output_details", action="store_true", help="output per-read details [%default]") parser.add_option( "-q", "--fastq-file", dest="filename_fastq", help="filename with sequences and quality scores. This file is only " "used to collect sequence identifiers. Thus, for paired end data a " "single file is sufficient [%default]") parser.set_defaults( filename_rna=None, remove_rna=False, input_reads=0, force_output=False, filename_fastq=None, output_details=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if options.filename_rna: rna = GTF.readAndIndex( GTF.iterator(IOTools.openFile(options.filename_rna))) else: rna = None if options.stdin == sys.stdin: pysam_in = pysam.Samfile("-", "rb") else: raise NotImplementedError("-I option not implemented") if options.output_details: outfile_details = E.openOutputFile("details", "w") else: outfile_details = None if options.filename_fastq and not os.path.exists(options.filename_fastq): raise IOError("file %s does not exist" % options.filename_fastq) (counter, flags_counts, nh_filtered, nh_all, nm_filtered, nm_all, mapq, mapq_all, max_hi) = \ _bam2stats.count(pysam_in, options.remove_rna, rna, filename_fastq=options.filename_fastq, outfile_details=outfile_details) if max_hi > 0 and max_hi != max(nh_all.keys()): E.warn("max_hi(%i) is inconsistent with max_nh (%i) - counts will be corrected" % (max_hi, max(nh_all.keys()))) outs = options.stdout outs.write("category\tcounts\tpercent\tof\n") def _write(outs, text, numerator, denominator, base): percent = IOTools.prettyPercent(numerator, denominator) outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base)) ############################### ############################### ############################### # Output alignment information ############################### nalignments_unmapped = flags_counts["unmapped"] nalignments_mapped = counter.alignments_input - nalignments_unmapped _write(outs, "alignments_total", counter.alignments_input, counter.alignments_input, "alignments_total") if counter.alignments_input == 0: E.warn("no alignments in BAM file - no further output") E.Stop() return _write(outs, "alignments_mapped", nalignments_mapped, counter.alignments_input, 'alignments_total') _write(outs, "alignments_unmapped", nalignments_unmapped, counter.alignments_input, 'alignments_total') if nalignments_mapped == 0: E.warn("no mapped alignments - no further output") E.Stop() return for flag, counts in flags_counts.iteritems(): if flag == "unmapped": continue _write(outs, 'alignments_' + flag, counts, nalignments_mapped, 'alignments_mapped') if options.filename_rna: _write(outs, "alignments_rna", counter.alignments_rna, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_no_rna", counter.alignments_no_rna, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_filtered", counter.alignments_filtered, nalignments_mapped, "alignments_mapped") if counter.filtered == nalignments_mapped: normby = "alignments_mapped" else: normby = "alignments_filtered" if counter.filtered > 0: _write(outs, "alignments_duplicates", counter.alignments_duplicates, counter.alignments_filtered, normby) _write(outs, "alignments_unique", counter.aligmnments_filtered - counter.alignments_duplicates, counter.alignments_filtered, normby) ############################### ############################### ############################### # Output read based information ############################### # derive the number of mapped reads in file from alignment counts if options.filename_fastq: nreads_total = counter.total_read _write(outs, "reads_total", counter.total_read, nreads_total, 'reads_total') _write(outs, "reads_unmapped", counter.total_read_is_unmapped, nreads_total, 'reads_total') _write(outs, "reads_mapped", counter.total_read_is_mapped, nreads_total, 'reads_total') _write(outs, "reads_missing", counter.total_read_is_missing, nreads_total, 'reads_total') _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq, counter.total_read_is_mapped, 'reads_mapped') _write(outs, "reads_multimapping", counter.total_read_is_mmap, counter.total_read_is_mapped, 'reads_mapped') else: E.warn('inferring read counts from alignments and NH tags') nreads_unmapped = flags_counts["unmapped"] nreads_mapped = computeMappedReadsFromAlignments(nalignments_mapped, nh_all, max_hi) nreads_missing = 0 if options.input_reads: nreads_total = options.input_reads # unmapped reads in bam file? if nreads_unmapped: nreads_missing = nreads_total - nreads_unmapped - nreads_mapped else: nreads_unmapped = nreads_total - nreads_mapped elif nreads_unmapped: # if unmapped reads are in bam file, take those nreads_total = nreads_mapped + nreads_unmapped else: # otherwise normalize by mapped reads nreads_unmapped = 0 nreads_total = nreads_mapped outs.write("reads_total\t%i\t%5.2f\treads_total\n" % (nreads_total, 100.0)) outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" % (nreads_mapped, 100.0 * nreads_mapped / nreads_total)) outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" % (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total)) outs.write("reads_missing\t%i\t%5.2f\treads_total\n" % (nreads_missing, 100.0 * nreads_missing / nreads_total)) if len(nh_all) > 1: outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" % (nh_all[1], 100.0 * nh_all[1] / nreads_mapped)) # compute after filtering # not that these are rough guesses if options.filename_rna: nreads_norna = computeMappedReadsFromAlignments( counter.filtered, nh_filtered, max_hi) _write(outs, "reads_norna", nreads_norna, nreads_mapped, "reads_mapped") if len(nh_filtered) > 1: _write(outs, "reads_norna_unique", nh_filtered[1], nreads_norna, "reads_mapped") pysam_in.close() ############################### ############################### ############################### # Output pair information ############################### if flags_counts["read2"] > 0: if options.filename_fastq: pairs_mapped = counter.total_pair_is_mapped # sanity check assert counter.total_pair_is_mapped == \ (counter.total_pair_is_proper_uniq + counter.total_pair_is_incomplete_uniq + counter.total_pair_is_incomplete_mmap + counter.total_pair_is_proper_duplicate + counter.total_pair_is_proper_mmap + counter.total_pair_not_proper_uniq + counter.total_pair_is_other) outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (counter.total_pairs, 100.0 * counter.total_pairs / counter.total_pairs)) outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs)) outs.write( "pairs_unmapped\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_unmapped, 100.0 * counter.total_pair_is_unmapped / counter.total_pairs)) outs.write( "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_uniq, 100.0 * counter.total_pair_is_proper_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_uniq, 100.0 * counter.total_pair_is_incomplete_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_mmap, 100.0 * counter.total_pair_is_incomplete_mmap / counter.total_pairs)) outs.write( "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_duplicate, 100.0 * counter.total_pair_is_proper_duplicate / counter.total_pairs)) outs.write( "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_mmap, 100.0 * counter.total_pair_is_proper_mmap / counter.total_pairs)) outs.write( "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_not_proper_uniq, 100.0 * counter.total_pair_not_proper_uniq / counter.total_pairs)) outs.write( "pairs_other\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_other, 100.0 * counter.total_pair_is_other / counter.total_pairs)) nread1_total = counter.total_read1 _write(outs, "read1_total", counter.total_read1, nread1_total, 'read1_total') _write(outs, "read1_unmapped", counter.total_read1_is_unmapped, nread1_total, 'read1_total') _write(outs, "read1_mapped", counter.total_read1_is_mapped, nread1_total, 'read1_total') _write(outs, "read1_mapped_unique", counter.total_read1_is_mapped_uniq, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "reads_multimapping", counter.total_read1_is_mmap, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "read1_missing", counter.total_read1_is_missing, counter.total_read1_is_mapped, 'read1_total') nread2_total = counter.total_read2 _write(outs, "read2_total", counter.total_read2, nread2_total, 'read2_total') _write(outs, "read2_unmapped", counter.total_read2_is_unmapped, nread2_total, 'read2_total') _write(outs, "read2_mapped", counter.total_read2_is_mapped, nread2_total, 'read2_total') _write(outs, "read2_mapped_unique", counter.total_read2_is_mapped_uniq, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "reads_multimapping", counter.total_read2_is_mmap, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "read2_missing", counter.total_read2_is_missing, counter.total_read2_is_mapped, 'read2_total') else: # approximate counts pairs_total = nreads_total // 2 pairs_mapped = flags_counts["proper_pair"] // 2 _write(outs, "pairs_total", pairs_total, pairs_total, "pairs_total") _write(outs, "pairs_mapped", pairs_mapped, pairs_total, "pairs_total") else: # no paired end data pairs_total = pairs_mapped = 0 outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (pairs_total, 0.0)) outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 0.0)) if options.force_output or len(nm_filtered) > 0: outfile = E.openOutputFile("nm", "w") outfile.write("NM\talignments\n") if len(nm_filtered) > 0: for x in xrange(0, max(nm_filtered.keys()) + 1): outfile.write("%i\t%i\n" % (x, nm_filtered[x])) else: outfile.write("0\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(nh_all) > 1: outfile = E.openOutputFile("nh_all", "w") outfile.write("NH\treads\n") if len(nh_all) > 0: writeNH(outfile, nh_all, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.mapped_reads)) outfile.close() if options.force_output or len(nh_filtered) > 1: outfile = E.openOutputFile("nh", "w") outfile.write("NH\treads\n") if len(nh_filtered) > 0: writeNH(outfile, nh_filtered, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(mapq_all) > 1: outfile = E.openOutputFile("mapq", "w") outfile.write("mapq\tall_reads\tfiltered_reads\n") for x in xrange(0, max(mapq_all.keys()) + 1): outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x])) outfile.close() # write footer and output benchmark information. E.Stop()