def frag_coverage(bam, chrom_lengths, region=None, min_aqual=0, ref_cov=True, verbose=True): """ Calculate fragment coverage vectors on the forward and reverse strands. :param bam: Input bam file. :param chrom_lengths: Dictionary of chromosome names and lengths. :param region: Restrict parsing to the specified region. :param min_aqual: Minimum mapping quality. :param verbose: Display progress bar. :returns: Forward and reverse fragment coverage vectors. :rtype: dict """ frags_fwd = defaultdict(lambda: defaultdict(int)) frags_rev = defaultdict(lambda: defaultdict(int)) aln_ref_cov = (defaultdict(list)) bam_reader = bam_common.pysam_open(bam, in_format='BAM') ue = True if region is not None: ue = False bam_iter = bam_reader.fetch(region=region, until_eof=ue) try: total_reads = bam_reader.mapped + bam_reader.unmapped except: total_reads = None if verbose and region is None: sys.stdout.write( "Gathering fragment statistics from file: {}\n".format(bam)) bam_iter = tqdm.tqdm(bam_iter, total=total_reads) for r in bam_iter: # Skip unmapped reads: if r.is_unmapped: continue # Skip if mapping quality is too low: if r.mapq < min_aqual: continue pos = r.reference_start ref = r.reference_name if r.is_reverse: frags_rev[r.reference_name][pos] += 1 else: frags_fwd[r.reference_name][pos] += 1 if ref_cov: aln_ref_cov[ref].append(r.reference_length / float(chrom_lengths[ref])) frags_fwd = _frag_dict_to_array(frags_fwd, chrom_lengths) frags_rev = _frag_dict_to_array(frags_rev, chrom_lengths) res = {'frags_fwd': frags_fwd, 'frags_rev': frags_rev, 'ref_cov': aln_ref_cov} return res
def _process_bam(bam, out_tsv, chrom_lengths, region=None, min_aqual=0, verbose=True): bam_reader = bam_common.pysam_open(bam, in_format='BAM') ue = True if region is not None: ue = False bam_iter = bam_reader.fetch(region=region, until_eof=ue) try: total_reads = bam_reader.mapped + bam_reader.unmapped except: total_reads = None if verbose and region is None: sys.stdout.write( "Gathering fragment statistics from file: {}\n".format(bam)) bam_iter = tqdm.tqdm(bam_iter, total=total_reads) tsv = open(out_tsv, "w") tsv.write( "Read\tRef\tStrand\tRefCov\tReadCov\tReadLength\tReadAlnLength\tRefLength\tRefAlnLength\tMapQual\n" ) for r in bam_iter: # Skip unmapped reads: if r.is_unmapped: continue # Skip if mapping quality is too low: if r.mapq < min_aqual: continue strand = '-' if r.is_reverse else '+' ref = r.reference_name ref_cov = r.reference_length / float(chrom_lengths[ref]) read = r.query_name read_length = r.infer_read_length() mapq = r.mapping_quality read_aln_len = r.query_alignment_length read_cov = read_aln_len / float(read_length) ref_aln_length = r.reference_length tsv.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( read, ref, strand, ref_cov, read_cov, read_length, read_aln_len, chrom_lengths[ref], ref_aln_length, mapq)) tsv.flush() tsv.close()
def pileup_stats(bam, region=None, verbose=True, with_quals=True): """ Parse pileup columns and extract quality values. :param bam: Input BAM file. :param region: samtools region. :param verbose: Show progress bar. :param with_quals: Return quality values per position. :returns: Dictionaries per reference with per-base coverage and quality values. :rtype: dict """ st = defaultdict(lambda: defaultdict(list)) cst = defaultdict(lambda: defaultdict(int)) samfile = bam_common.pysam_open(bam, in_format='BAM') pileup_iter = samfile.pileup(region=region, min_base_quality=0) start, end = None, None if region is not None: tmp = region.split(":") _, start, end = tmp[0], int(tmp[1]) - 1, int(tmp[2]) if verbose: sys.stdout.write( "Gathering pileup statistics from file: {}\n".format(bam)) total_bases = sum(samfile.lengths) if region is not None: tmp = region.split(":") total_bases = int(tmp[2]) - int(tmp[1]) pileup_iter = tqdm.tqdm(pileup_iter, total=total_bases) for pileupcolumn in pileup_iter: if region is not None and (pileupcolumn.reference_pos < start or pileupcolumn.reference_pos >= end): continue # print pileupcolumn.reference_name, pileupcolumn.reference_pos, # pileupcolumn.nsegments cst[pileupcolumn.reference_name][ pileupcolumn.reference_pos] = pileupcolumn.nsegments for pileupread in pileupcolumn.pileups: if not pileupread.is_del and not pileupread.is_refskip: # print pileupcolumn.reference_name, pileupcolumn.reference_pos, # pileupread.alignment.query_qualities[pileupread.query_position] if (pileupread.alignment.query_qualities is not None) and with_quals: st[pileupcolumn.reference_name][ pileupcolumn.reference_pos].append( pileupread.alignment.query_qualities[ pileupread.query_position]) samfile.close() return {'qualities': dict(st), 'coverage': dict(cst)}
parser.add_argument( '-f', metavar='format', type=str, help="Input/output format (SAM).", default='SAM') parser.add_argument( '-s', metavar='strategy', type=str, help="Filtering strategy: top_per_query, query_coverage, ref_coverage (top_per_query).", default="top_per_query", choices=['top_per_query', 'query_coverage', 'ref_coverage']) parser.add_argument( '-q', metavar='query_cover', type=float, help="Minimum query coverage fraction (0.8).", default=0.8) parser.add_argument( 'infile', metavar='input_file', type=str, help="Input file.") parser.add_argument( 'outfile', metavar='output_file', type=str, help="Output SAM file.") if __name__ == '__main__': args = parser.parse_args() input_iter = bam_common.pysam_open(args.infile, args.f) if args.s == 'top_per_query': output_iter = bam_filter.filter_top_per_query(input_iter.fetch(until_eof=True)) elif args.s == 'query_coverage': output_iter = bam_filter.filter_query_coverage(input_iter.fetch(until_eof=True), args.q) elif args.s == 'ref_coverage': output_iter = bam_filter.filter_ref_coverage(input_iter.fetch(until_eof=True), args.q, input_iter.header) else: raise Exception('Filtering strategy not implemented!') writer = pysam.AlignmentFile(args.outfile, "wh", template=input_iter, header=input_iter.header) for record in output_iter: writer.write(record) writer.close()
type=str, help="Input fastq.", required=True) parser.add_argument('infile', metavar='input_file', type=str, help="Input file.") parser.add_argument('outfile', metavar='output_file', type=str, help="Output SAM file.") if __name__ == '__main__': args = parser.parse_args() input_iter = bam_common.pysam_open(args.infile, args.f).fetch(until_eof=True) # Get SAM record names: sam_names = [record.query_name for record in input_iter] writer = sam_writer.SamWriter(args.outfile) for read in seq_util.read_seq_records(args.q, 'fastq'): if read.id not in sam_names: qual = seq_util.quality_array_to_string( read.letter_annotations["phred_quality"]) sam_record = writer.new_sam_record(qname=read.id, flag=4, rname="*", pos=0, mapq=0,
'-x', action="store_true", help= "Sort by number of read bases instead of number of aligned reference bases.", default=False) parser.add_argument('-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False) parser.add_argument('bam', metavar='bam', type=str, help="Input BAM file.") if __name__ == '__main__': args = parser.parse_args() verbose = not args.Q bam_reader = bam_common.pysam_open(args.bam, in_format='BAM') if verbose: sys.stdout.write( "Gathering read and alignment lengths from file: {}\n".format( args.bam)) try: total_reads = bam_reader.mapped + bam_reader.unmapped except: total_reads = None bam_reader = tqdm.tqdm(bam_reader, total=total_reads) read_names = [] ref_names = [] ref_lengths = [] read_lengths = []
def error_and_read_stats(bam, refs, context_sizes=(1, 1), region=None, min_aqual=0, verbose=True): """Gather read statistics and context-dependend error statistics from BAM file. WARNING: context overstepping reference start/end boundaries are not registered. Definition of context: for substitutions the event is happening from the "central base", in the case of indels the events are located between the central base and the base before. :param bam: Input BAM file. :param refs: Dictionary of references. :param context_sizes: The size of the left and right contexts. :param region: samtools regions. :param min_qual: Minimum mappign quality. :param verbose: Show progress bar. :returns: Dictionary with read and error statistics. :rtype: dict """ events = defaultdict(lambda: defaultdict(int)) read_stats = defaultdict(list) read_stats = {'unmapped': 0, 'mapped': 0, 'unaligned_quals': [], 'unaligned_lengths': [], 'aligned_quals': [], 'alignment_lengths': [], 'aligned_lengths': [], 'mqfail_aligned_quals': [], 'mqfail_alignment_lengths': [], 'mapping_quals': [], } indel_dists = {'insertion_lengths': defaultdict(int), 'deletion_lengths': defaultdict( int), 'insertion_composition': defaultdict(int)} bam_reader = bam_common.pysam_open(bam, in_format='BAM') base_stats = {'match': 0, 'mismatch': 0, 'deletion': 0, 'insertion': 0} read_iter = bam_reader.fetch(region=region, until_eof=True) if verbose: sys.stdout.write( "Gathering read and error statistics from file: {}\n".format(bam)) try: total_reads = bam_reader.mapped + bam_reader.unmapped except: total_reads = None read_iter = tqdm.tqdm(read_iter, total=total_reads) for r in read_iter: _update_read_stats(r, read_stats, min_aqual) if r.is_unmapped: continue if r.query_sequence is None: continue if r.mapping_quality < min_aqual: continue ref = refs[r.reference_name] _update_events(r, ref, events, indel_dists, context_sizes, base_stats) base_stats['aln_length'] = base_stats['match'] + base_stats['mismatch'] + \ base_stats['insertion'] + base_stats['deletion'] if base_stats['match'] + base_stats['mismatch'] == 0: base_stats['identity'] = 0 else: base_stats['identity'] = float( base_stats['match']) / (base_stats['match'] + base_stats['mismatch']) if base_stats['aln_length'] == 0: base_stats['accuracy'] = 0 else: base_stats['accuracy'] = 1.0 - \ float(base_stats['mismatch'] + base_stats['insertion'] + base_stats['deletion']) / \ base_stats['aln_length'] res = {'events': dict(events), 'read_stats': dict( read_stats), 'indel_dists': dict(indel_dists), 'base_stats': base_stats} return res
def read_stats(bam, min_aqual=0, region=None, with_clipps=False, verbose=True): """ Parse reads in BAM file and record various statistics. :param bam: BAM file. :param min_aqual: Minimum mapping quality, skip read if mapping quality is lower. :param region: smatools region. :param with_clipps: Take into account clipps when calculating accuracy. :param verbose: Show progress bar. :returns: A dictionary with various global and per-read statistics. :rtype: dict """ res = {'unmapped': 0, 'mapped': 0, 'unaligned_quals': [], 'unaligned_lengths': [], 'aligned_quals': [], 'alignment_lengths': [], 'aligned_lengths': [], 'mqfail_aligned_quals': [], 'mqfail_alignment_lengths': [], 'mapping_quals': [], } base_stats = {'aln_length': 0, 'match': 0, 'mismatch': 0, 'deletion': 0, 'insertion': 0, 'clipps': 0} read_stats = OrderedDict([ ("name", []), ("ref", []), ("coverage", []), ("direction", []), ("aln_length", []), ("insertion", []), ("deletion", []), ("mismatch", []), ("match", []), ("identity", []), ("accuracy", []), ("clipps", []) ]) bam_reader = bam_common.pysam_open(bam, in_format='BAM') ue = True if region is not None: ue = False bam_iter = bam_reader.fetch(region=region, until_eof=ue) try: total_reads = bam_reader.mapped + bam_reader.unmapped except: total_reads = None if verbose and region is None: sys.stdout.write( "Gathering read statistics from file: {}\n".format(bam)) bam_iter = tqdm.tqdm(bam_iter, total=total_reads) for r in bam_iter: # Update basic read statistics: _update_read_stats(r, res, min_aqual) # Get detailed statistics from aligned read and # updated global stats: bs = stats_from_aligned_read(r, with_clipps) # bs is None for unaligned reads. if bs is not None: for k in six.iterkeys(base_stats): base_stats[k] += bs[k] for stat, value in six.iteritems(bs): read_stats[stat].append(value) # Calculate global identity and accuracy: base_stats['identity'] = float( base_stats['match']) / (base_stats['match'] + base_stats['mismatch']) clipps = 0 if with_clipps: clipps = base_stats['clipps'] base_stats['accuracy'] = 1.0 - (float(base_stats['insertion'] + base_stats['deletion'] + base_stats['mismatch'] + clipps) / base_stats['aln_length']) res['base_stats'] = base_stats res['read_stats'] = read_stats bam_reader.close() return res
def bam_compare(aln_one, aln_two, coarse_tolerance=50, strict_flags=False, in_format='BAM', verbose=False): """Count reads mapping to references in a BAM file. :param alignment_file: BAM file. :param min_aln_qual: Minimum mapping quality. :param verbose: Show progress bar. :returns: Dictionary with read counts per reference. :rtype: dict """ aln_iter_one = bam_common.pysam_open(aln_one, in_format) aln_iter_two = bam_common.pysam_open(aln_two, in_format) total = None if in_format == "BAM": total_one = aln_iter_one.mapped + aln_iter_one.unmapped total_two = aln_iter_two.mapped + aln_iter_two.unmapped if total_one != total_two: raise Exception( "The two input files ({} {}) have a different number of records!" .format(aln_one, aln_two)) total = total_one # Comparison summary structure: stats = OrderedDict([ ('BamFiles', [aln_one, aln_two]), ('TotalQueries', 0), ('DirectionMismatch', 0), ('RefMismatch', 0), ('StrictFlagMismatch', 0), ('SeqMismatch', 0), ('CoarseMatches', 0), ('CommonAlignedBases', 0), ('CommonMatchingBases', 0), ('PerQueryBaseSim', []), ('PerQueryBaseSimClipped', []), (aln_one, { 'HardClippedBases': 0, 'SoftClippedBases': 0, 'AlignedBases': 0, 'UnalignedQueries': 0, 'AlignedQueries': 0 }), (aln_two, { 'HardClippedBases': 0, 'SoftClippedBases': 0, 'AlignedBases': 0, 'UnalignedQueries': 0, 'AlignedQueries': 0 }), ('AlignedSimilarity', 0.0), ]) records_iter = zip(aln_iter_one.fetch(until_eof=True), aln_iter_two.fetch(until_eof=True)) if verbose and in_format == "BAM": records_iter = tqdm.tqdm(records_iter, total=total) for segments in records_iter: aln_diff = compare_alignments(segments[0], segments[1], strict_flags) stats['TotalQueries'] += 1 # Register hard and soft clipped bases: stats[aln_one]['HardClippedBases'] += aln_diff['hard_clipped'][0] stats[aln_two]['HardClippedBases'] += aln_diff['hard_clipped'][1] stats[aln_one]['SoftClippedBases'] += aln_diff['soft_clipped'][0] stats[aln_two]['SoftClippedBases'] += aln_diff['soft_clipped'][1] # Both reads are aligned: if aln_diff['mapped'] == (True, True): stats[aln_one]['AlignedQueries'] += 1 stats[aln_two]['AlignedQueries'] += 1 # Reference mismatch: if aln_diff['ref_match'] is False: stats['RefMismatch'] = +1 continue # Orientation mismatch: if aln_diff['dir_match'] is False: stats['DirectionMismatch'] += 1 continue # Flag mismatch: if aln_diff['flag_match'] is False: stats['StrictFlagMismatch'] += 1 continue # Sequence mismatch: if aln_diff['seq_match'] is False: stats['SeqMismatch'] += 1 stats['CommonAlignedBases'] += aln_diff['bases'] stats['CommonMatchingBases'] += aln_diff['cons_score'] stats['PerQueryBaseSim'].append(aln_diff['cons_score'] / float(aln_diff['bases'])) stats['PerQueryBaseSimClipped'].append( float(aln_diff['cons_score']) / min(segments[0].infer_query_length(), segments[1].infer_query_length())) if is_coarse_match(aln_diff, coarse_tolerance): stats['CoarseMatches'] += 1 stats[aln_one]['AlignedBases'] += aln_diff['bases'] stats[aln_two]['AlignedBases'] += aln_diff['bases'] # Read from first BAM is aligned: elif aln_diff['mapped'] == (True, False): stats[aln_one]['AlignedQueries'] += 1 stats[aln_one]['AlignedBases'] += aln_diff['bases_one'] stats[aln_two]['UnalignedQueries'] += 1 # Read from second BAM is aligned: elif aln_diff['mapped'] == (False, True): stats[aln_two]['AlignedQueries'] += 1 stats[aln_two]['AlignedBases'] += aln_diff['bases_two'] stats[aln_one]['UnalignedQueries'] += 1 # Both unaligned: elif aln_diff['mapped'] == (False, False): stats[aln_one]['UnalignedQueries'] += 1 stats[aln_two]['UnalignedQueries'] += 1 if stats['CommonAlignedBases'] > 0: stats['AlignedSimilarity'] = stats['CommonMatchingBases'] / \ float(stats['CommonAlignedBases']) else: stats['AlignedSimilarity'] = 0.0 return stats