def get_rough_insert_median(opts, bam, pairs_to_check=10000): # check min_mapq, neither unmapped, neither supp ilen = [] seen = {} rej = set() for aln in bam.fetch_unsorted(): if aln.qname in seen: if aln.mapq < opts[ 'min_mapq_reads'] or aln.is_unmapped or not_primary(aln): del seen[aln.qname] else: pair = (aln, seen[aln.qname]) process_insert_len(pair, ilen, opts['min_mapq_reads'], opts['read_len'], truncate=False) del seen[aln.qname] else: if aln.mapq < opts[ 'min_mapq_reads'] or aln.is_unmapped or not_primary(aln): rej.add(aln.qname) else: seen[aln.qname] = aln if len(ilen) >= pairs_to_check: break return np.median(ilen)
def process_discordant_pair(aln1, aln2, chrom, discordant_pairs, min_mapq, ilen, min_insert, max_insert, is_rf=False): if (aln1.is_reverse != aln2.is_reverse) and (ilen is not None) and \ (ilen >= min_insert) and (ilen <= max_insert): return None if aln1.mapq < min_mapq or aln2.mapq < min_mapq or aln1.is_unmapped or \ aln2.is_unmapped or not_primary(aln1) or not_primary(aln2): return None # "First" is -> if FR (-> <-) and <- if RF (<- ->) # i.e. the read we expect on the "left" in ref. coords if aln1.is_reverse != aln2.is_reverse: second = aln1 if (aln1.is_reverse ^ is_rf) else aln2 first = aln1 if second is aln2 else aln2 if ilen > max_insert: dtype = 'Del' disc = DiscordantPair(chrom, first.reference_end, second.reference_start, ilen, first.qname) elif (first.reference_start > second.reference_start) or \ (first.reference_end > second.reference_end): dtype = 'Dup' disc = DiscordantPair(chrom, second.reference_start, first.reference_end, ilen, second.qname) elif ilen < min_insert: dtype = 'Ins' disc = DiscordantPair(chrom, first.reference_end, second.reference_start, ilen, first.qname) else: dtype = 'InvR' if (aln1.is_reverse ^ is_rf) else 'InvL' if dtype == 'InvL': pos1, pos2 = sorted([aln1.reference_end, aln2.reference_end]) else: pos1, pos2 = sorted([aln1.reference_start, aln2.reference_start]) disc = DiscordantPair(chrom, pos1, pos2, ilen, aln1.qname) discordant_pairs[dtype] = discordant_pairs.get(dtype, []) + [disc] if disc.pos1 > disc.pos2 and dtype != 'Ins': raise Warning( '[process_disc_pair] discordant type {0} pos1 > pos2'.format( dtype)) return dtype
def process_softclip(opts, pair, pair_split_found, softclips, lib_idx): min_mapq = opts['min_mapq_softclip'] min_clipped_bases = opts['min_clipped_bases'] min_clipped_qual = opts['min_clipped_qual'] lowqual_trim_extra = opts['lowqual_trim_extra'] for (aln, split_found) in zip(pair, pair_split_found): if aln is None or aln.is_unmapped or \ aln.mapq < min_mapq or not_primary(aln) or \ split_found: continue # count number of phred qual > 2 clipped bases and adjust nclip nclip = [aln.query_alignment_start, len(aln.seq) - aln.query_alignment_end] if nclip == [0, 0]: continue pos = (aln.reference_start, aln.reference_end) lowqual = count_lowqual_bases(aln, lowqual_trim_extra) for o in (LEFT, RIGHT): if lowqual[o] > 0: nclip[o] = max(0, nclip[o] - lowqual[o]) if nclip[o] < min_clipped_bases: continue if o == LEFT: med_qual = np.median(aln.query_qualities[lowqual[o]:(lowqual[o]+nclip[o])]) else: med_qual = np.median(aln.query_qualities[(-lowqual[o]-nclip[o]):(-lowqual[o] or None)]) if med_qual < min_clipped_qual: continue this_nclip = nclip[o] this_pos = pos[o] this_nmapped = aln.query_alignment_end - aln.query_alignment_start sc = SoftclipCluster(is_right=(o == RIGHT), pos=this_pos, bases_clipped=this_nclip, bases_mapped=this_nmapped, num_reads=1, num_reads_exact=1, sum_mapq=aln.mapq, num_minus=int(aln.is_reverse), num_plus=1-int(aln.is_reverse), which_libs=(1 << lib_idx)) softclips[o][this_pos].append(sc)
def extract_approximate_library_stats(opts, bam, rough_insert_median): reads_per_chunk = int( np.floor(opts['approx_stats_nreads'] / opts['approx_stats_nchunks'])) # lib_patterns, lib_stats = parse_library_stats(meta) # maps read groups matching lib_patterns to indices in lib_stats # lib_dict = {} # MULTILIB nlib = opts['nlib'] insert_len = [[] for i in range(nlib)] read_len_shorter = [[] for i in range(nlib)] read_len_longer = [[] for i in range(nlib)] chrom_name = opts['chromosome'] chrom_size = get_chrom_size_from_bam(chrom_name, bam) chunk_size = 10 * opts['insert_max_mu_multiple'] * rough_insert_median rough_insert_max = opts['insert_max_mu_multiple'] * rough_insert_median reads_processed = [0 for i in range(nlib)] chunks_processed = 0 # MINOR reads_per_chunk should mean completed while min(reads_processed) < opts['approx_stats_nreads']: # extract random chunk start = np.random.randint(0, chrom_size - chunk_size) end = start + chunk_size # parse reads seen_aln = {} chunk_reads_seen = 0 alns = list(bam.fetch_unsorted(chrom_name, start, end)) if bam.num_bam > 1: alns.sort(key=lambda a: a.pos) for aln in list(bam.fetch_unsorted(chrom_name, start, end)): # conditioning on mate position introduces slight bias, # but insignificant if chunk_size >> insert size if not_primary(aln) or aln.is_duplicate or aln.is_unmapped or \ aln.mpos < start or aln.mpos >= end or aln.mate_is_unmapped: continue if aln.qname not in seen_aln: if chunk_reads_seen < reads_per_chunk: seen_aln[aln.qname] = aln chunk_reads_seen += 1 continue else: continue # pair completed mate = seen_aln[aln.qname] pair = (aln, mate) del seen_aln[aln.qname] lib_idx = 0 # get_lib_idx(aln.get_tag('RG'), lib_dict, lib_patterns) process_insert_len(pair, insert_len[lib_idx], opts['min_mapq_reads'], opts['read_len'], maximum_insert_size=rough_insert_max) process_read_len(pair, read_len_shorter[lib_idx], read_len_longer[lib_idx]) reads_processed[lib_idx] += 1 if min(reads_processed) % 200000 == 0 and opts['verbosity'] > 0: print( '[library_stats] processed {0} reads ({1} chunks) for each lib' .format(min(reads_processed), chunks_processed)) chunks_processed += 1 insert_mean = [np.median(il) for il in insert_len] insert_sd = [robust_sd(il) for il in insert_len] insert_lower = [np.percentile(il, 0.15) for il in insert_len] insert_upper = [np.percentile(il, 99.85) for il in insert_len] insert_pmf = [ pmf_kernel_smooth(il, 0, opts['insert_max_mu_multiple'] * mu, opts['max_kde_samples']) for (il, mu) in zip(insert_len, insert_mean) ] rlen_short = [round(np.median(rl)) for rl in read_len_shorter] rlen_long = [round(np.median(rl)) for rl in read_len_longer] rlen_medians = list(zip(rlen_short, rlen_long)) return insert_mean, insert_sd, insert_pmf, insert_lower, insert_upper, rlen_medians
def parse_bam(opts, reference_files, bamfiles): chrom_name = opts['chromosome'] start, end = opts['region_start'], opts['region_end'] outdir = opts['outdir'] min_mapq_reads = opts['min_mapq_reads'] nlib = opts['nlib'] # MULTILIB # lib_patterns, lib_stats = parse_library_stats(meta) # lib_dict = {} bam = BamGroup(bamfiles) opts['read_len'] = bam_read_len(bam) # bam_has_unmapped = has_unmapped_records(bam) # if opts['verbosity'] > 0: # if bam_has_unmapped: # print('[parse_bam] bam file DOES contain unmapped records') # else: # print('[parse_bam] bam file DOES NOT contain unmapped records') if opts['verbosity'] > 0: print('\n[parse_bam] extracting approximate library stats') rough_insert_median = get_rough_insert_median(opts, bam) if opts['verbosity'] > 0: print('[parse_bam] read_len: {0}; rough_insert_median: {1}'.format( opts['read_len'], rough_insert_median)) als = extract_approximate_library_stats(opts, bam, rough_insert_median) mean_approx, sd_approx, pmf_approx, qlower, qupper, rlen_medians = als for i in range(len(pmf_approx)): with open( os.path.join( outdir, 'logging', '{0}_insert_pmf.txt'.format(opts['library_names'][i])), 'w') as f: for j in range(len(pmf_approx[i])): f.write('{0}\t{1}\n'.format(j, pmf_approx[i][j])) if opts['verbosity'] > 0: print('[parse_bam] library stats:\n\tmu = {0}\n\tsigma = {1}'.format( mean_approx, sd_approx)) add_time_checkpoint(opts, 'lib. stats') def get_lr_cutoff(opts, pmf, do_min=False): cutoff_normal_equivalent = opts['insert_cutoff'] lr_cutoff = normpdf(0) - normpdf(cutoff_normal_equivalent) mode = max(pmf) logmode = np.log(mode) which_mode = [i for i in range(len(pmf)) if pmf[i] == mode] cutoff = None if do_min: for i in range(1, len(pmf)): if pmf[i] != 0 and logmode - np.log(pmf[i]) < lr_cutoff: cutoff = i - 1 break else: for i in range(len(pmf) - 2, -1, -1): if pmf[i] != 0 and logmode - np.log(pmf[i]) < lr_cutoff: cutoff = i + 1 break if opts['verbosity'] > 0: print('[insert_cutoff] lr_cutoff is {0}'.format(lr_cutoff)) print('[insert_cutoff] mode (log) {0} at {1}'.format( logmode, which_mode)) print('[insert_cutoff] cutoff ratio (log) {0} at {1}'.format( logmode - np.log(pmf[i]), cutoff)) return cutoff min_concordant_insert = [ get_lr_cutoff(opts, pmf, do_min=True) for pmf in pmf_approx ] max_concordant_insert = [get_lr_cutoff(opts, pmf) for pmf in pmf_approx] if opts['verbosity'] > 0: print('[parse_bam] insert size cutoffs:') print('[parse_bam]' + '\n'.join([ '{0}-{1}'.format(min_concordant_insert[i], max_concordant_insert[i]) for i in range(len(mean_approx)) ])) print( '[parse_bam] equivalent to mu +/- 3 sigma in normal:\n\t{0}\n\t{1}\n' .format(qlower, qupper)) seen_aln = {} nreads, npairs = 0, 0 num_read_through = 0 insert_len = [[] for i in range(nlib)] softclips = [(defaultdict(list), defaultdict(list)) for i in range(nlib)] splits = [[] for i in range(nlib)] if opts['do_pecluster']: discordant_pairs = [OrderedDict() for i in range(nlib)] if not opts['use_mate_tags']: # need to estimate mappability proportions mapstats = [defaultdict(int) for i in range(nlib)] else: mapstats = None if opts['verbosity'] > 0: print('[parse_bam] starting alignment parsing. . .') alignments = bam.fetch_unsorted(chrom_name, start, end) for aln in alignments: if not_primary(aln) or aln.is_unmapped or aln.is_duplicate: continue nreads += 1 if opts['verbosity'] > 0 and nreads % (1000000) == 0: print('[parse_bam] %d reads processed' % nreads) # TODO this can be done cleaner -- check for is_unmapped above # and use handle_unpaired for everything with mate_is_unmapped if aln.qname not in seen_aln: # read is not going to pair, so handle now if aln.mate_is_unmapped or aln.rname != aln.mrnm: handle_unpaired_read(opts, aln, softclips, splits, bam, mapstats) # waiting for this read's pair else: seen_aln[aln.qname] = aln continue # Completed a pair! npairs += 1 mate = seen_aln[aln.qname] pair = (aln, mate) del seen_aln[aln.qname] if opts['filter_read_through'] and is_read_through(opts, pair): num_read_through += 1 continue # MULTILIB lib_idx = 0 # handle softclip information, insert len, mapping stats, splits/discordants if not opts['use_mate_tags']: process_aggregate_mapstats(pair, mapstats[lib_idx], min_mapq_reads, opts['max_pair_distance']) ilen = process_insert_len(pair, insert_len[lib_idx], opts['min_mapq_reads'], opts['read_len']) if opts['do_pecluster']: process_discordant_pair(pair[0], pair[1], chrom_name, discordant_pairs[lib_idx], min_mapq_reads, ilen, min_concordant_insert[lib_idx], max_concordant_insert[lib_idx], opts['library_is_rf']) if any(op == CIGAR_SOFT_CLIP for ( op, oplen) in itertools.chain(aln.cigartuples, mate.cigartuples)): if opts['do_splits']: a1_split = process_splits(pair[0], splits[lib_idx], bam, min_mapq=min_mapq_reads, mate=pair[1]) a2_split = process_splits(pair[1], splits[lib_idx], bam, min_mapq=min_mapq_reads, mate=pair[0]) else: a1_split, a2_split = False, False # if we found the same breakpoint in both reads, # it's quite likely that the reads were overlapping due to a short insert if a1_split and a2_split and splits_are_mirrored( splits[lib_idx][-1], splits[lib_idx][-2]): if opts['verbosity'] > 1: print('[bamparser] mirrored split: {0} {1} {2}'.format( chrom_name, splits[lib_idx][-1].bp2, pair[0].qname)) del splits[lib_idx][-1] process_softclip(opts, pair, (a1_split, a2_split), softclips[lib_idx], lib_idx) # handle unpaired reads if opts['verbosity'] > 0: print('[parse_bam] handling unpaired reads') for aln in seen_aln.values(): handle_unpaired_read(opts, aln, softclips, splits, bam, mapstats) if any(len(ins) == 0 for ins in insert_len): # MULTILIB should only fail if all() print('Error: region specified contains no reads!') sys.exit(1) # report stats if opts['verbosity'] > 0: print('[parse_bam] processed a total of {0} reads'.format(nreads)) if opts['filter_read_through']: print('[parse_bam] found {0} read-through pairs out of {1} total'. format(num_read_through, npairs)) add_time_checkpoint(opts, 'parse bam') # compute insert length distributions and save plots if opts['verbosity'] > 1: print('[parse_bam] observed insert size min:') print('\n'.join([str(min(insert_len[i])) for i in range(nlib)])) print('\n'.join( [str(Counter(sorted(insert_len[i]))) for i in range(nlib)])) print('[parse_bam] insert 25-50-75 percentiles by library:') percentiles = [np.percentile(ins, (25, 50, 75)) for ins in insert_len] print(''.join([ '{0}: {1}\n'.format(opts['library_names'][l], tuple(percentiles[l])) for l in range(nlib) ])) if opts['verbosity'] > 0: print('[parse_bam] computing insert length pmfs') insert_mean = [np.median(il) for il in insert_len] insert_sd = [robust_sd(il) for il in insert_len] max_mult = opts['insert_max_mu_multiple'] insert_len_dist = [ pmf_kernel_smooth(insert_len[i], 0, max_mult * mu, opts['max_kde_samples']) for (i, mu) in zip(range(nlib), insert_mean) ] if opts['verbosity'] > 1: for i in range(nlib): print('[parse_bam] lib {0} mu {1} sigma {2}'.format( i, insert_mean[i], insert_sd[i])) # insert dist plots plot_insert_dist(opts, insert_len_dist, outdir) # compute average coverage # MULTILIB this needs adjusting -- keeping track of nreads from each bamgroup region_len = len_without_gaps(chrom_name, start, end, reference_files['gap']) opts['seq_coverage'] = [ nreads * opts['read_len'] / (nlib * region_len) for _ in range(nlib) ] opts['phys_coverage'] = [npairs * m / region_len for m in insert_mean] opts['max_pecluster_size'] = [ pc * opts['pecluster_size_coverage_ratio'] for pc in opts['phys_coverage'] ] if opts['verbosity'] > 0: print('[parse_bam] average sequence coverage: %.1fx' % opts['seq_coverage'][0]) print('[parse_bam] average physical coverage: %.1fx' % opts['phys_coverage'][0]) if opts['do_pecluster']: return (softclips, splits, mapstats, rlen_medians, insert_len_dist, insert_mean, insert_sd, discordant_pairs, min_concordant_insert, max_concordant_insert) else: return (softclips, splits, mapstats, rlen_medians, insert_len_dist, insert_mean, insert_sd, None, None, None)
def parse_reads_with_blocks(opts, reference_files, bamgroups, breakpoints, insert_ranges, map_models): # get gaps chrom_name = opts['chromosome'] start, end = opts['region_start'], opts['region_end'] gaps = load_genome_gaps(reference_files['gap'], chrom_name) cb_out = create_blocks(breakpoints, gaps, chrom_name, start, end, opts['verbosity']) blocks, gap_indices, left_breakpoints, right_breakpoints = cb_out block_ends = [0] + sorted(b.end for b in blocks) bploc = list(breakpoints.keys()) bploc.sort() if opts['verbosity'] > 1: print('\n\nbreakpoints:') print(bploc) print('\ngaps:') print(gaps) print('gap_indices:') print(gap_indices) print('blocks_after_gaps:') print([ blocks[i] for i in range(len(blocks)) if i > 0 and i in gap_indices ]) print('\nBLOCKS:') print(blocks) print('\n') g = GenomeGraph(len(blocks)) cached_dist = {} npairs = 0 for bam in bamgroups: seen_aln = {} # rejected_aln = set() cur_idx = 0 cur_block = blocks[cur_idx] prev_block_end = block_ends[cur_idx] # parse reads from this chromosome alignments = bam.fetch_unsorted(chrom_name, start, end) if opts['verbosity'] > 0: print( '[parse_reads] fetching alignments from chromosome {0}'.format( chrom_name)) # SPEEDUP handle hanging reads (mate unmapped or rname!=mrnm, but not distant) as we go to save memory. but, careful not to add them twice... for aln in alignments: if not_primary( aln ) or aln.is_unmapped or aln.is_duplicate or aln.pos >= blocks[ -1].end: continue if not (prev_block_end <= aln.pos < cur_block.end): cur_idx = find_block_idx(aln.pos, block_ends) cur_block = blocks[cur_idx] prev_block_end = block_ends[cur_idx] if aln.qname in seen_aln: mate, mate_block_idx = seen_aln[aln.qname] del seen_aln[aln.qname] block_parser_handle_pair(opts, aln, mate, bam, g, blocks, block_ends, insert_ranges, cached_dist, map_models, block_idx1=cur_idx, block_idx2=mate_block_idx) npairs += 1 else: seen_aln[aln.qname] = (aln, cur_idx) if opts['verbosity'] > 1: print('\nreads missing pairs are on these chromosomes:') print( Counter([bam.getrname(a[0].rname) for a in seen_aln.values()])) print('\nreads missing pairs have mates on these chromosomes:') print(Counter([bam.getrname(a[0].mrnm) for a in seen_aln.values()])) print('') for (aln, block_idx) in seen_aln.values(): block_parser_handle_hanging(opts, aln, bam, g, blocks, block_ends, insert_ranges, cached_dist, map_models, block_idx) if opts['verbosity'] > 1: print('within-block insert size stats:\n') for i in range(g.size): edge = g.get_edge(2 * i, 2 * i + 1) if len(edge['offset']) > 100: print('block {0}: '.format(blocks[i])) ulibs = set(edge['lib']) for l in ulibs: which_lib = [ edge['lib'][j] == l and not (j in edge['which_hanging']) for j in range(len(edge['offset'])) ] if any(which_lib): med = np.median([ edge['offset'][j] for j in range(len(edge['offset'])) if which_lib[j] ]) print('\tlib {0} median {1} ({2} reads)'.format( l, med, sum(which_lib))) print('\n') return g, blocks, gap_indices, left_breakpoints, right_breakpoints