Beispiel #1
0
def get_rough_insert_median(opts, bam, pairs_to_check=10000):
    # check min_mapq, neither unmapped, neither supp
    ilen = []
    seen = {}
    rej = set()
    for aln in bam.fetch_unsorted():
        if aln.qname in seen:
            if aln.mapq < opts[
                    'min_mapq_reads'] or aln.is_unmapped or not_primary(aln):
                del seen[aln.qname]
            else:
                pair = (aln, seen[aln.qname])
                process_insert_len(pair,
                                   ilen,
                                   opts['min_mapq_reads'],
                                   opts['read_len'],
                                   truncate=False)
                del seen[aln.qname]
        else:
            if aln.mapq < opts[
                    'min_mapq_reads'] or aln.is_unmapped or not_primary(aln):
                rej.add(aln.qname)
            else:
                seen[aln.qname] = aln
        if len(ilen) >= pairs_to_check:
            break
    return np.median(ilen)
Beispiel #2
0
def process_discordant_pair(aln1,
                            aln2,
                            chrom,
                            discordant_pairs,
                            min_mapq,
                            ilen,
                            min_insert,
                            max_insert,
                            is_rf=False):
    if (aln1.is_reverse != aln2.is_reverse) and (ilen is not None) and \
       (ilen >= min_insert) and (ilen <= max_insert):
        return None
    if aln1.mapq < min_mapq or aln2.mapq < min_mapq or aln1.is_unmapped or \
       aln2.is_unmapped or not_primary(aln1) or not_primary(aln2):
        return None
    # "First" is -> if FR (->  <-) and <- if RF (<-  ->)
    # i.e. the read we expect on the "left" in ref. coords
    if aln1.is_reverse != aln2.is_reverse:
        second = aln1 if (aln1.is_reverse ^ is_rf) else aln2
        first = aln1 if second is aln2 else aln2
        if ilen > max_insert:
            dtype = 'Del'
            disc = DiscordantPair(chrom, first.reference_end,
                                  second.reference_start, ilen, first.qname)
        elif (first.reference_start > second.reference_start) or \
             (first.reference_end > second.reference_end):
            dtype = 'Dup'
            disc = DiscordantPair(chrom, second.reference_start,
                                  first.reference_end, ilen, second.qname)
        elif ilen < min_insert:
            dtype = 'Ins'
            disc = DiscordantPair(chrom, first.reference_end,
                                  second.reference_start, ilen, first.qname)
    else:
        dtype = 'InvR' if (aln1.is_reverse ^ is_rf) else 'InvL'
        if dtype == 'InvL':
            pos1, pos2 = sorted([aln1.reference_end, aln2.reference_end])
        else:
            pos1, pos2 = sorted([aln1.reference_start, aln2.reference_start])
        disc = DiscordantPair(chrom, pos1, pos2, ilen, aln1.qname)
    discordant_pairs[dtype] = discordant_pairs.get(dtype, []) + [disc]
    if disc.pos1 > disc.pos2 and dtype != 'Ins':
        raise Warning(
            '[process_disc_pair] discordant type {0} pos1 > pos2'.format(
                dtype))
    return dtype
Beispiel #3
0
def process_softclip(opts, pair, pair_split_found, softclips, lib_idx):
    min_mapq = opts['min_mapq_softclip']
    min_clipped_bases = opts['min_clipped_bases']
    min_clipped_qual = opts['min_clipped_qual']
    lowqual_trim_extra = opts['lowqual_trim_extra']
    for (aln, split_found) in zip(pair, pair_split_found):
        if aln is None or aln.is_unmapped or \
           aln.mapq < min_mapq or not_primary(aln) or \
           split_found:
            continue

        # count number of phred qual > 2 clipped bases and adjust nclip
        nclip = [aln.query_alignment_start, len(aln.seq) - aln.query_alignment_end]
        if nclip == [0, 0]:
            continue
        pos = (aln.reference_start, aln.reference_end)
        lowqual = count_lowqual_bases(aln, lowqual_trim_extra)

        for o in (LEFT, RIGHT):
            if lowqual[o] > 0:
                nclip[o] = max(0, nclip[o] - lowqual[o])
            if nclip[o] < min_clipped_bases:
                continue
            if o == LEFT:
                med_qual = np.median(aln.query_qualities[lowqual[o]:(lowqual[o]+nclip[o])])
            else:
                med_qual = np.median(aln.query_qualities[(-lowqual[o]-nclip[o]):(-lowqual[o] or None)])
            if med_qual < min_clipped_qual:
                continue
            this_nclip = nclip[o]
            this_pos = pos[o]
            this_nmapped = aln.query_alignment_end - aln.query_alignment_start
            sc = SoftclipCluster(is_right=(o == RIGHT), pos=this_pos,
                                 bases_clipped=this_nclip, bases_mapped=this_nmapped,
                                 num_reads=1, num_reads_exact=1,
                                 sum_mapq=aln.mapq,
                                 num_minus=int(aln.is_reverse),
                                 num_plus=1-int(aln.is_reverse),
                                 which_libs=(1 << lib_idx))
            softclips[o][this_pos].append(sc)
Beispiel #4
0
def extract_approximate_library_stats(opts, bam, rough_insert_median):
    reads_per_chunk = int(
        np.floor(opts['approx_stats_nreads'] / opts['approx_stats_nchunks']))

    # lib_patterns, lib_stats = parse_library_stats(meta)
    # maps read groups matching lib_patterns to indices in lib_stats
    # lib_dict = {}
    # MULTILIB
    nlib = opts['nlib']
    insert_len = [[] for i in range(nlib)]
    read_len_shorter = [[] for i in range(nlib)]
    read_len_longer = [[] for i in range(nlib)]

    chrom_name = opts['chromosome']
    chrom_size = get_chrom_size_from_bam(chrom_name, bam)
    chunk_size = 10 * opts['insert_max_mu_multiple'] * rough_insert_median

    rough_insert_max = opts['insert_max_mu_multiple'] * rough_insert_median
    reads_processed = [0 for i in range(nlib)]
    chunks_processed = 0
    # MINOR reads_per_chunk should mean completed
    while min(reads_processed) < opts['approx_stats_nreads']:
        # extract random chunk
        start = np.random.randint(0, chrom_size - chunk_size)
        end = start + chunk_size
        # parse reads
        seen_aln = {}
        chunk_reads_seen = 0
        alns = list(bam.fetch_unsorted(chrom_name, start, end))
        if bam.num_bam > 1:
            alns.sort(key=lambda a: a.pos)
        for aln in list(bam.fetch_unsorted(chrom_name, start, end)):
            # conditioning on mate position introduces slight bias,
            # but insignificant if chunk_size >> insert size
            if not_primary(aln) or aln.is_duplicate or aln.is_unmapped or \
               aln.mpos < start or aln.mpos >= end or aln.mate_is_unmapped:
                continue
            if aln.qname not in seen_aln:
                if chunk_reads_seen < reads_per_chunk:
                    seen_aln[aln.qname] = aln
                    chunk_reads_seen += 1
                    continue
                else:
                    continue
            # pair completed
            mate = seen_aln[aln.qname]
            pair = (aln, mate)
            del seen_aln[aln.qname]

            lib_idx = 0  # get_lib_idx(aln.get_tag('RG'), lib_dict, lib_patterns)
            process_insert_len(pair,
                               insert_len[lib_idx],
                               opts['min_mapq_reads'],
                               opts['read_len'],
                               maximum_insert_size=rough_insert_max)
            process_read_len(pair, read_len_shorter[lib_idx],
                             read_len_longer[lib_idx])
            reads_processed[lib_idx] += 1
            if min(reads_processed) % 200000 == 0 and opts['verbosity'] > 0:
                print(
                    '[library_stats] processed {0} reads ({1} chunks) for each lib'
                    .format(min(reads_processed), chunks_processed))
        chunks_processed += 1

    insert_mean = [np.median(il) for il in insert_len]
    insert_sd = [robust_sd(il) for il in insert_len]
    insert_lower = [np.percentile(il, 0.15) for il in insert_len]
    insert_upper = [np.percentile(il, 99.85) for il in insert_len]
    insert_pmf = [
        pmf_kernel_smooth(il, 0, opts['insert_max_mu_multiple'] * mu,
                          opts['max_kde_samples'])
        for (il, mu) in zip(insert_len, insert_mean)
    ]
    rlen_short = [round(np.median(rl)) for rl in read_len_shorter]
    rlen_long = [round(np.median(rl)) for rl in read_len_longer]
    rlen_medians = list(zip(rlen_short, rlen_long))
    return insert_mean, insert_sd, insert_pmf, insert_lower, insert_upper, rlen_medians
Beispiel #5
0
def parse_bam(opts, reference_files, bamfiles):
    chrom_name = opts['chromosome']
    start, end = opts['region_start'], opts['region_end']
    outdir = opts['outdir']
    min_mapq_reads = opts['min_mapq_reads']
    nlib = opts['nlib']  # MULTILIB
    # lib_patterns, lib_stats = parse_library_stats(meta)
    # lib_dict = {}

    bam = BamGroup(bamfiles)
    opts['read_len'] = bam_read_len(bam)
    # bam_has_unmapped = has_unmapped_records(bam)
    # if opts['verbosity'] > 0:
    #     if bam_has_unmapped:
    #         print('[parse_bam] bam file DOES contain unmapped records')
    #     else:
    #         print('[parse_bam] bam file DOES NOT contain unmapped records')

    if opts['verbosity'] > 0:
        print('\n[parse_bam] extracting approximate library stats')
    rough_insert_median = get_rough_insert_median(opts, bam)
    if opts['verbosity'] > 0:
        print('[parse_bam] read_len: {0}; rough_insert_median: {1}'.format(
            opts['read_len'], rough_insert_median))
    als = extract_approximate_library_stats(opts, bam, rough_insert_median)
    mean_approx, sd_approx, pmf_approx, qlower, qupper, rlen_medians = als
    for i in range(len(pmf_approx)):
        with open(
                os.path.join(
                    outdir, 'logging',
                    '{0}_insert_pmf.txt'.format(opts['library_names'][i])),
                'w') as f:
            for j in range(len(pmf_approx[i])):
                f.write('{0}\t{1}\n'.format(j, pmf_approx[i][j]))
    if opts['verbosity'] > 0:
        print('[parse_bam] library stats:\n\tmu = {0}\n\tsigma = {1}'.format(
            mean_approx, sd_approx))
        add_time_checkpoint(opts, 'lib. stats')

    def get_lr_cutoff(opts, pmf, do_min=False):
        cutoff_normal_equivalent = opts['insert_cutoff']
        lr_cutoff = normpdf(0) - normpdf(cutoff_normal_equivalent)
        mode = max(pmf)
        logmode = np.log(mode)
        which_mode = [i for i in range(len(pmf)) if pmf[i] == mode]
        cutoff = None
        if do_min:
            for i in range(1, len(pmf)):
                if pmf[i] != 0 and logmode - np.log(pmf[i]) < lr_cutoff:
                    cutoff = i - 1
                    break
        else:
            for i in range(len(pmf) - 2, -1, -1):
                if pmf[i] != 0 and logmode - np.log(pmf[i]) < lr_cutoff:
                    cutoff = i + 1
                    break
        if opts['verbosity'] > 0:
            print('[insert_cutoff] lr_cutoff is {0}'.format(lr_cutoff))
            print('[insert_cutoff] mode (log) {0} at {1}'.format(
                logmode, which_mode))
            print('[insert_cutoff] cutoff ratio (log) {0} at {1}'.format(
                logmode - np.log(pmf[i]), cutoff))
        return cutoff

    min_concordant_insert = [
        get_lr_cutoff(opts, pmf, do_min=True) for pmf in pmf_approx
    ]
    max_concordant_insert = [get_lr_cutoff(opts, pmf) for pmf in pmf_approx]
    if opts['verbosity'] > 0:
        print('[parse_bam] insert size cutoffs:')
        print('[parse_bam]' + '\n'.join([
            '{0}-{1}'.format(min_concordant_insert[i],
                             max_concordant_insert[i])
            for i in range(len(mean_approx))
        ]))
        print(
            '[parse_bam] equivalent to mu +/- 3 sigma in normal:\n\t{0}\n\t{1}\n'
            .format(qlower, qupper))

    seen_aln = {}
    nreads, npairs = 0, 0
    num_read_through = 0
    insert_len = [[] for i in range(nlib)]
    softclips = [(defaultdict(list), defaultdict(list)) for i in range(nlib)]
    splits = [[] for i in range(nlib)]
    if opts['do_pecluster']:
        discordant_pairs = [OrderedDict() for i in range(nlib)]
    if not opts['use_mate_tags']:  # need to estimate mappability proportions
        mapstats = [defaultdict(int) for i in range(nlib)]
    else:
        mapstats = None

    if opts['verbosity'] > 0:
        print('[parse_bam] starting alignment parsing. . .')
    alignments = bam.fetch_unsorted(chrom_name, start, end)
    for aln in alignments:
        if not_primary(aln) or aln.is_unmapped or aln.is_duplicate:
            continue
        nreads += 1
        if opts['verbosity'] > 0 and nreads % (1000000) == 0:
            print('[parse_bam] %d reads processed' % nreads)

        # TODO this can be done cleaner -- check for is_unmapped above
        #    and use handle_unpaired for everything with mate_is_unmapped
        if aln.qname not in seen_aln:
            # read is not going to pair, so handle now
            if aln.mate_is_unmapped or aln.rname != aln.mrnm:
                handle_unpaired_read(opts, aln, softclips, splits, bam,
                                     mapstats)
            # waiting for this read's pair
            else:
                seen_aln[aln.qname] = aln
            continue

        # Completed a pair!
        npairs += 1
        mate = seen_aln[aln.qname]
        pair = (aln, mate)
        del seen_aln[aln.qname]

        if opts['filter_read_through'] and is_read_through(opts, pair):
            num_read_through += 1
            continue

        # MULTILIB
        lib_idx = 0

        # handle softclip information, insert len, mapping stats, splits/discordants
        if not opts['use_mate_tags']:
            process_aggregate_mapstats(pair, mapstats[lib_idx], min_mapq_reads,
                                       opts['max_pair_distance'])
        ilen = process_insert_len(pair, insert_len[lib_idx],
                                  opts['min_mapq_reads'], opts['read_len'])
        if opts['do_pecluster']:
            process_discordant_pair(pair[0], pair[1], chrom_name,
                                    discordant_pairs[lib_idx], min_mapq_reads,
                                    ilen, min_concordant_insert[lib_idx],
                                    max_concordant_insert[lib_idx],
                                    opts['library_is_rf'])
        if any(op == CIGAR_SOFT_CLIP for (
                op,
                oplen) in itertools.chain(aln.cigartuples, mate.cigartuples)):
            if opts['do_splits']:
                a1_split = process_splits(pair[0],
                                          splits[lib_idx],
                                          bam,
                                          min_mapq=min_mapq_reads,
                                          mate=pair[1])
                a2_split = process_splits(pair[1],
                                          splits[lib_idx],
                                          bam,
                                          min_mapq=min_mapq_reads,
                                          mate=pair[0])
            else:
                a1_split, a2_split = False, False
            # if we found the same breakpoint in both reads,
            # it's quite likely that the reads were overlapping due to a short insert
            if a1_split and a2_split and splits_are_mirrored(
                    splits[lib_idx][-1], splits[lib_idx][-2]):
                if opts['verbosity'] > 1:
                    print('[bamparser] mirrored split: {0} {1} {2}'.format(
                        chrom_name, splits[lib_idx][-1].bp2, pair[0].qname))
                del splits[lib_idx][-1]

            process_softclip(opts, pair, (a1_split, a2_split),
                             softclips[lib_idx], lib_idx)

    # handle unpaired reads
    if opts['verbosity'] > 0:
        print('[parse_bam] handling unpaired reads')
    for aln in seen_aln.values():
        handle_unpaired_read(opts, aln, softclips, splits, bam, mapstats)

    if any(len(ins) == 0
           for ins in insert_len):  # MULTILIB should only fail if all()
        print('Error: region specified contains no reads!')
        sys.exit(1)

    # report stats
    if opts['verbosity'] > 0:
        print('[parse_bam] processed a total of {0} reads'.format(nreads))
        if opts['filter_read_through']:
            print('[parse_bam] found {0} read-through pairs out of {1} total'.
                  format(num_read_through, npairs))
    add_time_checkpoint(opts, 'parse bam')

    # compute insert length distributions and save plots
    if opts['verbosity'] > 1:
        print('[parse_bam] observed insert size min:')
        print('\n'.join([str(min(insert_len[i])) for i in range(nlib)]))
        print('\n'.join(
            [str(Counter(sorted(insert_len[i]))) for i in range(nlib)]))
        print('[parse_bam] insert 25-50-75 percentiles by library:')
        percentiles = [np.percentile(ins, (25, 50, 75)) for ins in insert_len]
        print(''.join([
            '{0}: {1}\n'.format(opts['library_names'][l],
                                tuple(percentiles[l])) for l in range(nlib)
        ]))
    if opts['verbosity'] > 0:
        print('[parse_bam] computing insert length pmfs')
    insert_mean = [np.median(il) for il in insert_len]
    insert_sd = [robust_sd(il) for il in insert_len]
    max_mult = opts['insert_max_mu_multiple']
    insert_len_dist = [
        pmf_kernel_smooth(insert_len[i], 0, max_mult * mu,
                          opts['max_kde_samples'])
        for (i, mu) in zip(range(nlib), insert_mean)
    ]

    if opts['verbosity'] > 1:
        for i in range(nlib):
            print('[parse_bam] lib {0} mu {1} sigma {2}'.format(
                i, insert_mean[i], insert_sd[i]))

    # insert dist plots
    plot_insert_dist(opts, insert_len_dist, outdir)

    # compute average coverage
    # MULTILIB this needs adjusting -- keeping track of nreads from each bamgroup
    region_len = len_without_gaps(chrom_name, start, end,
                                  reference_files['gap'])
    opts['seq_coverage'] = [
        nreads * opts['read_len'] / (nlib * region_len) for _ in range(nlib)
    ]
    opts['phys_coverage'] = [npairs * m / region_len for m in insert_mean]
    opts['max_pecluster_size'] = [
        pc * opts['pecluster_size_coverage_ratio']
        for pc in opts['phys_coverage']
    ]

    if opts['verbosity'] > 0:
        print('[parse_bam] average sequence coverage: %.1fx' %
              opts['seq_coverage'][0])
        print('[parse_bam] average physical coverage: %.1fx' %
              opts['phys_coverage'][0])

    if opts['do_pecluster']:
        return (softclips, splits, mapstats, rlen_medians, insert_len_dist,
                insert_mean, insert_sd, discordant_pairs,
                min_concordant_insert, max_concordant_insert)
    else:
        return (softclips, splits, mapstats, rlen_medians, insert_len_dist,
                insert_mean, insert_sd, None, None, None)
Beispiel #6
0
def parse_reads_with_blocks(opts, reference_files, bamgroups, breakpoints,
                            insert_ranges, map_models):
    # get gaps
    chrom_name = opts['chromosome']
    start, end = opts['region_start'], opts['region_end']
    gaps = load_genome_gaps(reference_files['gap'], chrom_name)
    cb_out = create_blocks(breakpoints, gaps, chrom_name, start, end,
                           opts['verbosity'])
    blocks, gap_indices, left_breakpoints, right_breakpoints = cb_out
    block_ends = [0] + sorted(b.end for b in blocks)

    bploc = list(breakpoints.keys())
    bploc.sort()

    if opts['verbosity'] > 1:
        print('\n\nbreakpoints:')
        print(bploc)
        print('\ngaps:')
        print(gaps)
        print('gap_indices:')
        print(gap_indices)
        print('blocks_after_gaps:')
        print([
            blocks[i] for i in range(len(blocks)) if i > 0 and i in gap_indices
        ])
        print('\nBLOCKS:')
        print(blocks)
        print('\n')

    g = GenomeGraph(len(blocks))
    cached_dist = {}

    npairs = 0

    for bam in bamgroups:
        seen_aln = {}
        # rejected_aln = set()
        cur_idx = 0
        cur_block = blocks[cur_idx]
        prev_block_end = block_ends[cur_idx]

        # parse reads from this chromosome
        alignments = bam.fetch_unsorted(chrom_name, start, end)
        if opts['verbosity'] > 0:
            print(
                '[parse_reads] fetching alignments from chromosome {0}'.format(
                    chrom_name))
        # SPEEDUP handle hanging reads (mate unmapped or rname!=mrnm, but not distant) as we go to save memory. but, careful not to add them twice...
        for aln in alignments:
            if not_primary(
                    aln
            ) or aln.is_unmapped or aln.is_duplicate or aln.pos >= blocks[
                    -1].end:
                continue
            if not (prev_block_end <= aln.pos < cur_block.end):
                cur_idx = find_block_idx(aln.pos, block_ends)
                cur_block = blocks[cur_idx]
                prev_block_end = block_ends[cur_idx]
            if aln.qname in seen_aln:
                mate, mate_block_idx = seen_aln[aln.qname]
                del seen_aln[aln.qname]
                block_parser_handle_pair(opts,
                                         aln,
                                         mate,
                                         bam,
                                         g,
                                         blocks,
                                         block_ends,
                                         insert_ranges,
                                         cached_dist,
                                         map_models,
                                         block_idx1=cur_idx,
                                         block_idx2=mate_block_idx)
                npairs += 1
            else:
                seen_aln[aln.qname] = (aln, cur_idx)

        if opts['verbosity'] > 1:
            print('\nreads missing pairs are on these chromosomes:')
            print(
                Counter([bam.getrname(a[0].rname) for a in seen_aln.values()]))
            print('\nreads missing pairs have mates on these chromosomes:')
            print(Counter([bam.getrname(a[0].mrnm)
                           for a in seen_aln.values()]))
            print('')
        for (aln, block_idx) in seen_aln.values():
            block_parser_handle_hanging(opts, aln, bam, g, blocks, block_ends,
                                        insert_ranges, cached_dist, map_models,
                                        block_idx)

    if opts['verbosity'] > 1:
        print('within-block insert size stats:\n')
        for i in range(g.size):
            edge = g.get_edge(2 * i, 2 * i + 1)
            if len(edge['offset']) > 100:
                print('block {0}: '.format(blocks[i]))
                ulibs = set(edge['lib'])
                for l in ulibs:
                    which_lib = [
                        edge['lib'][j] == l
                        and not (j in edge['which_hanging'])
                        for j in range(len(edge['offset']))
                    ]
                    if any(which_lib):
                        med = np.median([
                            edge['offset'][j]
                            for j in range(len(edge['offset'])) if which_lib[j]
                        ])
                        print('\tlib {0} median {1} ({2} reads)'.format(
                            l, med, sum(which_lib)))
                        print('\n')

    return g, blocks, gap_indices, left_breakpoints, right_breakpoints