Exemple #1
0
def main(args, outs):
    in_bam = tk_bam.create_bam_infile(args.possorted_bam)
    chrom = args.chrom

    poses = []
    mol_qs = []
    bcs = []

    for read in in_bam.fetch(str(chrom), int(args.start_pos),
                             int(args.end_pos)):
        if not read.is_secondary and not read.is_duplicate and read.is_read1 and \
            not read.is_unmapped and read.mapq >= args.mapq:
            poses.append(read.pos)
            mol_qs.append(tk_io.get_read_molecule_conf(read))
            bcs.append(tk_io.get_read_barcode(read))
    ret_df = pd.DataFrame({
        'chrom': chrom,
        'pos': poses,
        'bc': bcs,
        'mol_qual': mol_qs
    })

    if len(ret_df) > 0:
        start_pos = poses[0]
        end_pos = poses[-1]
        cov_df = tk_hdf5.read_data_frame_indexed(
            args.coverage, [(chrom, start_pos, end_pos + 1)])

        # Boolean array with length equal to the range of positions in ret_df
        on_target = np.zeros((end_pos - start_pos + 1, ), dtype=np.bool)
        on_target[cov_df.pos - start_pos] = True
        cum_on_target = np.cumsum(on_target)

        ret_df['on_target'] = on_target[ret_df.pos - start_pos]
        # Note that fraction is set to 1 if there are no bases
        frac_on_target = np.ones((len(ret_df), )) * on_target[0]
        for i, p in enumerate(poses):
            if i > 0:
                nbp = float(p - poses[i - 1] - 1)
                if nbp > 0:
                    frac_on_target[i] = (
                        cum_on_target[p - start_pos] -
                        cum_on_target[poses[i - 1] - start_pos] -
                        int(on_target[p - start_pos])) / nbp
                else:
                    frac_on_target[i] = float(on_target[p - start_pos])
        ret_df['frac_on_target'] = frac_on_target
    else:
        ret_df['on_target'] = False
        ret_df['frac_on_target'] = 0.0
    tk_hdf5.write_data_frame(outs.reads, ret_df)
Exemple #2
0
def main_bucket_reads_by_bc(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end

    prefixes = get_seqs(args.nbases)

    bam_in = tk_bam.create_bam_infile(args.input)
    reads = list(tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)))

    tmp_dir = os.path.dirname(outs.default)
    bams_out = {}
    outs.buckets = {}
    buckets = {}
    for prefix in prefixes:
        filename = os.path.join(tmp_dir, "bc_%s.bam" % prefix)
        bam_out, _ = tk_bam.create_bam_outfile(filename,
                                               None,
                                               None,
                                               template=bam_in)
        bams_out[prefix] = bam_out
        outs.buckets[prefix] = filename
        buckets[prefix] = []

    non_bc_bam_out, _ = tk_bam.create_bam_outfile(outs.default,
                                                  None,
                                                  None,
                                                  template=bam_in)
    non_bc_reads = []
    for r in reads:
        barcode = tk_io.get_read_barcode(r)
        if barcode is None:
            non_bc_bam_out.write(r)
            non_bc_reads.append(r)
        else:
            prefix = barcode[:args.nbases]
            buckets[prefix].append(r)
    non_bc_bam_out.close()

    # Set random seed to get deterministic qname subsampling
    random.seed(0)
    sampled_non_bc_reads = random.sample(non_bc_reads,
                                         min(len(non_bc_reads), len(prefixes)))
    outs.qnames = [read.qname for read in sampled_non_bc_reads]

    for prefix, bucket in buckets.iteritems():
        bucket.sort(key=bc_sort_key)
        bam_out = bams_out[prefix]
        for r in bucket:
            bam_out.write(r)
        bam_out.close()
Exemple #3
0
def sort_by_bc(file_name, sorted_name, store_in_memory=True):
    """ Sorts a bam file by the 10X barcode (specified in the tags BC field)
    if store_in_memory is True, avoids file seeks by keeping reads in memory
    """
    in_file = create_bam_infile(file_name)
    out_file, tids = create_bam_outfile(sorted_name, None, None, template=in_file)

    if store_in_memory:
        bc_reads = {}

        for read in in_file:
            bc = tk_io.get_read_barcode(read)
            this_bc_reads = bc_reads.setdefault(bc, [])
            this_bc_reads.append(read)

        sorted_bcs = sorted(bc_reads.keys())
        for bc in sorted_bcs:
            for read in bc_reads[bc]:
                out_file.write(read)
    else:
        # Store the file offset locations (in bytes) by bc
        bc_locs = {}

        file_offset = in_file.tell()
        for read in in_file:
            bc = tk_io.get_read_barcode(read)
            this_bc_locs = bc_locs.setdefault(bc, [])
            this_bc_locs.append(file_offset)
            file_offset = in_file.tell() # has to be before next read

        sorted_bcs = sorted(bc_locs.keys())
        for bc in sorted_bcs:
            for offset in bc_locs[bc]:
                in_file.seek(offset)
                out_file.write(in_file.next())

    out_file.close()
Exemple #4
0
def get_reads(in_bam, chrom, start, stop, min_mapq=60):
    bcs = []
    poses = []

    for read in in_bam.fetch(str(chrom), start, stop):
        mapq = read.mapq
        if mapq < min_mapq or read.is_secondary or read.is_duplicate:
            continue
        bc = tk_io.get_read_barcode(read)
        if bc is None:
            continue

        bcs.append(bc)
        poses.append(read.pos)
    df = pd.DataFrame({'pos': poses, 'bc': bcs})
    df.sort('bc', inplace=True)
    return df
Exemple #5
0
def get_bcs_at_region(in_bam,
                      chrom,
                      start,
                      end,
                      min_mapq=60,
                      bc_map=None,
                      other_chrom=None,
                      other_start=None,
                      other_end=None,
                      read_to_bc=None):

    bc_list = {}

    for read in in_bam.fetch(str(chrom), int(start), int(end)):
        if read.mapq < min_mapq or read.is_duplicate or read.is_secondary:
            continue
        if read.pos < start or read.pos > end:
            continue

        if not other_start is None and not other_end is None:
            tag_names = [t[0] for t in read.tags]
            tag_vals = [t[1] for t in read.tags]
            if 'XA' in tag_names:
                idx = np.where(np.array(tag_names) == 'XA')[0]
                chrom = tag_vals[idx].split(',')[0]
                pos = int(tag_vals[idx].split(',')[1].strip('+-'))
                if chrom == other_chrom and pos >= other_start and pos <= other_end:
                    continue
        if read_to_bc is None:
            bc = tk_io.get_read_barcode(read)
        else:
            bc = read_to_bc[read.qname]

        if not (bc is None or bc == '') and (bc_map is None or bc in bc_map):
            if not bc in bc_list:
                bc_list[bc] = []
            bc_list[bc].append(read.qname)

    for bc in bc_list.keys():
        bc_list[bc] = list(set(bc_list[bc]))
    return bc_list
Exemple #6
0
def main_report_single_partition(args, outs):
    # Bail out if there no valid barcodes
    if args.barcode_whitelist is None or args.input is None:
        outs.fragments = None
        return

    bam_in = tk_bam.create_bam_infile(args.input)

    if args.targets_file is None:
        target_regions = None
    else:
        target_regions = tk_io.get_target_regions(open(args.targets_file))

    # Bail out if we're on a small genome
    if sum(bam_in.lengths) < 3e6:
        outs.fragments = None
        return

    bam_chunk = tk_bam.read_bam_chunk(bam_in,
                                      (args.chunk_start, args.chunk_end))
    # Skip reads without a barcode
    bam_chunk_filt = itertools.ifilter(read_has_barcode, bam_chunk)
    bc_read_iter = itertools.groupby(bam_chunk_filt,
                                     lambda x: tk_io.get_read_barcode(x))

    bc_data = (summarize_barcode(bc, list(reads), args.read_link_distance,
                                 bam_in.references, target_regions)
               for (bc, reads) in bc_read_iter)
    bc_data_filt = (x for x in bc_data if x is not None)

    frag_tbl, bc_tbl = make_output_dataframes(bc_data_filt)
    if frag_tbl is not None:
        # Sort and index fragment table, so that we can combine the fragments files per-chromosome to reduce memory consumption
        frag_tbl.sort(['chrom', 'start_pos'], inplace=True)
        tenkit.hdf5.write_data_frame(outs.fragments, frag_tbl)
        tenkit.hdf5.create_tabix_index(outs.fragments, 'chrom', 'start_pos',
                                       'end_pos')
    if bc_tbl is not None:
        tenkit.hdf5.write_data_frame(outs.barcodes, bc_tbl)
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """

    chunk = args.chunk

    #subsample_rate = 1.0
    #if args.subsample_rate is not None:
    #    subsample_rate = args.subsample_rate

    bam_in = tk_bam.create_bam_infile(args.align_chunk)
    bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs"))

    if args.barcode_whitelist is None or args.bc_counts is None:
        # If there's no whitelist or counts then all high quality BC reads get allowed.
        barcode_whitelist = None
        wl_idxs = None
        bc_dist = None
    else:
        barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist)

        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        counts = counts[str(chunk['gem_group'])]['bc_counts']

        # Prior distribution over barcodes, with pseudo-count
        bc_dist = np.array(counts, dtype=np.float) + 1.0
        bc_dist = bc_dist / bc_dist.sum()
        wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) }

    # set random seed to get deterministic subsampling
    random.seed(0)

    def open_maybe_gzip(fn):
        if fn[-2:] == "gz":
            return gzip.open(fn)
        else:
            return open(fn)

    if chunk['barcode']:
        processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist)
        require_barcode_for_stringent = True
    else:
        processed_barcode_iter = itertools.repeat(None)
        require_barcode_for_stringent = False

    if chunk['sample_index']:
        sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index']))
    else:
        sample_index_iter = itertools.repeat(None)

    iters = itertools.izip(processed_barcode_iter, sample_index_iter)

    # First read
    read = bam_in.next()

    # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates
    perfect_read_count = 0

    # Due to secondary alignments, we must apply the tags to all
    # reads with the same cluster name.
    for (barcode_info, sample_index_info) in iters:
        tags = []
        read_name = None

        if read is None:
            break

        if barcode_info:
            (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info
            tags.append((RAW_BARCODE_TAG, raw_bc_seq))
            tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual))
            if processed_bc_seq is not None:
                tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq))
            read_name = bc_read_name.split()[0]


        if sample_index_info:
            (si_read_name, seq, qual) = sample_index_info
            tags.append((SAMPLE_INDEX_TAG, seq))
            tags.append((SAMPLE_INDEX_QUAL_TAG, qual))

            if read_name != None:
                if si_read_name.split()[0] != read_name:
                    martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name))
                assert(si_read_name.split()[0] == read_name)
            else:
                read_name = si_read_name.split()[0]

        reads_attached = 0
        #emit_read_pair = random.random() < subsample_rate
        emit_read_pair = True

        while read.qname == read_name or read_name == None:
            if len(tags) > 0:
                existing_tags = read.tags
                existing_tags.extend(tags)
                read.tags = existing_tags

            reads_attached += 1
            if not (read_name is None):
                assert(read.qname == read_name)

            if emit_read_pair:
                # Count the perfect reads -- will be used when subsampling in dedup
                if tenkit.read_filter.stringent_read_filter(read, require_barcode_for_stringent):
                    perfect_read_count += 1

                if args.exclude_non_bc_reads:
                    if not(tk_io.get_read_barcode(read) is None):
                        bam_out.write(read)
                else:
                    bam_out.write(read)

            try:
                read = bam_in.next()

            except StopIteration:
                read = None
                break

        # We may have more than 2 reads is there was a
        # secondary alignment, but less than 2 means
        # something went wrong
        assert(reads_attached >= 2)


    outs.perfect_read_count = perfect_read_count
    bam_out.close()
 def read_tuple(r):
     bc = tk_io.get_read_barcode(r)
     return (bc, r.tid, r.pos, r.mrnm, r.mpos, r.is_reverse, r.is_read1)
    def test_big_dedup(self):
        tenkit.constants.DUPLICATE_SUBSAMPLE_COVERAGES = [0.000003, 0.000015]
        args = martian.Record({
            'input': IN_BAM_BIG,
            'estimated_coverage': 100.0,
            'perfect_read_count': 100000,
            'chunk_start': None,
            'chunk_end': None
        })
        outs = martian.Record({
            'output': OUT_BAM,
            'duplicate_summary': OUT_JSON
        })
        main_mark_duplicates(args, outs)

        out_bam = pysam.Samfile(OUT_BAM)
        out_reads = list(out_bam)

        in_bam = pysam.Samfile(IN_BAM_BIG)
        in_reads = list(in_bam)

        # Check we haven't lost any reads
        self.assertEqual(len(out_reads), len(in_reads))

        def read_tuple(r):
            bc = tk_io.get_read_barcode(r)
            return (bc, r.tid, r.pos, r.mrnm, r.mpos, r.is_reverse, r.is_read1)
            #return (bc, r.is_read1, r.is_reverse, r.tid, r.pos, r.mrnm, r.mpos)

        def mark_duplicates(read_set):

            # Re-run the dup analysis manually
            read_tups = [(read_tuple(r), r) for r in read_set]
            read_tups.sort(key=lambda x: x[0])

            groups = itertools.groupby(read_tups, lambda x: x[0])

            for (k, reads) in groups:
                rl = list(reads)
                rl[0][1].is_duplicate = False

                for i in range(1, len(rl)):
                    rl[i][1].is_duplicate = True

        mark_duplicates(in_reads)

        # Make sure our 'all-reads' analysis matches the code
        out_dup_marks = np.array([
            r.is_duplicate for r in out_reads
            if (not r.is_unmapped) and (not r.mate_is_unmapped)
        ])
        test_dup_marks = np.array([
            r.is_duplicate for r in in_reads
            if (not r.is_unmapped) and (not r.mate_is_unmapped)
        ])

        print "len(start_bam): %d  -- len(out_bam): %d" % (len(out_dup_marks),
                                                           len(test_dup_marks))
        eq = (out_dup_marks == test_dup_marks).all()

        print "mean dups code: %f" % out_dup_marks.mean()
        print "mean dups test: %f" % test_dup_marks.mean()

        self.assertTrue(eq)

        # Read the molecule count histogram and verify
        count_hist = json.load(file(OUT_JSON))['no_filter_full_use_bcs']

        dups = sum([(int(times_observed) - 1) * n
                    for (times_observed, n) in count_hist.items()])
        total_reads = sum([
            int(times_observed) * n
            for (times_observed, n) in count_hist.items()
        ])
        summary_dup_rate = float(dups) / total_reads

        mapped_in_reads = np.array([
            r.is_duplicate for r in in_reads
            if not (r.is_unmapped or r.mate_is_unmapped)
            and tk_io.get_read_barcode(r) is not None
        ])
        self.assertEqual(summary_dup_rate, mapped_in_reads.mean())

        # Get the perfect reads, mark dups and compare stats
        perfect_reads = [
            x for x in in_reads
            if tenkit.read_filter.stringent_read_filter(x, True)
        ]
        mark_duplicates(perfect_reads)

        # Read the molecule count histogram and verify -- perfect reads
        count_hist = json.load(file(OUT_JSON))['full_use_bcs']

        dups = sum([(int(times_observed) - 1) * n
                    for (times_observed, n) in count_hist.items()])
        total_reads = sum([
            int(times_observed) * n
            for (times_observed, n) in count_hist.items()
        ])
        summary_dup_rate = float(dups) / total_reads

        mapped_in_reads = np.array([r.is_duplicate for r in perfect_reads])
        self.assertEqual(summary_dup_rate, mapped_in_reads.mean())
Exemple #10
0
def get_allele_read_info(chrom,
                         pos,
                         ref,
                         alt_alleles,
                         min_mapq_counts,
                         min_mapq_for_mean,
                         min_mapq_for_bc,
                         default_indel_qual,
                         bam,
                         reference_pyfasta,
                         max_reads=1000,
                         match=1,
                         mismatch=-4,
                         gap_open=-6,
                         gap_extend=-1):
    all_alleles = [ref] + alt_alleles
    bc_qual_maps = [{} for j in xrange(len(all_alleles))]
    counts = [0 for x in all_alleles]
    diffs = [[] for x in all_alleles]
    mapq_sums = [0.0 for x in all_alleles]
    mapq_denoms = [0.0 for x in all_alleles]
    molecule_differences = [[] for x in all_alleles]
    rescued = [[] for x in all_alleles]
    num_reads = 0
    qnames = set()
    for read in bam.fetch(chrom, pos, pos + 1):
        num_reads += 1
        if read.qname in qnames:
            continue
        qnames.add(read.qname)
        if read.is_duplicate:
            continue
        if num_reads > max_reads:
            break

        is_indel_variant = False
        for allele in alt_alleles:
            if len(allele) != len(ref):
                is_indel_variant = True

        allele_index_in_read = read_contains_allele_sw(
            ref,
            all_alleles,
            pos,
            read,
            reference_pyfasta[chrom],
            match=match,
            mismatch=mismatch,
            gap_open=gap_open,
            gap_extend=gap_extend)
        for (allele_index, allele) in enumerate(all_alleles):
            if allele_index == allele_index_in_read:
                if dict(read.tags).get("AS") is not None and dict(
                        read.tags).get("XS") is not None:
                    diffs[allele_index].append(
                        float(dict(read.tags).get("AS")) -
                        float(dict(read.tags).get("XS")))
                if dict(read.tags).get('OM') is not None:
                    if read.mapq >= 30 and dict(read.tags).get('OM') < 30:
                        rescue = 1
                    else:
                        rescue = 0
                    rescued[allele_index].append(rescue)
                if dict(read.tags).get("DM") is not None:
                    molecule_differences[allele_index].append(
                        float(dict(read.tags).get("DM")))
                if read.mapq >= min_mapq_for_mean:
                    mapq_sums[allele_index] += read.mapq
                    mapq_denoms[allele_index] += 1
                if read.mapq >= min_mapq_counts:
                    counts[allele_index] += 1
                if read.mapq >= min_mapq_for_bc:
                    bc = tk_io.get_read_barcode(read)
                    if bc is None:
                        continue
                    cigar_map = tk_seq.get_cigar_map(read.cigar)
                    try:
                        read_offset = cigar_map.index(pos - read.pos - 1)
                    except:
                        continue
                    if allele == ref:
                        if is_indel_variant:
                            qual = str(default_indel_qual)
                        else:
                            qual = str(
                                ord(
                                    min(read.qual[read_offset:read_offset +
                                                  len(allele)])))
                        bc_quals = bc_qual_maps[allele_index].setdefault(
                            bc, [])
                        bc_quals.append(qual)
                    # SNP
                    elif len(allele) == 1 and len(ref) == 1:
                        if is_indel_variant:
                            qual = str(default_indel_qual)
                        else:
                            qual = str(ord(read.qual[read_offset]))
                        bc_quals = bc_qual_maps[allele_index].setdefault(
                            bc, [])
                        bc_quals.append(qual)
                    # Insert
                    elif len(allele) > len(ref) and allele.startswith(ref):
                        bc_quals = bc_qual_maps[allele_index].setdefault(
                            bc, [])
                        bc_quals.append(str(default_indel_qual))
                    # Deletion
                    elif len(allele) < len(ref) and ref.startswith(allele):
                        bc_quals = bc_qual_maps[allele_index].setdefault(
                            bc, [])
                        bc_quals.append(str(default_indel_qual))
                    else:
                        bc_quals = bc_qual_maps[allele_index].setdefault(
                            bc, [])
                        bc_quals.append(str(default_indel_qual))
    bc_qual_strings = []
    for bc_qual_map in bc_qual_maps:
        bc_qual_strings.append([])
        for bc, bc_quals in bc_qual_map.iteritems():
            bc_qual_strings[-1].append(bc + '_' + '_'.join(bc_quals))

    mapq_means = [
        mapq_sums[i] / mapq_denoms[i] if mapq_denoms[i] > 0 else 31
        for i in range(len(all_alleles))
    ]
    return (counts, mapq_means, bc_qual_strings, molecule_differences, diffs,
            rescued)
Exemple #11
0
def bc_key_func(read):
    return tk_io.get_read_barcode(read)
Exemple #12
0
def create_bc_matrix_step(bam_filename,
                          chrom,
                          starts,
                          stops,
                          win,
                          step,
                          bc_map,
                          min_mapq=30,
                          read1_only=False,
                          no_split=False):
    """ Creates a (BCs X Windows) sparse matrix of barcodes (columns) versus windowed locations
    """

    step = min(max(1, step), win)
    in_bam = tk_bam.create_bam_infile(bam_filename)
    nsteps_per_win = int(np.ceil(win / float(step)))
    nbcs = len(bc_map.values())
    bc_idx = []
    win_idx = []
    win_starts = []
    win_stops = []

    for start, stop in zip(starts, stops):
        nwin = int(np.ceil((stop - start) / float(step)))
        for read in in_bam.fetch(str(chrom), start, stop):
            mapq = read.mapq
            if mapq < min_mapq or (read1_only and read.is_read2
                                   ) or read.is_duplicate or read.is_secondary:
                continue

            if no_split and ('SA' in [t[0] for t in read.tags]):
                continue

            bc = tk_io.get_read_barcode(read)
            if bc is None or bc == '' or not bc in bc_map:
                continue

            pos = read.pos
            if pos < start or pos > stop:
                continue

            last_win_idx = int(np.floor((pos - start) / float(step)))
            first_win_idx = max(0, last_win_idx - nsteps_per_win + 1)
            last_win_idx = min(last_win_idx, nwin - 1)
            first_win_idx = min(first_win_idx, nwin - 1)
            bc_idx.extend(
                [bc_map[bc] for i in range(first_win_idx, last_win_idx + 1)])
            win_idx.extend([
                i + len(win_starts)
                for i in range(first_win_idx, last_win_idx + 1)
            ])
        win_starts.extend([start + i * step for i in range(nwin)])
        win_stops.extend(
            [min(stop, start + i * step + win) for i in range(nwin)])
    bc_idx = np.reshape(np.array(bc_idx, ndmin=2), (1, len(bc_idx)))
    win_idx = np.reshape(np.array(win_idx, ndmin=2), (1, len(win_idx)))
    bc_mat = sp.csc_matrix((np.ones((bc_idx.shape[1], ), dtype=np.float32),
                            np.concatenate((bc_idx, win_idx), axis=0)),
                           (nbcs, len(win_starts))).tolil()
    win_starts = np.array(win_starts).flatten()
    win_stops = np.array(win_stops).flatten()
    return (bc_mat, win_starts, win_stops)
Exemple #13
0
def chunk_split_func(r):
    return tk_io.get_read_barcode(r)
Exemple #14
0
def read_has_barcode(r):
    bc = tk_io.get_read_barcode(r)
    if bc is None:
        return False
    else:
        return True
Exemple #15
0
def main(args, outs):
    """ Outputs barcode file """
    args.coerce_strings()
    bam_in = tk_bam.create_bam_infile(args.input)
    unsorted_temp_name = martian.make_path(outs.contig_output +
                                           '_TEMPUNSORTED')
    sorted_temp_name = martian.make_path(outs.contig_output + '_TEMPSORTED')
    base_dir = os.path.dirname(outs.contig_output)
    unsorted_temp_file = open(unsorted_temp_name, 'w')
    contig_output_file = open(outs.contig_output, 'w')
    window_size = args.window_size

    chroms = bam_in.references

    # Output the raw poses
    unsorted_temp_file.write('\t'.join(['#CHROM', 'START', 'END', 'BC_SEQ']) +
                             '\n')
    if args.restrict_locus is None:
        bam_iter = bam_in.fetch(args.chrom)
    else:
        restrict_chrom, restrict_start, restrict_stop = tk_io.get_locus_info(
            args.restrict_locus)
        assert (args.chrom == restrict_chrom)
        bam_iter = bam_in.fetch(restrict_chrom, restrict_start, restrict_stop)

    for read in bam_iter:
        chrom = chroms[read.tid]
        start = read.pos
        end = read.aend

        if end is None:
            end = start + len(read.seq)

        bc = tk_io.get_read_barcode(read)

        if not (bc is None):
            unsorted_temp_file.write(
                '\t'.join([chrom, str(start), str(end), bc]) + '\n')

    # Sort the poses
    unsorted_temp_file.close()
    tk_tabix.sort_bc_loc_tabix(unsorted_temp_name,
                               sorted_temp_name,
                               temp_dir_name=base_dir)

    # Infer the contig locations
    # This header is written during join
    #contig_output_file.write('\t'.join(['#CHROM', 'START', 'END', 'BC_SEQ', 'NUM_READS']) + '\n')
    sorted_temp_file = open(sorted_temp_name, 'r')
    sorted_temp_file.readline()
    old_bc_seq = None
    bc_poses = []
    for line in sorted_temp_file:
        (chrom, start, end, bc_seq) = line.strip('\n').split('\t')
        start = int(start)
        end = int(end)

        if not (bc_seq == old_bc_seq):
            if not (old_bc_seq is None):
                frags = infer_fragments(bc_poses, window_size)
                for (frag_chrom, frag_start, frag_end, num_reads) in frags:
                    contig_output_file.write('\t'.join([
                        frag_chrom,
                        str(frag_start - BUFFER),
                        str(frag_end + BUFFER), old_bc_seq,
                        str(num_reads)
                    ]) + '\n')
            bc_poses = []
        old_bc_seq = bc_seq
        bc_poses.append((chrom, start, end))

    # Output for the last barcode
    if not (old_bc_seq is None):
        frags = infer_fragments(bc_poses, window_size)
        for (frag_chrom, frag_start, frag_end, num_reads) in frags:
            contig_output_file.write('\t'.join([
                frag_chrom,
                str(frag_start - BUFFER),
                str(frag_end + BUFFER), old_bc_seq,
                str(num_reads)
            ]) + '\n')

    sorted_temp_file.close()
    subprocess.check_call(['rm', sorted_temp_name])
    subprocess.check_call(['rm', unsorted_temp_name])
    contig_output_file.close()
Exemple #16
0
def bc_sort_key(read):
    return (tk_io.get_read_barcode(read), read.qname)
Exemple #17
0
        def process_read_block(reads, count_hist, optical_dup_hist,
                               diffusion_dup_hist):
            ''' dedup a block of reads, then write to BAM in original order '''
            read_tuples = []
            for read in reads:
                # Don't consider unmapped reads
                if read.is_unmapped:
                    continue

                bc_sequence = None

                if self.split_bcs:
                    bc_sequence = tk_io.get_read_barcode(read)

                read_tup = (bc_sequence, read.is_read1, read.is_reverse,
                            read.tid, read.pos, read.mrnm, read.mpos)

                # Include the read and the original index so that we can get back to the original order
                read_tuples.append((read_tup, read, len(read_tuples)))

            # Sort by the read tuple -- need to do this for the groupby to get all the common items together
            read_tuples.sort(key=lambda x: x[0])

            # Both reads in a pair must be mapped, otherwise we drop the pair
            mapped_pair_tuples = itertools.ifilter(
                lambda x:
                (not x[1].is_unmapped) and (not x[1].mate_is_unmapped),
                read_tuples)

            # Group the reads by the read_tuple
            dup_groups = itertools.groupby(mapped_pair_tuples, lambda x: x[0])

            for (key, dup_group) in dup_groups:
                # Note how many dups we have
                dup_group = list(dup_group)
                n_dups = len(dup_group)

                if n_dups > 1:
                    optical_dups, diffusion_dups = self.count_dups_by_distance(
                        dup_group)
                else:
                    optical_dups = 0
                    diffusion_dups = 0

                # diffusion dups encompass optical dups, if
                non_proximal_dups = n_dups - max(diffusion_dups, optical_dups)

                # If we are splitting on bcs, then only counts stats for read groups with BCs
                group_bc = key[0]
                if not self.split_bcs or group_bc is not None:
                    count_hist[non_proximal_dups] = count_hist.setdefault(
                        non_proximal_dups, 0) + 1
                    if optical_dups > 0:
                        optical_dup_hist[
                            'count'] = optical_dup_hist.setdefault(
                                'count', 0) + optical_dups
                        diffusion_dup_hist[
                            'count'] = diffusion_dup_hist.setdefault(
                                'count', 0) + diffusion_dups

                # Mark dups
                if self.output_bam:
                    # mark the first read in the set as not-a-dup
                    dup_group[0][1].is_duplicate = False

                    for i in range(1, n_dups):
                        dup_group[i][1].is_duplicate = True

            if self.output_bam:
                for read in reads:
                    self.output_bam.write(read)

            # done processing block of reads
            return
Exemple #18
0
def main(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end

    bam_in = tk_bam.create_bam_infile(args.input)
    reads = tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end))
    pgs = [
        tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_phasing"),
        tk_bam.make_terminal_pg_header(martian.get_pipelines_version())
    ]
    # dont duplicate header if already there, this is for developer testing purposes
    PG = bam_in.header['PG']
    for item in PG:
        if item['ID'] == "attach_phasing":
            pgs = []
    bam_out, _ = tk_bam.create_bam_outfile(outs.phased_possorted_bam, None, None, template=bam_in, pgs=pgs)

    # File with contig phasing information
    if args.fragment_phasing is not None:
        frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing)
    else:
        frag_phasing = None

    if args.fragments is not None:
        # Fragments file for global molecule id
        frag_id_reader = tk_hdf5.DataFrameReader(args.fragments)
    else:
        frag_id_reader = None

    # Phasing data
    ph_db = None
    ph_db_chrom = None
    ph_db_start = None
    ph_db_end = None

    # Fragment data - for global molecule id
    fr_db = None
    fr_db_chrom = None
    fr_db_start = None
    fr_db_end = None

    total_reads = 0
    phased_reads = 0
    molecule_tagged_reads = 0

    for r in reads:
        chrom = bam_in.references[r.tid]
        pos = r.pos
        bc = tk_io.get_read_barcode(r)

        total_reads += 1
        tags = r.tags

        # Strip out RX and QX tags
        #strip_tags = [RAW_BARCODE_TAG, RAW_BARCODE_QUAL_TAG]
        # Actually don't strip
        strip_tags = []
        tags = [(tag, value) for (tag, value) in tags if (tag not in strip_tags)]

        # Fetch from the fragments file to get records that should cover many future reads
        # fragment phasing file may not exist in ALIGNER only pipeline - may need to skip
        if frag_phasing is not None:
            if ph_db is None or chrom != ph_db_chrom or pos < ph_db_start or pos > ph_db_end:
                ph_db, (ph_db_chrom, ph_db_start, ph_db_end) = get_frag_phasing_db(frag_phasing, chrom, pos, window=WINDOW_SIZE)

            if bc is not None and ph_db.has_key(bc):
                frags = ph_db[bc]
                # See if we having phasing for this fragment
                valid_phasing = [x for x in frags if x['start'] <= r.pos and x['end'] > r.pos]
                assert(len(valid_phasing) < 2)
                if len(valid_phasing) == 1:
                    phased_reads += 1
                    read_phasing = valid_phasing[0]
                    tags.append((PHASE_SET_BAM_TAG, read_phasing['ps']))
                    tags.append((HAPLOTYPE_BAM_TAG, read_phasing['hap']))
                    tags.append((PHASING_CONF_BAM_TAG, read_phasing['pc']))

        if frag_id_reader is not None:
            # Fetch from the fragments file to get records that should cover many future reads
            if fr_db is None or chrom != fr_db_chrom or pos < fr_db_start or pos > fr_db_end:
                fr_db, (fr_db_chrom, fr_db_start, fr_db_end) = get_molecule_id_db(frag_id_reader, chrom, pos, window=WINDOW_SIZE)

            if bc is not None and fr_db.has_key(bc):
                frags = fr_db[bc]
                # See if we having phasing for this fragment
                molecule_ids = [x for x in frags if x['start'] <= r.pos and x['end'] > r.pos]
                assert(len(molecule_ids) < 2)
                if len(molecule_ids) == 1:
                    molecule_tagged_reads += 1
                    molecule_id = molecule_ids[0]
                    tags.append((MOLECULE_ID_BAM_TAG, molecule_id['molecule_id']))


        r.tags = tags
        bam_out.write(r)

    bam_out.close()
    outs.total_reads = total_reads
    outs.phased_reads = phased_reads
    outs.molecule_tagged_reads = molecule_tagged_reads