Example #1
0
def split(args):
    bam_in = create_bam_infile(args.input)
    chunk_defs = tk_bam.chunk_bam_records(bam_in,
                                          chunk_bound_key=None,
                                          chunk_size_gb=0.5)
    for chunk in chunk_defs:
        chunk["__mem_gb"] = 8.0
    return {'chunks': chunk_defs}
def split(args):
    # Chunk bam to get 1GB per chunk
    bam_in = create_bam_infile(args.input)
    bam_chunk_size_disk = 0.75
    chunk_defs = tk_bam.chunk_bam_records(bam_in,
                                          chunk_bound_func,
                                          chunk_size_gb=bam_chunk_size_disk)

    for chunk in chunk_defs:
        chunk['__mem_gb'] = 4
        chunk['__vmem_gb'] = 5 + int(
            np.ceil(2 * whitelist_mem_gb(args.barcode_whitelist) +
                    bam_chunk_size_disk * 10))

    lane_coord_sys = tk_lane.LaneCoordinateSystem()

    # Reopen BAM for estimating tile extents
    bam_in = create_bam_infile(args.input)
    lane_coord_sys.estimate_tile_extents(bam_in)
    for cnum, chunk in enumerate(chunk_defs):
        chunk['lane_map'] = lane_coord_sys.to_dict()
        chunk['chunk_num'] = cnum

    return {'chunks': chunk_defs, 'join': {'__mem_gb': 8, '__threads': 4}}
def main(args, outs):
    outs.coerce_strings()
    bam_in = create_bam_infile(args.bucket[0])
    bam_out, _ = tk_bam.create_bam_outfile(
        outs.bcsorted_bam,
        None,
        None,
        template=bam_in,
        pgs=[
            tk_bam.make_pg_header(martian.get_pipelines_version(),
                                  "sort_reads_by_bc", TENX_PRODUCT_NAME)
        ])
    bam_in.close()

    outs.total_reads = merge_by_key(args.bucket, bc_sort_key, bam_out)
    bam_out.close()
Example #4
0
def main_bucket_reads_by_bc(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end

    prefixes = get_seqs(args.nbases)

    bam_in = create_bam_infile(args.input)

    buckets = {prefix: [] for prefix in prefixes}

    non_bc_bam_out, _ = tk_bam.create_bam_outfile(outs.default, None, None, template=bam_in)
    non_bc_reads = []
    for r in tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)):
        barcode = get_read_barcode(r, None)
        if barcode is None:
            non_bc_bam_out.write(r)
            non_bc_reads.append(r)
        else:
            prefix = barcode[:args.nbases]
            buckets[prefix].append(r)
    non_bc_bam_out.close()

    # Set random seed to get deterministic qname subsampling
    random.seed(0)
    sampled_non_bc_reads = random.sample(non_bc_reads, min(len(non_bc_reads), len(prefixes)))
    outs.qnames = [read.qname for read in sampled_non_bc_reads]

    outs.buckets = {}
    files_dir = os.path.dirname(outs.default)
    for prefix, bucket in buckets.iteritems():
        filename = os.path.join(files_dir, "bc_%s.bam" % prefix)
        outs.buckets[prefix] = filename
        bucket.sort(key=bc_sort_key)
        bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in)
        try:
            for r in bucket:
                bam_out.write(r)
        finally:
            bam_out.close()
    bam_in.close()
Example #5
0
def main(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end

    bam_in = create_bam_infile(args.input)
    reads = list(tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)))

    tmp_dir = os.path.dirname(outs.default)
    bams_out = {}
    outs.buckets = {}
    buckets = {}
    for qname in args.qnames:
        filename = os.path.join(tmp_dir, "qname_%s.bam" % qname)
        bam_out, _ = tk_bam.create_bam_outfile(filename,
                                               None,
                                               None,
                                               template=bam_in)
        bams_out[qname] = bam_out
        outs.buckets[qname] = filename
        buckets[qname] = []

    qname_ranges = zip(args.qnames, args.qnames[1:])
    for r in reads:
        qname = None
        for qnames in qname_ranges:
            if qnames[0] <= r.qname and r.qname <= qnames[1]:
                qname = qnames[0]
                break
        if qname is None:
            qname = args.qnames[-1]
        buckets[qname].append(r)

    for qname, bucket in buckets.iteritems():
        bucket.sort(key=bc_sort_key)
        bam_out = bams_out[qname]
        for r in bucket:
            bam_out.write(r)
        bam_out.close()
def join(args, outs, chunk_defs, chunk_outs):
    chunk_lists = [[], []]
    outs.total_reads = 0
    for chunk in zip(chunk_defs, chunk_outs):
        index = chunk[0].index
        chunk_lists[index].append(chunk)
        outs.total_reads += chunk[1].total_reads

    # Sanity check vs. position-sorted BAM
    with create_bam_infile(args.possorted_bam) as possorted_bam_in:
        assert possorted_bam_in.unmapped + possorted_bam_in.mapped == outs.total_reads

    buckets = []
    for chunks in chunk_lists:
        chunks = sorted(chunks, key=lambda chunk: chunk[0].prefix)
        buckets += [chunk[1].bcsorted_bam for chunk in chunks]
    tk_bam.concatenate(outs.bcsorted_bam, buckets)

    print "%s indexing BAM file" % PROCESSED_BARCODE_TAG
    index = tenkit.bam.BamBCIndex(outs.bcsorted_bam)
    index.save_index()
    outs.bcsorted_bam_index = outs.bcsorted_bam + ".bxi"
    print "Wrote bx index to %s" % outs.bcsorted_bam_index
def main(args, outs):
    """Mark exact duplicate reads in the output BAM file while also writing out some summary statistics.
    PCR duplicates have the same read1 start site and read2 start site.
    """
    args.coerce_strings()
    outs.coerce_strings()

    # Chunk output doesn't get indexed
    outs.fragments_index = None
    outs.index = None

    # Pull in prior likelihoods for barcodes
    raw_barcode_abundance = None
    barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist)
    if args.raw_barcode_counts is not None and barcode_whitelist is not None:
        with open(args.raw_barcode_counts, 'r') as infile:
            raw_counts = json.load(infile)
        raw_barcode_abundance = {
            '{}-{}'.format(barcode, gem_group): count
            for gem_group, subdict in raw_counts.iteritems()
            for barcode, count in zip(barcode_whitelist, subdict['bc_counts'])
        }

    bam_in = create_bam_infile(args.input)
    bam_refs = bam_in.references

    bam_prefix, ext = os.path.splitext(outs.output)
    raw_bam_file = martian.make_path(bam_prefix + '_five_prime_pos_sorted' +
                                     ext)

    frag_prefix, ext = os.path.splitext(outs.fragments)
    raw_frag_file = martian.make_path(frag_prefix + '_raw' + ext)

    # only write CO line for one chunk, so we don't have duplicates after samtools merge
    if args.chunk_num == 0:
        COs = [
            '10x_bam_to_fastq:R1(SEQ:QUAL,TR:TQ)',
            '10x_bam_to_fastq:R2(SEQ:QUAL,TR:TQ)',
            '10x_bam_to_fastq:I1(BC:QT)', '10x_bam_to_fastq:I2(CR:CY)',
            '10x_bam_to_fastq_seqnames:R1,R3,I1,R2'
        ]
    else:
        COs = None

    bam_out, _ = tk_bam.create_bam_outfile(
        raw_bam_file,
        None,
        None,
        template=bam_in,
        pgs=[
            tk_bam.make_pg_header(martian.get_pipelines_version(),
                                  "mark_duplicates", TENX_PRODUCT_NAME)
        ],
        cos=COs)
    fragments_out = open(raw_frag_file, 'w')
    bam_in.reset()

    # Ensure the summary key indicates what kind of dup marking was actually performed.
    lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map)
    reference_manager = ReferenceManager(args.reference_path)
    summarizer = DupSummary(split_bcs=False,
                            lane_coordinate_system=lane_coord_sys,
                            output_bam=bam_out,
                            output_tsv=fragments_out,
                            ref=reference_manager,
                            bam_refs=bam_refs,
                            priors=raw_barcode_abundance)

    # Now broadcast the selected reads to the summarizers
    consumers = [summarizer.read_consumer()]
    source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    broadcast(source, consumers)

    # Close outfiles
    bam_out.close()
    fragments_out.close()

    # Feed the chunk barcode_counts data back to join()
    with open(outs.singlecell_mapping, 'w') as outfile:
        pickle.dump(summarizer.bc_counts, outfile)

    # Sort the output bam & tsv files
    sort_bam(raw_bam_file,
             outs.output,
             threads=martian.get_threads_allocation())
    sort_bed(raw_frag_file,
             outs.fragments,
             genome=reference_manager.fasta_index,
             threads=martian.get_threads_allocation(),
             leave_key=True)
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """

    chunk = args.chunk

    bam_in = create_bam_infile(args.align_chunk)

    bam_out, _ = tk_bam.create_bam_outfile(
        outs.output,
        None,
        None,
        template=bam_in,
        pgs=[
            tk_bam.make_pg_header(martian.get_pipelines_version(),
                                  "attach_bcs", TENX_PRODUCT_NAME)
        ])

    gp_tagger = GlobalFivePrimePosTagger(bam_in)

    if args.barcode_whitelist is None or args.bc_counts is None:
        # If there's no whitelist or counts then all high quality BC reads get allowed.
        barcode_whitelist = None
        wl_idxs = None
        bc_dist = None
    else:
        barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist)

        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        counts = counts[str(chunk['gem_group'])]['bc_counts']

        # Prior distribution over barcodes, with pseudo-count
        bc_dist = np.array(counts, dtype=np.float) + 1.0
        bc_dist = bc_dist / bc_dist.sum()
        wl_idxs = {
            bc: idx
            for (idx, bc) in enumerate(sorted(list(barcode_whitelist)))
        }

    # set random seed to get deterministic subsampling
    random.seed(0)

    if chunk['barcode'] is not None:
        processed_barcode_iter = get_raw_processed_barcodes(
            open_maybe_gzip(chunk['barcode']), barcode_whitelist,
            args.bc_confidence_threshold, chunk['gem_group'],
            chunk['barcode_reverse_complement'], wl_idxs, bc_dist)
        require_barcode_for_stringent = True
    else:
        processed_barcode_iter = itertools.repeat(None)
        require_barcode_for_stringent = False

    if chunk['sample_index'] is not None:
        sample_index_iter = tk_fasta.read_generator_fastq(
            open_maybe_gzip(chunk['sample_index']))
    else:
        sample_index_iter = itertools.repeat(None)

    if chunk['trim'] is not None:
        trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(
            chunk['trim']),
                                                  paired_end=True)
    else:
        trim_iter = itertools.repeat(None)

    iters = itertools.izip(processed_barcode_iter, sample_index_iter,
                           trim_iter)

    # First read
    try:
        read = bam_in.next()
    except StopIteration:
        read = None

    # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates
    perfect_read_count = 0

    # Due to secondary alignments, we must apply the tags to all
    # reads with the same cluster name.
    for (barcode_info, sample_index_info, trim_info) in iters:
        tags = []
        read_name = None

        if read is None:
            break

        if barcode_info is not None:
            (bc_read_name, raw_bc_seq, processed_bc_seq,
             raw_bc_qual) = barcode_info
            tags.append((RAW_BARCODE_TAG, raw_bc_seq))
            tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual))
            if processed_bc_seq is not None:
                tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq))
            read_name = bc_read_name.split()[0]

        if sample_index_info is not None:
            (si_read_name, seq, qual) = sample_index_info
            tags.append((SAMPLE_INDEX_TAG, seq))
            tags.append((SAMPLE_INDEX_QUAL_TAG, qual))

            if read_name is not None:
                if si_read_name.split()[0] != read_name:
                    martian.log_info(
                        "mismatch: si_read_name: %s, bam_read_name: %s" %
                        (si_read_name, read_name))
                assert (si_read_name.split()[0] == read_name)
            else:
                read_name = si_read_name.split()[0]

        r1_tags = tags
        r2_tags = list(r1_tags)

        if trim_info is not None:
            (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name,
             trim2_seq, trim2_qual) = trim_info
            if len(trim1_seq) > 0:
                r1_tags.append((TRIM_TAG, trim1_seq))
                r1_tags.append((TRIM_QUAL_TAG, trim1_qual))
            if len(trim2_seq) > 0:
                r2_tags.append((TRIM_TAG, trim2_seq))
                r2_tags.append((TRIM_QUAL_TAG, trim2_qual))

        reads_attached = 0
        reads_to_attach = []

        while read.query_name == read_name or read_name is None:
            tags = r1_tags if read.is_read1 else r2_tags
            if len(tags) > 0:
                existing_tags = read.tags
                existing_tags.extend(tags)
                read.tags = existing_tags

            if reads_to_attach and (
                    read.query_name != reads_to_attach[0].query_name
                    or reads_to_attach[0].query_name is None):
                gp_tagger.tag_reads(reads_to_attach)
                reads_attached += len(reads_to_attach)
                for r in reads_to_attach:
                    if stringent_read_filter(r, require_barcode_for_stringent):
                        perfect_read_count += 1

                    if args.exclude_non_bc_reads:
                        if not (get_read_barcode(r) is None):
                            bam_out.write(r)
                    else:
                        bam_out.write(r)
                reads_to_attach = []

            reads_to_attach.append(read)

            try:
                read = bam_in.next()

            except StopIteration:
                read = None
                break

        gp_tagger.tag_reads(reads_to_attach)
        reads_attached += len(reads_to_attach)
        for r in reads_to_attach:
            if stringent_read_filter(r, require_barcode_for_stringent):
                perfect_read_count += 1

            if args.exclude_non_bc_reads:
                if not (get_read_barcode(r) is None):
                    bam_out.write(r)
            else:
                bam_out.write(r)
        # We may have more than 2 reads if there was a
        # secondary alignment, but less than 2 means
        # something went wrong
        assert (reads_attached >= 2)

    outs.perfect_read_count = perfect_read_count
    bam_out.close()
def main(args, outs):
    bam_in = create_bam_infile(args.chunk_bam)
    references = bam_in.references
    misc_sm = compute_basic_stats(bam_in, references)
    misc_sm.save(outs.misc_sm)