Ejemplo n.º 1
0
    def test_targets(self):
        bam_bc_file = tk_test.in_path("namesort_test.bam")
        read_info_out = tk_test.out_path("read_info.h5")
        barcode_whitelist = bc_utils.load_barcode_whitelist("737K-april-2014")

        targets_filename = tk_test.in_path('agilent_kinome_targs.bed')
        targets_file = open(targets_filename, 'r')
        target_regions = tk_io.get_target_regions(targets_file)

        bam_in = tk_bam.create_bam_infile(bam_bc_file)
        r = compute_basic_stats(bam_in,
                                target_regions,
                                1000,
                                bam_in.references,
                                barcode_whitelist=barcode_whitelist,
                                read_h5_out=read_info_out)
        # insert_size_dists, nearest_targ_dists, summary_metrics, bc_table, mapq_counts, insert_size_hist = r
        misc_sm, bc_sms = r

        nearest_targ_dists = bc_sms.get('nearest_targ_dists')
        maxTargetDist = max(nearest_targ_dists.get_summarizer(60).dict.keys())
        minTargetDist = min(nearest_targ_dists.get_summarizer(60).dict.keys())

        self.assertEqual(minTargetDist, 130)
        self.assertEqual(maxTargetDist, 10000)
Ejemplo n.º 2
0
def split(args):
    if args.bcsorted_bam is None or args.barcode_whitelist is None:
        chunk_defs = [{'chunk_start': "0", 'chunk_end': "0"}]
        return {'chunks': chunk_defs}
    # if args.input

    # Some R&D bc sets have very small diversity -- don't run on them
    barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist)
    if len(barcode_whitelist) < 100:
        chunk_defs = [{'chunk_start': "0", 'chunk_end': "0"}]
        return {'chunks': chunk_defs}
    # if barcode_whitelist

    min_chunks = 4
    if len(barcode_whitelist) > 1e6:
        min_chunks = 8
    # if barcode_whitelist

    bam_in = tk_bam.create_bam_infile(args.bcsorted_bam)
    chunks = tk_bam.chunk_bam_records(bam_in,
                                      chunk_split_func,
                                      chunk_size_gb=8.0,
                                      min_chunks=min_chunks)
    for c in chunks:
        c['__mem_gb'] = 12
    # for c

    return {'chunks': chunks, 'join': {'__mem_gb': 32}}
    def test_attach_bcs(self):
        #  --align_input alignment_output.bam --barcode_input phix_I2.fastq --output test2.out --complete ~/c --stats ~/s
        args = {
            'barcode_whitelist' : IN_WHITELIST,
            'align_chunk' : IN_BAM,
            'barcode_chunk' : IN_I2,
            'sample_index_chunk' : IN_I1,
            'gem_group' : None,
            'paired_end' : True,
            'exclude_non_bc_reads' : False,
            'max_expected_bc_error': 0.75,
            'subsample_rate' : 1.0,
        }
        outs = { 'output': OUT_BAM }

        args = martian.Record(args)
        outs = martian.Record(outs)

        main(args, outs)

        # Get the barcodes
        barcode_whitelist = bc_utils.load_barcode_whitelist(IN_WHITELIST)

        # Ensure each read has a barcode
        out_bam = pysam.Samfile(OUT_BAM)
        for r in out_bam:
            tag_dict = { k:v for (k,v) in r.tags }
            tag_names = [ k for (k,v) in r.tags ]
            self.assertTrue(RAW_BARCODE_TAG in tag_names)

            if tag_dict[RAW_BARCODE_TAG] in barcode_whitelist:
                self.assertTrue(PROCESSED_BARCODE_TAG in tag_names)

            self.assertTrue(SAMPLE_INDEX_TAG in tag_names)


        # Make sure we put out the full BAM file
        out_len = len([ x for x in pysam.Samfile(OUT_BAM)])
        in_len  = len([ x for x in pysam.Samfile(IN_BAM)])
        self.assertEqual(out_len, in_len)


        def get_bc(r):
            tags = { k:v for (k,v) in r.tags }
            return tags[RAW_BARCODE_TAG]

        # Ensure each read pair has the same barcode
        out_bam = pysam.Samfile(OUT_BAM)
        reads = [ x for x in out_bam ]

        for (grp, reads) in groupby(reads, lambda x: x.qname):
            bcs = set(crdna_io.get_read_barcode(r) for r in reads)
            self.assertEqual(len(bcs), 1)
Ejemplo n.º 4
0
def split(args):
    bam = pysam.Samfile(args.input, check_sq=False)

    min_chunks = 1
    if args.barcode_whitelist is not None:
        barcode_whitelist = bc_utils.load_barcode_whitelist(
            args.barcode_whitelist)
        if len(barcode_whitelist) > 1e6:
            min_chunks = 4

    # Split to ensure read pairs always go together
    chunks = tk_bam.chunk_bam_records(bam,
                                      lambda x: x.qname,
                                      min_chunks=min_chunks)
    for chunk in chunks:
        chunk['n_chunks'] = len(chunks)
        chunk['__mem_gb'] = 3
    return {'chunks': chunks, 'join': {'__mem_gb': 8}}
Ejemplo n.º 5
0
def main_report_basic(args, outs):
    bam_in = pysam.Samfile(args.input, check_sq=False)
    targets_filename = args.targets_file
    references = bam_in.references

    if args.input_pos is not None:
        bam_in_pos = tk_bam.create_bam_infile(args.input_pos)
        n_mapped = bam_in_pos.mapped
        n_chunk = math.ceil(n_mapped / args.n_chunks)
        bam_in_pos.close()
    else:
        n_mapped = 0
        n_chunk = 0

    if targets_filename is None or targets_filename == '':
        target_regions = None
    else:
        targets_file = open(targets_filename, 'r')
        target_regions = tk_io.get_target_regions(targets_file)

    if args.barcode_whitelist:
        barcode_whitelist = bc_utils.load_barcode_whitelist(
            args.barcode_whitelist)
    else:
        barcode_whitelist = None

    bam_slice = tk_bam.read_bam_chunk(bam_in,
                                      (args.chunk_start, args.chunk_end))

    # do basic counting
    misc_sm, qual_sms = \
            compute_basic_stats(bam_slice,
                    target_regions,
                    n_chunk,
                    references,
                    barcode_whitelist)

    misc_sm.save(outs.misc_sm)
    with open(outs.qual_sms, 'wb') as out_handle:
        pickle.dump(qual_sms, out_handle)
Ejemplo n.º 6
0
def split(args):
    if args.input is None or args.barcode_whitelist is None:
        chunk_defs = [{'chunk_start':"0", 'chunk_end':"0"}]
        return {'chunks': chunk_defs}

    # Some R&D bc sets have very small diversity -- don't run on them
    barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist)
    if len(barcode_whitelist) < 100:
        chunk_defs = [{'chunk_start':"0", 'chunk_end':"0"}]
        return {'chunks': chunk_defs}

    min_chunks = 20
    if len(barcode_whitelist) > 1e6:
        min_chunks = 100

    bam_in = tk_bam.create_bam_infile(args.input)
    chunks = tk_bam.chunk_bam_records(bam_in, groupbybarcode, 
                chunk_size_gb=8.0, min_chunks=min_chunks)
    for c in chunks:
        c['__mem_gb'] = 3

    return {'chunks': chunks, 'join': {'__mem_gb': 6}}
Ejemplo n.º 7
0
    def test_barcode_counts(self):
        bam_bc_file = tk_test.in_path("attach_bcs/attach_bcs_output.bam")
        read_info_out = tk_test.out_path("read_info.h5")
        barcode_whitelist = bc_utils.load_barcode_whitelist("737K-april-2014")
        bam_in = tk_bam.create_bam_infile(bam_bc_file)
        r = compute_basic_stats(bam_in, {},
                                2000,
                                bam_in.references,
                                barcode_whitelist=barcode_whitelist,
                                read_h5_out=read_info_out)
        # insert_size_dists, nearest_targ_dists, summary_metrics, bc_table, mapq_counts, insert_size_hist = r
        misc_sm, bc_sms = r

        # Look at the barcode results -- there should be a raw bc count for each read pair
        # n_raw_bcs = bc_table["count"].sum()
        n_reads = len([x for x in tk_bam.create_bam_infile(bam_bc_file)])

        # self.assertEqual(n_raw_bcs, n_reads / 2)

        # Load the per-cluster table -- there should be a row for each read pair
        read_info = tenkit.hdf5.read_data_frame(read_info_out)

        self.assertEqual(read_info.shape[0], n_reads / 2)
Ejemplo n.º 8
0
def join(args, outs, chunk_defs, chunk_outs):
    ''' join the various outputs created by report_basic '''
    chunk_outs = list(chunk_outs)

    martian.log_info("combining misc summary managers")
    misc_sm_outs = [x.misc_sm for x in chunk_outs]
    misc_sm = combine_summary_managers(misc_sm_outs)

    martian.log_info("combining nested summary managers")
    qual_sms_outs = [x.qual_sms for x in chunk_outs]
    qual_sms = combine_nested_summary_managers(qual_sms_outs)

    martian.log_info("computing summary metrics")
    compute_summary_metrics(misc_sm, qual_sms)

    metrics = misc_sm.get_summarizer("metrics")
    if metrics["unmapped_fract"] > 0.90:
        martian.exit("%.1f %% of reads were not mapped to the supplied "\
            "reference genome. This is likely the consequence of a sample "\
            "mixup or very low sequencing quality. Further execution will "\
            "be halted." % (metrics["unmapped_fract"]*100))

    if args.barcode_whitelist:
        barcode_whitelist = bc_utils.load_barcode_whitelist(
            args.barcode_whitelist)
    else:
        barcode_whitelist = None

    # barcode hdf5
    if outs.barcode_counts:
        bc_table = summarize_barcode_data(misc_sm, qual_sms, barcode_whitelist)
        tenkit.hdf5.write_data_frame(outs.barcode_counts, bc_table)

    # insert sizes output
    insert_size_dists = {}
    for qual in INSERT_MAPQ_CUTOFFS:
        insert_size_dists[qual] = qual_sms['insert_size_dists'].get_summarizer(
            qual).dict
    insert_sizes_output_file = open(outs.insert_sizes, 'w')
    insert_sizes_output_file.write(
        tenkit.safe_json.safe_jsonify(insert_size_dists) + '\n')
    insert_sizes_output_file.close()

    # target distances
    nearest_targ_dists = {}
    for qual in TARGET_MAPQ_CUTOFFS:
        nearest_targ_dists[qual] = qual_sms[
            'nearest_targ_dists'].get_summarizer(qual).dict
    target_dists_output_file = open(outs.target_dists, 'w')
    target_dists_output_file.write(
        tenkit.safe_json.safe_jsonify(nearest_targ_dists))
    target_dists_output_file.close()

    # overall summary metrics
    summary_output_file = open(outs.summary, 'w')
    summary_output_file.write(
        tenkit.safe_json.safe_jsonify(misc_sm.get_summarizer('metrics').dict,
                                      pretty=True))
    summary_output_file.close()

    # mapq counts
    mapq_output_file = open(outs.mapq_counts, 'w')
    mapq_output_file.write(
        tenkit.safe_json.safe_jsonify(
            misc_sm.get_summarizer('mapq_counts').dict))
    mapq_output_file.close()

    # logging
    print tenkit.safe_json.safe_jsonify(misc_sm.get_summarizer('metrics').dict,
                                        pretty=True)
Ejemplo n.º 9
0
def join(args, outs, chunk_defs, chunk_outs):
    final_chunks = []

    for cl in chunk_outs:
        final_chunks.extend(cl.chunks)

    outs.chunks = final_chunks
    valid_counts = [c.bc_counts for c in chunk_outs if c.bc_counts is not None]

    # No counts if there's no whitelist or actual counts
    if args.barcode_whitelist is None or len(valid_counts) == 0:
        outs.bc_counts = None
        outs.lot_info = None
        return

    result = {}

    for (c_out, c_def) in zip(chunk_outs, chunk_defs):
        gem_group = c_def.chunk['gem_group']
        if c_out.bc_counts is None:
            continue

        with open(c_out.bc_counts) as f:
            r = json.load(f)

        gg_result = result.setdefault(gem_group, {
            'bad_bc_count': 0,
            'bc_counts': None
        })

        gg_result['bad_bc_count'] += r['bad_bc_count']

        if gg_result['bc_counts'] is None:
            gg_result['bc_counts'] = np.array(r['bc_counts'], dtype=np.int32)
        else:
            gg_result['bc_counts'] += np.array(r['bc_counts'], dtype=np.int32)

    total_counts = 0
    total_errors = 0
    for gg in result.keys():
        rgg = result[gg]
        rgg['bc_error_rate'] = tk_stats.robust_divide(
            float(rgg['bad_bc_count']),
            float(rgg['bad_bc_count'] + rgg['bc_counts'].sum()))
        total_counts += float(rgg['bad_bc_count'] + rgg['bc_counts'].sum())
        total_errors += float(rgg['bad_bc_count'])

    # Hardcoded bail-out if the BC-correct rate is extremely high
    bc_error_rate = total_errors / total_counts
    if bc_error_rate > 0.97:
        martian.exit(
            "Extremely high rate of incorrect barcodes observed (%.2f %%). Check that input is 10x Chromium data, and that there are no missing cycles in the first 16bp of Read 1."
            % (bc_error_rate * 100.0))

    # possibly do lot detection
    lot_detection = {}
    lot_map = WHITELIST_TO_LOT_MAP.get(args.barcode_whitelist)
    if lot_map is not None:
        # get BC counts histogram
        # for now, just sum over all gem groups
        bc_seq = sorted(
            list(bc_utils.load_barcode_whitelist(args.barcode_whitelist)))
        bc_cts = np.sum([ggr['bc_counts'] for ggr in result.values()], axis=0)
        bc_hist = {seq: cts for seq, cts in zip(bc_seq, bc_cts)}

        (gelbead_lot, gelbead_lot_confidence,
         gelbead_lot_counts) = identify_gelbead_lot(bc_hist, lot_map)
        # only report on lots with nonzero counts
        gelbead_lot_counts_nonzero = {
            lot: count
            for lot, count in gelbead_lot_counts.items() if count > 0
        }

        lot_detection['gelbead_lot'] = gelbead_lot
        lot_detection['gelbead_lot_confidence'] = gelbead_lot_confidence
        lot_detection['gelbead_lot_counts'] = gelbead_lot_counts_nonzero

        martian.log_info("Gelbead lot detected: %s, reason (if None): %s" %
                         (gelbead_lot, gelbead_lot_confidence))

    with open(outs.lot_info, 'w') as f:
        tenkit.safe_json.dump_numpy(lot_detection, f, pretty=True)

    with open(outs.bc_counts, 'w') as f:
        tenkit.safe_json.dump_numpy(result, f)
Ejemplo n.º 10
0
def main(args, outs):
    """ Trim the reads in a series of fasta files """

    # Set a fixed random seed to eliminate noise in metrics
    random.seed(0)

    chunk = args.chunk
    interleaved = chunk['reads_interleaved']
    have_read2 = chunk['read2'] is not None
    paired = interleaved or have_read2

    read1_trim = args.read1_trim_length
    read2_trim = args.read2_trim_length

    subsample_rate = chunk['subsample_rate']

    # BC config -- BC come from separate fastq, or are embedded in R1 or R2
    have_barcode = False
    bc_in_read1 = False
    bc_in_read2 = False
    bc_in_fastq = False

    # If we have bc in read, use that & ignore a separate BC read
    if chunk.get('bc_in_read',
                 None) is not None and chunk.get('bc_length', 0) > 0:
        have_barcode = True
        bc_length = chunk['bc_length']
        if chunk['bc_in_read'] == 1:
            bc_in_read1 = True
            read1_trim += bc_length
        elif chunk['bc_in_read'] == 2:
            bc_in_read2 = True
            read2_trim += bc_length
        else:
            martian.exit(
                "bc_in_read configuration incorrect -- read must be 1 or 2")

    # Otherwise use the BC file
    elif chunk['barcode'] is not None:
        have_barcode = True
        bc_in_fastq = True

    have_sample_index = chunk['sample_index'] is not None
    have_trim1 = args.read1_trim_length > 0
    have_trim2 = args.read2_trim_length > 0

    output_directory = os.path.dirname(os.path.realpath(outs.placeholder))
    max_read_num = args.max_read_num

    # counter for sub-chunked files
    file_number = 1

    # open the available read files and make the appropriate iterators
    if interleaved:
        read_in = openfq(chunk['read1'])
        read_iter = tk_fasta.read_generator_fastq(read_in, paired_end=True)
    else:
        if have_read2:
            read1_in = openfq(chunk['read1'])
            read1_iter = tk_fasta.read_generator_fastq(read1_in)

            read2_in = openfq(chunk['read2'])
            read2_iter = tk_fasta.read_generator_fastq(read2_in)

            read_iter = itertools.imap(
                lambda x, y: (x[0], x[1], x[2], y[0], y[1], y[2]), read1_iter,
                read2_iter)
        else:
            read1_in = openfq(chunk['read1'])
            read_iter = tk_fasta.read_generator_fastq(read1_in)

    # open read file
    read_name = output_directory + "/read" + str(file_number) + ".fastq"
    read_names = [read_name]
    out_read_fastq = open(read_name, 'w')

    # Bail out if there's no barcodes or whitelist
    if args.barcode_whitelist is None:
        outs.bc_counts = None
        bc_idx = None
    else:
        barcode_whitelist = sorted(
            list(bc_utils.load_barcode_whitelist(args.barcode_whitelist)))
        bc_idx = {bc: idx for (idx, bc) in enumerate(barcode_whitelist)}
        bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32)
        bad_count = 0

    # open barcode file if there is one
    if have_barcode:
        bc_name = output_directory + "/BC" + str(file_number) + ".fastq"
        out_bc_fastq = open(bc_name, 'w')
        bc_names = [bc_name]
        if bc_in_fastq:
            bc_in = openfq(chunk['barcode'])
            bc_iter = tk_fasta.read_generator_fastq(bc_in)
        elif bc_in_read1 or bc_in_read2:
            # BC in read -- have output file but no input file
            bc_iter = itertools.repeat(None)
    else:
        bc_iter = itertools.repeat(None)
        bc_names = [None]
        outs.bc_counts = None

# open sample_index file if there is one
    if have_sample_index:
        si_name = output_directory + "/SI" + str(file_number) + ".fastq"
        out_si_fastq = open(si_name, 'w')
        si_in = openfq(chunk['sample_index'])
        si_iter = tk_fasta.read_generator_fastq(si_in)
        si_names = [si_name]
    else:
        si_iter = itertools.repeat(None)
        si_names = [None]

    # open trim_read file if there is one
    if have_trim1 or have_trim2:
        trim_name = output_directory + "/TRIM" + str(file_number) + ".fastq"
        out_trim_fastq = open(trim_name, 'w')
        trim_names = [trim_name]
    else:
        trim_names = [None]

    # loop through reads
    read_num = 0
    for read, barcode_read, sample_index_read in itertools.izip(
            read_iter, bc_iter, si_iter):
        if read_num > 0 and random.random() > subsample_rate:
            continue

        if paired:
            (name1, seq1, qual1, name2, seq2, qual2) = read
        else:
            (name1, seq1, qual1) = read

        if len(seq1) != len(qual1):
            martian.exit(
                "Invalid FASTQ file: read and qual lengths don't match")

        new_seq1 = seq1[read1_trim:]
        trim_seq1 = seq1[:read1_trim]
        new_qual1 = qual1[read1_trim:]
        trim_qual1 = qual1[:read1_trim]
        if paired:
            if len(seq1) != len(qual1):
                martian.exit(
                    "Invalid FASTQ file: read and qual lengths don't match")
            new_seq2 = seq2[read2_trim:]
            new_qual2 = qual2[read2_trim:]
            trim_seq2 = seq2[:read2_trim]
            trim_qual2 = qual2[:read2_trim]

        # Get BC sequence out of the read, for BC-in-read schemes
        if bc_in_read1:
            barcode_read = (name1, seq1[:bc_length], qual1[:bc_length])
            trim_seq1 = trim_seq1[bc_length:]
            trim_qual1 = trim_qual1[bc_length:]

        if bc_in_read2:
            barcode_read = (name2, seq2[:bc_length], qual2[:bc_length])
            trim_seq2 = trim_seq2[bc_length:]
            trim_qual2 = trim_qual2[bc_length:]

        read_num += 1
        if read_num > max_read_num:
            read_num = 1
            file_number += 1
            read_name = output_directory + "/read" + str(
                file_number) + ".fastq"
            out_read_fastq.close()
            out_read_fastq = open(read_name, 'w')
            read_names.append(read_name)

            if have_barcode:
                bc_name = output_directory + "/BC" + str(
                    file_number) + ".fastq"
                out_bc_fastq.close()
                out_bc_fastq = open(bc_name, 'w')
                bc_names.append(bc_name)
            else:
                bc_names.append(None)

            if have_trim1 or have_trim2:
                trim_name = output_directory + "/TRIM" + str(
                    file_number) + ".fastq"
                out_trim_fastq.close()
                out_trim_fastq = open(trim_name, 'w')
                trim_names.append(trim_name)
            else:
                trim_names.append(None)

            if have_sample_index:
                si_name = output_directory + "/SI" + str(
                    file_number) + ".fastq"
                out_si_fastq.close()
                out_si_fastq = open(si_name, 'w')
                si_names.append(si_name)
            else:
                si_names.append(None)

        if have_barcode:
            barcode_seq = barcode_read[1]
            barcode_qual = barcode_read[2]
            if chunk['barcode_reverse_complement']:
                barcode_seq = tk_seq.get_rev_comp(barcode_seq)
                barcode_qual = barcode_qual[::
                                            -1]  # obscure way to reverse string
            if bc_idx is not None:
                idx = bc_idx.get(barcode_seq)
                if idx is not None:
                    bc_counts[idx] += 1
                else:
                    bad_count += 1

            tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0],
                                      barcode_seq, barcode_qual)
        if have_sample_index:
            tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0],
                                      sample_index_read[1],
                                      sample_index_read[2])

        tk_fasta.write_read_fastq(out_read_fastq, name1, new_seq1, new_qual1)
        if have_trim1 or have_trim2:
            tk_fasta.write_read_fastq(out_trim_fastq, name1, trim_seq1,
                                      trim_qual1)
        if paired:
            tk_fasta.write_read_fastq(out_read_fastq, name2, new_seq2,
                                      new_qual2)
            if have_trim1 or have_trim2:
                tk_fasta.write_read_fastq(out_trim_fastq, name2, trim_seq2,
                                          trim_qual2)

    if have_barcode:
        out_bc_fastq.close()
        # Only emit BC counts if we had a whitelist
        if outs.bc_counts is not None:
            result = {}
            result['bad_bc_count'] = bad_count
            result['bc_counts'] = list(bc_counts)
            with open(outs.bc_counts, 'w') as bc_counts_out:
                tenkit.safe_json.dump_numpy(result, bc_counts_out)
    if have_sample_index:
        out_si_fastq.close()

    if have_trim1 or have_trim2:
        out_trim_fastq.close()

    out_read_fastq.close()

    chunks = []
    for (r, bc, si, trim) in zip(read_names, bc_names, si_names, trim_names):
        new_chunk = {
            'read1': r,
            'read2': None,
            'barcode': bc,
            'sample_index': si,
            'trim': trim,
            'barcode_reverse_complement': False,
            'reads_interleaved': have_read2 or interleaved,
            'gem_group': chunk['gem_group'],
            'read_group': chunk['read_group']
        }
        chunks.append(new_chunk)

    outs.chunks = chunks
Ejemplo n.º 11
0
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """
    # this silences a weird non-failure in --strict=error mode
    # TODO(lhepler): remove this when martian upstream handles this itself
    outs.outputs = []

    chunk = args.chunk

    bam_in = tk_bam.create_bam_infile(args.align_chunk)
    bc_spec = "{}:{}".format(RAW_BARCODE_TAG, RAW_BARCODE_QUAL_TAG)

    # only comment the first chunk, otherwise later merge will duplicate the comments and could lead to:
    # samtools merge ... : '[finish_merged_header] Output header text too long'
    if args.chunk_index > 0:
        COs = None
    elif chunk['trim']:
        COs = ['10x_bam_to_fastq:R1({},TR:TQ,SEQ:QUAL)'.format(bc_spec), '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)']
    else:
        COs = ['10x_bam_to_fastq:R1({},SEQ:QUAL)'.format(bc_spec), '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)']

    bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=[tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs")], cos = COs)

    gp_tagger = GlobalFivePrimePosTagger(bam_in)

    if args.barcode_whitelist is None or args.bc_counts is None:
        # If there's no whitelist or counts then all high quality BC reads get allowed.
        barcode_whitelist = None
        wl_idxs = None
        bc_dist = None
    else:
        barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist)

        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        counts = counts[str(chunk['gem_group'])]['bc_counts']

        # Prior distribution over barcodes, with pseudo-count
        bc_dist = np.array(counts, dtype=np.float) + 1.0
        bc_dist = bc_dist / bc_dist.sum()
        wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) }

    # set random seed to get deterministic subsampling
    random.seed(0)

    def open_maybe_gzip(fn):
        if fn[-2:] == "gz":
            return gzip.open(fn)
        else:
            return open(fn)

    if chunk['barcode']:
        processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist)
        require_barcode_for_stringent = True
    else:
        processed_barcode_iter = itertools.repeat(None)
        require_barcode_for_stringent = False

    if chunk['trim']:
        trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['trim']), paired_end=True)
    else:
        trim_iter = itertools.repeat(None)

    if chunk['sample_index']:
        sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index']))
    else:
        sample_index_iter = itertools.repeat(None)

    iters = itertools.izip(processed_barcode_iter, sample_index_iter, trim_iter)

    # First read
    read = bam_in.next()

    # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates
    perfect_read_count = 0

    # Due to secondary alignments, we must apply the tags to all
    # reads with the same cluster name.
    for (barcode_info, sample_index_info, trim_info) in iters:
        tags = []
        read_name = None

        if read is None:
            break

        if barcode_info:
            (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info
            tags.append((RAW_BARCODE_TAG, raw_bc_seq))
            tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual))
            if processed_bc_seq is not None:
                tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq))
            read_name = bc_read_name.split()[0]


        if sample_index_info:
            (si_read_name, seq, qual) = sample_index_info
            tags.append((SAMPLE_INDEX_TAG, seq))
            tags.append((SAMPLE_INDEX_QUAL_TAG, qual))

            if read_name != None:
                if si_read_name.split()[0] != read_name:
                    martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name))
                assert(si_read_name.split()[0] == read_name)
            else:
                read_name = si_read_name.split()[0]

        r1_tags = tags
        r2_tags = list(tags)

        if trim_info:
            (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name, trim2_seq, trim2_qual) = trim_info
            if len(trim1_seq) > 0:
                r1_tags.append((TRIM_TAG, trim1_seq))
                r1_tags.append((TRIM_QUAL_TAG, trim1_qual))
            if len(trim2_seq) > 0:
                r2_tags.append((TRIM_TAG, trim2_seq))
                r2_tags.append((TRIM_QUAL_TAG, trim2_qual))

        reads_attached = 0
        reads_to_attach = []

        while read.qname == read_name or read_name == None:
            tags = r1_tags if read.is_read1 else r2_tags
            if len(tags) > 0:
                existing_tags = read.tags
                existing_tags.extend(tags)
                read.tags = existing_tags

            if not (read_name is None):
                assert(read.qname == read_name)

            if reads_to_attach and (read.query_name != reads_to_attach[0].query_name or reads_to_attach[0].query_name is None):
                gp_tagger.tag_reads(reads_to_attach)
                reads_attached += len(reads_to_attach)
                for r in reads_to_attach:
                    if stringent_read_filter(r, require_barcode_for_stringent):
                        perfect_read_count += 1

                    if args.exclude_non_bc_reads:
                        if not(crdna_io.get_read_barcode(r) is None):
                            bam_out.write(r)
                    else:
                        bam_out.write(r)
                reads_to_attach = []

            reads_to_attach.append(read)

            try:
                read = bam_in.next()

            except StopIteration:
                read = None
                break

        gp_tagger.tag_reads(reads_to_attach)
        reads_attached += len(reads_to_attach)
        for r in reads_to_attach:
            if stringent_read_filter(r, require_barcode_for_stringent):
                perfect_read_count += 1

            if args.exclude_non_bc_reads:
                if not(crdna_io.get_read_barcode(r) is None):
                    bam_out.write(r)
            else:
                bam_out.write(r)

        # We may have more than 2 reads is there was a
        # secondary alignment, but less than 2 means
        # something went wrong
        assert(reads_attached >= 2)


    outs.perfect_read_count = perfect_read_count
    bam_out.close()