def test_targets(self): bam_bc_file = tk_test.in_path("namesort_test.bam") read_info_out = tk_test.out_path("read_info.h5") barcode_whitelist = bc_utils.load_barcode_whitelist("737K-april-2014") targets_filename = tk_test.in_path('agilent_kinome_targs.bed') targets_file = open(targets_filename, 'r') target_regions = tk_io.get_target_regions(targets_file) bam_in = tk_bam.create_bam_infile(bam_bc_file) r = compute_basic_stats(bam_in, target_regions, 1000, bam_in.references, barcode_whitelist=barcode_whitelist, read_h5_out=read_info_out) # insert_size_dists, nearest_targ_dists, summary_metrics, bc_table, mapq_counts, insert_size_hist = r misc_sm, bc_sms = r nearest_targ_dists = bc_sms.get('nearest_targ_dists') maxTargetDist = max(nearest_targ_dists.get_summarizer(60).dict.keys()) minTargetDist = min(nearest_targ_dists.get_summarizer(60).dict.keys()) self.assertEqual(minTargetDist, 130) self.assertEqual(maxTargetDist, 10000)
def split(args): if args.bcsorted_bam is None or args.barcode_whitelist is None: chunk_defs = [{'chunk_start': "0", 'chunk_end': "0"}] return {'chunks': chunk_defs} # if args.input # Some R&D bc sets have very small diversity -- don't run on them barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist) if len(barcode_whitelist) < 100: chunk_defs = [{'chunk_start': "0", 'chunk_end': "0"}] return {'chunks': chunk_defs} # if barcode_whitelist min_chunks = 4 if len(barcode_whitelist) > 1e6: min_chunks = 8 # if barcode_whitelist bam_in = tk_bam.create_bam_infile(args.bcsorted_bam) chunks = tk_bam.chunk_bam_records(bam_in, chunk_split_func, chunk_size_gb=8.0, min_chunks=min_chunks) for c in chunks: c['__mem_gb'] = 12 # for c return {'chunks': chunks, 'join': {'__mem_gb': 32}}
def test_attach_bcs(self): # --align_input alignment_output.bam --barcode_input phix_I2.fastq --output test2.out --complete ~/c --stats ~/s args = { 'barcode_whitelist' : IN_WHITELIST, 'align_chunk' : IN_BAM, 'barcode_chunk' : IN_I2, 'sample_index_chunk' : IN_I1, 'gem_group' : None, 'paired_end' : True, 'exclude_non_bc_reads' : False, 'max_expected_bc_error': 0.75, 'subsample_rate' : 1.0, } outs = { 'output': OUT_BAM } args = martian.Record(args) outs = martian.Record(outs) main(args, outs) # Get the barcodes barcode_whitelist = bc_utils.load_barcode_whitelist(IN_WHITELIST) # Ensure each read has a barcode out_bam = pysam.Samfile(OUT_BAM) for r in out_bam: tag_dict = { k:v for (k,v) in r.tags } tag_names = [ k for (k,v) in r.tags ] self.assertTrue(RAW_BARCODE_TAG in tag_names) if tag_dict[RAW_BARCODE_TAG] in barcode_whitelist: self.assertTrue(PROCESSED_BARCODE_TAG in tag_names) self.assertTrue(SAMPLE_INDEX_TAG in tag_names) # Make sure we put out the full BAM file out_len = len([ x for x in pysam.Samfile(OUT_BAM)]) in_len = len([ x for x in pysam.Samfile(IN_BAM)]) self.assertEqual(out_len, in_len) def get_bc(r): tags = { k:v for (k,v) in r.tags } return tags[RAW_BARCODE_TAG] # Ensure each read pair has the same barcode out_bam = pysam.Samfile(OUT_BAM) reads = [ x for x in out_bam ] for (grp, reads) in groupby(reads, lambda x: x.qname): bcs = set(crdna_io.get_read_barcode(r) for r in reads) self.assertEqual(len(bcs), 1)
def split(args): bam = pysam.Samfile(args.input, check_sq=False) min_chunks = 1 if args.barcode_whitelist is not None: barcode_whitelist = bc_utils.load_barcode_whitelist( args.barcode_whitelist) if len(barcode_whitelist) > 1e6: min_chunks = 4 # Split to ensure read pairs always go together chunks = tk_bam.chunk_bam_records(bam, lambda x: x.qname, min_chunks=min_chunks) for chunk in chunks: chunk['n_chunks'] = len(chunks) chunk['__mem_gb'] = 3 return {'chunks': chunks, 'join': {'__mem_gb': 8}}
def main_report_basic(args, outs): bam_in = pysam.Samfile(args.input, check_sq=False) targets_filename = args.targets_file references = bam_in.references if args.input_pos is not None: bam_in_pos = tk_bam.create_bam_infile(args.input_pos) n_mapped = bam_in_pos.mapped n_chunk = math.ceil(n_mapped / args.n_chunks) bam_in_pos.close() else: n_mapped = 0 n_chunk = 0 if targets_filename is None or targets_filename == '': target_regions = None else: targets_file = open(targets_filename, 'r') target_regions = tk_io.get_target_regions(targets_file) if args.barcode_whitelist: barcode_whitelist = bc_utils.load_barcode_whitelist( args.barcode_whitelist) else: barcode_whitelist = None bam_slice = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) # do basic counting misc_sm, qual_sms = \ compute_basic_stats(bam_slice, target_regions, n_chunk, references, barcode_whitelist) misc_sm.save(outs.misc_sm) with open(outs.qual_sms, 'wb') as out_handle: pickle.dump(qual_sms, out_handle)
def split(args): if args.input is None or args.barcode_whitelist is None: chunk_defs = [{'chunk_start':"0", 'chunk_end':"0"}] return {'chunks': chunk_defs} # Some R&D bc sets have very small diversity -- don't run on them barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist) if len(barcode_whitelist) < 100: chunk_defs = [{'chunk_start':"0", 'chunk_end':"0"}] return {'chunks': chunk_defs} min_chunks = 20 if len(barcode_whitelist) > 1e6: min_chunks = 100 bam_in = tk_bam.create_bam_infile(args.input) chunks = tk_bam.chunk_bam_records(bam_in, groupbybarcode, chunk_size_gb=8.0, min_chunks=min_chunks) for c in chunks: c['__mem_gb'] = 3 return {'chunks': chunks, 'join': {'__mem_gb': 6}}
def test_barcode_counts(self): bam_bc_file = tk_test.in_path("attach_bcs/attach_bcs_output.bam") read_info_out = tk_test.out_path("read_info.h5") barcode_whitelist = bc_utils.load_barcode_whitelist("737K-april-2014") bam_in = tk_bam.create_bam_infile(bam_bc_file) r = compute_basic_stats(bam_in, {}, 2000, bam_in.references, barcode_whitelist=barcode_whitelist, read_h5_out=read_info_out) # insert_size_dists, nearest_targ_dists, summary_metrics, bc_table, mapq_counts, insert_size_hist = r misc_sm, bc_sms = r # Look at the barcode results -- there should be a raw bc count for each read pair # n_raw_bcs = bc_table["count"].sum() n_reads = len([x for x in tk_bam.create_bam_infile(bam_bc_file)]) # self.assertEqual(n_raw_bcs, n_reads / 2) # Load the per-cluster table -- there should be a row for each read pair read_info = tenkit.hdf5.read_data_frame(read_info_out) self.assertEqual(read_info.shape[0], n_reads / 2)
def join(args, outs, chunk_defs, chunk_outs): ''' join the various outputs created by report_basic ''' chunk_outs = list(chunk_outs) martian.log_info("combining misc summary managers") misc_sm_outs = [x.misc_sm for x in chunk_outs] misc_sm = combine_summary_managers(misc_sm_outs) martian.log_info("combining nested summary managers") qual_sms_outs = [x.qual_sms for x in chunk_outs] qual_sms = combine_nested_summary_managers(qual_sms_outs) martian.log_info("computing summary metrics") compute_summary_metrics(misc_sm, qual_sms) metrics = misc_sm.get_summarizer("metrics") if metrics["unmapped_fract"] > 0.90: martian.exit("%.1f %% of reads were not mapped to the supplied "\ "reference genome. This is likely the consequence of a sample "\ "mixup or very low sequencing quality. Further execution will "\ "be halted." % (metrics["unmapped_fract"]*100)) if args.barcode_whitelist: barcode_whitelist = bc_utils.load_barcode_whitelist( args.barcode_whitelist) else: barcode_whitelist = None # barcode hdf5 if outs.barcode_counts: bc_table = summarize_barcode_data(misc_sm, qual_sms, barcode_whitelist) tenkit.hdf5.write_data_frame(outs.barcode_counts, bc_table) # insert sizes output insert_size_dists = {} for qual in INSERT_MAPQ_CUTOFFS: insert_size_dists[qual] = qual_sms['insert_size_dists'].get_summarizer( qual).dict insert_sizes_output_file = open(outs.insert_sizes, 'w') insert_sizes_output_file.write( tenkit.safe_json.safe_jsonify(insert_size_dists) + '\n') insert_sizes_output_file.close() # target distances nearest_targ_dists = {} for qual in TARGET_MAPQ_CUTOFFS: nearest_targ_dists[qual] = qual_sms[ 'nearest_targ_dists'].get_summarizer(qual).dict target_dists_output_file = open(outs.target_dists, 'w') target_dists_output_file.write( tenkit.safe_json.safe_jsonify(nearest_targ_dists)) target_dists_output_file.close() # overall summary metrics summary_output_file = open(outs.summary, 'w') summary_output_file.write( tenkit.safe_json.safe_jsonify(misc_sm.get_summarizer('metrics').dict, pretty=True)) summary_output_file.close() # mapq counts mapq_output_file = open(outs.mapq_counts, 'w') mapq_output_file.write( tenkit.safe_json.safe_jsonify( misc_sm.get_summarizer('mapq_counts').dict)) mapq_output_file.close() # logging print tenkit.safe_json.safe_jsonify(misc_sm.get_summarizer('metrics').dict, pretty=True)
def join(args, outs, chunk_defs, chunk_outs): final_chunks = [] for cl in chunk_outs: final_chunks.extend(cl.chunks) outs.chunks = final_chunks valid_counts = [c.bc_counts for c in chunk_outs if c.bc_counts is not None] # No counts if there's no whitelist or actual counts if args.barcode_whitelist is None or len(valid_counts) == 0: outs.bc_counts = None outs.lot_info = None return result = {} for (c_out, c_def) in zip(chunk_outs, chunk_defs): gem_group = c_def.chunk['gem_group'] if c_out.bc_counts is None: continue with open(c_out.bc_counts) as f: r = json.load(f) gg_result = result.setdefault(gem_group, { 'bad_bc_count': 0, 'bc_counts': None }) gg_result['bad_bc_count'] += r['bad_bc_count'] if gg_result['bc_counts'] is None: gg_result['bc_counts'] = np.array(r['bc_counts'], dtype=np.int32) else: gg_result['bc_counts'] += np.array(r['bc_counts'], dtype=np.int32) total_counts = 0 total_errors = 0 for gg in result.keys(): rgg = result[gg] rgg['bc_error_rate'] = tk_stats.robust_divide( float(rgg['bad_bc_count']), float(rgg['bad_bc_count'] + rgg['bc_counts'].sum())) total_counts += float(rgg['bad_bc_count'] + rgg['bc_counts'].sum()) total_errors += float(rgg['bad_bc_count']) # Hardcoded bail-out if the BC-correct rate is extremely high bc_error_rate = total_errors / total_counts if bc_error_rate > 0.97: martian.exit( "Extremely high rate of incorrect barcodes observed (%.2f %%). Check that input is 10x Chromium data, and that there are no missing cycles in the first 16bp of Read 1." % (bc_error_rate * 100.0)) # possibly do lot detection lot_detection = {} lot_map = WHITELIST_TO_LOT_MAP.get(args.barcode_whitelist) if lot_map is not None: # get BC counts histogram # for now, just sum over all gem groups bc_seq = sorted( list(bc_utils.load_barcode_whitelist(args.barcode_whitelist))) bc_cts = np.sum([ggr['bc_counts'] for ggr in result.values()], axis=0) bc_hist = {seq: cts for seq, cts in zip(bc_seq, bc_cts)} (gelbead_lot, gelbead_lot_confidence, gelbead_lot_counts) = identify_gelbead_lot(bc_hist, lot_map) # only report on lots with nonzero counts gelbead_lot_counts_nonzero = { lot: count for lot, count in gelbead_lot_counts.items() if count > 0 } lot_detection['gelbead_lot'] = gelbead_lot lot_detection['gelbead_lot_confidence'] = gelbead_lot_confidence lot_detection['gelbead_lot_counts'] = gelbead_lot_counts_nonzero martian.log_info("Gelbead lot detected: %s, reason (if None): %s" % (gelbead_lot, gelbead_lot_confidence)) with open(outs.lot_info, 'w') as f: tenkit.safe_json.dump_numpy(lot_detection, f, pretty=True) with open(outs.bc_counts, 'w') as f: tenkit.safe_json.dump_numpy(result, f)
def main(args, outs): """ Trim the reads in a series of fasta files """ # Set a fixed random seed to eliminate noise in metrics random.seed(0) chunk = args.chunk interleaved = chunk['reads_interleaved'] have_read2 = chunk['read2'] is not None paired = interleaved or have_read2 read1_trim = args.read1_trim_length read2_trim = args.read2_trim_length subsample_rate = chunk['subsample_rate'] # BC config -- BC come from separate fastq, or are embedded in R1 or R2 have_barcode = False bc_in_read1 = False bc_in_read2 = False bc_in_fastq = False # If we have bc in read, use that & ignore a separate BC read if chunk.get('bc_in_read', None) is not None and chunk.get('bc_length', 0) > 0: have_barcode = True bc_length = chunk['bc_length'] if chunk['bc_in_read'] == 1: bc_in_read1 = True read1_trim += bc_length elif chunk['bc_in_read'] == 2: bc_in_read2 = True read2_trim += bc_length else: martian.exit( "bc_in_read configuration incorrect -- read must be 1 or 2") # Otherwise use the BC file elif chunk['barcode'] is not None: have_barcode = True bc_in_fastq = True have_sample_index = chunk['sample_index'] is not None have_trim1 = args.read1_trim_length > 0 have_trim2 = args.read2_trim_length > 0 output_directory = os.path.dirname(os.path.realpath(outs.placeholder)) max_read_num = args.max_read_num # counter for sub-chunked files file_number = 1 # open the available read files and make the appropriate iterators if interleaved: read_in = openfq(chunk['read1']) read_iter = tk_fasta.read_generator_fastq(read_in, paired_end=True) else: if have_read2: read1_in = openfq(chunk['read1']) read1_iter = tk_fasta.read_generator_fastq(read1_in) read2_in = openfq(chunk['read2']) read2_iter = tk_fasta.read_generator_fastq(read2_in) read_iter = itertools.imap( lambda x, y: (x[0], x[1], x[2], y[0], y[1], y[2]), read1_iter, read2_iter) else: read1_in = openfq(chunk['read1']) read_iter = tk_fasta.read_generator_fastq(read1_in) # open read file read_name = output_directory + "/read" + str(file_number) + ".fastq" read_names = [read_name] out_read_fastq = open(read_name, 'w') # Bail out if there's no barcodes or whitelist if args.barcode_whitelist is None: outs.bc_counts = None bc_idx = None else: barcode_whitelist = sorted( list(bc_utils.load_barcode_whitelist(args.barcode_whitelist))) bc_idx = {bc: idx for (idx, bc) in enumerate(barcode_whitelist)} bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32) bad_count = 0 # open barcode file if there is one if have_barcode: bc_name = output_directory + "/BC" + str(file_number) + ".fastq" out_bc_fastq = open(bc_name, 'w') bc_names = [bc_name] if bc_in_fastq: bc_in = openfq(chunk['barcode']) bc_iter = tk_fasta.read_generator_fastq(bc_in) elif bc_in_read1 or bc_in_read2: # BC in read -- have output file but no input file bc_iter = itertools.repeat(None) else: bc_iter = itertools.repeat(None) bc_names = [None] outs.bc_counts = None # open sample_index file if there is one if have_sample_index: si_name = output_directory + "/SI" + str(file_number) + ".fastq" out_si_fastq = open(si_name, 'w') si_in = openfq(chunk['sample_index']) si_iter = tk_fasta.read_generator_fastq(si_in) si_names = [si_name] else: si_iter = itertools.repeat(None) si_names = [None] # open trim_read file if there is one if have_trim1 or have_trim2: trim_name = output_directory + "/TRIM" + str(file_number) + ".fastq" out_trim_fastq = open(trim_name, 'w') trim_names = [trim_name] else: trim_names = [None] # loop through reads read_num = 0 for read, barcode_read, sample_index_read in itertools.izip( read_iter, bc_iter, si_iter): if read_num > 0 and random.random() > subsample_rate: continue if paired: (name1, seq1, qual1, name2, seq2, qual2) = read else: (name1, seq1, qual1) = read if len(seq1) != len(qual1): martian.exit( "Invalid FASTQ file: read and qual lengths don't match") new_seq1 = seq1[read1_trim:] trim_seq1 = seq1[:read1_trim] new_qual1 = qual1[read1_trim:] trim_qual1 = qual1[:read1_trim] if paired: if len(seq1) != len(qual1): martian.exit( "Invalid FASTQ file: read and qual lengths don't match") new_seq2 = seq2[read2_trim:] new_qual2 = qual2[read2_trim:] trim_seq2 = seq2[:read2_trim] trim_qual2 = qual2[:read2_trim] # Get BC sequence out of the read, for BC-in-read schemes if bc_in_read1: barcode_read = (name1, seq1[:bc_length], qual1[:bc_length]) trim_seq1 = trim_seq1[bc_length:] trim_qual1 = trim_qual1[bc_length:] if bc_in_read2: barcode_read = (name2, seq2[:bc_length], qual2[:bc_length]) trim_seq2 = trim_seq2[bc_length:] trim_qual2 = trim_qual2[bc_length:] read_num += 1 if read_num > max_read_num: read_num = 1 file_number += 1 read_name = output_directory + "/read" + str( file_number) + ".fastq" out_read_fastq.close() out_read_fastq = open(read_name, 'w') read_names.append(read_name) if have_barcode: bc_name = output_directory + "/BC" + str( file_number) + ".fastq" out_bc_fastq.close() out_bc_fastq = open(bc_name, 'w') bc_names.append(bc_name) else: bc_names.append(None) if have_trim1 or have_trim2: trim_name = output_directory + "/TRIM" + str( file_number) + ".fastq" out_trim_fastq.close() out_trim_fastq = open(trim_name, 'w') trim_names.append(trim_name) else: trim_names.append(None) if have_sample_index: si_name = output_directory + "/SI" + str( file_number) + ".fastq" out_si_fastq.close() out_si_fastq = open(si_name, 'w') si_names.append(si_name) else: si_names.append(None) if have_barcode: barcode_seq = barcode_read[1] barcode_qual = barcode_read[2] if chunk['barcode_reverse_complement']: barcode_seq = tk_seq.get_rev_comp(barcode_seq) barcode_qual = barcode_qual[:: -1] # obscure way to reverse string if bc_idx is not None: idx = bc_idx.get(barcode_seq) if idx is not None: bc_counts[idx] += 1 else: bad_count += 1 tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0], barcode_seq, barcode_qual) if have_sample_index: tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0], sample_index_read[1], sample_index_read[2]) tk_fasta.write_read_fastq(out_read_fastq, name1, new_seq1, new_qual1) if have_trim1 or have_trim2: tk_fasta.write_read_fastq(out_trim_fastq, name1, trim_seq1, trim_qual1) if paired: tk_fasta.write_read_fastq(out_read_fastq, name2, new_seq2, new_qual2) if have_trim1 or have_trim2: tk_fasta.write_read_fastq(out_trim_fastq, name2, trim_seq2, trim_qual2) if have_barcode: out_bc_fastq.close() # Only emit BC counts if we had a whitelist if outs.bc_counts is not None: result = {} result['bad_bc_count'] = bad_count result['bc_counts'] = list(bc_counts) with open(outs.bc_counts, 'w') as bc_counts_out: tenkit.safe_json.dump_numpy(result, bc_counts_out) if have_sample_index: out_si_fastq.close() if have_trim1 or have_trim2: out_trim_fastq.close() out_read_fastq.close() chunks = [] for (r, bc, si, trim) in zip(read_names, bc_names, si_names, trim_names): new_chunk = { 'read1': r, 'read2': None, 'barcode': bc, 'sample_index': si, 'trim': trim, 'barcode_reverse_complement': False, 'reads_interleaved': have_read2 or interleaved, 'gem_group': chunk['gem_group'], 'read_group': chunk['read_group'] } chunks.append(new_chunk) outs.chunks = chunks
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ # this silences a weird non-failure in --strict=error mode # TODO(lhepler): remove this when martian upstream handles this itself outs.outputs = [] chunk = args.chunk bam_in = tk_bam.create_bam_infile(args.align_chunk) bc_spec = "{}:{}".format(RAW_BARCODE_TAG, RAW_BARCODE_QUAL_TAG) # only comment the first chunk, otherwise later merge will duplicate the comments and could lead to: # samtools merge ... : '[finish_merged_header] Output header text too long' if args.chunk_index > 0: COs = None elif chunk['trim']: COs = ['10x_bam_to_fastq:R1({},TR:TQ,SEQ:QUAL)'.format(bc_spec), '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)'] else: COs = ['10x_bam_to_fastq:R1({},SEQ:QUAL)'.format(bc_spec), '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)'] bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=[tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs")], cos = COs) gp_tagger = GlobalFivePrimePosTagger(bam_in) if args.barcode_whitelist is None or args.bc_counts is None: # If there's no whitelist or counts then all high quality BC reads get allowed. barcode_whitelist = None wl_idxs = None bc_dist = None else: barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist) # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) counts = counts[str(chunk['gem_group'])]['bc_counts'] # Prior distribution over barcodes, with pseudo-count bc_dist = np.array(counts, dtype=np.float) + 1.0 bc_dist = bc_dist / bc_dist.sum() wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) } # set random seed to get deterministic subsampling random.seed(0) def open_maybe_gzip(fn): if fn[-2:] == "gz": return gzip.open(fn) else: return open(fn) if chunk['barcode']: processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist) require_barcode_for_stringent = True else: processed_barcode_iter = itertools.repeat(None) require_barcode_for_stringent = False if chunk['trim']: trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['trim']), paired_end=True) else: trim_iter = itertools.repeat(None) if chunk['sample_index']: sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index'])) else: sample_index_iter = itertools.repeat(None) iters = itertools.izip(processed_barcode_iter, sample_index_iter, trim_iter) # First read read = bam_in.next() # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates perfect_read_count = 0 # Due to secondary alignments, we must apply the tags to all # reads with the same cluster name. for (barcode_info, sample_index_info, trim_info) in iters: tags = [] read_name = None if read is None: break if barcode_info: (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info tags.append((RAW_BARCODE_TAG, raw_bc_seq)) tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual)) if processed_bc_seq is not None: tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq)) read_name = bc_read_name.split()[0] if sample_index_info: (si_read_name, seq, qual) = sample_index_info tags.append((SAMPLE_INDEX_TAG, seq)) tags.append((SAMPLE_INDEX_QUAL_TAG, qual)) if read_name != None: if si_read_name.split()[0] != read_name: martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name)) assert(si_read_name.split()[0] == read_name) else: read_name = si_read_name.split()[0] r1_tags = tags r2_tags = list(tags) if trim_info: (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name, trim2_seq, trim2_qual) = trim_info if len(trim1_seq) > 0: r1_tags.append((TRIM_TAG, trim1_seq)) r1_tags.append((TRIM_QUAL_TAG, trim1_qual)) if len(trim2_seq) > 0: r2_tags.append((TRIM_TAG, trim2_seq)) r2_tags.append((TRIM_QUAL_TAG, trim2_qual)) reads_attached = 0 reads_to_attach = [] while read.qname == read_name or read_name == None: tags = r1_tags if read.is_read1 else r2_tags if len(tags) > 0: existing_tags = read.tags existing_tags.extend(tags) read.tags = existing_tags if not (read_name is None): assert(read.qname == read_name) if reads_to_attach and (read.query_name != reads_to_attach[0].query_name or reads_to_attach[0].query_name is None): gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not(crdna_io.get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) reads_to_attach = [] reads_to_attach.append(read) try: read = bam_in.next() except StopIteration: read = None break gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not(crdna_io.get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) # We may have more than 2 reads is there was a # secondary alignment, but less than 2 means # something went wrong assert(reads_attached >= 2) outs.perfect_read_count = perfect_read_count bam_out.close()