def main(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end chunk_index = args.chunk_index prefixes = get_seqs(args.nbases) bam_in = tk_bam.create_bam_infile(args.input) template = BamTemplateShim(bam_in, keep_comments=(chunk_index==0)) bams_out = {} for prefix in prefixes: filename = martian.make_path("bc_{}.bam".format(prefix)) bams_out[prefix], _ = tk_bam.create_bam_outfile(filename, None, None, template=template) non_bc_bam = martian.make_path("bc_{}.bam".format(None)) non_bc_bam_out, _ = tk_bam.create_bam_outfile(non_bc_bam, None, None, template=template) for read in tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)): barcode = crdna_io.get_read_barcode(read) if barcode is None: non_bc_bam_out.write(read) else: prefix = barcode[:args.nbases] bams_out[prefix].write(read) bam_in.close() non_bc_bam_out.close() sort_bam(non_bc_bam) outs.non_bc_bams = [non_bc_bam] outs.buckets = {} for prefix in prefixes: filename = bams_out[prefix].filename bams_out[prefix].close() sort_bam(filename) outs.buckets[prefix] = filename
def main_bucket_reads_by_bc(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end prefixes = get_seqs(args.nbases) bam_in = tk_bam.create_bam_infile(args.input) reads = list(tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end))) tmp_dir = os.path.dirname(outs.default) bams_out = {} outs.buckets = {} buckets = {} for prefix in prefixes: filename = os.path.join(tmp_dir, "bc_%s.bam" % prefix) bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in) bams_out[prefix] = bam_out outs.buckets[prefix] = filename buckets[prefix] = [] non_bc_bam_out, _ = tk_bam.create_bam_outfile(outs.default, None, None, template=bam_in) non_bc_reads = [] for r in reads: barcode = tk_io.get_read_barcode(r) if barcode is None: non_bc_bam_out.write(r) non_bc_reads.append(r) else: prefix = barcode[:args.nbases] buckets[prefix].append(r) non_bc_bam_out.close() # Set random seed to get deterministic qname subsampling random.seed(0) sampled_non_bc_reads = random.sample(non_bc_reads, min(len(non_bc_reads), len(prefixes))) outs.qnames = [read.qname for read in sampled_non_bc_reads] for prefix, bucket in buckets.iteritems(): bucket.sort(key=bc_sort_key) bam_out = bams_out[prefix] for r in bucket: bam_out.write(r) bam_out.close()
def process_alignments(genome_bam_file, trimmed_bam_file, out_bam_file, bam_comments, reporter, gene_index, star_index, args): in_genome_bam = tk_bam.create_bam_infile(genome_bam_file) bam_open_time = time.time() in_trimmed_bam = tk_bam.create_bam_infile(trimmed_bam_file) if trimmed_bam_file else None out_bam, _ = tk_bam.create_bam_outfile(out_bam_file, None, None, template=in_genome_bam, cos=bam_comments) bam_iter = cr_utils.iter_by_qname(in_genome_bam, in_trimmed_bam) num_alignments = 0 read_consume_time = None for qname, reads_iter, trimmed_iter in bam_iter: reads = list(reads_iter) if read_consume_time is None: read_consume_time = time.time() # if streaming, verify we're actually streaming print "Time to first read: %f seconds" % (read_consume_time - bam_open_time) num_alignments += len(reads) trimmed = list(trimmed_iter) trimmed_read = trimmed[0] if len(trimmed) > 0 else None for read in process_qname(qname, reads, trimmed_read, reporter, gene_index, star_index, args): out_bam.write(read) in_genome_bam.close() if in_trimmed_bam: in_trimmed_bam.close() out_bam.close() return num_alignments
def update_mapqs(bamfilename, outfile, reference_path): bam = tk_bam.create_bam_infile(bamfilename) bam_out, _ = tk_bam.create_bam_outfile(outfile, None, None, template=bam) variant_heap = [] variant_map = {} read_heap = [] primary_contigs = tk_reference.load_primary_contigs(reference_path) for read in bam: tags = [(key, value) for (key, value) in dict(read.tags).iteritems()] tags.append(('OM', int(read.mapq))) read.tags = tags if bam.references[read.tid] not in primary_contigs: read.tags = [(key, value) for (key, value) in read.tags if key != 'AC' and key != 'XC'] bam_out.write(read) continue add_variant_counts(read, variant_heap, variant_map) heapq.heappush(read_heap, (read.pos, read)) update_updatable(read_heap, read.pos, variant_heap, variant_map, bam_out) update_updatable(read_heap, 500000000, variant_heap, variant_map, bam_out, empty_me=True) bam_out.close()
def main(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end bam_in = tk_bam.create_bam_infile(args.input) reads = list(tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end))) tmp_dir = os.path.dirname(outs.default) bams_out = {} outs.buckets = {} buckets = {} for qname in args.qnames: filename = os.path.join(tmp_dir, "qname_%s.bam" % qname) bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in) bams_out[qname] = bam_out outs.buckets[qname] = filename buckets[qname] = [] qname_ranges = zip(args.qnames, args.qnames[1:]) for r in reads: qname = None for qnames in qname_ranges: if qnames[0] <= r.qname and r.qname <= qnames[1]: qname = qnames[0] break if qname is None: qname = args.qnames[-1] buckets[qname].append(r) for qname, bucket in buckets.iteritems(): bucket.sort(key=bc_sort_key) bam_out = bams_out[qname] for r in bucket: bam_out.write(r) bam_out.close()
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.input_bam) out_bam, _ = tk_bam.create_bam_outfile(outs.output, None, None, template=in_bam) cell_bcs = set(cr_utils.load_barcode_tsv(args.cell_barcodes)) for (tid, pos), reads_iter in itertools.groupby(in_bam, key=cr_utils.pos_sort_key): dupe_keys = set() for read in reads_iter: if cr_utils.get_read_barcode(read) not in cell_bcs: continue if cr_utils.is_read_dupe_candidate( read, cr_utils.get_high_conf_mapq(args.align)): dupe_key = (cr_utils.si_pcr_dupe_func(read), cr_utils.get_read_umi(read)) if dupe_key in dupe_keys: continue dupe_keys.add(dupe_key) out_bam.write(read)
def main(args, outs): outs.coerce_strings() bam_in = tk_bam.create_bam_infile(args.bucket[0]) bam_out, _ = tk_bam.create_bam_outfile(outs.default, None, None, template=bam_in, pgs=tk_bam.make_pg_header(martian.get_pipelines_version(), "sort_reads_by_bc")) bam_in.close() outs.total_reads = merge_by_key(args.bucket, bc_sort_key, bam_out) bam_out.close()
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.input) in_bam_chunk = tk_bam.read_bam_chunk(in_bam, (args.chunk_start, args.chunk_end)) out_bam, _ = tk_bam.create_bam_outfile(outs.output, None, None, template=in_bam) chroms = in_bam.references reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), chroms=chroms) for (gg, bc, gene_ids), reads_iter in itertools.groupby( in_bam_chunk, key=cr_utils.barcode_sort_key): # Ignore reads w/o a valid barcode, unmapped reads and reads that map to more than 1 gene if bc is None or gg is None or gene_ids is None or len(gene_ids) != 1: for read in reads_iter: reporter.mark_dupes_corrected_cb(read) out_bam.write(read) continue reads = list(reads_iter) gene_id = gene_ids[0] # Count cDNA PCR duplicates with uncorrected UMIs dupe_key_umi_counts = mark_dupes( bc, gene_id, reads, args, cr_constants.CDNA_PCR_UNCORRECTED_DUPE_TYPE, cr_utils.cdna_pcr_dupe_func, reporter) # Record UMI corrections umi_corrections = correct_umis(dupe_key_umi_counts) # Mark duplicates for cDNA PCR duplicates with corrected UMIs mark_dupes(bc, gene_id, reads, args, cr_constants.CDNA_PCR_DUPE_TYPE, cr_utils.cdna_pcr_dupe_func, reporter, corrected_dupe_keys=umi_corrections, out_bam=out_bam) # Count duplicates for SI PCR duplicates with uncorrected UMIs mark_dupes(bc, gene_id, reads, args, cr_constants.SI_PCR_DUPE_TYPE, cr_utils.si_pcr_dupe_func, reporter) in_bam.close() out_bam.close() reporter.save(outs.chunked_reporter)
def main(args, outs): outs.coerce_strings() bam_in = tk_bam.create_bam_infile(args.bucket[0]) bam_out, _ = tk_bam.create_bam_outfile(outs.default, None, None, template=bam_in) bam_in.close() outs.total_reads = merge_by_key(args.bucket, bc_and_qname_sort_key, bam_out) bam_out.close()
def main(args, outs): bam_in = tk_bam.create_bam_infile(args.chunk_input) # Get gem groups library_info = rna_library.get_bam_library_info(bam_in) gem_groups = sorted(list(set(lib['gem_group'] for lib in library_info))) # Define buckets bucket_names = [] prefixes = cr_utils.get_seqs(args.nbases) for gg in gem_groups: for prefix in prefixes: bucket_names.append('%s-%d' % (prefix, gg)) bucket_names.append('') # Read all records reads = [read for read in bam_in] # Bucket the records bams_out = {} outs.buckets = {} buckets = {} for bucket_name in bucket_names: filename = martian.make_path("bc-%s.bam" % bucket_name) bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in, rgs=args.read_groups, replace_rg=True) bams_out[bucket_name] = bam_out outs.buckets[bucket_name] = filename buckets[bucket_name] = [] for r in reads: barcode = cr_utils.get_read_barcode(r) if barcode is None: bucket_name = '' else: barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) prefix = barcode_seq[:args.nbases] bucket_name = '%s-%d' % (prefix, gem_group) buckets[bucket_name].append(r) for bucket_name, bucket in buckets.iteritems(): bucket.sort(key=cr_utils.barcode_sort_key) bam_out = bams_out[bucket_name] for r in bucket: bam_out.write(r) bam_out.close()
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.possorted_bam) in_bam_chunk = tk_bam.read_bam_chunk(in_bam, (args.chunk_start, args.chunk_end)) out_bam, _ = tk_bam.create_bam_outfile(outs.filtered_bam, None, None, template=in_bam) cluster_bcs = set(args.cluster_bcs) for (tid, pos), reads_iter in itertools.groupby(in_bam_chunk, key=cr_utils.pos_sort_key): dupe_keys = set() for read in reads_iter: if cr_utils.get_read_barcode(read) not in cluster_bcs: continue if cr_utils.is_read_dupe_candidate(read, cr_utils.get_high_conf_mapq({"high_conf_mapq":60})): dupe_key = (cr_utils.si_pcr_dupe_func(read), cr_utils.get_read_umi(read)) if dupe_key in dupe_keys: continue dupe_keys.add(dupe_key) read.is_duplicate = False out_bam.write(read)
def main(args, outs): prefixes = cr_utils.get_seqs(args.nbases) prefixes.append('') bam_in = tk_bam.create_bam_infile(args.chunk_input) reads = [read for read in bam_in] bams_out = {} outs.buckets = {} buckets = {} for prefix in prefixes: filename = martian.make_path("bc_%s.bam" % prefix) bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in, rgs=args.read_groups, replace_rg=True) bams_out[prefix] = bam_out outs.buckets[prefix] = filename buckets[prefix] = [] for r in reads: barcode = cr_utils.get_read_barcode(r) if barcode is None: prefix = '' else: prefix = barcode[:args.nbases] buckets[prefix].append(r) for prefix, bucket in buckets.iteritems(): bucket.sort(key=cr_utils.barcode_sort_key) bam_out = bams_out[prefix] for r in bucket: bam_out.write(r) bam_out.close()
def main(args, outs): """ Mark exact duplicate reads in the BAM file. Duplicates have the same read1 start site and read2 start site """ lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map) args.coerce_strings() outs.coerce_strings() bam_in = tk_bam.create_bam_infile(args.input) template = BamTemplateShim(bam_in, keep_comments=(args.chunk_index==0)) if args.write_bam: bam_prefix, ext = os.path.splitext(outs.output) out_bam_name = bam_prefix + '_five_prime_pos_sorted' + ext bam_out, _ = tk_bam.create_bam_outfile(out_bam_name, None, None, template=template, pgs=[tk_bam.make_pg_header(martian.get_pipelines_version(), "mark_duplicates")]) outs.index = None # chunk bams don't get indexed else: bam_out = None outs.output = None outs.index = None # Determine whether the BAM has 10x barcodes bam_in.reset() has_barcodes = [crdna_io.read_has_barcode(x) for x in itertools.islice(bam_in, 1000)] have_barcodes = (float(sum(has_barcodes)) / len(has_barcodes)) > 0.1 # All read duplicate marking - these dup decisions are written to bam_out # the output bam has BC aware dup marking if available. # Ensure the summary key indicates what kind of dup marking was actually performed. if have_barcodes: no_filter_dups_bcs = DupSummary(False, 1.0, True, "no_filter_full_use_bcs", lane_coord_sys, output_bam=bam_out, threshold=args.diffusion_threshold) no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, threshold=args.diffusion_threshold) else: no_filter_dups_bcs = DupSummary(False, 1.0, True, "no_filter_full_use_bcs", lane_coord_sys, threshold=args.diffusion_threshold) no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, output_bam=bam_out, threshold=args.diffusion_threshold) # Dup marking on all perfect reads full_dups_bcs = DupSummary(True, 1.0, True, "full_use_bcs", lane_coord_sys, threshold=args.diffusion_threshold, tag_counts=True) full_dups_no_bcs = DupSummary(True, 1.0, False, "full_ignore_bcs", lane_coord_sys, threshold=args.diffusion_threshold) dup_sums = [full_dups_bcs, full_dups_no_bcs, no_filter_dups_bcs, no_filter_dups_no_bcs] # Now broadcast the selected reads to the summarizers # We can't do the points the require a sample_rate > 1.0 so, skip those. # If we don't have barcodes, don't run the set that are split by barcode. consumers = [x.read_consumer() for x in dup_sums if x.sample_rate <= 1.0 and ((not x.split_bcs) or have_barcodes)] source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) broadcast(source, consumers) # We close the BAM if bam_out: bam_out.close() # Note - the indexing happens in join bam_prefix, _ = os.path.splitext(outs.output) tk_bam.sort(out_bam_name, bam_prefix) # Package up the summaries: dup_results = {} for x in dup_sums: (dups, optical_dups, diff_dups, custom_diff_dups) = x.result desc = x.description dup_results[desc] = dups optical_desc = "optical_" + desc dup_results[optical_desc] = optical_dups diff_desc = "diffusion_old_" + desc dup_results[diff_desc] = diff_dups custom_diff_desc = "diffusion_" + desc dup_results[custom_diff_desc] = custom_diff_dups if outs.duplicate_summary: with open(outs.duplicate_summary, 'w') as f: json.dump(dup_results, f, indent=4)
def main(args, outs): reporter = vdj_report.VdjReporter( vdj_reference_path=args.vdj_reference_path) gene_umi_counts_per_bc = {} strand = cr_chem.get_strandedness(args.chemistry_def) paired_end = cr_chem.is_paired_end(args.chemistry_def) assert paired_end != (args.read2_chunk is None) # For the entire chunk, match reads against the V(D)J reference ref_fasta = vdj_reference.get_vdj_reference_fasta(args.vdj_reference_path) # The filtering code will write this bam. Then we'll read it, correct the UMIs # and write outs.chunked_bams. filter_bam = martian.make_path('tmp.bam') vdj_filt.run_read_match(args.read1_chunk, args.read2_chunk, ref_fasta, filter_bam, strand, args.sw_params) # Make two passes over the BAM file, processing one barcode at a time bam1 = pysam.AlignmentFile(filter_bam, check_sq=False) bam2 = pysam.AlignmentFile(filter_bam, check_sq=False) bc_iter1 = get_bc_grouped_pair_iter(bam1, paired_end) bc_iter2 = get_bc_grouped_pair_iter(bam2, paired_end) reads_per_bc = open(outs.reads_per_bc, 'w') out_bam, _ = tk_bam.create_bam_outfile(outs.barcode_chunked_bams, None, None, template=bam1) for (bc, pair_iter1), (_, pair_iter2) in itertools.izip(bc_iter1, bc_iter2): nreads = 0 # Pass 1: UMI correction umi_counts = defaultdict(int) for header, (read1, read2) in pair_iter1: nreads += 2 umi_counts[header.get_tag(cr_constants.RAW_UMI_TAG)] += 1 corrected_umis = correct_umis(umi_counts) # Pass 2: Write the UMI-corrected records process_bam_barcode(bam1, pair_iter2, bc, corrected_umis, reporter, gene_umi_counts_per_bc, strand, out_bam, paired_end) reads_per_bc.write('{}\t{}\n'.format(bc, nreads)) bam1.close() bam2.close() out_bam.close() # Write bc-gene-umi counts cPickle.dump(gene_umi_counts_per_bc, open(outs.chunked_gene_umi_counts, 'w')) # Copy the input barcodes if args.barcodes_chunk is not None: cr_utils.copy(args.barcodes_chunk, outs.barcodes_in_chunks) else: outs.barcodes_in_chunks = None reporter.save(outs.chunked_reporter)
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ chunk = args.chunk bam_in = create_bam_infile(args.align_chunk) bam_out, _ = tk_bam.create_bam_outfile( outs.output, None, None, template=bam_in, pgs=[ tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs", TENX_PRODUCT_NAME) ]) gp_tagger = GlobalFivePrimePosTagger(bam_in) if args.barcode_whitelist is None or args.bc_counts is None: # If there's no whitelist or counts then all high quality BC reads get allowed. barcode_whitelist = None wl_idxs = None bc_dist = None else: barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist) # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) counts = counts[str(chunk['gem_group'])]['bc_counts'] # Prior distribution over barcodes, with pseudo-count bc_dist = np.array(counts, dtype=np.float) + 1.0 bc_dist = bc_dist / bc_dist.sum() wl_idxs = { bc: idx for (idx, bc) in enumerate(sorted(list(barcode_whitelist))) } # set random seed to get deterministic subsampling random.seed(0) if chunk['barcode'] is not None: processed_barcode_iter = get_raw_processed_barcodes( open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist) require_barcode_for_stringent = True else: processed_barcode_iter = itertools.repeat(None) require_barcode_for_stringent = False if chunk['sample_index'] is not None: sample_index_iter = tk_fasta.read_generator_fastq( open_maybe_gzip(chunk['sample_index'])) else: sample_index_iter = itertools.repeat(None) if chunk['trim'] is not None: trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip( chunk['trim']), paired_end=True) else: trim_iter = itertools.repeat(None) iters = itertools.izip(processed_barcode_iter, sample_index_iter, trim_iter) # First read try: read = bam_in.next() except StopIteration: read = None # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates perfect_read_count = 0 # Due to secondary alignments, we must apply the tags to all # reads with the same cluster name. for (barcode_info, sample_index_info, trim_info) in iters: tags = [] read_name = None if read is None: break if barcode_info is not None: (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info tags.append((RAW_BARCODE_TAG, raw_bc_seq)) tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual)) if processed_bc_seq is not None: tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq)) read_name = bc_read_name.split()[0] if sample_index_info is not None: (si_read_name, seq, qual) = sample_index_info tags.append((SAMPLE_INDEX_TAG, seq)) tags.append((SAMPLE_INDEX_QUAL_TAG, qual)) if read_name is not None: if si_read_name.split()[0] != read_name: martian.log_info( "mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name)) assert (si_read_name.split()[0] == read_name) else: read_name = si_read_name.split()[0] r1_tags = tags r2_tags = list(r1_tags) if trim_info is not None: (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name, trim2_seq, trim2_qual) = trim_info if len(trim1_seq) > 0: r1_tags.append((TRIM_TAG, trim1_seq)) r1_tags.append((TRIM_QUAL_TAG, trim1_qual)) if len(trim2_seq) > 0: r2_tags.append((TRIM_TAG, trim2_seq)) r2_tags.append((TRIM_QUAL_TAG, trim2_qual)) reads_attached = 0 reads_to_attach = [] while read.query_name == read_name or read_name is None: tags = r1_tags if read.is_read1 else r2_tags if len(tags) > 0: existing_tags = read.tags existing_tags.extend(tags) read.tags = existing_tags if reads_to_attach and ( read.query_name != reads_to_attach[0].query_name or reads_to_attach[0].query_name is None): gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not (get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) reads_to_attach = [] reads_to_attach.append(read) try: read = bam_in.next() except StopIteration: read = None break gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not (get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) # We may have more than 2 reads if there was a # secondary alignment, but less than 2 means # something went wrong assert (reads_attached >= 2) outs.perfect_read_count = perfect_read_count bam_out.close()
def main(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end bam_in = tk_bam.create_bam_infile(args.input) reads = tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)) pgs = [ tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_phasing"), tk_bam.make_terminal_pg_header(martian.get_pipelines_version()) ] # dont duplicate header if already there, this is for developer testing purposes PG = bam_in.header['PG'] for item in PG: if item['ID'] == "attach_phasing": pgs = [] bam_out, _ = tk_bam.create_bam_outfile(outs.phased_possorted_bam, None, None, template=bam_in, pgs=pgs) # File with contig phasing information if args.fragment_phasing is not None: frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing) else: frag_phasing = None if args.fragments is not None: # Fragments file for global molecule id frag_id_reader = tk_hdf5.DataFrameReader(args.fragments) else: frag_id_reader = None # Phasing data ph_db = None ph_db_chrom = None ph_db_start = None ph_db_end = None # Fragment data - for global molecule id fr_db = None fr_db_chrom = None fr_db_start = None fr_db_end = None total_reads = 0 phased_reads = 0 molecule_tagged_reads = 0 for r in reads: chrom = bam_in.references[r.tid] pos = r.pos bc = tk_io.get_read_barcode(r) total_reads += 1 tags = r.tags # Strip out RX and QX tags #strip_tags = [RAW_BARCODE_TAG, RAW_BARCODE_QUAL_TAG] # Actually don't strip strip_tags = [] tags = [(tag, value) for (tag, value) in tags if (tag not in strip_tags)] # Fetch from the fragments file to get records that should cover many future reads # fragment phasing file may not exist in ALIGNER only pipeline - may need to skip if frag_phasing is not None: if ph_db is None or chrom != ph_db_chrom or pos < ph_db_start or pos > ph_db_end: ph_db, (ph_db_chrom, ph_db_start, ph_db_end) = get_frag_phasing_db(frag_phasing, chrom, pos, window=WINDOW_SIZE) if bc is not None and ph_db.has_key(bc): frags = ph_db[bc] # See if we having phasing for this fragment valid_phasing = [x for x in frags if x['start'] <= r.pos and x['end'] > r.pos] assert(len(valid_phasing) < 2) if len(valid_phasing) == 1: phased_reads += 1 read_phasing = valid_phasing[0] tags.append((PHASE_SET_BAM_TAG, read_phasing['ps'])) tags.append((HAPLOTYPE_BAM_TAG, read_phasing['hap'])) tags.append((PHASING_CONF_BAM_TAG, read_phasing['pc'])) if frag_id_reader is not None: # Fetch from the fragments file to get records that should cover many future reads if fr_db is None or chrom != fr_db_chrom or pos < fr_db_start or pos > fr_db_end: fr_db, (fr_db_chrom, fr_db_start, fr_db_end) = get_molecule_id_db(frag_id_reader, chrom, pos, window=WINDOW_SIZE) if bc is not None and fr_db.has_key(bc): frags = fr_db[bc] # See if we having phasing for this fragment molecule_ids = [x for x in frags if x['start'] <= r.pos and x['end'] > r.pos] assert(len(molecule_ids) < 2) if len(molecule_ids) == 1: molecule_tagged_reads += 1 molecule_id = molecule_ids[0] tags.append((MOLECULE_ID_BAM_TAG, molecule_id['molecule_id'])) r.tags = tags bam_out.write(r) bam_out.close() outs.total_reads = total_reads outs.phased_reads = phased_reads outs.molecule_tagged_reads = molecule_tagged_reads
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ chunk = args.chunk #subsample_rate = 1.0 #if args.subsample_rate is not None: # subsample_rate = args.subsample_rate bam_in = tk_bam.create_bam_infile(args.align_chunk) bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs")) if args.barcode_whitelist is None or args.bc_counts is None: # If there's no whitelist or counts then all high quality BC reads get allowed. barcode_whitelist = None wl_idxs = None bc_dist = None else: barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist) # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) counts = counts[str(chunk['gem_group'])]['bc_counts'] # Prior distribution over barcodes, with pseudo-count bc_dist = np.array(counts, dtype=np.float) + 1.0 bc_dist = bc_dist / bc_dist.sum() wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) } # set random seed to get deterministic subsampling random.seed(0) def open_maybe_gzip(fn): if fn[-2:] == "gz": return gzip.open(fn) else: return open(fn) if chunk['barcode']: processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist) require_barcode_for_stringent = True else: processed_barcode_iter = itertools.repeat(None) require_barcode_for_stringent = False if chunk['sample_index']: sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index'])) else: sample_index_iter = itertools.repeat(None) iters = itertools.izip(processed_barcode_iter, sample_index_iter) # First read read = bam_in.next() # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates perfect_read_count = 0 # Due to secondary alignments, we must apply the tags to all # reads with the same cluster name. for (barcode_info, sample_index_info) in iters: tags = [] read_name = None if read is None: break if barcode_info: (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info tags.append((RAW_BARCODE_TAG, raw_bc_seq)) tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual)) if processed_bc_seq is not None: tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq)) read_name = bc_read_name.split()[0] if sample_index_info: (si_read_name, seq, qual) = sample_index_info tags.append((SAMPLE_INDEX_TAG, seq)) tags.append((SAMPLE_INDEX_QUAL_TAG, qual)) if read_name != None: if si_read_name.split()[0] != read_name: martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name)) assert(si_read_name.split()[0] == read_name) else: read_name = si_read_name.split()[0] reads_attached = 0 #emit_read_pair = random.random() < subsample_rate emit_read_pair = True while read.qname == read_name or read_name == None: if len(tags) > 0: existing_tags = read.tags existing_tags.extend(tags) read.tags = existing_tags reads_attached += 1 if not (read_name is None): assert(read.qname == read_name) if emit_read_pair: # Count the perfect reads -- will be used when subsampling in dedup if tenkit.read_filter.stringent_read_filter(read, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not(tk_io.get_read_barcode(read) is None): bam_out.write(read) else: bam_out.write(read) try: read = bam_in.next() except StopIteration: read = None break # We may have more than 2 reads is there was a # secondary alignment, but less than 2 means # something went wrong assert(reads_attached >= 2) outs.perfect_read_count = perfect_read_count bam_out.close()
def main(args, outs): """Mark exact duplicate reads in the output BAM file while also writing out some summary statistics. PCR duplicates have the same read1 start site and read2 start site. """ args.coerce_strings() outs.coerce_strings() # Chunk output doesn't get indexed outs.fragments_index = None outs.index = None # Pull in prior likelihoods for barcodes raw_barcode_abundance = None barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist) if args.raw_barcode_counts is not None and barcode_whitelist is not None: with open(args.raw_barcode_counts, 'r') as infile: raw_counts = json.load(infile) raw_barcode_abundance = { '{}-{}'.format(barcode, gem_group): count for gem_group, subdict in raw_counts.iteritems() for barcode, count in zip(barcode_whitelist, subdict['bc_counts']) } bam_in = create_bam_infile(args.input) bam_refs = bam_in.references bam_prefix, ext = os.path.splitext(outs.output) raw_bam_file = martian.make_path(bam_prefix + '_five_prime_pos_sorted' + ext) frag_prefix, ext = os.path.splitext(outs.fragments) raw_frag_file = martian.make_path(frag_prefix + '_raw' + ext) # only write CO line for one chunk, so we don't have duplicates after samtools merge if args.chunk_num == 0: COs = [ '10x_bam_to_fastq:R1(SEQ:QUAL,TR:TQ)', '10x_bam_to_fastq:R2(SEQ:QUAL,TR:TQ)', '10x_bam_to_fastq:I1(BC:QT)', '10x_bam_to_fastq:I2(CR:CY)', '10x_bam_to_fastq_seqnames:R1,R3,I1,R2' ] else: COs = None bam_out, _ = tk_bam.create_bam_outfile( raw_bam_file, None, None, template=bam_in, pgs=[ tk_bam.make_pg_header(martian.get_pipelines_version(), "mark_duplicates", TENX_PRODUCT_NAME) ], cos=COs) fragments_out = open(raw_frag_file, 'w') bam_in.reset() # Ensure the summary key indicates what kind of dup marking was actually performed. lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map) reference_manager = ReferenceManager(args.reference_path) summarizer = DupSummary(split_bcs=False, lane_coordinate_system=lane_coord_sys, output_bam=bam_out, output_tsv=fragments_out, ref=reference_manager, bam_refs=bam_refs, priors=raw_barcode_abundance) # Now broadcast the selected reads to the summarizers consumers = [summarizer.read_consumer()] source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) broadcast(source, consumers) # Close outfiles bam_out.close() fragments_out.close() # Feed the chunk barcode_counts data back to join() with open(outs.singlecell_mapping, 'w') as outfile: pickle.dump(summarizer.bc_counts, outfile) # Sort the output bam & tsv files sort_bam(raw_bam_file, outs.output, threads=martian.get_threads_allocation()) sort_bed(raw_frag_file, outs.fragments, genome=reference_manager.fasta_index, threads=martian.get_threads_allocation(), leave_key=True)
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ # this silences a weird non-failure in --strict=error mode # TODO(lhepler): remove this when martian upstream handles this itself outs.outputs = [] chunk = args.chunk bam_in = tk_bam.create_bam_infile(args.align_chunk) bc_spec = "{}:{}".format(RAW_BARCODE_TAG, RAW_BARCODE_QUAL_TAG) # only comment the first chunk, otherwise later merge will duplicate the comments and could lead to: # samtools merge ... : '[finish_merged_header] Output header text too long' if args.chunk_index > 0: COs = None elif chunk['trim']: COs = ['10x_bam_to_fastq:R1({},TR:TQ,SEQ:QUAL)'.format(bc_spec), '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)'] else: COs = ['10x_bam_to_fastq:R1({},SEQ:QUAL)'.format(bc_spec), '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)'] bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=[tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs")], cos = COs) gp_tagger = GlobalFivePrimePosTagger(bam_in) if args.barcode_whitelist is None or args.bc_counts is None: # If there's no whitelist or counts then all high quality BC reads get allowed. barcode_whitelist = None wl_idxs = None bc_dist = None else: barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist) # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) counts = counts[str(chunk['gem_group'])]['bc_counts'] # Prior distribution over barcodes, with pseudo-count bc_dist = np.array(counts, dtype=np.float) + 1.0 bc_dist = bc_dist / bc_dist.sum() wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) } # set random seed to get deterministic subsampling random.seed(0) def open_maybe_gzip(fn): if fn[-2:] == "gz": return gzip.open(fn) else: return open(fn) if chunk['barcode']: processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist) require_barcode_for_stringent = True else: processed_barcode_iter = itertools.repeat(None) require_barcode_for_stringent = False if chunk['trim']: trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['trim']), paired_end=True) else: trim_iter = itertools.repeat(None) if chunk['sample_index']: sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index'])) else: sample_index_iter = itertools.repeat(None) iters = itertools.izip(processed_barcode_iter, sample_index_iter, trim_iter) # First read read = bam_in.next() # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates perfect_read_count = 0 # Due to secondary alignments, we must apply the tags to all # reads with the same cluster name. for (barcode_info, sample_index_info, trim_info) in iters: tags = [] read_name = None if read is None: break if barcode_info: (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info tags.append((RAW_BARCODE_TAG, raw_bc_seq)) tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual)) if processed_bc_seq is not None: tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq)) read_name = bc_read_name.split()[0] if sample_index_info: (si_read_name, seq, qual) = sample_index_info tags.append((SAMPLE_INDEX_TAG, seq)) tags.append((SAMPLE_INDEX_QUAL_TAG, qual)) if read_name != None: if si_read_name.split()[0] != read_name: martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name)) assert(si_read_name.split()[0] == read_name) else: read_name = si_read_name.split()[0] r1_tags = tags r2_tags = list(tags) if trim_info: (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name, trim2_seq, trim2_qual) = trim_info if len(trim1_seq) > 0: r1_tags.append((TRIM_TAG, trim1_seq)) r1_tags.append((TRIM_QUAL_TAG, trim1_qual)) if len(trim2_seq) > 0: r2_tags.append((TRIM_TAG, trim2_seq)) r2_tags.append((TRIM_QUAL_TAG, trim2_qual)) reads_attached = 0 reads_to_attach = [] while read.qname == read_name or read_name == None: tags = r1_tags if read.is_read1 else r2_tags if len(tags) > 0: existing_tags = read.tags existing_tags.extend(tags) read.tags = existing_tags if not (read_name is None): assert(read.qname == read_name) if reads_to_attach and (read.query_name != reads_to_attach[0].query_name or reads_to_attach[0].query_name is None): gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not(crdna_io.get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) reads_to_attach = [] reads_to_attach.append(read) try: read = bam_in.next() except StopIteration: read = None break gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not(crdna_io.get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) # We may have more than 2 reads is there was a # secondary alignment, but less than 2 means # something went wrong assert(reads_attached >= 2) outs.perfect_read_count = perfect_read_count bam_out.close()
def open_file(self, filename): # Create a dummy header to prevent samtools / pysam crashing return tk_bam.create_bam_outfile(filename, ['dummy'], [8])[0]
def get_consensus_seq(clonotype_name, sel_contigs, best_contig, out_dir, args): """Build a consensus sequence from a set of contigs. Args: - clonotype_name: Used to prefix output files. - sel_contigs: Names of contigs to use for consensus building. - best_contig: Name of "best" contig. Will search for this contig's sequence and base qualities. - out_dir: dir used for temporary results - args: stage args. - Return value: A tuple (best_contig_seq, best_contig_quals, consensus_seq, out_bam_name, out_fastq_name, out_fasta_name). - best_contig_seq/best_contig_quals: the sequence and quals of the best contig - consensus_seq: the consensus sequence or None if no consensus could be built. - out_bam_name: Path of BAM with alignments of contigs to consensus seq. - out_fastq_name: FASTQ with contig sequences. - out_fasta_name: FASTA with consensus sequence. enough reads for consensus. """ best_contig_seq = None best_contig_quals = None # Input to base quality computation - we don't really need the # base qualities because we will replace them by read-based qualities # But we need to do this to get proper alignments of contigs against # the consensus. out_fastq_name = martian.make_path(clonotype_name + '_contigs.fastq') # Input to assembly out_bam_name = martian.make_path(clonotype_name + '_contigs.bam') # The reference in the output bam doesn't really matter. out_bam, _ = tk_bam.create_bam_outfile(out_bam_name, ['chr1'], [1]) # Read the entire fastq (all contigs) and write the selected contigs to # a bam for the assembler and a fastq for the aligner. with open(args.contigs_fastq, 'r') as f, open(out_fastq_name, 'w') as out_fq: fq_iter = tk_fasta.read_generator_fastq(f) for (name, seq, quals) in fq_iter: if name in sel_contigs: if name == best_contig: best_contig_seq = seq best_contig_quals = quals header = cr_fastq.AugmentedFastqHeader(name) # Create a pseudo-UMI for each input contig header.set_tag(PROCESSED_UMI_TAG, name) # Put all reads on the same "barcode". This is important, so # the assembler assembles all of them together. header.set_tag(PROCESSED_BARCODE_TAG, clonotype_name) record = pysam.AlignedRead() record.reference_start = 0 record.reference_id = 0 # Wrap with str() or pysam will crash when given unicode record.qname = str(header.to_string()) record.seq = seq record.qual = quals record.flag = MAPPED_UNPAIRED_FLAG out_bam.write(record) # Now change the tags. The final bam concatenation code will pull # the tags out of the header, so we want these to be meaningful. # Put the real barcode in the barcode tag. The alignment-base-qual # code will ignore it anyway. header.set_tag(PROCESSED_BARCODE_TAG, name.split('_')[0]) tk_fasta.write_read_fastq(out_fq, header.to_string(), seq, quals) out_bam.close() assert (not best_contig_seq is None) out_fasta_name = martian.make_path(clonotype_name + '_contigs.fasta') # Run the assembler to produce a consensus sequence. Read contig-reads from out_bam_name. # The resulting sequences will be in out_dir/<clonotype_name>_contigs.fasta. This is the # only output of the assembler we care about. if len(sel_contigs) >= MIN_CONTIGS_FOR_CONSENSUS: cmd = [ 'vdj_asm', 'asm', out_bam_name, out_dir, '--single-end', '--cons', # required so we produce a single output sequence '--kmers=0', '--min-qual=0', '--score-factor=0.0' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(os.path.join(out_dir, clonotype_name + '_contigs.fasta'), 'r') as contig_f: lines = contig_f.readlines() if lines: out_seq = lines[1].strip() else: # In some rare cases (eg. input contigs have 0 quality), assembly might fail. out_seq = None else: out_seq = None # Write the best contig sequence on a new fasta. We need to make sure this has the # right contig name because this will be the name written in the bam alignments # of the contigs against the consensus with open(out_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, clonotype_name, out_seq if out_seq else best_contig_seq) # Now align the same reads that were used in vdj_asm against the consensus that you just got. # The output will be in out_dir/<clonotype_name> + '_contigs.bam' cmd = [ 'vdj_asm', 'base-quals', martian.make_path(clonotype_name + '_contigs'), out_dir, '--single-end' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) # Move the BAM of the contigs aligned against the consensus out of the outs # (Will overwrite this bam which was already used as input to assembly). cr_io.move(os.path.join(out_dir, clonotype_name + '_contigs.bam'), out_bam_name) return (best_contig_seq, best_contig_quals, out_seq, out_bam_name, out_fastq_name, out_fasta_name)
def main(args, outs): reporter = vdj_report.VdjReporter( vdj_reference_path=args.vdj_reference_path) gene_umi_counts_per_bc = {} strand = cr_chem.get_strandedness(args.chemistry_def) # For the entire chunk, match reads against the V(D)J reference ref_fasta = vdj_reference.get_vdj_reference_fasta(args.vdj_reference_path) fq_prefix = re.sub('_1.fastq', '', args.read1_chunk) # The filtering code will write this bam. Then we'll read it, correct the UMIs # and write outs.chunked_bams. filter_bam = martian.make_path('tmp.bam') run_read_match(fq_prefix, ref_fasta, filter_bam, args.chemistry_def, args.sw_params) # Make two passes over the BAM file, processing one barcode at a time bam1 = tk_bam.create_bam_infile(filter_bam) bam2 = tk_bam.create_bam_infile(filter_bam) bc_iter1 = get_bc_grouped_pair_iter(bam1) bc_iter2 = get_bc_grouped_pair_iter(bam2) reads_per_bc = open(outs.reads_per_bc, 'w') if args.output_fastqs: out_fastq1 = open(outs.barcode_chunked_read1, 'w') out_fastq2 = open(outs.barcode_chunked_read2, 'w') out_bam = None else: out_bam, _ = tk_bam.create_bam_outfile(outs.barcode_chunked_bams, None, None, template=bam1) out_fastq1 = None out_fastq2 = None for (bc, pair_iter1), (_, pair_iter2) in itertools.izip(bc_iter1, bc_iter2): nreads = 0 # Pass 1: UMI correction umi_counts = defaultdict(int) for header, (read1, read2) in pair_iter1: nreads += 2 if is_mapped(read1, read2): umi_counts[header.get_tag(cr_constants.RAW_UMI_TAG)] += 1 corrected_umis = correct_umis(umi_counts) # Pass 2: Write the UMI-corrected records write_barcode_fastq(bam1, pair_iter2, bc, corrected_umis, reporter, gene_umi_counts_per_bc, strand, out_bam, out_fastq1, out_fastq2) reads_per_bc.write('{}\t{}\n'.format(bc, nreads)) bam1.close() bam2.close() if args.output_fastqs: out_fastq1.close() out_fastq2.close() else: out_bam.close() # Write bc-gene-umi counts cPickle.dump(gene_umi_counts_per_bc, open(outs.chunked_gene_umi_counts, 'w')) reporter.save(outs.chunked_reporter)
def main_mark_duplicates(args, outs): """ Mark exact duplicate reads in the BAM file. Duplicates have the same read1 start site and read2 start site """ lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map) args.coerce_strings() outs.coerce_strings() bam_in = tk_bam.create_bam_infile(args.input) bam_out, tids = tk_bam.create_bam_outfile( outs.output, None, None, template=bam_in, pg=tk_bam.make_pg_header(martian.get_pipelines_version(), "mark_duplicates")) # Determine whether the BAM has 10x barcodes bam_in.reset() has_barcodes = [ tk_io.read_has_barcode(x) for x in itertools.islice(bam_in, 1000) ] have_barcodes = (float(sum(has_barcodes)) / len(has_barcodes)) > 0.1 # We do the subsampling to achieve the desired coverage on _perfect reads_, as # defined by tenkit.read_filter.stringent_read_filter. This is tallied in ATTACH_BCS, # and passed into the perfect_read_count argument. We will fail if it's not supplied. total_coverage = args.estimated_coverage # Set a fixed random seed to eliminate noise in metrics random.seed(0) sampling_rates = [] for sample_cov in DUPLICATE_SUBSAMPLE_COVERAGES: rate = tk_stats.robust_divide(float(sample_cov), total_coverage) sampling_rates.append((rate, sample_cov)) # All read duplicate marking - these dup decisions are written to bam_out # the output bam has BC aware dup marking if available. # Ensure the summary key indicates what kind of dup marking was actually performed. if have_barcodes: no_filter_dups_bcs = DupSummary(False, 1.0, True, "no_filter_full_use_bcs", lane_coord_sys, bam_out) no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, write_to_stdout=False) else: no_filter_dups_bcs = DupSummary(False, 1.0, True, "no_filter_full_use_bcs", lane_coord_sys) no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, bam_out, write_to_stdout=False) # Dup marking on all perfect reads full_dups_bcs = DupSummary(True, 1.0, True, "full_use_bcs", lane_coord_sys) full_dups_no_bcs = DupSummary(True, 1.0, False, "full_ignore_bcs", lane_coord_sys) # Make a battery of duplicate summaries at different coverages, with and w/o # barcode splitting split_options = [True, False] dup_sums = [ full_dups_bcs, full_dups_no_bcs, no_filter_dups_bcs, no_filter_dups_no_bcs ] # Duplicate marking on perfect reads subsampled to the requested coverage for (sr, cov) in sampling_rates: for split_bc in split_options: description = "cov_" + str(cov) + ('_use_bcs' if split_bc else '_ignore_bcs') dup_sums.append( DupSummary(True, sr, split_bc, description, lane_coord_sys)) # Now broadcast the selected reads to the summarizers # We can't do the points the require a sample_rate > 1.0 so, skip those. # If we don't have barcodes, don't run the set that are split by barcode. consumers = [ x.read_consumer() for x in dup_sums if x.sample_rate <= 1.0 and ((not x.split_bcs) or have_barcodes) ] source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) broadcast(source, consumers) # We close the BAM bam_out.close() # Note - the indexing happens in join # Package up the summaries: dup_results = {} for x in dup_sums: (dups, optical_dups, diff_dups) = x.result desc = x.description dup_results[desc] = dups optical_desc = "optical_" + desc dup_results[optical_desc] = optical_dups diff_desc = "diffusion_" + desc dup_results[diff_desc] = diff_dups if outs.duplicate_summary: f = open(outs.duplicate_summary, 'w') json.dump(dup_results, f) f.close()