def main(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end chunk_index = args.chunk_index prefixes = get_seqs(args.nbases) bam_in = tk_bam.create_bam_infile(args.input) template = BamTemplateShim(bam_in, keep_comments=(chunk_index==0)) bams_out = {} for prefix in prefixes: filename = martian.make_path("bc_{}.bam".format(prefix)) bams_out[prefix], _ = tk_bam.create_bam_outfile(filename, None, None, template=template) non_bc_bam = martian.make_path("bc_{}.bam".format(None)) non_bc_bam_out, _ = tk_bam.create_bam_outfile(non_bc_bam, None, None, template=template) for read in tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)): barcode = crdna_io.get_read_barcode(read) if barcode is None: non_bc_bam_out.write(read) else: prefix = barcode[:args.nbases] bams_out[prefix].write(read) bam_in.close() non_bc_bam_out.close() sort_bam(non_bc_bam) outs.non_bc_bams = [non_bc_bam] outs.buckets = {} for prefix in prefixes: filename = bams_out[prefix].filename bams_out[prefix].close() sort_bam(filename) outs.buckets[prefix] = filename
def main(args, outs): if args.flowcell_geometry is None: return lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map) args.coerce_strings() outs.coerce_strings() bam_in = tk_bam.create_bam_infile(args.input) null_distribution = compute_null_distribution(args.flowcell_geometry, seed=args.seed) estimator = DupSummary.diffusion_estimator(lane_coord_sys, args.flowcell_geometry) consumers = [estimator.read_consumer()] source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) broadcast(source, consumers) # Package up the summaries: dup_results = { 'null_distribution': null_distribution, "observed_distribution": estimator.result } if outs.summary: with open(outs.summary, 'w') as f: json.dump(dup_results, f, indent=4)
def main(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end bam_in = tk_bam.create_bam_infile(args.input) reads = list(tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end))) tmp_dir = os.path.dirname(outs.default) bams_out = {} outs.buckets = {} buckets = {} for qname in args.qnames: filename = os.path.join(tmp_dir, "qname_%s.bam" % qname) bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in) bams_out[qname] = bam_out outs.buckets[qname] = filename buckets[qname] = [] qname_ranges = zip(args.qnames, args.qnames[1:]) for r in reads: qname = None for qnames in qname_ranges: if qnames[0] <= r.qname and r.qname <= qnames[1]: qname = qnames[0] break if qname is None: qname = args.qnames[-1] buckets[qname].append(r) for qname, bucket in buckets.iteritems(): bucket.sort(key=bc_sort_key) bam_out = bams_out[qname] for r in bucket: bam_out.write(r) bam_out.close()
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.input) in_bam_chunk = tk_bam.read_bam_chunk(in_bam, (args.chunk_start, args.chunk_end)) out_bam, _ = tk_bam.create_bam_outfile(outs.output, None, None, template=in_bam) chroms = in_bam.references reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), chroms=chroms) for (gg, bc, gene_ids), reads_iter in itertools.groupby( in_bam_chunk, key=cr_utils.barcode_sort_key): # Ignore reads w/o a valid barcode, unmapped reads and reads that map to more than 1 gene if bc is None or gg is None or gene_ids is None or len(gene_ids) != 1: for read in reads_iter: reporter.mark_dupes_corrected_cb(read) out_bam.write(read) continue reads = list(reads_iter) gene_id = gene_ids[0] # Count cDNA PCR duplicates with uncorrected UMIs dupe_key_umi_counts = mark_dupes( bc, gene_id, reads, args, cr_constants.CDNA_PCR_UNCORRECTED_DUPE_TYPE, cr_utils.cdna_pcr_dupe_func, reporter) # Record UMI corrections umi_corrections = correct_umis(dupe_key_umi_counts) # Mark duplicates for cDNA PCR duplicates with corrected UMIs mark_dupes(bc, gene_id, reads, args, cr_constants.CDNA_PCR_DUPE_TYPE, cr_utils.cdna_pcr_dupe_func, reporter, corrected_dupe_keys=umi_corrections, out_bam=out_bam) # Count duplicates for SI PCR duplicates with uncorrected UMIs mark_dupes(bc, gene_id, reads, args, cr_constants.SI_PCR_DUPE_TYPE, cr_utils.si_pcr_dupe_func, reporter) in_bam.close() out_bam.close() reporter.save(outs.chunked_reporter)
def main_bucket_reads_by_bc(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end prefixes = get_seqs(args.nbases) bam_in = tk_bam.create_bam_infile(args.input) reads = list(tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end))) tmp_dir = os.path.dirname(outs.default) bams_out = {} outs.buckets = {} buckets = {} for prefix in prefixes: filename = os.path.join(tmp_dir, "bc_%s.bam" % prefix) bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in) bams_out[prefix] = bam_out outs.buckets[prefix] = filename buckets[prefix] = [] non_bc_bam_out, _ = tk_bam.create_bam_outfile(outs.default, None, None, template=bam_in) non_bc_reads = [] for r in reads: barcode = tk_io.get_read_barcode(r) if barcode is None: non_bc_bam_out.write(r) non_bc_reads.append(r) else: prefix = barcode[:args.nbases] buckets[prefix].append(r) non_bc_bam_out.close() # Set random seed to get deterministic qname subsampling random.seed(0) sampled_non_bc_reads = random.sample(non_bc_reads, min(len(non_bc_reads), len(prefixes))) outs.qnames = [read.qname for read in sampled_non_bc_reads] for prefix, bucket in buckets.iteritems(): bucket.sort(key=bc_sort_key) bam_out = bams_out[prefix] for r in bucket: bam_out.write(r) bam_out.close()
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.input) in_bam_chunk = tk_bam.read_bam_chunk(in_bam, (args.chunk_start, args.chunk_end)) out_bam, _ = tk_bam.create_bam_outfile(outs.output, None, None, template=in_bam) cell_bcs = set(cr_utils.load_barcode_tsv(args.cell_barcodes)) for (tid, pos), reads_iter in itertools.groupby(in_bam_chunk, key=cr_utils.pos_sort_key): dupe_keys = set() for read in reads_iter: if cr_utils.get_read_barcode(read) not in cell_bcs: continue if cr_utils.is_read_dupe_candidate(read, cr_utils.get_high_conf_mapq(args.align)): dupe_key = (cr_utils.si_pcr_dupe_func(read), cr_utils.get_read_umi(read)) if dupe_key in dupe_keys: continue dupe_keys.add(dupe_key) out_bam.write(read)
def main_report_basic(args, outs): bam_in = pysam.Samfile(args.input, check_sq=False) targets_filename = args.targets_file references = bam_in.references if args.input_pos is not None: bam_in_pos = tk_bam.create_bam_infile(args.input_pos) n_mapped = bam_in_pos.mapped n_chunk = math.ceil(n_mapped / args.n_chunks) bam_in_pos.close() else: n_mapped = 0 n_chunk = 0 if targets_filename is None or targets_filename == '': target_regions = None else: targets_file = open(targets_filename, 'r') target_regions = tk_io.get_target_regions(targets_file) if args.barcode_whitelist: barcode_whitelist = bc_utils.load_barcode_whitelist( args.barcode_whitelist) else: barcode_whitelist = None bam_slice = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) # do basic counting misc_sm, qual_sms = \ compute_basic_stats(bam_slice, target_regions, n_chunk, references, barcode_whitelist) misc_sm.save(outs.misc_sm) with open(outs.qual_sms, 'wb') as out_handle: pickle.dump(qual_sms, out_handle)
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.possorted_bam) in_bam_chunk = tk_bam.read_bam_chunk(in_bam, (args.chunk_start, args.chunk_end)) out_bam, _ = tk_bam.create_bam_outfile(outs.filtered_bam, None, None, template=in_bam) cluster_bcs = set(args.cluster_bcs) for (tid, pos), reads_iter in itertools.groupby(in_bam_chunk, key=cr_utils.pos_sort_key): dupe_keys = set() for read in reads_iter: if cr_utils.get_read_barcode(read) not in cluster_bcs: continue if cr_utils.is_read_dupe_candidate(read, cr_utils.get_high_conf_mapq({"high_conf_mapq":60})): dupe_key = (cr_utils.si_pcr_dupe_func(read), cr_utils.get_read_umi(read)) if dupe_key in dupe_keys: continue dupe_keys.add(dupe_key) read.is_duplicate = False out_bam.write(read)
def main_report_single_partition(args, outs): # Bail out if there no valid barcodes if args.barcode_whitelist is None or args.input is None: outs.fragments = None return bam_in = tk_bam.create_bam_infile(args.input) if args.targets_file is None: target_regions = None else: target_regions = tk_io.get_target_regions(open(args.targets_file)) # Bail out if we're on a small genome if sum(bam_in.lengths) < 3e6: outs.fragments = None return bam_chunk = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) # Skip reads without a barcode bam_chunk_filt = itertools.ifilter(read_has_barcode, bam_chunk) bc_read_iter = itertools.groupby(bam_chunk_filt, lambda x: tk_io.get_read_barcode(x)) bc_data = (summarize_barcode(bc, list(reads), args.read_link_distance, bam_in.references, target_regions) for (bc, reads) in bc_read_iter) bc_data_filt = (x for x in bc_data if x is not None) frag_tbl, bc_tbl = make_output_dataframes(bc_data_filt) if frag_tbl is not None: # Sort and index fragment table, so that we can combine the fragments files per-chromosome to reduce memory consumption frag_tbl.sort(['chrom', 'start_pos'], inplace=True) tenkit.hdf5.write_data_frame(outs.fragments, frag_tbl) tenkit.hdf5.create_tabix_index(outs.fragments, 'chrom', 'start_pos', 'end_pos') if bc_tbl is not None: tenkit.hdf5.write_data_frame(outs.barcodes, bc_tbl)
def main(args, outs): ref = contig_manager.contig_manager(args.reference_path) args.coerce_strings() outs.coerce_strings() # Bail out if there no valid barcodes if args.barcode_whitelist is None or args.input is None: outs.summary = None return bam_in = tk_bam.create_bam_infile(args.input) bam_chunk = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) # Skip reads without a barcode bam_chunk_filt = itertools.ifilter(read_has_barcode, bam_chunk) bc_read_iter = itertools.groupby(bam_chunk_filt, lambda x: crdna_io.get_read_barcode(x)) counts = {} for bc, reads in bc_read_iter: for r in reads: contig = bam_in.references[r.tid] species = ref.species_from_contig(contig) if not species in counts: counts[species] = {} if not bc in counts[species]: counts[species][bc] = 0 if r.is_secondary or r.is_supplementary: ## we are ignoring alternate alignments continue if (r.is_unmapped or r.mapping_quality < CELL_DETECT_MAPQ_THRESHOLD or r.is_duplicate): ## if read is unmapped, poor mapping quality or dup continue counts[species][bc] += 1 outs.counts = counts
def main(args, outs): """ Mark exact duplicate reads in the BAM file. Duplicates have the same read1 start site and read2 start site """ lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map) args.coerce_strings() outs.coerce_strings() bam_in = tk_bam.create_bam_infile(args.input) template = BamTemplateShim(bam_in, keep_comments=(args.chunk_index==0)) if args.write_bam: bam_prefix, ext = os.path.splitext(outs.output) out_bam_name = bam_prefix + '_five_prime_pos_sorted' + ext bam_out, _ = tk_bam.create_bam_outfile(out_bam_name, None, None, template=template, pgs=[tk_bam.make_pg_header(martian.get_pipelines_version(), "mark_duplicates")]) outs.index = None # chunk bams don't get indexed else: bam_out = None outs.output = None outs.index = None # Determine whether the BAM has 10x barcodes bam_in.reset() has_barcodes = [crdna_io.read_has_barcode(x) for x in itertools.islice(bam_in, 1000)] have_barcodes = (float(sum(has_barcodes)) / len(has_barcodes)) > 0.1 # All read duplicate marking - these dup decisions are written to bam_out # the output bam has BC aware dup marking if available. # Ensure the summary key indicates what kind of dup marking was actually performed. if have_barcodes: no_filter_dups_bcs = DupSummary(False, 1.0, True, "no_filter_full_use_bcs", lane_coord_sys, output_bam=bam_out, threshold=args.diffusion_threshold) no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, threshold=args.diffusion_threshold) else: no_filter_dups_bcs = DupSummary(False, 1.0, True, "no_filter_full_use_bcs", lane_coord_sys, threshold=args.diffusion_threshold) no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, output_bam=bam_out, threshold=args.diffusion_threshold) # Dup marking on all perfect reads full_dups_bcs = DupSummary(True, 1.0, True, "full_use_bcs", lane_coord_sys, threshold=args.diffusion_threshold, tag_counts=True) full_dups_no_bcs = DupSummary(True, 1.0, False, "full_ignore_bcs", lane_coord_sys, threshold=args.diffusion_threshold) dup_sums = [full_dups_bcs, full_dups_no_bcs, no_filter_dups_bcs, no_filter_dups_no_bcs] # Now broadcast the selected reads to the summarizers # We can't do the points the require a sample_rate > 1.0 so, skip those. # If we don't have barcodes, don't run the set that are split by barcode. consumers = [x.read_consumer() for x in dup_sums if x.sample_rate <= 1.0 and ((not x.split_bcs) or have_barcodes)] source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) broadcast(source, consumers) # We close the BAM if bam_out: bam_out.close() # Note - the indexing happens in join bam_prefix, _ = os.path.splitext(outs.output) tk_bam.sort(out_bam_name, bam_prefix) # Package up the summaries: dup_results = {} for x in dup_sums: (dups, optical_dups, diff_dups, custom_diff_dups) = x.result desc = x.description dup_results[desc] = dups optical_desc = "optical_" + desc dup_results[optical_desc] = optical_dups diff_desc = "diffusion_old_" + desc dup_results[diff_desc] = diff_dups custom_diff_desc = "diffusion_" + desc dup_results[custom_diff_desc] = custom_diff_dups if outs.duplicate_summary: with open(outs.duplicate_summary, 'w') as f: json.dump(dup_results, f, indent=4)
def main(args, outs): """Mark exact duplicate reads in the output BAM file while also writing out some summary statistics. PCR duplicates have the same read1 start site and read2 start site. """ args.coerce_strings() outs.coerce_strings() # Chunk output doesn't get indexed outs.fragments_index = None outs.index = None # Pull in prior likelihoods for barcodes raw_barcode_abundance = None barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist) if args.raw_barcode_counts is not None and barcode_whitelist is not None: with open(args.raw_barcode_counts, 'r') as infile: raw_counts = json.load(infile) raw_barcode_abundance = { '{}-{}'.format(barcode, gem_group): count for gem_group, subdict in raw_counts.iteritems() for barcode, count in zip(barcode_whitelist, subdict['bc_counts']) } bam_in = create_bam_infile(args.input) bam_refs = bam_in.references bam_prefix, ext = os.path.splitext(outs.output) raw_bam_file = martian.make_path(bam_prefix + '_five_prime_pos_sorted' + ext) frag_prefix, ext = os.path.splitext(outs.fragments) raw_frag_file = martian.make_path(frag_prefix + '_raw' + ext) # only write CO line for one chunk, so we don't have duplicates after samtools merge if args.chunk_num == 0: COs = [ '10x_bam_to_fastq:R1(SEQ:QUAL,TR:TQ)', '10x_bam_to_fastq:R2(SEQ:QUAL,TR:TQ)', '10x_bam_to_fastq:I1(BC:QT)', '10x_bam_to_fastq:I2(CR:CY)', '10x_bam_to_fastq_seqnames:R1,R3,I1,R2' ] else: COs = None bam_out, _ = tk_bam.create_bam_outfile( raw_bam_file, None, None, template=bam_in, pgs=[ tk_bam.make_pg_header(martian.get_pipelines_version(), "mark_duplicates", TENX_PRODUCT_NAME) ], cos=COs) fragments_out = open(raw_frag_file, 'w') bam_in.reset() # Ensure the summary key indicates what kind of dup marking was actually performed. lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map) reference_manager = ReferenceManager(args.reference_path) summarizer = DupSummary(split_bcs=False, lane_coordinate_system=lane_coord_sys, output_bam=bam_out, output_tsv=fragments_out, ref=reference_manager, bam_refs=bam_refs, priors=raw_barcode_abundance) # Now broadcast the selected reads to the summarizers consumers = [summarizer.read_consumer()] source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) broadcast(source, consumers) # Close outfiles bam_out.close() fragments_out.close() # Feed the chunk barcode_counts data back to join() with open(outs.singlecell_mapping, 'w') as outfile: pickle.dump(summarizer.bc_counts, outfile) # Sort the output bam & tsv files sort_bam(raw_bam_file, outs.output, threads=martian.get_threads_allocation()) sort_bed(raw_frag_file, outs.fragments, genome=reference_manager.fasta_index, threads=martian.get_threads_allocation(), leave_key=True)
def main_mark_duplicates(args, outs): """ Mark exact duplicate reads in the BAM file. Duplicates have the same read1 start site and read2 start site """ lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map) args.coerce_strings() outs.coerce_strings() bam_in = tk_bam.create_bam_infile(args.input) bam_out, tids = tk_bam.create_bam_outfile( outs.output, None, None, template=bam_in, pg=tk_bam.make_pg_header(martian.get_pipelines_version(), "mark_duplicates")) # Determine whether the BAM has 10x barcodes bam_in.reset() has_barcodes = [ tk_io.read_has_barcode(x) for x in itertools.islice(bam_in, 1000) ] have_barcodes = (float(sum(has_barcodes)) / len(has_barcodes)) > 0.1 # We do the subsampling to achieve the desired coverage on _perfect reads_, as # defined by tenkit.read_filter.stringent_read_filter. This is tallied in ATTACH_BCS, # and passed into the perfect_read_count argument. We will fail if it's not supplied. total_coverage = args.estimated_coverage # Set a fixed random seed to eliminate noise in metrics random.seed(0) sampling_rates = [] for sample_cov in DUPLICATE_SUBSAMPLE_COVERAGES: rate = tk_stats.robust_divide(float(sample_cov), total_coverage) sampling_rates.append((rate, sample_cov)) # All read duplicate marking - these dup decisions are written to bam_out # the output bam has BC aware dup marking if available. # Ensure the summary key indicates what kind of dup marking was actually performed. if have_barcodes: no_filter_dups_bcs = DupSummary(False, 1.0, True, "no_filter_full_use_bcs", lane_coord_sys, bam_out) no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, write_to_stdout=False) else: no_filter_dups_bcs = DupSummary(False, 1.0, True, "no_filter_full_use_bcs", lane_coord_sys) no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, bam_out, write_to_stdout=False) # Dup marking on all perfect reads full_dups_bcs = DupSummary(True, 1.0, True, "full_use_bcs", lane_coord_sys) full_dups_no_bcs = DupSummary(True, 1.0, False, "full_ignore_bcs", lane_coord_sys) # Make a battery of duplicate summaries at different coverages, with and w/o # barcode splitting split_options = [True, False] dup_sums = [ full_dups_bcs, full_dups_no_bcs, no_filter_dups_bcs, no_filter_dups_no_bcs ] # Duplicate marking on perfect reads subsampled to the requested coverage for (sr, cov) in sampling_rates: for split_bc in split_options: description = "cov_" + str(cov) + ('_use_bcs' if split_bc else '_ignore_bcs') dup_sums.append( DupSummary(True, sr, split_bc, description, lane_coord_sys)) # Now broadcast the selected reads to the summarizers # We can't do the points the require a sample_rate > 1.0 so, skip those. # If we don't have barcodes, don't run the set that are split by barcode. consumers = [ x.read_consumer() for x in dup_sums if x.sample_rate <= 1.0 and ((not x.split_bcs) or have_barcodes) ] source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) broadcast(source, consumers) # We close the BAM bam_out.close() # Note - the indexing happens in join # Package up the summaries: dup_results = {} for x in dup_sums: (dups, optical_dups, diff_dups) = x.result desc = x.description dup_results[desc] = dups optical_desc = "optical_" + desc dup_results[optical_desc] = optical_dups diff_desc = "diffusion_" + desc dup_results[diff_desc] = diff_dups if outs.duplicate_summary: f = open(outs.duplicate_summary, 'w') json.dump(dup_results, f) f.close()
def main(args, outs): #min_insert_size = 0 #max_insert_size = 1e4 ## sc purity threshold: what fraction of contamination by another species ## will we tolerate SC_PURITY_THRESHOLD = 0.95 args.coerce_strings() outs.coerce_strings() # Bail out if there no valid barcodes if args.barcode_whitelist is None or args.input is None: outs.summary = None return ## group bam records by barcode NO_BARCODE/raw barcode tag/processed barcode tag bam_in = tk_bam.create_bam_infile(args.input) bam_chunk = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) bc_read_iter = itertools.groupby(bam_chunk, groupbybarcode) ## compute species_list refs = bam_in.references ref = contig_manager.contig_manager(args.reference_path) species_list = ref.list_species() has_species_info = (species_list != [""]) species_list.sort() genome_size = sum(ref.get_contig_lengths().values()) ## index cells of each species cell_index = {} for sp in species_list: bc_list = args.cell_barcodes.get(sp, {}).keys() bc_list.sort( ) for i, b in enumerate(bc_list): y = cell_index.get(b, "") if len(y) == 0: cell_index[b] = "%s_cell_%d"%(sp, i) else: cell_index[b] = y + "_" + "%s_cell_%d"%(sp, i) ## construct and write header for barnyard file barnyard_file = open(outs.barnyard, 'w') barnyard_header = (['BC'] + ["cell_id"] + [s+("_" if has_species_info else "")+"reads_mapq_60" for s in species_list] + [s+("_" if has_species_info else "")+"contigs" for s in species_list] + ['mapped', 'num_mapped_bases', 'soft_clip_frac', 'insert_p50', 'num_mapped_pos', 'mapped_frac', 'amp_rate', 'library_complexity', 'dup_ratio', 'num_pairs'] + ["is_%s_cell_barcode"%s for s in species_list]) waste_keys = ["no_barcode", "non_cell_barcode", "unmapped", "low_mapq_lt_%d"%PROFILE_MAPQ_THRESHOLD, "dups", "denominator", "unusable_read"] fractional_waste_keys = [ "no_barcode_frac", "non_cell_barcode_frac", "unmapped_frac", "low_mapq_lt_%d_frac"%PROFILE_MAPQ_THRESHOLD, "dups_frac"] barnyard_header.extend(waste_keys) barnyard_header.extend(fractional_waste_keys) barnyard_file.write( ",".join(barnyard_header) + "\n" ) ## wasted data categories ## construct and write header for barnyard_hits file barnyard_hits_file = open( outs.barnyard_hits, "w" ) bh_header = ["barcode", "is_whitelisted"] bh_header.extend(["is_%s_cell_barcode"%s for s in species_list]) bh_header.extend([refname for refname in bam_in.references]) barnyard_hits_file.write( ",".join(bh_header) + "\n" ) # For each barocode, count # per each contig, number per each window (for each window size) # number per species (if available in contig), number per species # TODO: Add detailed matrix by contigs, windows output num_sc_bcs = 0 num_qual_reads = 0 num_sc_reads = 0 ploidy = 2 bc_hist = {} ## count number of raw barcodes that exactly match whitelist ## without any error correction raw_bc_on_whitelist = 0 # dup_summary = json.load(open(args.duplicate_summary)) # pcr_dup_fraction = dup_summary['dup_fraction']['pcr'] #barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist) for bc, reads in bc_read_iter: ## collect various forms of wasted data here per barcode wastebin = defaultdict(int) bh_hits = [0 for _ in bam_in.references] dup_count = 1 non_dup = 1 bc_count = 0 num_per_species = defaultdict(int) contigs_per_species = defaultdict(set) total_reads_by_clip = np.zeros(2, dtype=float) insert_length = [] num_pairs = 0 num_mapped = 0 num_mapped_bases = 0 pos_set = set([]) for r in reads: ## secondary/supplementary are never counted towards anything if r.is_secondary or r.is_supplementary: continue ## include everything in the denominator wastebin["denominator"] += 1 ## how many reads have >= 10 soft clipped bases if r.cigartuples is not None: cigar_dict = dict(r.cigartuples) soft_clip_index = int(cigar_dict.get(4, 0) >= 10) total_reads_by_clip[soft_clip_index] += 1 if barnyard_hits_include(r): bh_hits[r.tid] += 1 ## non-whitelisted barcodes count as wasted data if not "-" in bc: wastebin["no_barcode"] += 1 continue if bc[:-2] == r.get_tag(RAW_BARCODE_TAG): raw_bc_on_whitelist += 1 is_cell_bc_read = True ## waste hierarchy ## if not a cell or if read doesn't belong to species, then waste ## else if not mapped, then waste ## else if mapq< 30, then waste ## else if dup, then waste ## is this is a contaminant read from a different species ## it is wasted contig = refs[r.tid] read_species = ref.species_from_contig(contig) if ( not(read_species in args.cell_barcodes) or not(bc in args.cell_barcodes[read_species]) ): wastebin["non_cell_barcode"] += 1 is_cell_bc_read = False elif r.is_unmapped: wastebin["unmapped"] += 1 elif r.mapq < PROFILE_MAPQ_THRESHOLD: wastebin["low_mapq_lt_%d"%PROFILE_MAPQ_THRESHOLD] += 1 elif r.is_duplicate: wastebin["dups"] += 1 bad_map_or_dup = (r.is_unmapped or (r.mapq < PROFILE_MAPQ_THRESHOLD) or r.is_duplicate) if is_cell_bc_read: bc_count += 1 # if (stringent_read_filter(r, True) and # not(r.is_unmapped) and not(r.mate_is_unmapped)): # if r.is_duplicate: # dup_count += 1 # else: # non_dup += 1 if r.has_tag(DUPLICATE_COUNT_TAG): dup_count += r.get_tag(DUPLICATE_COUNT_TAG) non_dup += 1 elif bad_map_or_dup: # unusable reads are those that are non-cell barcodes that are # also any of unmapped, low mapq, nor dups wastebin['unusable_read'] += 1 ## whether we have a cell barcode or not, count these stats if not bad_map_or_dup: num_mapped += 1 num_mapped_bases += r.reference_length pos_set.add((r.reference_name, r.reference_start/1000)) ## if read is part of a proper pair, only count read or its pair if r.is_proper_pair: if r.is_read1: insert_length.append( r.template_length ) num_pairs += 1 else: continue ## Use MAPQ >= 60 to get accurate mappings only for barnyard stuff if r.mapq < 60: continue num_qual_reads += 1 if has_species_info: num_per_species[read_species] += 1 contigs_per_species[read_species].add(contig) ## end of loop over reads in this barcode assert wastebin['denominator'] - wastebin['no_barcode'] - wastebin['unusable_read'] == num_mapped + \ wastebin["low_mapq_lt_%d" % PROFILE_MAPQ_THRESHOLD] + wastebin['unmapped'] + wastebin['dups'] ## compute the library complexity and amp ## NOTE: insert length is hardcoded as 250, so the amp rate is really the ## library complexity in different units num_amplicons = num_mapped - num_pairs dup_ratio = tk_stats.robust_divide(float(dup_count + non_dup), float(non_dup)) library_complexity = tk_stats.robust_divide(num_amplicons, (dup_ratio-1.0)*2) amp_rate = tk_stats.robust_divide(float(library_complexity * DEFAULT_AMPLICON_LENGTH) , float(ploidy * genome_size)) bc_hist[bc] = bc_count map_rate = tk_stats.robust_divide(float(num_mapped), wastebin["denominator"]) ## write row to barnyard_hits file bh_row = [ bc, int("-" in bc)] for s in species_list: bh_row.append( int(s in args.cell_barcodes and bc in args.cell_barcodes[s]) ) bh_row.extend( bh_hits ) barnyard_hits_file.write(",".join(map(str, bh_row)) + "\n" ) ## write row to barnyard file barnyard_row = ([bc, cell_index.get(bc, "None")] + [num_per_species[s] for s in species_list] + [len(contigs_per_species[s]) for s in species_list] + [num_mapped, num_mapped_bases] + [tk_stats.robust_divide(total_reads_by_clip[1], sum(total_reads_by_clip)), np.median(insert_length) if len(insert_length) else np.nan, len(pos_set), map_rate, amp_rate, library_complexity, dup_ratio, num_pairs]) for speci in species_list: barnyard_row.append( int((speci in args.cell_barcodes) and (bc in args.cell_barcodes[speci])) ) for key in waste_keys: fkey = key + "_frac" if (fkey in fractional_waste_keys): wastebin[fkey] = tk_stats.robust_divide(float(wastebin[key]), float(wastebin["denominator"])) barnyard_row.extend( [ wastebin[x] for x in waste_keys ] ) barnyard_row.extend( [ wastebin[x] for x in fractional_waste_keys ] ) barnyard_file.write( ",".join(map(str, barnyard_row)) + "\n") ## metrics relating to purity - only for multi species if has_species_info and len(species_list) >= 2: counts_by_species = [float(num_per_species[s]) for s in species_list] major_species_index = np.argmax( counts_by_species ) major_species = species_list[major_species_index] species_purity = tk_stats.robust_divide( counts_by_species[major_species_index], np.sum(counts_by_species) ) if species_purity >= SC_PURITY_THRESHOLD: num_sc_bcs += 1 num_sc_reads += num_per_species[major_species] ## END of loop over barcodes summary_info = {} summary_info['num_sc_bcs'] = num_sc_bcs summary_info['num_sc_qual_reads'] = num_qual_reads summary_info['num_sc_reads'] = num_sc_reads summary_info['raw_bc_on_whitelist'] = raw_bc_on_whitelist barnyard_file.close() barnyard_hits_file.close() with open(outs.summary, 'w') as summary_file: summary_file.write(tenkit.safe_json.safe_jsonify(summary_info)) with open(outs.barcode_histogram, 'w') as bc_hist_file: bc_hist_file.write(tenkit.safe_json.safe_jsonify(bc_hist))
def main(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end bam_in = tk_bam.create_bam_infile(args.input) reads = tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)) pgs = [ tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_phasing"), tk_bam.make_terminal_pg_header(martian.get_pipelines_version()) ] # dont duplicate header if already there, this is for developer testing purposes PG = bam_in.header['PG'] for item in PG: if item['ID'] == "attach_phasing": pgs = [] bam_out, _ = tk_bam.create_bam_outfile(outs.phased_possorted_bam, None, None, template=bam_in, pgs=pgs) # File with contig phasing information if args.fragment_phasing is not None: frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing) else: frag_phasing = None if args.fragments is not None: # Fragments file for global molecule id frag_id_reader = tk_hdf5.DataFrameReader(args.fragments) else: frag_id_reader = None # Phasing data ph_db = None ph_db_chrom = None ph_db_start = None ph_db_end = None # Fragment data - for global molecule id fr_db = None fr_db_chrom = None fr_db_start = None fr_db_end = None total_reads = 0 phased_reads = 0 molecule_tagged_reads = 0 for r in reads: chrom = bam_in.references[r.tid] pos = r.pos bc = tk_io.get_read_barcode(r) total_reads += 1 tags = r.tags # Strip out RX and QX tags #strip_tags = [RAW_BARCODE_TAG, RAW_BARCODE_QUAL_TAG] # Actually don't strip strip_tags = [] tags = [(tag, value) for (tag, value) in tags if (tag not in strip_tags)] # Fetch from the fragments file to get records that should cover many future reads # fragment phasing file may not exist in ALIGNER only pipeline - may need to skip if frag_phasing is not None: if ph_db is None or chrom != ph_db_chrom or pos < ph_db_start or pos > ph_db_end: ph_db, (ph_db_chrom, ph_db_start, ph_db_end) = get_frag_phasing_db(frag_phasing, chrom, pos, window=WINDOW_SIZE) if bc is not None and ph_db.has_key(bc): frags = ph_db[bc] # See if we having phasing for this fragment valid_phasing = [x for x in frags if x['start'] <= r.pos and x['end'] > r.pos] assert(len(valid_phasing) < 2) if len(valid_phasing) == 1: phased_reads += 1 read_phasing = valid_phasing[0] tags.append((PHASE_SET_BAM_TAG, read_phasing['ps'])) tags.append((HAPLOTYPE_BAM_TAG, read_phasing['hap'])) tags.append((PHASING_CONF_BAM_TAG, read_phasing['pc'])) if frag_id_reader is not None: # Fetch from the fragments file to get records that should cover many future reads if fr_db is None or chrom != fr_db_chrom or pos < fr_db_start or pos > fr_db_end: fr_db, (fr_db_chrom, fr_db_start, fr_db_end) = get_molecule_id_db(frag_id_reader, chrom, pos, window=WINDOW_SIZE) if bc is not None and fr_db.has_key(bc): frags = fr_db[bc] # See if we having phasing for this fragment molecule_ids = [x for x in frags if x['start'] <= r.pos and x['end'] > r.pos] assert(len(molecule_ids) < 2) if len(molecule_ids) == 1: molecule_tagged_reads += 1 molecule_id = molecule_ids[0] tags.append((MOLECULE_ID_BAM_TAG, molecule_id['molecule_id'])) r.tags = tags bam_out.write(r) bam_out.close() outs.total_reads = total_reads outs.phased_reads = phased_reads outs.molecule_tagged_reads = molecule_tagged_reads