def join(args, outs, chunk_defs, chunk_outs): # Sample ID / pipestance name check_sample_id(args.sample_id) # force_cells check_force_cells(args.force_cells, ulimit=10000000) # allow arbitrarily large limit for reanalyzer # # Reference # ref directory structure and timestamps ok, msg = check_refdata(args.reference_path, max_contigs=None) if ok: martian.log_info(msg) else: martian.exit(msg) # formatting check_reference_format(args.reference_path) contig_manager = ReferenceManager(args.reference_path) # peaks format check and nonoverlapping if args.peaks is None: martian.exit("peaks file not provided") exists_and_readable(args.peaks, "peaks") bed_format_checker(args.peaks, contig_manager.fasta_index) contain_three_columns(args.peaks) if is_overlapping(args.peaks): martian.exit("{} contains overlapping peak regions".format(args.peaks)) # check parameters files if args.parameters is not None: if not os.path.exists(args.parameters): martian.exit("{} does not exist".format(args.parameters)) # fragments checks whitelist_barcodes = load_barcode_whitelist(args.barcode_whitelist) species_list = contig_manager.list_species() observed_gem_groups = set() observed_species = set() if args.fragments is None: martian.exit("fragments file not provided") exists_and_readable(args.fragments, "fragments") contig_lens = contig_manager.get_contig_lengths() # check bounds and matching contigs in reference and species for chrom, start, stop, bc, _ in open_fragment_file(args.fragments): spec = chrom.split("_") observed_species.add(spec[0] if spec[0] != chrom else "") barcode, gem_group = bc.split("-") observed_gem_groups.add(gem_group) if args.check_executables: # run this only non-locally if barcode not in whitelist_barcodes: martian.exit("{} is not a valid whitelist barcode".format(barcode)) if chrom not in contig_lens: martian.exit("contig {} not present in reference".format(chrom)) if stop > contig_lens[chrom]: martian.exit("fragment {}:{}-{} boundaries exceed contig size ({} bp)".format(chrom, start, stop, contig_lens[chrom])) # ensure fragments are on the correct reference for species in observed_species: if species not in species_list: martian.exit("{} contains fragments mapped to species not recognized in the reference".format(args.fragments)) if len(observed_gem_groups) > 1: martian.log_info("multiple gem groups present in {}, likely generated in a previous aggregation run".format(args.fragments)) # fragments index is synced with fragments if args.fragments_index is None: martian.exit("fragments index file not provided") if not os.path.exists(args.fragments_index): martian.exit("{} does not exist".format(args.fragments_index)) try: all_contigs = contig_manager.primary_contigs(allow_sex_chromosomes=True) for contig in all_contigs: en = 0 for chrom, start, end, bc, dups in parsed_fragments_from_contig(contig, args.fragments, index=args.fragments_index): if en >= FRAGMENTS_SCAN_SIZE: break en += 1 except: martian.exit("fragments index is not in sync with the fragments file") # aggr csv checks if args.aggregation_csv is not None: check_aggr_csv(args.aggregation_csv, args.reference_path, cursory=True) # cell barcode checks if args.cell_barcodes is not None: if not os.path.exists(args.cell_barcodes): martian.exit("{} does not exist".format(args.cell_barcodes)) check_singlecell_format(args.cell_barcodes, species_list, whitelist_barcodes) # Open file handles limit if args.check_executables: check_filehandle_limit() martian.log_info(tk_preflight.record_package_versions())
def main(args, outs): """Mark exact duplicate reads in the output BAM file while also writing out some summary statistics. PCR duplicates have the same read1 start site and read2 start site. """ args.coerce_strings() outs.coerce_strings() # Chunk output doesn't get indexed outs.fragments_index = None outs.index = None # Pull in prior likelihoods for barcodes raw_barcode_abundance = None barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist) if args.raw_barcode_counts is not None and barcode_whitelist is not None: with open(args.raw_barcode_counts, 'r') as infile: raw_counts = json.load(infile) raw_barcode_abundance = { '{}-{}'.format(barcode, gem_group): count for gem_group, subdict in raw_counts.iteritems() for barcode, count in zip(barcode_whitelist, subdict['bc_counts']) } bam_in = create_bam_infile(args.input) bam_refs = bam_in.references bam_prefix, ext = os.path.splitext(outs.output) raw_bam_file = martian.make_path(bam_prefix + '_five_prime_pos_sorted' + ext) frag_prefix, ext = os.path.splitext(outs.fragments) raw_frag_file = martian.make_path(frag_prefix + '_raw' + ext) # only write CO line for one chunk, so we don't have duplicates after samtools merge if args.chunk_num == 0: COs = [ '10x_bam_to_fastq:R1(SEQ:QUAL,TR:TQ)', '10x_bam_to_fastq:R2(SEQ:QUAL,TR:TQ)', '10x_bam_to_fastq:I1(BC:QT)', '10x_bam_to_fastq:I2(CR:CY)', '10x_bam_to_fastq_seqnames:R1,R3,I1,R2' ] else: COs = None bam_out, _ = tk_bam.create_bam_outfile( raw_bam_file, None, None, template=bam_in, pgs=[ tk_bam.make_pg_header(martian.get_pipelines_version(), "mark_duplicates", TENX_PRODUCT_NAME) ], cos=COs) fragments_out = open(raw_frag_file, 'w') bam_in.reset() # Ensure the summary key indicates what kind of dup marking was actually performed. lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map) reference_manager = ReferenceManager(args.reference_path) summarizer = DupSummary(split_bcs=False, lane_coordinate_system=lane_coord_sys, output_bam=bam_out, output_tsv=fragments_out, ref=reference_manager, bam_refs=bam_refs, priors=raw_barcode_abundance) # Now broadcast the selected reads to the summarizers consumers = [summarizer.read_consumer()] source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) broadcast(source, consumers) # Close outfiles bam_out.close() fragments_out.close() # Feed the chunk barcode_counts data back to join() with open(outs.singlecell_mapping, 'w') as outfile: pickle.dump(summarizer.bc_counts, outfile) # Sort the output bam & tsv files sort_bam(raw_bam_file, outs.output, threads=martian.get_threads_allocation()) sort_bed(raw_frag_file, outs.fragments, genome=reference_manager.fasta_index, threads=martian.get_threads_allocation(), leave_key=True)
def generate_cellcalling_fragment_counts_plot(singlecell_df, cell_parameters, barcode_whitelist, excluded_barcodes=None, species=""): if species: fragment_counts = np.array( singlecell_df['passed_filters_{}'.format(species)].values) else: fragment_counts = np.array( singlecell_df['peak_region_fragments'].values) if excluded_barcodes is None: valid_barcode_mask = singlecell_df['barcode'] != NO_BARCODE else: barcodes = singlecell_df['barcode'].values valid_barcode_mask = np.array( [(bc not in excluded_barcodes[species]) and (bc != NO_BARCODE) for bc in barcodes], dtype=bool) threshold = cell_parameters[species]['cell_threshold'] cell_mask = (fragment_counts >= threshold) & valid_barcode_mask noncell_mask = (fragment_counts < threshold) & valid_barcode_mask logbinmax = int(np.ceil(np.log10(fragment_counts.max()))) xbins = list( np.hstack([np.arange(100), np.logspace(np.log10(100), logbinmax, 350)])) data_subplots = [] for name, mask in zip(["Non-cells", "{} Cells".format(species)], [noncell_mask, cell_mask]): if mask.sum() > 0: counts, _ = np.histogram(fragment_counts[mask], xbins) data_subplots.append({ "name": name, "x": xbins, "y": list(counts), "type": "scatter", "connectgaps": True, "fill": "tozeroy", }) whitelist_length = len(load_barcode_whitelist(barcode_whitelist)) fragment_depth = sum(singlecell_df['passed_filters'].values) count_shift = max( MINIMUM_COUNT, int(fragment_depth * WHITELIST_CONTAM_RATE / whitelist_length)) def get_fitted_counts(barcode_total, bins, species, parameters): max_count = max(bins) count_values = np.arange(max_count + 1) frac_noise = parameters[species]['fraction_noise'] mean_noise = parameters[species]['noise_mean'] mean_signal = parameters[species]['signal_mean'] dispersion_noise = parameters[species]['noise_dispersion'] dispersion_signal = parameters[species]['signal_dispersion'] estimated_noise_counts = stats.nbinom.pmf( count_values, 1 / dispersion_noise, 1 / (1 + dispersion_noise * mean_noise)) estimated_signal_counts = stats.nbinom.pmf( count_values, 1 / dispersion_signal, 1 / (1 + dispersion_signal * mean_signal)) estimated_noise_counts *= frac_noise * barcode_total estimated_signal_counts *= (1 - frac_noise) * barcode_total noise_bin_counts = np.array([ estimated_noise_counts[(count_values >= lower) & (count_values < upper)].sum() for lower, upper in zip(bins[:-1], bins[1:]) ]) signal_bin_counts = np.array([ estimated_signal_counts[(count_values >= lower) & (count_values < upper)].sum() for lower, upper in zip(bins[:-1], bins[1:]) ]) noise_bin_counts[noise_bin_counts < 1.0] = 0.0 signal_bin_counts[signal_bin_counts < 1.0] = 0.0 return bins[:-1], noise_bin_counts, signal_bin_counts xvals, noise, signal = get_fitted_counts( (fragment_counts >= count_shift).sum(), xbins, species, cell_parameters) data_subplots.append({ "name": "Noise fit", "x": list(xvals), "y": list(noise), "type": "scatter", "mode": "lines", "line": { "color": "grey", "width": 1 }, }) data_subplots.append({ "name": "Signal fit", "x": list(xvals), "y": list(signal), "type": "scatter", "mode": "lines", "line": { "color": "black", "width": 1 }, }) data_subplots.append({ "name": "Joint fit", "x": list(xvals), "y": list(signal + noise), "type": "scatter", "mode": "lines", "line": { "color": "red", "width": 1 }, }) return { "layout": { "xaxis": { "type": "log", "title": "{} Fragments Per Barcode".format(species), }, "yaxis": { "type": "log", "title": "Barcodes", }, "title": "{} Fragment Distribution".format(species), }, "data": data_subplots, }
def join(args, outs, chunk_defs, chunk_outs): outs.coerce_strings() # Merge the output bam files with duplicates marked hierarchical_merge_bam([c.output for c in chunk_outs], outs.output, tag=None, threads=martian.get_threads_allocation()) outs.index = index_bam(outs.output, martian.get_threads_allocation()) # Merge the barcode counts from each chunk and write out our singlecell_mapping file barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist, ordered=True) sorted_barcodes = [] if args.raw_barcode_counts is not None: with open(args.raw_barcode_counts, 'r') as infile: raw_counts = json.load(infile) sorted_barcodes = [ '{}-{}'.format(barcode, gem_group) for gem_group in raw_counts for barcode in sorted(barcode_whitelist) ] barcode_counts = {} for chunk in chunk_outs: with open(chunk.singlecell_mapping, 'r') as infile: chunk_counts = pickle.load(infile) for barcode, count_dict in chunk_counts.iteritems(): if barcode not in barcode_counts: barcode_counts[barcode] = Counter() barcode_counts[barcode] += Counter(count_dict) with open(outs.singlecell_mapping, 'w') as outfile: outfile.write("barcode,") outfile.write(",".join(SINGLE_CELL_KEYS)) outfile.write("\n") if None in barcode_counts: outfile.write("{},".format(NO_BARCODE)) outfile.write(",".join( [str(barcode_counts[None][key]) for key in SINGLE_CELL_KEYS])) outfile.write("\n") for barcode in (bc for bc in sorted_barcodes if bc in barcode_counts): outfile.write("{},".format(barcode)) outfile.write(",".join([ str(barcode_counts[barcode][key]) for key in SINGLE_CELL_KEYS ])) outfile.write("\n") # Merge the fragment file base_file, extension = os.path.splitext(outs.fragments) if not extension == '.gz': raise ValueError('Expecting compressed file output') input_tsvs = [str(chunk.fragments) for chunk in chunk_outs] merge_keyed_bed(input_tsvs, base_file, threads=martian.get_threads_allocation()) if os.path.getsize(base_file) == 0: outs.fragments = None outs.fragments_index = None return # N.B. tabix_index will automatically compress the input file, adding the .gz suffix pysam.tabix_index(base_file, preset='bed', index=outs.fragments_index)
def main(args, outs): """ Trim the reads in a series of fasta files """ chunk = args.chunk subsample_rate = chunk['subsample_rate'] have_barcode = chunk['barcode'] is not None have_sample_index = chunk['sample_index'] is not None # STEP 1: We run the R1/R2 reads through cutadapt, writing them to a temporary file with appropriate adapters # trimmed, optionally filtering out reads where adapters weren't found interleaved = chunk['read2'] is None # can't do discard_untrimmed because we're running cutadapt in single-end mode if args.trim_def['discard_untrimmed']: martian.exit("discard_untrimmed was set in trim_def") if interleaved: trimmed_reads = martian.make_path("trimmed_reads.fastq") trim_info_fn = martian.make_path("trim_info.txt") initial_read_pairs, trimmed_read_pairs = run_cutadapt_single_end( chunk['read1'], trimmed_reads, trim_info_fn, args.trim_def, args.adapters) else: trimmed_r1 = martian.make_path("trimmed_r1.fastq") trimmed_r2 = martian.make_path("trimmed_r2.fastq") trim_info_r1_fn = martian.make_path("trim_info_r1.txt") trim_info_r2_fn = martian.make_path("trim_info_r2.txt") initial1, trimmed1 = run_cutadapt_single_end(chunk['read1'], trimmed_r1, trim_info_r1_fn, args.trim_def, args.adapters, read_id="R1") initial2, trimmed2 = run_cutadapt_single_end(chunk['read2'], trimmed_r2, trim_info_r2_fn, args.trim_def, args.adapters, read_id="R2") initial_read_pairs = initial1 + initial2 trimmed_read_pairs = trimmed1 + trimmed2 if initial1 != initial2: martian.exit( "Input fastq files for R1 and R2 are not the same length") if trimmed1 != trimmed2: raise ValueError( "Cutadapt produced differing numbers of reads for R1 and R2") # STEP 2: We run through the trimmed R1/R2 reads along with sample index and barcode reads, chunking into files of # max_read_num reads or less, and skipping sample index/barcode reads that don't match the trimmed & filtered R1/R2 # reads max_read_num = args.max_read_num file_number = 1 # open the available input read files and get the iterator over them if interleaved: reads_in = open_maybe_gzip(trimmed_reads, 'r') read_iter = tk_fasta.read_generator_fastq(reads_in, paired_end=True) trim_info = open_maybe_gzip(trim_info_fn, 'r') trim_iter = read_generator_trim_info(trim_info, paired_end=True) else: r1_in = open_maybe_gzip(trimmed_r1, 'r') r2_in = open_maybe_gzip(trimmed_r2, 'r') read_iter = ((r1[0], r1[1], r1[2], r2[0], r2[1], r2[2]) for r1, r2 in itertools.izip_longest( tk_fasta.read_generator_fastq(r1_in), tk_fasta.read_generator_fastq(r2_in))) trim_info_r1 = open_maybe_gzip(trim_info_r1_fn, 'r') trim_info_r2 = open_maybe_gzip(trim_info_r2_fn, 'r') trim_iter = (t1 + t2 for t1, t2 in itertools.izip( read_generator_trim_info(trim_info_r1), read_generator_trim_info(trim_info_r2))) # open output read file, which will be interleaved read_name = martian.make_path("read{}.fastq".format(file_number)) out_readfiles = [read_name] out_read_fastq = open(read_name, 'w') # open trimmed read file, which will be interleaved trim_out_name = martian.make_path("TRIM{}.fastq".format(file_number)) out_trimfiles = [trim_out_name] out_trim_fastq = open(trim_out_name, 'w') if args.barcode_whitelist is None: outs.bc_counts = None barcode_indices = None else: barcode_whitelist = sorted( list(load_barcode_whitelist(args.barcode_whitelist))) barcode_indices = { bc: idx for (idx, bc) in enumerate(barcode_whitelist) } bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32) bad_count = 0 # open barcode file if there is one if have_barcode: bc_name = martian.make_path("BC{}.fastq".format(file_number)) out_bc_fastq = open(bc_name, 'w') out_barcodefiles = [bc_name] barcode_read = None bc_in = open_maybe_gzip(chunk['barcode'], 'r') bc_iter = tk_fasta.read_generator_fastq(bc_in) # Determine if barcode sequences need to be reverse complemented. with open_maybe_gzip(chunk['barcode'], 'r') as bc_in2: bc_iter2 = tk_fasta.read_generator_fastq(bc_in2) barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist) barcode_rc = infer_barcode_reverse_complement( barcode_whitelist, bc_iter2) else: out_barcodefiles = [None] outs.bc_counts = None # open sample_index file if there is one if have_sample_index: si_name = martian.make_path("SI{}.fastq".format(file_number)) out_si_fastq = open(si_name, 'w') si_in = open_maybe_gzip(chunk['sample_index'], 'r') sample_index_read = None si_iter = tk_fasta.read_generator_fastq(si_in) out_sampleindex_files = [si_name] else: out_sampleindex_files = [None] read_num = 0 random.seed(0) for (read, trim) in itertools.izip(read_iter, trim_iter): # Downsample (other than the first read). Note we've set a fixed seed to make this deterministic. if read_num > 0 and random.random() > subsample_rate: continue # Now we need to step through the barcode and sample index reads to find the matching reads if have_barcode: try: while barcode_read is None or not read_match( read, barcode_read): barcode_read = bc_iter.next() # reverse complement if all barcodes are RC-ed if barcode_rc: barcode_read = (barcode_read[0], tk_seq.get_rev_comp(barcode_read[1]), barcode_read[2][::-1]) except StopIteration: raise ValueError( "Couldn't find barcode read matching {}".format( get_read_name(read))) if have_sample_index: try: while sample_index_read is None or not read_match( read, sample_index_read): sample_index_read = si_iter.next() except StopIteration: raise ValueError( "Couldn't find sample index read matching {}".format( get_read_name(read))) (name1, seq1, qual1, name2, seq2, qual2) = read (tr_name1, tr_seq1, tr_qual1, tr_name2, tr_seq2, tr_qual2) = trim read_num += 1 if read_num > max_read_num: read_num = 1 file_number += 1 read_name = martian.make_path("read{}.fastq".format(file_number)) out_read_fastq.close() out_read_fastq = open(read_name, 'w') out_readfiles.append(read_name) trim_out_name = martian.make_path( "TRIM{}.fastq".format(file_number)) out_trim_fastq.close() out_trim_fastq = open(trim_out_name, 'w') out_trimfiles.append(trim_out_name) if have_barcode: bc_name = martian.make_path("BC{}.fastq".format(file_number)) out_bc_fastq.close() out_bc_fastq = open(bc_name, 'w') out_barcodefiles.append(bc_name) else: out_barcodefiles.append(None) if have_sample_index: si_name = martian.make_path("SI{}.fastq".format(file_number)) out_si_fastq.close() out_si_fastq = open(si_name, 'w') out_sampleindex_files.append(si_name) else: out_sampleindex_files.append(None) if have_barcode: barcode_seq = barcode_read[1] barcode_qual = barcode_read[2] if barcode_indices is not None: idx = barcode_indices.get(barcode_seq) if idx is not None: bc_counts[idx] += 1 else: bad_count += 1 tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0], barcode_seq, barcode_qual) if have_sample_index: tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0], sample_index_read[1], sample_index_read[2]) tk_fasta.write_read_fastq(out_read_fastq, name1, seq1, qual1) tk_fasta.write_read_fastq(out_read_fastq, name2, seq2, qual2) tk_fasta.write_read_fastq(out_trim_fastq, tr_name1, tr_seq1, tr_qual1) tk_fasta.write_read_fastq(out_trim_fastq, tr_name2, tr_seq2, tr_qual2) if interleaved: reads_in.close() else: r1_in.close() r2_in.close() if have_barcode: out_bc_fastq.close() # Only emit BC counts if we had a whitelist if outs.bc_counts is not None: result = {} result['bad_bc_count'] = bad_count result['bc_counts'] = list(bc_counts) with open(outs.bc_counts, 'w') as bc_counts_out: tenkit.safe_json.dump_numpy(result, bc_counts_out) with open(outs.read_counts, 'w') as outfile: read_counts = { 'total_read_pairs': initial_read_pairs, 'filtered_read_pairs': trimmed_read_pairs } tenkit.safe_json.dump_numpy(read_counts, outfile) if have_sample_index: out_si_fastq.close() out_read_fastq.close() out_trim_fastq.close() outs.chunks = [ { 'read1': r, # output chunked trimmed read file 'read2': None, 'trim': t, # output chunked trim file 'barcode': bc, # output chunked barcode file 'sample_index': si, # output chunked sample index file 'barcode_reverse_complement': False, # we always keep BC in correct orientation 'reads_interleaved': True, 'gem_group': chunk['gem_group'], 'read_group': chunk['read_group'] } for (r, t, bc, si) in zip(out_readfiles, out_trimfiles, out_barcodefiles, out_sampleindex_files) ]
def join(args, outs, chunk_defs, chunk_outs): final_chunks = [] for cl in chunk_outs: final_chunks.extend(cl.chunks) outs.chunks = final_chunks valid_counts = [c.bc_counts for c in chunk_outs if c.bc_counts is not None] # No counts if there's no whitelist or actual counts if args.barcode_whitelist is None or len(valid_counts) == 0: outs.bc_counts = None outs.lot_info = None return bc_counts = {} read_counts = {} for (c_out, c_def) in zip(chunk_outs, chunk_defs): # Sum up total and trimmed read counts with open(c_out.read_counts) as f: r = json.load(f) for key in ['filtered_read_pairs', 'total_read_pairs']: read_counts[key] = read_counts.get(key, 0) + r[key] # Sum up barcode counts gem_group = c_def.chunk['gem_group'] if c_out.bc_counts is not None: with open(c_out.bc_counts) as f: r = json.load(f) gg_result = bc_counts.setdefault(gem_group, { 'bad_bc_count': 0, 'bc_counts': None }) gg_result['bad_bc_count'] += r['bad_bc_count'] if gg_result['bc_counts'] is None: gg_result['bc_counts'] = np.array(r['bc_counts'], dtype=np.int32) else: gg_result['bc_counts'] += np.array(r['bc_counts'], dtype=np.int32) total_counts = 0 total_errors = 0 for gg in bc_counts.keys(): rgg = bc_counts[gg] rgg['bc_error_rate'] = tk_stats.robust_divide( float(rgg['bad_bc_count']), float(rgg['bad_bc_count'] + rgg['bc_counts'].sum())) total_counts += float(rgg['bad_bc_count'] + rgg['bc_counts'].sum()) total_errors += float(rgg['bad_bc_count']) # Hardcoded bail-out if the BC-error rate is extremely high bc_error_rate = total_errors / total_counts if bc_error_rate > MAX_BARCODE_ERROR_RATE: martian.exit( "Extremely high rate of incorrect barcodes observed (%.2f %%). " "Check that input is 10x Chromium data, " "and that there are no missing cycles in first 16 bases of the index read I2." % (bc_error_rate * 100.0)) # possibly do lot detection lot_detection = {} lot_map = WHITELIST_TO_LOT_MAP.get(args.barcode_whitelist, None) if lot_map is not None: # get BC counts histogram # for now, just sum over all gem groups bc_seq = sorted(list(load_barcode_whitelist(args.barcode_whitelist))) bc_cts = np.sum([ggr['bc_counts'] for ggr in bc_counts.values()], axis=0) bc_hist = {seq: cts for seq, cts in zip(bc_seq, bc_cts)} (gelbead_lot, gelbead_lot_confidence, gelbead_lot_counts) = identify_gelbead_lot(bc_hist, lot_map) # only report on lots with nonzero counts gelbead_lot_counts_nonzero = { lot: count for lot, count in gelbead_lot_counts.items() if count > 0 } lot_detection['gelbead_lot'] = gelbead_lot lot_detection['gelbead_lot_confidence'] = gelbead_lot_confidence lot_detection['gelbead_lot_counts'] = gelbead_lot_counts_nonzero martian.log_info("Gelbead lot detected: %s, reason (if None): %s" % (gelbead_lot, gelbead_lot_confidence)) if outs.lot_info is not None: with open(outs.lot_info, 'w') as f: tenkit.safe_json.dump_numpy(lot_detection, f, pretty=True) if outs.bc_counts is not None: with open(outs.bc_counts, 'w') as f: tenkit.safe_json.dump_numpy(bc_counts, f) with open(outs.read_counts, 'w') as f: tenkit.safe_json.dump_numpy(read_counts, f, pretty=True)
def join(args, outs, chunk_defs, chunk_outs): args.coerce_strings() outs.coerce_strings() if args.fragments is None: outs.cell_barcodes = None outs.cell_calling_summary = None outs.singlecell = None return if args.excluded_barcodes is not None: with open(args.excluded_barcodes, 'r') as infile: excluded_barcodes = json.load(infile) else: excluded_barcodes = None # Merge the chunk inputs ref = ReferenceManager(args.reference_path) species_list = ref.list_species() barcode_counts_by_species = { species: Counter() for species in species_list } targeted_counts_by_species = { species: Counter() for species in species_list } fragment_depth = 0 for chunk_in, chunk_out in zip(chunk_defs, chunk_outs): species = ref.species_from_contig(chunk_in.contig) with open(chunk_out.barcode_counts, 'r') as infile: barcode_counts_by_species[species] += pickle.load(infile) with open(chunk_out.targeted_counts, 'r') as infile: targeted_counts_by_species[species] += pickle.load(infile) fragment_depth += chunk_out.fragment_depth print('Total fragments across all chunks: {}'.format(fragment_depth)) barcodes = list({ bc for species in species_list for bc in barcode_counts_by_species[species] }) non_excluded_barcodes = { species: [bc for bc in barcodes if bc not in excluded_barcodes[species]] for species in species_list } print('Total barcodes observed: {}'.format(len(barcodes))) retained_counts = {} for species in species_list: if excluded_barcodes is None: retained_counts[species] = np.array( [targeted_counts_by_species[species][bc] for bc in barcodes]) else: retained_counts[species] = np.array([ targeted_counts_by_species[species][bc] for bc in barcodes if bc not in excluded_barcodes[species] ]) print('Barcodes excluded for species {}: {}'.format( species, len(excluded_barcodes[species]))) print('Barcodes remaining for species {}: {}'.format( species, len(non_excluded_barcodes[species]))) parameters = {} whitelist_length = len(load_barcode_whitelist(args.barcode_whitelist)) count_shift = max( MINIMUM_COUNT, int(fragment_depth * WHITELIST_CONTAM_RATE / whitelist_length)) print('Count shift for whitelist contamination: {}'.format(count_shift)) for (species, count_data) in retained_counts.iteritems(): print('Analyzing species {}'.format(species)) # Subtract MINIMUM_COUNT from all counts to remove the effects of whitelist contamination shifted_data = count_data[count_data >= count_shift] - count_shift print('Number of barcodes analyzed: {}'.format(len(shifted_data))) count_dict = Counter(shifted_data) parameters[species] = {} forced_cell_count = None if args.force_cells is not None: if species in args.force_cells: forced_cell_count = int(args.force_cells[species]) elif "default" in args.force_cells: forced_cell_count = int(args.force_cells["default"]) if forced_cell_count > MAXIMUM_CELLS_PER_SPECIES: forced_cell_count = MAXIMUM_CELLS_PER_SPECIES martian.log_info( 'Attempted to force cells to {}. Overriding to maximum allowed cells.' .format(forced_cell_count)) # Initialize parameters to empty parameters[species]['noise_mean'] = None parameters[species]['noise_dispersion'] = None parameters[species]['signal_mean'] = None parameters[species]['signal_dispersion'] = None parameters[species]['fraction_noise'] = None parameters[species]['cell_threshold'] = None parameters[species]['goodness_of_fit'] = None parameters[species]['estimated_cells_present'] = 0 # Corner case where FRIP is 0 because the number of peaks is tiny (fuzzer tests) if len(count_dict) < 10: parameters[species]['cells_detected'] = 0 forced_cell_count = None elif forced_cell_count is None: print('Estimating parameters') fitted_params = estimate_parameters(count_dict) signal_threshold = estimate_threshold( fitted_params, CELL_CALLING_THRESHOLD) + count_shift print('Primary threshold: {}'.format(signal_threshold)) parameters[species]['noise_mean'] = fitted_params.mu_noise parameters[species]['noise_dispersion'] = fitted_params.alpha_noise parameters[species]['signal_mean'] = fitted_params.mu_signal parameters[species][ 'signal_dispersion'] = fitted_params.alpha_signal parameters[species]['fraction_noise'] = fitted_params.frac_noise parameters[species]['cell_threshold'] = signal_threshold parameters[species]['goodness_of_fit'] = goodness_of_fit( shifted_data, fitted_params) called_cell_count = np.sum(count_data >= signal_threshold) parameters[species]['cells_detected'] = called_cell_count parameters[species]['estimated_cells_present'] = int( (1 - fitted_params.frac_noise) * len(shifted_data)) if called_cell_count > MAXIMUM_CELLS_PER_SPECIES: # Abort the model fitting and instead force cells to the maximum forced_cell_count = MAXIMUM_CELLS_PER_SPECIES if forced_cell_count is not None: print('Forcing cells to {}'.format(forced_cell_count)) if forced_cell_count <= 0: raise ValueError("Force cells must be positive") else: adj_data = shifted_data[shifted_data > 0] print('Total barcodes considered for forcing cells: {}'.format( len(adj_data))) parameters[species]['cell_threshold'] = min(adj_data) if forced_cell_count >= len(adj_data) else \ sorted(adj_data, reverse=True)[forced_cell_count - 1] parameters[species]['cell_threshold'] += count_shift parameters[species]['cells_detected'] = np.sum( count_data >= parameters[species]['cell_threshold']) # For barnyard samples, mask out the noise distribution and re-fit to get cleaner separation if len(retained_counts) == 2 and (args.force_cells is None or not args.force_cells): print('Estimating secondary thresholds') sp1, sp2 = species_list sp1_threshold = -1 if parameters[sp1][ 'cell_threshold'] is not None else parameters[sp1]['cell_threshold'] sp2_threshold = -1 if parameters[sp2][ 'cell_threshold'] is not None else parameters[sp2]['cell_threshold'] if parameters[sp1]['cell_threshold'] is not None: sp1_counts = np.array([ targeted_counts_by_species[sp1][bc] for bc in non_excluded_barcodes[sp1] if (targeted_counts_by_species[sp1][bc] > sp1_threshold) and ( targeted_counts_by_species[sp2][bc] > sp2_threshold) ]) sp1_params = estimate_parameters(Counter(sp1_counts), threshold=sp1_threshold) if not np.isnan(sp1_params.frac_noise): parameters[sp1]['cell_threshold'] = max( sp1_threshold, estimate_threshold(sp1_params, 20)) parameters[sp1]['cells_detected'] = np.sum( sp1_counts >= parameters[sp1]['cell_threshold']) else: parameters[sp1]['cells_detected'] = 0 if parameters[sp2]['cell_threshold'] is not None: sp2_counts = np.array([ targeted_counts_by_species[sp2][bc] for bc in non_excluded_barcodes[sp2] if (targeted_counts_by_species[sp1][bc] > sp1_threshold) and ( targeted_counts_by_species[sp2][bc] > sp2_threshold) ]) sp2_params = estimate_parameters(Counter(sp2_counts), threshold=sp2_threshold) if not np.isnan(sp2_params.frac_noise): parameters[sp2]['cell_threshold'] = max( sp2_threshold, estimate_threshold(sp2_params, 20)) parameters[sp2]['cells_detected'] = np.sum( sp2_counts >= parameters[sp2]['cell_threshold']) else: parameters[sp2]['cells_detected'] = 0 print('Secondary threshold ({}): {}'.format( sp1, parameters[sp1]['cell_threshold'])) print('Secondary threshold ({}): {}'.format( sp2, parameters[sp2]['cell_threshold'])) print('Writing out cell barcodes') cell_barcodes = {} for (species, count_data) in retained_counts.iteritems(): threshold = parameters[species]['cell_threshold'] cell_barcodes[species] = {} print('Cell threshold for species {}: {}'.format(species, threshold)) if threshold is not None: for count, barcode in zip(count_data, non_excluded_barcodes[species]): if count >= threshold: print('{} - Total {}, Targeted {}, Count {}, Threshold {}'. format(barcode, barcode_counts_by_species[species][barcode], targeted_counts_by_species[species][barcode], count, threshold)) cell_barcodes[species][barcode] = count if len(cell_barcodes[species] ) != parameters[species]['cells_detected']: print(len(cell_barcodes[species]), parameters[species]['cells_detected']) raise ValueError( 'Mismatch in called cells identified - failure in threshold setting' ) print('Selected {} barcodes of species {}'.format( len(cell_barcodes[species]), species)) with open(outs.cell_barcodes, 'w') as outfile: # low mem reduce op to merge-sort bcs across species for species in cell_barcodes.keys(): outfile.write(species + ",") outfile.write(",".join(cell_barcodes[species]) + "\n") cell_index = compute_cell_index(species_list, cell_barcodes) with open(outs.singlecell, 'w') as outfile: outfile.write("barcode,cell_id,") outfile.write(",".join([ "is_{}_cell_barcode".format(species) for species in species_list ])) if len(species_list) > 1: for species in species_list: outfile.write(",passed_filters_{}".format(species)) outfile.write(",peak_region_fragments_{}".format(species)) outfile.write("\n") for barcode in [NO_BARCODE] + sorted(barcodes): outfile.write("{},".format(barcode)) outfile.write("{},".format(cell_index.get(barcode, "None"))) values = [ str( int(species in cell_barcodes and barcode in cell_barcodes[species])) for species in species_list ] outfile.write(",".join(values)) if len(species_list) > 1: for species in species_list: outfile.write(",{:d}".format( barcode_counts_by_species[species][barcode])) outfile.write(",{:d}".format( targeted_counts_by_species[species][barcode])) outfile.write("\n") # process data into summary metrics summary_info = {} summary_info.update( generate_cell_calling_metrics(parameters, cell_barcodes)) summary_info.update(generate_gb_metrics(cell_barcodes, excluded_barcodes)) with open(outs.cell_calling_summary, 'w') as outfile: outfile.write(json.dumps(summary_info, indent=4))
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ chunk = args.chunk bam_in = create_bam_infile(args.align_chunk) bam_out, _ = tk_bam.create_bam_outfile( outs.output, None, None, template=bam_in, pgs=[ tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs", TENX_PRODUCT_NAME) ]) gp_tagger = GlobalFivePrimePosTagger(bam_in) if args.barcode_whitelist is None or args.bc_counts is None: # If there's no whitelist or counts then all high quality BC reads get allowed. barcode_whitelist = None wl_idxs = None bc_dist = None else: barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist) # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) counts = counts[str(chunk['gem_group'])]['bc_counts'] # Prior distribution over barcodes, with pseudo-count bc_dist = np.array(counts, dtype=np.float) + 1.0 bc_dist = bc_dist / bc_dist.sum() wl_idxs = { bc: idx for (idx, bc) in enumerate(sorted(list(barcode_whitelist))) } # set random seed to get deterministic subsampling random.seed(0) if chunk['barcode'] is not None: processed_barcode_iter = get_raw_processed_barcodes( open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist) require_barcode_for_stringent = True else: processed_barcode_iter = itertools.repeat(None) require_barcode_for_stringent = False if chunk['sample_index'] is not None: sample_index_iter = tk_fasta.read_generator_fastq( open_maybe_gzip(chunk['sample_index'])) else: sample_index_iter = itertools.repeat(None) if chunk['trim'] is not None: trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip( chunk['trim']), paired_end=True) else: trim_iter = itertools.repeat(None) iters = itertools.izip(processed_barcode_iter, sample_index_iter, trim_iter) # First read try: read = bam_in.next() except StopIteration: read = None # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates perfect_read_count = 0 # Due to secondary alignments, we must apply the tags to all # reads with the same cluster name. for (barcode_info, sample_index_info, trim_info) in iters: tags = [] read_name = None if read is None: break if barcode_info is not None: (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info tags.append((RAW_BARCODE_TAG, raw_bc_seq)) tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual)) if processed_bc_seq is not None: tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq)) read_name = bc_read_name.split()[0] if sample_index_info is not None: (si_read_name, seq, qual) = sample_index_info tags.append((SAMPLE_INDEX_TAG, seq)) tags.append((SAMPLE_INDEX_QUAL_TAG, qual)) if read_name is not None: if si_read_name.split()[0] != read_name: martian.log_info( "mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name)) assert (si_read_name.split()[0] == read_name) else: read_name = si_read_name.split()[0] r1_tags = tags r2_tags = list(r1_tags) if trim_info is not None: (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name, trim2_seq, trim2_qual) = trim_info if len(trim1_seq) > 0: r1_tags.append((TRIM_TAG, trim1_seq)) r1_tags.append((TRIM_QUAL_TAG, trim1_qual)) if len(trim2_seq) > 0: r2_tags.append((TRIM_TAG, trim2_seq)) r2_tags.append((TRIM_QUAL_TAG, trim2_qual)) reads_attached = 0 reads_to_attach = [] while read.query_name == read_name or read_name is None: tags = r1_tags if read.is_read1 else r2_tags if len(tags) > 0: existing_tags = read.tags existing_tags.extend(tags) read.tags = existing_tags if reads_to_attach and ( read.query_name != reads_to_attach[0].query_name or reads_to_attach[0].query_name is None): gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not (get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) reads_to_attach = [] reads_to_attach.append(read) try: read = bam_in.next() except StopIteration: read = None break gp_tagger.tag_reads(reads_to_attach) reads_attached += len(reads_to_attach) for r in reads_to_attach: if stringent_read_filter(r, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not (get_read_barcode(r) is None): bam_out.write(r) else: bam_out.write(r) # We may have more than 2 reads if there was a # secondary alignment, but less than 2 means # something went wrong assert (reads_attached >= 2) outs.perfect_read_count = perfect_read_count bam_out.close()