def main(args, outs): in_bam = tk_bam.create_bam_infile(args.chunk_input) libraries = rna_library.get_bam_library_info(in_bam) distinct_library_types = sorted( list(set([x['library_type'] for x in libraries]))) library_prefixes = map( lambda lib: rna_library.get_library_type_metric_prefix(lib[ 'library_type']), libraries) chroms = in_bam.references barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_summary = cr_utils.load_barcode_tsv( args.barcodes_detected) if not barcode_whitelist else None # TODO: this is redundant gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_summary=barcode_summary, gem_groups=args.gem_groups, library_types=distinct_library_types) feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) if barcode_whitelist: barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, args.gem_groups) else: barcode_seqs = barcode_summary matrix = cr_matrix.CountMatrix.empty(feature_ref, barcode_seqs, dtype='int32') for qname, reads_iter, _ in cr_utils.iter_by_qname(in_bam, None): is_conf_mapped_deduped, genome, feature_id, bc = reporter.count_genes_bam_cb( reads_iter, libraries, library_prefixes, use_umis=cr_chem.has_umis(args.chemistry_def)) if is_conf_mapped_deduped: matrix.add(feature_id, bc) in_bam.close() reporter.store_reference_metadata(args.reference_path, cr_constants.REFERENCE_TYPE, cr_constants.REFERENCE_METRIC_PREFIX) matrix.save_h5_file(outs.matrices_h5) reporter.save(outs.chunked_reporter)
def get_constants_for_pipeline(pipeline): if pipeline == shared_constants.PIPELINE_VDJ: metrics, alarms, charts = ws_vdj_constants.METRICS, ws_vdj_constants.METRIC_ALARMS, ws_vdj_constants.CHARTS metric_prefixes = vdj_report.VdjReporter().get_all_prefixes() else: metrics, alarms, charts = ws_gex_constants.METRICS, ws_gex_constants.METRIC_ALARMS, ws_gex_constants.CHARTS metric_prefixes = cr_report.Reporter().get_all_prefixes() return metrics, alarms, charts, metric_prefixes
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.input) in_bam_chunk = tk_bam.read_bam_chunk(in_bam, (args.chunk_start, args.chunk_end)) out_bam, _ = tk_bam.create_bam_outfile(outs.output, None, None, template=in_bam) chroms = in_bam.references reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), chroms=chroms) for (gg, bc, gene_ids), reads_iter in itertools.groupby( in_bam_chunk, key=cr_utils.barcode_sort_key): # Ignore reads w/o a valid barcode, unmapped reads and reads that map to more than 1 gene if bc is None or gg is None or gene_ids is None or len(gene_ids) != 1: for read in reads_iter: reporter.mark_dupes_corrected_cb(read) out_bam.write(read) continue reads = list(reads_iter) gene_id = gene_ids[0] # Count cDNA PCR duplicates with uncorrected UMIs dupe_key_umi_counts = mark_dupes( bc, gene_id, reads, args, cr_constants.CDNA_PCR_UNCORRECTED_DUPE_TYPE, cr_utils.cdna_pcr_dupe_func, reporter) # Record UMI corrections umi_corrections = correct_umis(dupe_key_umi_counts) # Mark duplicates for cDNA PCR duplicates with corrected UMIs mark_dupes(bc, gene_id, reads, args, cr_constants.CDNA_PCR_DUPE_TYPE, cr_utils.cdna_pcr_dupe_func, reporter, corrected_dupe_keys=umi_corrections, out_bam=out_bam) # Count duplicates for SI PCR duplicates with uncorrected UMIs mark_dupes(bc, gene_id, reads, args, cr_constants.SI_PCR_DUPE_TYPE, cr_utils.si_pcr_dupe_func, reporter) in_bam.close() out_bam.close() reporter.save(outs.chunked_reporter)
def main(args, outs): np.random.seed(0) subsample_rate = args.subsample_info.get('subsample_rate') if subsample_rate is None: return mol_counter = MoleculeCounter.open(args.molecule_info, 'r', start=int(args.chunk_start), length=int(args.chunk_len)) # Subsample the matrices subsample_result = {} subsampled_raw_mats = cr_matrix.GeneBCMatrices.build_from_mol_counter( mol_counter, subsample_rate=subsample_rate, subsample_result=subsample_result) # Filter the subsampled matrices filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) subsampled_filt_mats = subsampled_raw_mats.filter_barcodes( filtered_bcs_per_genome) # Calculations for subsampled duplication rate reporter = cr_report.Reporter( genomes=map(str, mol_counter.get_ref_column('genome_ids')), subsample_types=cr_constants.ALL_SUBSAMPLE_TYPES, subsample_depths=args.subsample_info['all_target_rpc']) reporter.subsampled_duplication_frac_cb( subsampled_raw_mats, mol_counter, args.subsample_info['subsample_rate'], args.subsample_info['subsample_type'], args.subsample_info['target_rpc'], subsample_result['mapped_reads'], ) mol_counter.close() reporter.save(outs.chunked_reporter) outs.subsampled_matrices = {} outs.subsampled_matrices['raw_matrices'] = martian.make_path( 'raw_matrices.h5') outs.subsampled_matrices['filtered_matrices'] = martian.make_path( 'filtered_matrices.h5') subsampled_raw_mats.save_h5(outs.subsampled_matrices['raw_matrices']) subsampled_filt_mats.save_h5(outs.subsampled_matrices['filtered_matrices'])
def get_constants_for_pipeline(pipeline, sample_properties): """ Get the appropriate metrics/alarms/charts for a pipeline """ if pipeline == shared_constants.PIPELINE_VDJ: metrics, alarms, charts = ws_vdj_constants.METRICS, ws_vdj_constants.METRIC_ALARMS, ws_vdj_constants.CHARTS metric_prefixes = filter_vdj_prefixes( vdj_report.VdjReporter().get_all_prefixes(), sample_properties) alarms = filter_vdj_alarms(alarms, sample_properties) else: metrics, alarms, charts = ws_gex_constants.METRICS, ws_gex_constants.METRIC_ALARMS, ws_gex_constants.CHARTS metric_prefixes = cr_report.Reporter().get_all_prefixes() return metrics, alarms, charts, metric_prefixes
def main(args, outs): reference_star_path = cr_utils.get_reference_star_path(args.reference_path) star_index = cr_transcriptome.build_star_index(reference_star_path) chroms = star_index[0][0] gene_index = cr_reference.GeneIndex.load_pickle(cr_utils.get_reference_genes_index(args.reference_path)) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_constants.STAR_DEFAULT_HIGH_CONF_MAPQ, gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_dist=barcode_dist, gem_groups=args.gem_groups, umi_length=cr_chem.get_umi_length(args.chemistry_def), umi_min_qual_threshold=args.umi_min_qual_threshold) reporter.attach_bcs_init() outs.num_alignments = process_alignments(args.chunk_genome_input, args.chunk_trimmed_input, outs.output, args.bam_comments, reporter, gene_index, star_index, args) reporter.attach_bcs_finalize() reporter.save(outs.chunked_reporter)
def main(args, outs): in_bam = tk_bam.create_bam_infile(args.chunk_input) chroms = in_bam.references barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_summary = cr_utils.load_barcode_summary( args.barcode_summary) if not barcode_whitelist else None gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_summary=barcode_summary, gem_groups=args.gem_groups) if barcode_whitelist: barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, args.gem_groups) else: barcode_seqs = barcode_summary genomes = cr_utils.get_reference_genomes(args.reference_path) genes = cr_utils.split_genes_by_genomes(gene_index.get_genes(), genomes) matrices = cr_matrix.GeneBCMatrices(genomes, genes, barcode_seqs) for read in in_bam: is_conf_mapped_deduped, genome, gene_id, bc = reporter.count_genes_bam_cb( read, use_umis=cr_chem.has_umis(args.chemistry_def)) if is_conf_mapped_deduped: matrices.add(genome, gene_id, bc) in_bam.close() matrices.save_h5(outs.matrices_h5) reporter.save(outs.chunked_reporter)
def join(args, outs, chunk_defs, chunk_outs): summary = cr_utils.merge_jsons_as_dict([ args.extract_reads_summary, args.attach_bcs_and_umis_summary, args.mark_duplicates_summary, ]) # Hack for getting reference metadata - # this used to be computed in prior stages. # This is needed for storage in the molecule_info HDF5. tmp_reporter = cr_report.Reporter() tmp_reporter.store_reference_metadata(args.reference_path, cr_constants.REFERENCE_TYPE, cr_constants.REFERENCE_METRIC_PREFIX) ref_metadata = tmp_reporter.report(cr_constants.DEFAULT_REPORT_TYPE) summary.update(ref_metadata) # Load library info from BAM in_bam = tk_bam.create_bam_infile(args.inputs[0]) library_info = rna_library.get_bam_library_info(in_bam) metrics = MoleculeCounter.get_metrics_from_summary(summary, library_info, args.recovered_cells, args.force_cells) input_h5_filenames = [chunk_out.output for chunk_out in chunk_outs] # update with metrics that were computed in the chunks chunk_metric = cr_mol_counter.USABLE_READS_METRIC summed_lib_metrics = MoleculeCounter.sum_library_metric( input_h5_filenames, chunk_metric) for lib_key, value in summed_lib_metrics.iteritems(): metrics[cr_mol_counter.LIBRARIES_METRIC][lib_key][chunk_metric] = value MoleculeCounter.concatenate(outs.output, input_h5_filenames, metrics=metrics)
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [ rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def ] read_tags = [ None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained trim_defs = compute_trim_defs( read_defs, read_tags, args.chemistry_def.get('retain_trimmed_suffix_read')) outs.bam_comments = sorted( set([td.bam_to_fastq for td in trim_defs.itervalues()])) gem_groups = [chunk['gem_group'] for chunk in args.chunks] reporter = cr_report.Reporter( umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), gem_groups=gem_groups) # Determine if barcode sequences need to be reverse complemented. bc_check_rc = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) bc_check_rc.close() # Determine which read_iters need to retain trimmed sequence # (only one per read-type e.g., one per R1, one per R2, etc.) read_types_with_trim_def = set() rna_read_trim_defs = None rna_read2_trim_defs = None bc_read_trim_defs = None si_read_trim_defs = None umi_read_trim_defs = None if rna_read_def.read_type not in read_types_with_trim_def: rna_read_trim_defs = trim_defs read_types_with_trim_def.add(rna_read_def.read_type) if rna_read2_def.read_type not in read_types_with_trim_def: rna_read2_trim_defs = trim_defs read_types_with_trim_def.add(rna_read2_def.read_type) if bc_read_def.read_type not in read_types_with_trim_def: bc_read_trim_defs = trim_defs read_types_with_trim_def.add(bc_read_def.read_type) if si_read_def.read_type not in read_types_with_trim_def: si_read_trim_defs = trim_defs read_types_with_trim_def.add(si_read_def.read_type) if umi_read_def.read_type not in read_types_with_trim_def: umi_read_trim_defs = trim_defs read_types_with_trim_def.add(umi_read_def.read_type) # Setup read iterators. rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, rna_read_trim_defs) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, rna_read2_trim_defs) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, bc_read_trim_defs) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, si_read_trim_defs) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, umi_read_trim_defs) else: umi_reads = FastqReader(None, None, False, None) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads) # Compute trim order of the readers; this is to ensure stability in the ordering # in which trimmed sequence is added to the TRIMMED_SEQ tags trim_order = list( np.argsort([ reader.read_def.read_type for reader in fastq_readers if reader.read_def is not None ])) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest( *[reader.in_iter for reader in fastq_readers]) # Bam file to write auxiliary data to (that won't fit in a fastq hdr / QNAME) trimmed_seq_writer = ChunkedBamWriter(outs.trimmed_seqs, args.reads_per_file) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.initial_reads): # Downsample if random.random() > args.subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions rna_read = rna_extraction.read if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction.read if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction.read if bc_extraction is not None else EMPTY_READ si_read = si_extraction.read if si_extraction is not None else EMPTY_READ umi_read = umi_extraction.read if umi_extraction is not None else EMPTY_READ # Extra trimming for internal purposes if args.rna_read_length is not None: rna_read = (rna_read[0], rna_read[1][0:args.rna_read_length], rna_read[2][0:args.rna_read_length]) # Accumulate trimmed sequence; ordering is by read-type (I1,I2,R1,R2) # to ensure stability trimmed_seq = '' trimmed_qual = '' for i in trim_order: if extractions[i] is None: continue trimmed_seq += extractions[i].trimmed_seq trimmed_qual += extractions[i].trimmed_qual if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, args.gem_group, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) fastq_header_str1 = fastq_header1.to_string() read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2])) # Write trimmed sequence data to a separate, unaligned BAM file # Note: We assume that there is only one trimmed sequence per read-pair trimmed_seq_data = pysam.AlignedSegment() trimmed_seq_data.query_name = fastq_header_str1.split( AugmentedFastqHeader.WORD_SEP)[0] trimmed_seq_data.flag = 4 trimmed_seq_data.seq = trimmed_seq trimmed_seq_data.qual = trimmed_qual trimmed_seq_writer.write(trimmed_seq_data) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) read2_writer.write( (fastq_header2.to_string(), rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() bc_counter.close() trimmed_seq_writer.close() # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] outs.gem_groups = [args.gem_group] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) outs.trimmed_seqs = trimmed_seq_writer.get_out_paths() else: outs.reads = [] outs.read2s = [] outs.gem_groups = [] outs.read_groups = [] outs.trimmed_seqs = [] assert len(outs.gem_groups) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) assert len(outs.trimmed_seqs) == len(outs.reads) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)
def join(args, outs, chunk_defs, chunk_outs): outs.reads, outs.read2s, outs.tags = [], [], [] outs.gem_groups, outs.library_types, outs.library_ids, outs.read_groups = [], [], [], [] for chunk_out in chunk_outs: outs.reads += [read for read in chunk_out.reads] outs.read2s += [read2 for read2 in chunk_out.read2s] outs.tags += [tags for tags in chunk_out.tags] outs.gem_groups += [gem_group for gem_group in chunk_out.gem_groups] outs.library_types += [lt for lt in chunk_out.library_types] outs.library_ids += [li for li in chunk_out.library_ids] outs.read_groups += [ read_group for read_group in chunk_out.read_groups ] # Ensure that we have non-zero reads if not outs.reads: martian.exit( "No reads found. Check the input fastqs and/or the chemistry definition" ) # Ensure consistency of BAM comments assert all(chunk_out.bam_comments == chunk_outs[0].bam_comments for chunk_out in chunk_outs) outs.bam_comments = chunk_outs[0].bam_comments # Write barcode counts (merged by library_type) bc_counters = BarcodeCounter.merge_by( [co.barcode_counts for co in chunk_outs], [cd.library_type for cd in chunk_defs], args.barcode_whitelist, outs.gem_groups) with open(outs.barcode_counts, 'w') as f: tk_safe_json.dump_numpy(bc_counters, f) # Write feature counts feature_counts = None for chunk_def, chunk_out in itertools.izip(chunk_defs, chunk_outs): with open(chunk_out.feature_counts) as f: chunk_counts = np.asarray(json.load(f), dtype=int) if feature_counts is None: feature_counts = chunk_counts else: feature_counts += chunk_counts with open(outs.feature_counts, 'w') as f: json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f) outs.align = cr_utils.select_alignment_params(args.align) # Group reporters by library type outs.chunked_reporter = None reporter_groups = defaultdict(list) for chunk_def, chunk_out in zip(chunk_defs, chunk_outs): if not chunk_out.reads: continue chunk_lib_types = set(lt for lt in chunk_out.library_types) assert len(chunk_lib_types) == 1 lib_type = list(chunk_lib_types)[0] reporter_groups[lib_type].append(chunk_out.chunked_reporter) # Merge reporters and prefix JSON keys by library type summary = {} for lib_type, reporters in reporter_groups.iteritems(): j = cr_report.merge_reporters(reporters).to_json() prefix = rna_library.get_library_type_metric_prefix(lib_type) j_prefixed = dict((prefix + k, v) for k, v in j.iteritems()) summary.update(j_prefixed) # Use a temporary reporter to generate the metadata (w/o a prefix) tmp_reporter = cr_report.Reporter() tmp_reporter.store_chemistry_metadata(args.chemistry_def) summary.update(tmp_reporter.to_json()) # Write summary JSON with open(outs.summary, 'w') as f: tk_safe_json.dump_numpy(summary, f, pretty=True)
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Build the feature reference if args.reference_path: feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) else: feature_ref = rna_feature_ref.FeatureReference.empty() # Setup feature barcode extraction feature_extractor = rna_feature_ref.FeatureExtractor( feature_ref, use_feature_types=[args.library_type]) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [ rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def ] read_tags = [ None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained for bamtofastq trim_defs = get_bamtofastq_defs(read_defs, read_tags) outs.bam_comments = sorted(set(trim_defs.itervalues())) num_libraries = len(args.library_info) reporter = cr_report.Reporter( umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), num_libraries=num_libraries) # Determine if barcode sequences need to be reverse complemented. with FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None) as bc_check_rc: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist, True) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) # Log the untrimmed read lengths to stdout r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None) r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None) r1_untrimmed_len = 0 for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r1_untrimmed_len = max(r1_untrimmed_len, len(read[1])) print "Read 1 untrimmed length = ", r1_untrimmed_len print "Input arg r1_length = ", args.r1_length r1_reader.close() if paired_end: r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None) r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None) r2_untrimmed_len = 0 for read in itertools.islice( r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r2_untrimmed_len = max(r2_untrimmed_len, len(read[1])) print "Read 2 untrimmed length = ", r2_untrimmed_len print "Input arg r2_length = ", args.r2_length r2_reader.close() # Setup read iterators. r1_length = args.r1_length r2_length = args.r2_length rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length) else: umi_reads = FastqReader(None, None, False, r1_length, r2_length) # Record feature counts: feature_counts = np.zeros(feature_ref.get_num_features(), dtype=int) # If this library type has no feature barcodes, make the reader a NOOP if feature_extractor.has_features_to_extract(): feature_reads = FastqFeatureReader(args.read_chunks, feature_extractor, args.reads_interleaved, r1_length, r2_length) else: feature_reads = FastqReader(None, None, None, r1_length, r2_length) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads, feature_reads) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION) tag_writer = None if not args.augment_fastq: tag_writer = ChunkedFastqWriter(outs.tags, args.reads_per_file, compression=COMPRESSION) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest( *[reader.in_iter for reader in fastq_readers]) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.chunk_initial_reads): # Downsample if random.random() > args.chunk_subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction, feature_extraction = extractions rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ si_read = si_extraction if si_extraction is not None else EMPTY_READ umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ if (not rna_read[1]) or (paired_end and (not rna_read2[1])): # Read 1 is empty or read 2 is empty (if paired_end) # Empty reads causes issue with STAR aligner, so eliminate # them here continue if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences lib_idx = [ i for i, x in enumerate(args.library_info) if x['library_id'] == args.library_id ][0] reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, lib_idx, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) feat_raw_bc = None feat_proc_bc = None feat_qual = None feat_ids = None if feature_extraction: if feature_extraction.barcode: feat_raw_bc = feature_extraction.barcode feat_qual = feature_extraction.qual if len(feature_extraction.ids) > 0: feat_proc_bc = feature_extraction.barcode feat_ids = ';'.join(feature_extraction.ids) # If hit a single feature ID, count its frequency if len(feature_extraction.ids) == 1: feature_counts[feature_extraction.indices[0]] += 1 if feat_raw_bc: fastq_header1.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG, feat_raw_bc) fastq_header1.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG, feat_qual) if feat_ids: fastq_header1.set_tag(cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc) fastq_header1.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids) if args.augment_fastq: read1_writer.write( (fastq_header1.to_string(), rna_read[1], rna_read[2])) else: read1_writer.write((rna_read[0], rna_read[1], rna_read[2])) tag_writer.write((fastq_header1.to_string(), '', '')) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) if feat_raw_bc: fastq_header2.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG, feat_raw_bc) fastq_header2.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG, feat_qual) if feat_ids: fastq_header2.set_tag( cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc) fastq_header2.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids) if args.augment_fastq: read2_writer.write( (fastq_header2.to_string(), rna_read2[1], rna_read2[2])) else: read2_writer.write((rna_read2[0], rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() if not args.augment_fastq: tag_writer.close() bc_counter.close() # Write feature BC read counts with open(outs.feature_counts, 'w') as f: json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f) # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] if args.augment_fastq: outs.tags = [] else: outs.tags = tag_writer.get_out_paths(len(outs.tags)) libraries = args.library_info library = [ li for li in libraries if li['library_id'] == args.library_id ][0] outs.gem_groups = [library['gem_group']] * len(outs.reads) outs.library_types = [library['library_type']] * len(outs.reads) outs.library_ids = [library['library_id']] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) else: outs.reads = [] outs.read2s = [] outs.tags = [] outs.gem_groups = [] outs.library_types = [] outs.library_ids = [] outs.read_groups = [] assert len(outs.gem_groups) == len(outs.reads) assert args.augment_fastq or len(outs.tags) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def] read_tags = [None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained for bamtofastq trim_defs = get_bamtofastq_defs(read_defs, read_tags) outs.bam_comments = sorted(set(trim_defs.itervalues())) gem_groups = [chunk['gem_group'] for chunk in args.chunks] reporter = cr_report.Reporter(umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), gem_groups=gem_groups) # Determine if barcode sequences need to be reverse complemented. bc_check_rc = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) bc_check_rc.close() # Log the untrimmed read lengths to stdout r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None) r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None) r1_untrimmed_len = 0 for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r1_untrimmed_len = max(r1_untrimmed_len, len(read[1])) print "Read 1 untrimmed length = ", r1_untrimmed_len print "Input arg r1_length = ", args.r1_length r1_reader.close() if paired_end: r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None) r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None) r2_untrimmed_len = 0 for read in itertools.islice(r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r2_untrimmed_len = max(r2_untrimmed_len, len(read[1])) print "Read 2 untrimmed length = ", r2_untrimmed_len print "Input arg r2_length = ", args.r2_length r2_reader.close() # Setup read iterators. r1_length = args.r1_length r2_length = args.r2_length rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length) else: umi_reads = FastqReader(None, None, False, r1_length, r2_length) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest(*[reader.in_iter for reader in fastq_readers]) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.initial_reads): # Downsample if random.random() > args.subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ si_read = si_extraction if si_extraction is not None else EMPTY_READ umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ if (not rna_read[1]) or (paired_end and (not rna_read2[1])): # Read 1 is empty or read 2 is empty (if paired_end) # Empty reads causes issue with STAR aligner, so eliminate # them here continue if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, args.gem_group, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) fastq_header_str1 = fastq_header1.to_string() read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2])) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) read2_writer.write((fastq_header2.to_string(), rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() bc_counter.close() # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] outs.gem_groups = [args.gem_group] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) else: outs.reads = [] outs.read2s = [] outs.gem_groups = [] outs.read_groups = [] assert len(outs.gem_groups) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)