def build_from_mol_counter(molecule_counter, subsample_rate=1.0, subsample_result=None): """ Construct a GeneBCMatrices object from a MoleculeCounter. Args: subsample_result (dict) - Return some metrics results into this dict. """ # Reconstruct all barcode sequences in the original matrices barcode_whitelist = cr_utils.load_barcode_whitelist(molecule_counter.get_barcode_whitelist()) barcode_length = molecule_counter.get_barcode_length() or len(barcode_whitelist[0]) gem_groups = molecule_counter.get_gem_groups() barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, gem_groups) # Reconstruct Gene tuples from the molecule info ref columns gene_ids = molecule_counter.get_ref_column('gene_ids') genome_ids = molecule_counter.get_ref_column('genome_ids') gene_names = molecule_counter.get_ref_column('gene_names') gene_tuples = [cr_constants.Gene(gid, gname, None, None, None) for (gid, gname) in itertools.izip(gene_ids, gene_names)] genes = cr_utils.split_genes_by_genomes(gene_tuples, genome_ids) matrices = GeneBCMatrices(genome_ids, genes, barcode_seqs) # Track results of subsampling reads = 0 for mol in molecule_counter.get_molecule_iter(barcode_length, subsample_rate=subsample_rate): matrices.add(mol.genome, mol.gene_id, mol.barcode) reads += mol.reads if subsample_result is not None: subsample_result['mapped_reads'] = reads return matrices
def main(args, outs): in_bam = tk_bam.create_bam_infile(args.chunk_input) libraries = rna_library.get_bam_library_info(in_bam) distinct_library_types = sorted( list(set([x['library_type'] for x in libraries]))) library_prefixes = map( lambda lib: rna_library.get_library_type_metric_prefix(lib[ 'library_type']), libraries) chroms = in_bam.references barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_summary = cr_utils.load_barcode_tsv( args.barcodes_detected) if not barcode_whitelist else None # TODO: this is redundant gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_summary=barcode_summary, gem_groups=args.gem_groups, library_types=distinct_library_types) feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) if barcode_whitelist: barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, args.gem_groups) else: barcode_seqs = barcode_summary matrix = cr_matrix.CountMatrix.empty(feature_ref, barcode_seqs, dtype='int32') for qname, reads_iter, _ in cr_utils.iter_by_qname(in_bam, None): is_conf_mapped_deduped, genome, feature_id, bc = reporter.count_genes_bam_cb( reads_iter, libraries, library_prefixes, use_umis=cr_chem.has_umis(args.chemistry_def)) if is_conf_mapped_deduped: matrix.add(feature_id, bc) in_bam.close() reporter.store_reference_metadata(args.reference_path, cr_constants.REFERENCE_TYPE, cr_constants.REFERENCE_METRIC_PREFIX) matrix.save_h5_file(outs.matrices_h5) reporter.save(outs.chunked_reporter)
def __init__(self, barcode_whitelist, out_counts, gem_groups=None): self.barcode_counts = None self.barcode_index = None self.out_counts = out_counts self.barcode_seqs = cr_utils.load_barcode_whitelist(barcode_whitelist) if self.barcode_seqs: self.barcode_seqs = cr_utils.format_barcode_seqs(self.barcode_seqs, gem_groups) self.barcode_counts = np.zeros(len(self.barcode_seqs), dtype=np.uint32) self.barcode_index = {bc: index for index, bc in enumerate(self.barcode_seqs)}
def split(args): # Need to store umi_info and a json with a dict containing 1 key per barcode umi_info_mem_gb = 2 * int(np.ceil(vdj_umi_info.get_mem_gb(args.umi_info))) bc_diversity = len(cr_utils.load_barcode_whitelist(args.barcode_whitelist)) assemble_summary_mem_gb = tk_stats.robust_divide(bc_diversity, DICT_BCS_PER_MEM_GB) return { 'chunks': [{ '__mem_gb': int( np.ceil( max(cr_constants.MIN_MEM_GB, umi_info_mem_gb + assemble_summary_mem_gb))), }] }
def split(args): chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist) chunks = [] for chunk_input in args.inputs: chunks.append({ 'chunk_input': chunk_input, '__mem_gb': chunk_mem_gb, }) join_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist, args.gem_groups, use_min=False) # Account for memory used by reporters (particularly the bc and umi diversity dicts) genomes = cr_utils.get_reference_genomes(args.reference_path) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) if barcode_whitelist is not None: num_barcodes = len(barcode_whitelist) * max(args.gem_groups) else: num_barcodes = cr_utils.get_num_barcodes_from_barcode_summary( args.barcode_summary) max_bc_diversity_entries = num_barcodes max_umi_diversity_entries = 4**cr_chem.get_umi_length(args.chemistry_def) # Multiply by 2 to hold the current reporter + accumulating reporter in the merge bc_diversity_mem_gb = (2 * max_bc_diversity_entries * cr_constants.BYTES_PER_STR_INT_DICT_ENTRY * (len(genomes) + 1) * len(cr_constants.READ_TYPES)) / 1e9 umi_diversity_mem_gb = (2 * max_umi_diversity_entries * cr_constants.BYTES_PER_STR_INT_DICT_ENTRY * (len(genomes) + 1) * len(cr_constants.READ_TYPES)) / 1e9 join_mem_gb = min( cr_constants.COUNT_GENES_MAX_MEM_GB, max(cr_constants.MIN_MEM_GB, int(join_mem_gb + bc_diversity_mem_gb + umi_diversity_mem_gb))) join = { '__mem_gb': join_mem_gb, } return {'chunks': chunks, 'join': join}
def main(args, outs): random.seed(0) np.random.seed(0) with cr_mol_counter.MoleculeCounter.open(args.molecule_h5, 'r', start=int(args.chunk_start), length=int( args.chunk_len)) as ctr_in: genome_ids = ctr_in.get_ref_column('genome_ids') gene_ids = ctr_in.get_ref_column('gene_ids') barcode_whitelist = cr_utils.load_barcode_whitelist( ctr_in.get_barcode_whitelist()) # Estimate BC diversity and recovered cells per gem group gg_total_diversity = len(barcode_whitelist) bc_counts_per_genome = get_bc_counts(genome_ids, gene_ids, ctr_in) top_bcs_per_genome = {} total_conf_mapped_cell_reads = 0 total_cells = 0 recovered_cells = args.recovered_cells or cr_constants.DEFAULT_RECOVERED_CELLS_PER_GEM_GROUP for genome, (barcodes, umi_counts, read_counts) in bc_counts_per_genome.iteritems(): if args.force_cells is not None: top_bc_indices, filter_summary, _ = cr_stats.filter_cellular_barcodes_fixed_cutoff( umi_counts, args.force_cells) else: top_bc_indices, filter_summary, _ = cr_stats.filter_cellular_barcodes_ordmag( umi_counts, recovered_cells, gg_total_diversity) top_bcs_per_genome[genome] = barcodes[top_bc_indices] total_conf_mapped_cell_reads += read_counts[top_bc_indices].sum() total_cells += filter_summary['filtered_bcs'] write_filtered_barcodes(outs.cell_barcodes, args.gem_group, ctr_in, top_bcs_per_genome) outs.gem_group_metrics = { 'cells': int(total_cells), 'cmb_reads': int(total_conf_mapped_cell_reads) }
def main(args, outs): reference_star_path = cr_utils.get_reference_star_path(args.reference_path) star_index = cr_transcriptome.build_star_index(reference_star_path) chroms = star_index[0][0] gene_index = cr_reference.GeneIndex.load_pickle(cr_utils.get_reference_genes_index(args.reference_path)) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_constants.STAR_DEFAULT_HIGH_CONF_MAPQ, gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_dist=barcode_dist, gem_groups=args.gem_groups, umi_length=cr_chem.get_umi_length(args.chemistry_def), umi_min_qual_threshold=args.umi_min_qual_threshold) reporter.attach_bcs_init() outs.num_alignments = process_alignments(args.chunk_genome_input, args.chunk_trimmed_input, outs.output, args.bam_comments, reporter, gene_index, star_index, args) reporter.attach_bcs_finalize() reporter.save(outs.chunked_reporter)
def main(args, outs): in_bam = tk_bam.create_bam_infile(args.chunk_input) chroms = in_bam.references barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_summary = cr_utils.load_barcode_summary( args.barcode_summary) if not barcode_whitelist else None gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_summary=barcode_summary, gem_groups=args.gem_groups) if barcode_whitelist: barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, args.gem_groups) else: barcode_seqs = barcode_summary genomes = cr_utils.get_reference_genomes(args.reference_path) genes = cr_utils.split_genes_by_genomes(gene_index.get_genes(), genomes) matrices = cr_matrix.GeneBCMatrices(genomes, genes, barcode_seqs) for read in in_bam: is_conf_mapped_deduped, genome, gene_id, bc = reporter.count_genes_bam_cb( read, use_umis=cr_chem.has_umis(args.chemistry_def)) if is_conf_mapped_deduped: matrices.add(genome, gene_id, bc) in_bam.close() matrices.save_h5(outs.matrices_h5) reporter.save(outs.chunked_reporter)
def main(args, outs): np.random.seed(0) unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) cell_barcodes = set() bc_support = defaultdict(int) # Load barcode whitelist barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) all_gem_groups = sorted(set(args.gem_groups)) if args.recovered_cells: recovered_cells = args.recovered_cells else: recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len( all_gem_groups) for gem_group in all_gem_groups: if barcode_whitelist is None: break # Load barcode raw read count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, gem_group, proportions=False) counts = np.array(barcode_dist.values()) # Append gem group to barcode seqs barcodes = np.array([ cr_utils.format_barcode_seq(seq, gem_group) for seq in barcode_dist.keys() ]) # Call cell barcodes gg_bc_support, gg_cell_bcs, rpu_threshold, umi_threshold, confidence = call_cell_barcodes( args.umi_info, int(gem_group)) # Record the RPU and UMI thresholds reporter._get_metric_attr('vdj_filter_bcs_rpu_threshold', gem_group).set_value(rpu_threshold) reporter._get_metric_attr('vdj_filter_bcs_umi_threshold', gem_group).set_value(umi_threshold) reporter._get_metric_attr('vdj_filter_bcs_confidence', gem_group).set_value(confidence) if len(gg_bc_support) > 0: if args.force_cells is not None: sorted_bcs = map( lambda kv: kv[0], sorted(gg_bc_support.items(), key=lambda kv: kv[1], reverse=True)) gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells )] # Update set of BCs called as cells cell_barcodes.update(set(gg_cell_bcs)) # Sum BC support for bc, count in gg_bc_support.iteritems(): bc_support[bc] += count # Load the extract_reads summary to get the total raw reads total_read_pairs = cr_utils.get_metric_from_json( args.extract_reads_summary, 'total_read_pairs') reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts, total_read_pairs, recovered_cells) save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes) with open(outs.barcode_support, 'w') as f: f.write('barcode,count\n') for k, v in bc_support.iteritems(): f.write('%s,%d\n' % (k, v)) write_barcode_umi_summary(args.umi_info, reporter, outs.barcode_umi_summary, args.min_readpairs_per_umi, cell_barcodes) reporter.report_summary_json(outs.summary)
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Build the feature reference if args.reference_path: feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) else: feature_ref = rna_feature_ref.FeatureReference.empty() # Setup feature barcode extraction feature_extractor = rna_feature_ref.FeatureExtractor( feature_ref, use_feature_types=[args.library_type]) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [ rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def ] read_tags = [ None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained for bamtofastq trim_defs = get_bamtofastq_defs(read_defs, read_tags) outs.bam_comments = sorted(set(trim_defs.itervalues())) num_libraries = len(args.library_info) reporter = cr_report.Reporter( umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), num_libraries=num_libraries) # Determine if barcode sequences need to be reverse complemented. with FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None) as bc_check_rc: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist, True) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) # Log the untrimmed read lengths to stdout r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None) r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None) r1_untrimmed_len = 0 for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r1_untrimmed_len = max(r1_untrimmed_len, len(read[1])) print "Read 1 untrimmed length = ", r1_untrimmed_len print "Input arg r1_length = ", args.r1_length r1_reader.close() if paired_end: r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None) r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None) r2_untrimmed_len = 0 for read in itertools.islice( r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r2_untrimmed_len = max(r2_untrimmed_len, len(read[1])) print "Read 2 untrimmed length = ", r2_untrimmed_len print "Input arg r2_length = ", args.r2_length r2_reader.close() # Setup read iterators. r1_length = args.r1_length r2_length = args.r2_length rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length) else: umi_reads = FastqReader(None, None, False, r1_length, r2_length) # Record feature counts: feature_counts = np.zeros(feature_ref.get_num_features(), dtype=int) # If this library type has no feature barcodes, make the reader a NOOP if feature_extractor.has_features_to_extract(): feature_reads = FastqFeatureReader(args.read_chunks, feature_extractor, args.reads_interleaved, r1_length, r2_length) else: feature_reads = FastqReader(None, None, None, r1_length, r2_length) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads, feature_reads) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION) tag_writer = None if not args.augment_fastq: tag_writer = ChunkedFastqWriter(outs.tags, args.reads_per_file, compression=COMPRESSION) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest( *[reader.in_iter for reader in fastq_readers]) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.chunk_initial_reads): # Downsample if random.random() > args.chunk_subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction, feature_extraction = extractions rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ si_read = si_extraction if si_extraction is not None else EMPTY_READ umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ if (not rna_read[1]) or (paired_end and (not rna_read2[1])): # Read 1 is empty or read 2 is empty (if paired_end) # Empty reads causes issue with STAR aligner, so eliminate # them here continue if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences lib_idx = [ i for i, x in enumerate(args.library_info) if x['library_id'] == args.library_id ][0] reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, lib_idx, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) feat_raw_bc = None feat_proc_bc = None feat_qual = None feat_ids = None if feature_extraction: if feature_extraction.barcode: feat_raw_bc = feature_extraction.barcode feat_qual = feature_extraction.qual if len(feature_extraction.ids) > 0: feat_proc_bc = feature_extraction.barcode feat_ids = ';'.join(feature_extraction.ids) # If hit a single feature ID, count its frequency if len(feature_extraction.ids) == 1: feature_counts[feature_extraction.indices[0]] += 1 if feat_raw_bc: fastq_header1.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG, feat_raw_bc) fastq_header1.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG, feat_qual) if feat_ids: fastq_header1.set_tag(cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc) fastq_header1.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids) if args.augment_fastq: read1_writer.write( (fastq_header1.to_string(), rna_read[1], rna_read[2])) else: read1_writer.write((rna_read[0], rna_read[1], rna_read[2])) tag_writer.write((fastq_header1.to_string(), '', '')) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) if feat_raw_bc: fastq_header2.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG, feat_raw_bc) fastq_header2.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG, feat_qual) if feat_ids: fastq_header2.set_tag( cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc) fastq_header2.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids) if args.augment_fastq: read2_writer.write( (fastq_header2.to_string(), rna_read2[1], rna_read2[2])) else: read2_writer.write((rna_read2[0], rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() if not args.augment_fastq: tag_writer.close() bc_counter.close() # Write feature BC read counts with open(outs.feature_counts, 'w') as f: json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f) # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] if args.augment_fastq: outs.tags = [] else: outs.tags = tag_writer.get_out_paths(len(outs.tags)) libraries = args.library_info library = [ li for li in libraries if li['library_id'] == args.library_id ][0] outs.gem_groups = [library['gem_group']] * len(outs.reads) outs.library_types = [library['library_type']] * len(outs.reads) outs.library_ids = [library['library_id']] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) else: outs.reads = [] outs.read2s = [] outs.tags = [] outs.gem_groups = [] outs.library_types = [] outs.library_ids = [] outs.read_groups = [] assert len(outs.gem_groups) == len(outs.reads) assert args.augment_fastq or len(outs.tags) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)
class MoleculeCounter: """ Streams a list of tuples w/named elements to or from an h5 file """ def __init__(self): self.file_version = None self.h5 = None self.columns = OrderedDict() self.ref_columns = OrderedDict() self.library_info = None def get_barcode_whitelist(self): return self.get_metric(BC_WHITELIST_METRIC) def get_gem_groups(self): return map(int, self.get_metric(GEM_GROUPS_METRIC).keys()) def is_aggregated(self): ret = self.get_metric(IS_AGGREGATED_METRIC) return ret if ret is not None else False @staticmethod def get_column_dtype(k): return np.dtype(MOLECULE_INFO_COLUMNS[k]) @staticmethod def get_record_bytes(): return sum([np.dtype(x).itemsize for x in MOLECULE_INFO_COLUMNS.values()]) @staticmethod def estimate_mem_gb(chunk_len, scale=1.0, cap=True): """ Estimate memory usage of this object given a number of records. """ mol_entries_per_gb = int(1e9 / MoleculeCounter.get_record_bytes()) mem_gb = round(math.ceil(scale * chunk_len / mol_entries_per_gb)) if cap: return max(h5_constants.MIN_MEM_GB, mem_gb) else: return mem_gb @staticmethod def build_barcode_info(filtered_barcodes_by_genome, library_info, barcodes): """Generate numpy arrays for per-barcode info Args: filtered_barcodes_by_genome (dict of str:list(str)): Keys are genomes, values are lists of filtered barcode strings. library_info (list of dict): Per-library metadata. barcodes (list of str): All barcode sequences (e.g. ['ACGT', ...] Returns: BarcodeInfo object """ # Replace a genome string with its lexicographical rank genome_to_idx = {g:i for i, g in \ enumerate(sorted(filtered_barcodes_by_genome.keys()))} libraries_for_gem_group = defaultdict(list) for lib_idx, lib in enumerate(library_info): libraries_for_gem_group[lib['gem_group']].append(lib_idx) # Map a barcode sequence to its index into the MoleculeCounter # 'barcodes' array bc_seq_to_idx = {bc:i for i, bc in enumerate(barcodes)} # Populate the "pass filter" array of tuples pf_tuples = [] for genome, bcs in filtered_barcodes_by_genome.iteritems(): genome_idx = genome_to_idx[genome] for bc_str in bcs: seq, gg = cr_utils.split_barcode_seq(bc_str) barcode_idx = bc_seq_to_idx[seq] # FIXME: Assumes no per-library filtering, just per-gem-group library_inds = libraries_for_gem_group[gg] for library_idx in library_inds: pf_tuples.append((barcode_idx, library_idx, genome_idx)) if len(pf_tuples) > 0: pass_filter = np.array(pf_tuples, dtype=BARCODE_INFO_DTYPES['pass_filter']) else: pass_filter = np.zeros((0,3), dtype=BARCODE_INFO_DTYPES['pass_filter']) assert pass_filter.shape[0] == len(pf_tuples) assert pass_filter.shape[1] == 3 # Sort by barcode index pass_filter = pass_filter[np.argsort(pass_filter[:,0]), :] return BarcodeInfo( pass_filter, genomes=sorted(filtered_barcodes_by_genome.keys()), ) @staticmethod def get_filtered_barcodes(barcode_info, library_info, barcodes, genome_idx=None, library_type=None): """Get a list of filtered barcode strings e.g. ['ACGT-1',...] Args: barcode_info (BarcodeInfo): Barcode info object. library_info (list of dict): Library info. barcodes (np.array): Barcode sequences. genome_idx (int): Restrict passing definition to this genome. None for no restriction. library_type (str): Restrict passing definition to this library type. None for no restriction. Returns: list of str """ # Without restrictions, assumes passing filter in a single library or genome is sufficient # for a barcode to be passing filter overall. pass_filter = barcode_info.pass_filter pf_barcode_idx = pass_filter[:,0] pf_library_idx = pass_filter[:,1] pf_genome_idx = pass_filter[:,2] mask = np.ones(pass_filter.shape[0], dtype=bool) if genome_idx is not None: mask &= pf_genome_idx == genome_idx if library_type is not None: library_inds = np.array([i for i,lib in enumerate(library_info) if lib['library_type'] == library_type], dtype=MOLECULE_INFO_COLUMNS['library_idx']) mask &= np.isin(pf_library_idx, library_inds) inds = np.flatnonzero(mask) lib_to_gg = np.array([lib['gem_group'] for lib in library_info], dtype='uint64') pf_gem_group = lib_to_gg[pf_library_idx[inds]] # Take unique, sorted barcodes (sorted by (gem_group, barcode_idx)) gg_bcs = np.unique(np.column_stack((pf_gem_group, pf_barcode_idx[inds])), axis=0) # Create barcode strings return [cr_utils.format_barcode_seq(barcodes[gg_bcs[i, 1]], gg_bcs[i, 0]) for i in xrange(gg_bcs.shape[0])] @staticmethod def save_barcode_info(bc_info, group): """Save barcode info to HDF5. Args: barcode_info (BarcodeInfo): Data. group (h5py.Group): Output group. """ group.create_dataset('pass_filter', data=bc_info.pass_filter, maxshape=(None, bc_info.pass_filter.shape[1]), compression=HDF5_COMPRESSION, shuffle=True) cr_io.create_hdf5_string_dataset(group, 'genomes', bc_info.genomes, compression=HDF5_COMPRESSION, shuffle=True) @staticmethod def load_barcode_info(group): """Load barcode info from an HDF5 group. Args: group (h5py.Group): Input group. Returns: BarcodeInfo object """ return BarcodeInfo( pass_filter=group['pass_filter'][:], genomes=cr_io.read_hdf5_string_dataset(group['genomes']), ) def get_barcode_info(self): return MoleculeCounter.load_barcode_info(self.h5[BARCODE_INFO_GROUP_NAME]) @staticmethod def open(filename, mode, feature_ref=None, barcodes=None, library_info=None, barcode_info=None): """Open a molecule info object. Args: filename (str): Filename to open or create mode (str): 'r' for reading, 'w' for writing. feature_ref (FeatureReference): Required when mode is 'w'. barcodes (list of str): All possible barcode sequences. Required when mode is 'w'. library_info (list of dict): Library metadata. Required when mode is 'w'. barcode_info (BarcodeInfo): Per-barcode metadata. Returns: MoleculeInfo: A new object """ assert mode == 'r' or mode == 'w' mc = MoleculeCounter() if mode == 'w': if feature_ref is None: raise ValueError('Feature reference must be specified when opening a molecule info object for writing') if barcodes is None: raise ValueError('Barcodes must be specified when opening a molecule info object for writing') if library_info is None: raise ValueError('Library info must be specified when opening a molecule info object for writing') if barcode_info is None: raise ValueError('Barcode info must be specified when opening a molecule info object for writing') mc.h5 = h5py.File(filename, 'w') cr_io.set_hdf5_attr(mc.h5, FILE_VERSION_KEY, CURR_FILE_VERSION) cr_io.set_hdf5_attr(mc.h5, h5_constants.H5_FILETYPE_KEY, MOLECULE_H5_FILETYPE) cr_io.set_hdf5_attr(mc.h5, FILE_VERSION_KEY, CURR_FILE_VERSION) mc.h5.create_group(METRICS_GROUP_NAME) # Write feature reference fref_group = mc.h5.create_group(h5_constants.H5_FEATURE_REF_ATTR) feature_ref.to_hdf5(fref_group) # Write barcodes # If there are multiple barcode lengths, use the largest for the numpy dtype. max_barcode_len = np.max(map(len, barcodes)) barcode_dtype = np.dtype('S%d' % max_barcode_len) mc.h5.create_dataset('barcodes', data=np.fromiter(barcodes, barcode_dtype, count=len(barcodes)), compression=HDF5_COMPRESSION) # Write library info lib_info_json = json.dumps(library_info, indent=4, sort_keys=True) cr_io.create_hdf5_string_dataset(mc.h5, 'library_info', [lib_info_json]) # Write barcode info g = mc.h5.create_group(BARCODE_INFO_GROUP_NAME) MoleculeCounter.save_barcode_info(barcode_info, g) # Create empty per-molecule datasets for name, col_type in MOLECULE_INFO_COLUMNS.iteritems(): mc.columns[name] = mc.h5.create_dataset(name, (0,), maxshape=(None,), dtype=col_type, compression=HDF5_COMPRESSION, chunks=(HDF5_CHUNK_SIZE,)) elif mode == 'r': mc.h5 = h5py.File(filename, 'r') try: mc.file_version = mc.h5.attrs[FILE_VERSION_KEY] except AttributeError: mc.file_version = 1 # V1 doesn't have version field if mc.file_version < CURR_FILE_VERSION: raise ValueError('The molecule info HDF5 file (format version %d) was produced by an older version of Cell Ranger. Reading these files is unsupported.' % mc.file_version) if mc.file_version > CURR_FILE_VERSION: raise ValueError('The molecule info HDF5 file (format version %d) was produced by an newer version of Cell Ranger. Reading these files is unsupported.' % mc.file_version) for key in mc.h5.keys(): if key in MOLECULE_INFO_COLUMNS: mc.columns[key] = mc.h5[key] elif key in MOLECULE_REF_COLUMNS: mc.ref_columns[key] = mc.h5[key] elif key == h5_constants.H5_FEATURE_REF_ATTR: mc.feature_reference = FeatureReference.from_hdf5(mc.h5[key]) elif key == METRICS_GROUP_NAME \ or key == BARCODE_INFO_GROUP_NAME: pass else: raise AttributeError("Unrecognized dataset key: %s" % key) # Load library info mc.library_info = json.loads(cr_io.read_hdf5_string_dataset(mc.h5['library_info'])[0]) return mc def nrows(self): return self.get_column_lazy(MOLECULE_INFO_COLUMNS.keys()[0]).shape[0] def get_chunk_key(self, idx): return tuple(self.get_column_lazy(col)[idx] for col in CHUNK_COLUMNS) def set_metric(self, key, value): """Set a metric. Serialize to Pickle.""" self.h5[METRICS_GROUP_NAME].attrs[key] = cPickle.dumps(value) def get_metric(self, key): """Get a metric.""" try: value = cPickle.loads(self.h5[METRICS_GROUP_NAME].attrs[key]) except KeyError: value = None return value def set_all_metrics(self, metrics): for (k,v) in metrics.iteritems(): self.set_metric(k, v) def get_all_metrics(self): return {k:cPickle.loads(v) for k,v in self.h5[METRICS_GROUP_NAME].attrs.iteritems()} def append_column(self, name, values): """Append an array of values to a column.""" ds = self.columns[name] start = len(ds) end = start + len(values) ds.resize((end,)) ds[start:end] = values def get_column_lazy(self, col_name): """ Retrieve column. Depending on how the file was opened, this may only be a file view instead of a full array. """ return self.columns[col_name] def get_column(self, col_name): """Load an entire column of data into memory""" return self.get_column_lazy(col_name)[:] def set_ref_column(self, col_name, values): assert col_name in MOLECULE_REF_COLUMNS self.ref_columns[col_name] = self.h5.create_carray(self.h5.root, col_name, obj=np.array(values)) def get_ref_column(self, col_name): """Load a reference array into memory as a numpy array""" return self.get_ref_column_lazy(col_name)[:] def get_ref_column_lazy(self, col_name): """Get a reference array as a lazy h5py Dataset""" return self.ref_columns[col_name] def get_feature_ref(self): return FeatureReference.from_hdf5(self.h5[h5_constants.H5_FEATURE_REF_ATTR]) def get_barcodes(self): return self.h5['barcodes'][:] def get_num_filtered_barcodes_for_library(self, library_idx): """Count the number of barcodes passing filter for a library. Args: library_idx (int): Index of library to count. Returns: int: Number of filtered barcodes for this library. """ pass_filter = self.h5[BARCODE_INFO_GROUP_NAME]['pass_filter'][:] this_lib = np.flatnonzero(pass_filter[:,1] == library_idx) barcode_inds = pass_filter[this_lib, 0] return len(np.unique(barcode_inds)) def get_library_info(self): return json.loads(self.h5['library_info'][0]) def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() def close(self): self.h5.close() def save(self): self.h5.close() @staticmethod def merge_barcode_infos(bc_infos): """Merge a BarcodeInfo into another BarcodeInfo. Args: src_bc_infos (list of BarcodeInfo): Input BarcodeInfos. Returns: BarcodeInfo""" assert len(bc_infos) > 0 genomes = bc_infos[0].genomes # Total number of barcodes with any information pfs = [] for bc_info in bc_infos: assert bc_info.pass_filter.shape[1] == 3 assert bc_info.genomes == genomes pfs.append(bc_info.pass_filter) new_pf = np.concatenate(pfs, axis=0) # Deduplicate the tuples. Unique throws an error on a zero-row array. if new_pf.shape[0] > 0: new_pf = np.unique(new_pf, axis=0) return BarcodeInfo( pass_filter=new_pf, genomes=genomes, ) @staticmethod def concatenate(out_filename, in_filenames, metrics=None): """Concatenate MoleculeCounter HDF5 files Args: out_filename (str): Output HDF5 filename in_filenames (list of str): Input HDF5 filenames metrics (dict): Metrics to write """ # Load reference info from first file first_mc = MoleculeCounter.open(in_filenames[0], 'r') feature_ref = first_mc.get_feature_ref() barcodes = first_mc.get_barcodes() library_info = first_mc.get_library_info() feature_ids = [f.id for f in feature_ref.feature_defs] # print 'Merging barcode info' bc_infos = [] for filename in in_filenames: with MoleculeCounter.open(filename, 'r') as mc: bc_infos.append(mc.get_barcode_info()) merged_bc_info = MoleculeCounter.merge_barcode_infos(bc_infos) # print 'Concatenating molecule info files' out_mc = MoleculeCounter.open(out_filename, mode='w', feature_ref=feature_ref, barcodes=barcodes, library_info=library_info, barcode_info=merged_bc_info) for filename in in_filenames: with MoleculeCounter.open(filename, mode='r') as in_mc: # Assert that these data are compatible assert in_mc.get_library_info() == library_info assert np.array_equal(in_mc.get_barcodes(), barcodes) fref = in_mc.get_feature_ref() assert [f.id for f in fref.feature_defs] == feature_ids # if no metrics specified, copy them from the first file if metrics is None: metrics = in_mc.get_all_metrics() # Concatenate per-molecule datasets for name, ds in in_mc.columns.iteritems(): out_mc.append_column(name, ds[:]) out_mc.set_all_metrics(metrics) out_mc.save() def find_last_occurrence_of_chunk_key(self, from_row): num_rows = self.nrows() initial_chunk_key = self.get_chunk_key(from_row) for i in xrange(from_row, num_rows): chunk_key = self.get_chunk_key(i) if not chunk_key == initial_chunk_key: return i - 1 return num_rows - 1 def bisect(self, query, key_func): return MoleculeCounter.bisect_static(self.nrows(), query, key_func) @staticmethod def bisect_static(num_rows, query, key_func): """ Performs a binary search to find the leftmost insertion point of query. Takes a key function, where key_func(i) = the value to compare to at index i.""" lo = 0 hi = num_rows exists = True while True: i = (hi + lo) / 2 curr = key_func(i) if curr == query: break elif hi - lo <= 1: # non-matching case exists = False break elif curr < query: lo = i else: hi = i if exists: # backtrack to first occurrence for j in xrange(i, -1, -1): curr = key_func(j) if curr != query: return j + 1 return 0 def get_chunks_from_partition(self, values, key_func): return MoleculeCounter.get_chunks_from_partition_static(self.nrows(), values, key_func) @staticmethod def get_chunks_from_partition_static(num_rows, values, key_func): """ Get chunks by partitioning on the specified values.""" starts = [0] + [MoleculeCounter.bisect_static(num_rows, val, key_func) for val in values[1:]] n = len(starts) for i in xrange(n): chunk_start = starts[i] chunk_end = starts[i+1] if i+1 < n else num_rows yield (chunk_start, chunk_end - chunk_start) def get_chunks(self, target_chunk_len, preserve_boundaries=True): """ Get chunks, optionally preserving boundaries defined by get_chunk_key(). Yields (chunk_start, chunk_len) which are closed intervals """ num_rows = self.nrows() chunk_start, chunk_end = 0, 0 while chunk_end < (num_rows - 1): target_chunk_end = min(num_rows - 1, chunk_start + target_chunk_len - 1) chunk_end = self.find_last_occurrence_of_chunk_key(target_chunk_end) if preserve_boundaries else target_chunk_end chunk_len = 1 + chunk_end - chunk_start yield (chunk_start, chunk_len) chunk_start = 1 + chunk_end @staticmethod def compress_gem_group(x): return MOLECULE_INFO_COLUMNS['gem_group'](x) @staticmethod def compress_umi_seq(x, umi_bits): return cr_utils.compress_seq(x, umi_bits) @staticmethod def get_metrics_from_summary(summary, libraries, total_recovered_cells=None, total_force_cells=None): """ Extract relevant metrics from a summary dict.""" mol_metrics = {} version_metrics = ['cellranger_version', 'reference_mkref_version', 'reference_fasta_hash', 'reference_gtf_hash'] for m in version_metrics: mol_metrics[m] = summary[m] chemistry_metrics = [m for m in summary if m.startswith('chemistry')] for m in chemistry_metrics: mol_metrics[m] = summary[m] # Per-library values lib_metrics = {} for lib_idx, lib in enumerate(libraries): lib_type_prefix = rna_library.get_library_type_metric_prefix(lib['library_type']) summary_name = '%s%s_total_read_pairs_per_library' % (lib_type_prefix, lib_idx) lib_metrics[str(lib_idx)] = { TOTAL_READS_METRIC: summary[summary_name], } # Per-gem-group values gg_metrics = {} gem_groups = sorted([lib['gem_group'] for lib in libraries]) for gg in gem_groups: # Distribute the toplevel expected and forced cells parameters # evenly among the gem groups. recovered_cells = total_recovered_cells / len(gem_groups) if total_recovered_cells is not None else None force_cells = total_force_cells / len(gem_groups) if total_force_cells is not None else None gg_metrics[str(gg)] = { GG_RECOVERED_CELLS_METRIC: recovered_cells, GG_FORCE_CELLS_METRIC: force_cells, } mol_metrics[LIBRARIES_METRIC] = lib_metrics mol_metrics[GEM_GROUPS_METRIC] = gg_metrics return mol_metrics @staticmethod def naive_concatenate_metrics(mol_h5_list): combined_metrics = None gg_metrics = {} lib_metrics = {} for mol_h5 in mol_h5_list: with MoleculeCounter.open(mol_h5, mode='r') as counter: single_metrics = counter.get_all_metrics() if combined_metrics is None: combined_metrics = single_metrics gg_metrics = counter.get_metric(GEM_GROUPS_METRIC) lib_metrics = counter.get_metric(LIBRARIES_METRIC) else: # concatenate new gem groups to the metrics. if it collides with an existing # gem group, the old one will be overwritten. new_gg_metrics = counter.get_metric(GEM_GROUPS_METRIC) new_lib_metrics = counter.get_metric(LIBRARIES_METRIC) gg_metrics.update(new_gg_metrics) lib_metrics.update(new_lib_metrics) combined_metrics[GEM_GROUPS_METRIC] = gg_metrics combined_metrics[LIBRARIES_METRIC] = lib_metrics return combined_metrics @staticmethod def get_compressed_bc_iter(barcodes): """ Yields compressed barcode tuples that can be compared against a MoleculeCounter's data. Useful for filtering a MoleculeCounter by barcode. Args: barcodes (iterable) - list of barcode strings (e.g., ACGT-1) Yields: (compressed_bc, compressed_gem_group) tuples """ for barcode in barcodes: barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) compressed_bc = MoleculeCounter.compress_barcode_seq(barcode_seq) compressed_gg = MoleculeCounter.compress_gem_group(gem_group) yield compressed_bc, compressed_gg def get_raw_read_pairs_per_library(self): """ Get raw read pairs per library. Returns: list of int: Order is by library index """ return [self.get_metric(LIBRARIES_METRIC)[str(li)][TOTAL_READS_METRIC] for li,_ in enumerate(self.library_info)] def get_usable_read_pairs_per_library(self): """ Get usable read pairs per library. Returns: list of int: Order is by library index """ return [self.get_metric(LIBRARIES_METRIC)[str(li)][USABLE_READS_METRIC] for li,_ in enumerate(self.library_info)] @staticmethod def _sum_metric(mol_h5_list, metric_name, metric_type): """ Combine a library- or gemgroup- level integer metric across multiple h5 files """ assert metric_type is LIBRARIES_METRIC or \ metric_type is GEM_GROUPS_METRIC combined = defaultdict(int) for mol_h5 in mol_h5_list: with MoleculeCounter.open(mol_h5, mode='r') as counter: for key, metrics in counter.get_metric(metric_type).iteritems(): combined[key] += metrics[metric_name] return combined @staticmethod def sum_library_metric(mol_h5_list, metric_name): return MoleculeCounter._sum_metric(mol_h5_list, metric_name, LIBRARIES_METRIC) @staticmethod def get_total_conf_mapped_reads_in_cells_chunk(filename, filtered_bcs_set, start, length, queue): total_mapped_reads = 0 with MoleculeCounter.open(filename, 'r', start, length) as mc: for barcode, gem_group, reads in itertools.izip(mc.get_column('barcode'), mc.get_column('gem_group'), mc.get_column('reads')): if reads < 1: continue if (barcode, gem_group) not in filtered_bcs_set: continue total_mapped_reads += reads queue.put(total_mapped_reads) @staticmethod def convert_v2_to_v3(v2_mole_info_h5, out_v3_mole_info_h5): """ Given the input v2 molecule info h5 file, convert it into v3 file. """ def get_v2_metrics(h5_file): group = tables.open_file(h5_file, 'r').get_node('/metrics') attrset = group._v_attrs return {k: attrset[k] for k in attrset._f_list()} def decompress_barcode_seq(x, barcode_length, bits=64): x = np.uint64(x) assert barcode_length <= (bits/2 - 1) if x & (1L << (bits-1)): return 'N' * barcode_length result = bytearray(barcode_length) for i in xrange(barcode_length): result[(barcode_length-1)-i] = tk_seq.NUCS[x & np.uint64(0b11)] x = x >> np.uint64(2) return str(result) def build_feature_ref(gene_ids, gene_names, genome_index): feature_defs = [] if len(genome_index) == 1: genome = genome_index.keys()[0] for idx, (gene_id, gene_name) in enumerate(zip(gene_ids, gene_names)): feature_defs.append(FeatureDef(index=idx, id=gene_id, name=gene_name, feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE, tags={'genome': genome})) else: for idx, (gene_id, gene_name) in enumerate(zip(gene_ids, gene_names)): genome = gene_id.split('_')[0] feature_defs.append(FeatureDef(index=idx, id=gene_id, name=gene_name, feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE, tags={'genome': genome})) return FeatureReference(feature_defs, ['genome']) def get_chunks_by_gem_group(gem_group_arr): """ Return exactly one chunk per gem group.""" # verify gem groups are sorted assert np.all(np.diff(gem_group_arr)>=0) num_rows = gem_group_arr.shape[0] unique_ggs = np.unique(gem_group_arr) gg_key = lambda i: gem_group_arr[i] chunk_iter = MoleculeCounter.get_chunks_from_partition_static(num_rows, unique_ggs, gg_key) for (gg, chunk) in zip(unique_ggs, chunk_iter): yield (gg, chunk[0], chunk[1]) random.seed(0) np.random.seed(0) v2_mc_in = h5py.File(v2_mole_info_h5, 'r') v2_metrics = get_v2_metrics(v2_mole_info_h5) v2_genome_ids = v2_mc_in['genome_ids'] v2_genome_name_to_index = {g:i for i, g in enumerate(v2_genome_ids)} # Feature Ref new_feature_ref = build_feature_ref(v2_mc_in['gene_ids'], v2_mc_in['gene_names'], v2_genome_name_to_index) # barcode whitelist barcode_length = v2_metrics[BC_LENGTH_METRIC] barcode_whitelist = cr_utils.load_barcode_whitelist(v2_metrics[BC_WHITELIST_METRIC]) barcode_to_idx = OrderedDict((k, i) for i,k in enumerate(barcode_whitelist)) gg_total_diversity = len(barcode_whitelist) v2_genomes = np.asarray(v2_mc_in['genome'], dtype=np.uint8) # <-> genome information goes into feature_idx in v3 v2_gene = np.asarray(v2_mc_in['gene'], dtype=MOLECULE_INFO_COLUMNS['feature_idx'])# <-> feature_idx in v3 v2_conf_mapped_reads = np.asarray(v2_mc_in['reads'], dtype=MOLECULE_INFO_COLUMNS['count']) # <-> count in v3 v2_barcodes = np.asarray(v2_mc_in['barcode'], dtype=np.uint64) # <-> transit into barcode_idx in v3 v2_umis = np.asarray(v2_mc_in['umi'], dtype=MOLECULE_INFO_COLUMNS['umi']) # <-> umi in v3 v2_gem_groups = np.asarray(v2_mc_in['gem_group'], dtype=MOLECULE_INFO_COLUMNS['gem_group']) # <-> gem_group in v3 library_info = [] barcode_info_genomes, barcode_info_pass_filter = [], [] barcode_idx_list, feature_idx_list, library_idx_list = [], [], [] gem_group_list, count_list, umi_list = [], [], [] v2_metrics[LIBRARIES_METRIC] = {} # each gem_group is a library for lib_idx, (gem_group, chunk_start, chunk_len) in enumerate(get_chunks_by_gem_group(v2_gem_groups)): library_info.append({ 'gem_group': int(gem_group), 'library_id': str(lib_idx), 'library_type': lib_constants.GENE_EXPRESSION_LIBRARY_TYPE }) # per library, raw_read_pairs and usable_read_pairs info v2_metrics[LIBRARIES_METRIC][str(lib_idx)] = { USABLE_READS_METRIC : v2_metrics[GEM_GROUPS_METRIC][gem_group]['conf_mapped_filtered_bc_reads'], TOTAL_READS_METRIC : v2_metrics[GEM_GROUPS_METRIC][gem_group]['total_reads'] } recovered_cells = v2_metrics[GEM_GROUPS_METRIC][gem_group].get(GG_RECOVERED_CELLS_METRIC, None) force_cells = v2_metrics[GEM_GROUPS_METRIC][gem_group].get(GG_FORCE_CELLS_METRIC, None) chunk_end = chunk_start + chunk_len genomes_for_gem_group = v2_genomes[chunk_start:chunk_end] bcs_for_gem_group = v2_barcodes[chunk_start:chunk_end] reads_for_gem_group = v2_conf_mapped_reads[chunk_start:chunk_end] gene_for_gem_group = v2_gene[chunk_start:chunk_end] umis_for_gem_group = v2_umis[chunk_start:chunk_end] for genome_id in v2_genome_ids: g_idx = v2_genome_name_to_index[genome_id] genome_indices = genomes_for_gem_group == g_idx if genome_indices.sum() == 0: # edge case - there's no data for this genome (e.g. empty sample, false barnyard sample, or nothing confidently mapped) continue bcs_for_genome = bcs_for_gem_group[genome_indices] reads_for_genome = reads_for_gem_group[genome_indices] gene_for_genome = gene_for_gem_group[genome_indices] umis_for_genome = umis_for_gem_group[genome_indices] # only count UMIs with at least one conf mapped read umi_conf_mapped_to_genome = reads_for_genome > 0 bc_breaks = bcs_for_genome[1:] - bcs_for_genome[:-1] bc_breaks = np.concatenate(([1], bc_breaks)) # first row is always a break bc_break_indices = np.nonzero(bc_breaks)[0] unique_bcs = bcs_for_genome[bc_break_indices] umis_per_bc = np.add.reduceat(umi_conf_mapped_to_genome, bc_break_indices) if force_cells is not None: top_bc_indices, _, _ = cr_stats.filter_cellular_barcodes_fixed_cutoff(umis_per_bc, force_cells) else: top_bc_indices, _, _ = cr_stats.filter_cellular_barcodes_ordmag(umis_per_bc, recovered_cells, gg_total_diversity) # barcode info barcode_seq_to_idx = {b:barcode_to_idx[decompress_barcode_seq(b, barcode_length)] for b in unique_bcs} barcode_info_genomes.append(genome_id) for b in unique_bcs[top_bc_indices]: barcode_info_pass_filter.append((barcode_seq_to_idx[b], lib_idx, g_idx)) # data barcode_idx_list.append(np.vectorize(barcode_seq_to_idx.get)(bcs_for_genome)) count_list.append(reads_for_genome) gem_group_list.append(np.full(reads_for_genome.shape[0], gem_group, dtype=MOLECULE_INFO_COLUMNS['gem_group'])) library_idx_list.append(np.full(reads_for_genome.shape[0], lib_idx, dtype=MOLECULE_INFO_COLUMNS['library_idx'])) feature_idx_list.append(gene_for_genome) umi_list.append(umis_for_genome) new_barcode_info = BarcodeInfo( pass_filter=np.array(barcode_info_pass_filter, dtype=BARCODE_INFO_DTYPES['pass_filter']), genomes=barcode_info_genomes, ) with MoleculeCounter.open(out_v3_mole_info_h5, 'w', feature_ref=new_feature_ref, barcodes=barcode_whitelist, library_info=library_info, barcode_info=new_barcode_info, ) as out_mc: out_mc.append_column('barcode_idx', np.concatenate(barcode_idx_list)) out_mc.append_column('count', np.concatenate(count_list)) out_mc.append_column('feature_idx', np.concatenate(feature_idx_list)) out_mc.append_column('gem_group', np.concatenate(gem_group_list)) out_mc.append_column('umi', np.concatenate(umi_list)) # library_idx is the same as gem_group_list out_mc.append_column('library_idx', np.concatenate(library_idx_list)) out_mc.set_all_metrics(v2_metrics) return
def join(args, outs, chunk_defs, chunk_outs): outs.barcode_compatible = True outs.barcode_compatibility_info = {} # record sampled barcode info outs.skip_translate = {} if chunk_outs is None or len(chunk_outs) == 0: return # aggreagate barcodes from chunk, {gem_group : {library_type : count_of_fastq_file} } sampled_barcodes = defaultdict(lambda: defaultdict(list)) for chunk_def, chunk_out in zip(chunk_defs, chunk_outs): gem_group, lib = chunk_def.gem_group, chunk_def.library_type sampled_barcodes[gem_group][lib].extend(chunk_out.sampled_barcodes) barcodes_in_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist, as_set=True) barcode_translate_map = cr_utils.load_barcode_translate_map( args.barcode_whitelist) sampled_bc_counter_in_wl = defaultdict( lambda: defaultdict(lambda: defaultdict(int))) for gem_group in sampled_barcodes: outs.barcode_compatibility_info[gem_group] = {} for lib in sampled_barcodes[gem_group]: sampled_bc = sampled_barcodes[gem_group][lib] unique_bc = set(sampled_bc) unique_bc_in_wl = unique_bc.intersection(barcodes_in_whitelist) outs.barcode_compatibility_info[gem_group][lib] = {} outs.barcode_compatibility_info[gem_group][lib][ 'num_barcodes_sampled'] = len(sampled_bc) outs.barcode_compatibility_info[gem_group][lib][ 'num_barcodes_sampled_unique'] = len(unique_bc) outs.barcode_compatibility_info[gem_group][lib][ 'num_barcodes_sampled_unique_in_whitelist'] = len( unique_bc_in_wl) sampled_bc_counter_in_wl[gem_group][lib] = { k: v for (k, v) in Counter(sampled_bc).iteritems() if k in unique_bc_in_wl } barcode_compatibility_cutoff = cr_constants.BARCODE_COMPATIBILITY_CUTOFF if args.barcode_compatibility_cutoff is None else args.barcode_compatibility_cutoff pairwise_compatibility = {} exit_log_msg = "Barcodes from libraries are not compatible." for gem_group in sampled_barcodes: outs.skip_translate[gem_group] = {} pairwise_compatibility[gem_group] = {} library_types = sampled_barcodes[gem_group].keys() if len(library_types) < 2: continue if GENE_EXPRESSION_LIBRARY_TYPE in library_types: base_lib = GENE_EXPRESSION_LIBRARY_TYPE library_types.remove(base_lib) outs.skip_translate[gem_group][base_lib] = True else: # TODO: as for CR3.0, we need GEX for cell calling et al # at some point, we might support samples without GEX martian.exit( "Gene expression data not found in the GEM group {}.".format( gem_group)) base_lib_counter = sampled_bc_counter_in_wl[gem_group][base_lib] for lib in library_types: pair_key = '{}/{}'.format(base_lib, lib) pairwise_compatibility[gem_group][pair_key] = {} lib_counter = sampled_bc_counter_in_wl[gem_group][lib] # without translate overlap_size = len( set(base_lib_counter).intersection(set(lib_counter))) cosine_sim = robust_cosine_similarity(base_lib_counter, lib_counter) outs.skip_translate[gem_group][lib] = True # with translate if (lib != GENE_EXPRESSION_LIBRARY_TYPE) and (barcode_translate_map is not None): translated_counter = { barcode_translate_map.get(k, k): v for (k, v) in lib_counter.iteritems() } overlap_size_translated = len( set(base_lib_counter).intersection( set(translated_counter))) cosine_sim_translated = robust_cosine_similarity( base_lib_counter, translated_counter) if cosine_sim_translated > cosine_sim: outs.skip_translate[gem_group][lib] = False overlap_size = overlap_size_translated cosine_sim = cosine_sim_translated pairwise_compatibility[gem_group][pair_key][ 'overlap_size'] = overlap_size pairwise_compatibility[gem_group][pair_key][ 'cosine_similarity'] = cosine_sim if cosine_sim < barcode_compatibility_cutoff: outs.barcode_compatible = False exit_log_msg += '\n - GEM group {}: Barcodes from [{}] and [{}] have cosine similarity {:.4f}'.format( gem_group, base_lib, lib, cosine_sim) outs.barcode_compatibility_info[ 'pairwise_compatibility'] = pairwise_compatibility # format warning/error message if incompatible if outs.barcode_compatible is False: martian.log_info(exit_log_msg) martian.exit(exit_log_msg) return
def main(args, outs): # Load barcode whitelist if args.barcode_whitelist is not None: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist) reporter = vdj_report.VdjReporter() # Load barcode count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group, args.library_type) if args.barcode_whitelist is not None: barcode_whitelist_set = set(barcode_whitelist) else: barcode_whitelist_set = None in_read1_fastq = cr_io.open_maybe_gzip(args.read1_chunk) in_read2_fastq = cr_io.open_maybe_gzip( args.read2_chunk) if args.read2_chunk else [] outs.corrected_bcs += h5_constants.LZ4_SUFFIX out_file = cr_io.open_maybe_gzip(outs.corrected_bcs, 'w') bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist, outs.corrected_barcode_counts) # Correct barcodes, add processed bc tag to fastq read_pair_iter = itertools.izip_longest(tk_fasta.read_generator_fastq(in_read1_fastq), \ tk_fasta.read_generator_fastq(in_read2_fastq)) for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads): read1_header = cr_fastq.AugmentedFastqHeader(read1[0]) raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) processed_bc = None if raw_bc: if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set: processed_bc = cr_stats.correct_bc_error( args.barcode_confidence_threshold, raw_bc, bc_qual, barcode_dist) else: # Disallow Ns in no-whitelist case if 'N' in raw_bc: processed_bc = None else: processed_bc = raw_bc if processed_bc: bc_counter.count(None, processed_bc, None) # Add gem group to barcode sequence processed_bc = cr_utils.format_barcode_seq( processed_bc, gem_group=args.gem_group) reporter.vdj_barcode_cb(raw_bc, processed_bc) out_file.write('%s\n' % (processed_bc if processed_bc is not None else '')) in_read1_fastq.close() if in_read2_fastq: in_read2_fastq.close() out_file.close() bc_counter.close() reporter.save(outs.chunked_reporter)
def main(args, outs): # Load barcode whitelist if args.barcode_whitelist is not None: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist) reporter = vdj_report.VdjReporter() # Load barcode count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group) if args.barcode_whitelist is not None: barcode_whitelist_set = set(barcode_whitelist) else: barcode_whitelist_set = None in_read1_fastq = open(args.read1_chunk) in_read2_fastq = open(args.read2_chunk) out_read1_fastq = open(outs.corrected_read1s, 'w') out_read2_fastq = open(outs.corrected_read2s, 'w') bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist, outs.corrected_barcode_counts) # Correct barcodes, add processed bc tag to fastq read_pair_iter = itertools.izip(tk_fasta.read_generator_fastq(in_read1_fastq), \ tk_fasta.read_generator_fastq(in_read2_fastq)) for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads): read1_header = cr_fastq.AugmentedFastqHeader(read1[0]) read2_header = cr_fastq.AugmentedFastqHeader(read2[0]) raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) if raw_bc: if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set: processed_bc = cr_stats.correct_bc_error( args.barcode_confidence_threshold, raw_bc, bc_qual, barcode_dist) else: # Disallow Ns in no-whitelist case if 'N' in raw_bc: processed_bc = None else: processed_bc = raw_bc if processed_bc: bc_counter.count(None, processed_bc, None) # Add gem group to barcode sequence processed_bc = cr_utils.format_barcode_seq( processed_bc, gem_group=args.gem_group) read1_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG, processed_bc) read2_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG, processed_bc) reporter.vdj_barcode_cb(raw_bc, processed_bc) tk_fasta.write_read_fastq(out_read1_fastq, read1_header.to_string(), read1[1], read1[2]) tk_fasta.write_read_fastq(out_read2_fastq, read2_header.to_string(), read2[1], read2[2]) in_read1_fastq.close() in_read2_fastq.close() out_read1_fastq.close() out_read2_fastq.close() bc_counter.close() reporter.save(outs.chunked_reporter)
def main(args, outs): outs.coerce_strings() # Load whitelist whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_to_idx = OrderedDict((k, i) for i, k in enumerate(whitelist)) # Load feature reference feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) # Load library info from BAM in_bam = tk_bam.create_bam_infile(args.chunk_input) library_info = rna_library.get_bam_library_info(in_bam) # Get cell-associated barcodes by genome filtered_bcs_by_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bc_union = cr_utils.get_cell_associated_barcode_set( args.filtered_barcodes) # Create the barcode info barcode_info = MoleculeCounter.build_barcode_info(filtered_bcs_by_genome, library_info, whitelist) # Create the molecule info file mc = MoleculeCounter.open(outs.output, mode='w', feature_ref=feature_ref, barcodes=whitelist, library_info=library_info, barcode_info=barcode_info) # Initialize per-library metrics lib_metrics = {} for lib_idx in xrange(len(library_info)): lib_metrics[str(lib_idx)] = {} lib_metrics[str(lib_idx)][cr_mol_counter.USABLE_READS_METRIC] = 0 # Record read-counts per molecule. Note that UMIs are not contiguous # in the input because no sorting was done after UMI correction. prev_gem_group = None prev_barcode_idx = None for (gem_group, barcode_seq), reads_iter in \ itertools.groupby(in_bam, key=cr_utils.barcode_sort_key_no_umi): if barcode_seq is None: continue barcode_idx = barcode_to_idx[barcode_seq] # Assert expected sort order of input BAM assert gem_group >= prev_gem_group if gem_group == prev_gem_group: assert barcode_idx >= prev_barcode_idx is_cell_barcode = cr_utils.format_barcode_seq( barcode_seq, gem_group) in filtered_bc_union counts = defaultdict(int) for read in reads_iter: # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent. if read.is_secondary or \ read.is_read2 or \ cr_utils.is_read_low_support_umi(read) or \ not cr_utils.is_read_conf_mapped_to_feature(read): continue umi_seq = cr_utils.get_read_umi(read) if umi_seq is None: continue umi_int = MoleculeCounter.compress_umi_seq( umi_seq, MoleculeCounter.get_column_dtype('umi').itemsize * 8) feature_ids = cr_utils.get_read_gene_ids(read) assert len(feature_ids) == 1 feature_int = feature_ref.id_map[feature_ids[0]].index library_idx = cr_utils.get_read_library_index(read) counts[(umi_int, library_idx, feature_int)] += 1 if is_cell_barcode: lib_metrics[str(library_idx)][ cr_mol_counter.USABLE_READS_METRIC] += 1 prev_gem_group = gem_group prev_barcode_idx = barcode_idx # Record data for this barcode gg_int = MoleculeCounter.get_column_dtype('gem_group').type(gem_group) mc.append_column('gem_group', np.repeat(gg_int, len(counts))) bc_int = MoleculeCounter.get_column_dtype('barcode_idx').type( barcode_idx) mc.append_column('barcode_idx', np.repeat(bc_int, len(counts))) feature_ints = np.fromiter( (k[2] for k in counts.iterkeys()), dtype=MoleculeCounter.get_column_dtype('feature_idx'), count=len(counts)) # Sort by feature for fast matrix construction order = np.argsort(feature_ints) feature_ints = feature_ints[order] mc.append_column('feature_idx', feature_ints) del feature_ints li_ints = np.fromiter( (k[1] for k in counts.iterkeys()), dtype=MoleculeCounter.get_column_dtype('library_idx'), count=len(counts))[order] mc.append_column('library_idx', li_ints) del li_ints umi_ints = np.fromiter((k[0] for k in counts.iterkeys()), dtype=MoleculeCounter.get_column_dtype('umi'), count=len(counts))[order] mc.append_column('umi', umi_ints) del umi_ints count_ints = np.fromiter( counts.itervalues(), dtype=MoleculeCounter.get_column_dtype('count'), count=len(counts))[order] mc.append_column('count', count_ints) del count_ints in_bam.close() mc.set_metric(cr_mol_counter.LIBRARIES_METRIC, dict(lib_metrics)) mc.save()
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def] read_tags = [None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained for bamtofastq trim_defs = get_bamtofastq_defs(read_defs, read_tags) outs.bam_comments = sorted(set(trim_defs.itervalues())) gem_groups = [chunk['gem_group'] for chunk in args.chunks] reporter = cr_report.Reporter(umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), gem_groups=gem_groups) # Determine if barcode sequences need to be reverse complemented. bc_check_rc = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) bc_check_rc.close() # Log the untrimmed read lengths to stdout r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None) r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None) r1_untrimmed_len = 0 for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r1_untrimmed_len = max(r1_untrimmed_len, len(read[1])) print "Read 1 untrimmed length = ", r1_untrimmed_len print "Input arg r1_length = ", args.r1_length r1_reader.close() if paired_end: r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None) r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None) r2_untrimmed_len = 0 for read in itertools.islice(r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS): r2_untrimmed_len = max(r2_untrimmed_len, len(read[1])) print "Read 2 untrimmed length = ", r2_untrimmed_len print "Input arg r2_length = ", args.r2_length r2_reader.close() # Setup read iterators. r1_length = args.r1_length r2_length = args.r2_length rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length) else: umi_reads = FastqReader(None, None, False, r1_length, r2_length) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest(*[reader.in_iter for reader in fastq_readers]) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.initial_reads): # Downsample if random.random() > args.subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ si_read = si_extraction if si_extraction is not None else EMPTY_READ umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ if (not rna_read[1]) or (paired_end and (not rna_read2[1])): # Read 1 is empty or read 2 is empty (if paired_end) # Empty reads causes issue with STAR aligner, so eliminate # them here continue if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, args.gem_group, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) fastq_header_str1 = fastq_header1.to_string() read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2])) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) read2_writer.write((fastq_header2.to_string(), rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() bc_counter.close() # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] outs.gem_groups = [args.gem_group] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) else: outs.reads = [] outs.read2s = [] outs.gem_groups = [] outs.read_groups = [] assert len(outs.gem_groups) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)
def _get_barcode_whitelist_set(chemistry): return set(cr_utils.load_barcode_whitelist(get_barcode_whitelist(chemistry)))
def main(args, outs): unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) cell_barcodes = set() bc_support = {} # Load barcode whitelist barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) all_gem_groups = sorted(set(args.gem_groups)) if args.recovered_cells: recovered_cells = args.recovered_cells else: recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len( all_gem_groups) for gem_group in all_gem_groups: if barcode_whitelist is None: break # Load barcode raw read count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, gem_group, proportions=False) counts = np.array(barcode_dist.values()) # Append gem group to barcode seqs barcodes = np.array([ cr_utils.format_barcode_seq(seq, gem_group) for seq in barcode_dist.keys() ]) # Call cell barcodes gg_bc_support, gg_cell_bcs, threshold = call_cell_barcodes( args.umi_summary, int(gem_group), args.min_umis, args.readpairs_per_umi_nx, args.readpairs_per_umi_ratio) # Record the threshold reporter._get_metric_attr( 'vdj_filtered_bc_contig_kth_umi_readpair_threshold', gem_group).set_value(threshold) if len(gg_bc_support) > 0: if args.force_cells is not None: sorted_bcs = map( lambda kv: kv[0], sorted(gg_bc_support.items(), key=lambda kv: kv[1], reverse=True)) gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells )] cell_barcodes.update(set(gg_cell_bcs)) bc_support.update(gg_bc_support) # Load the extract_reads summary to get the total raw reads total_read_pairs = cr_utils.get_metric_from_json( args.extract_reads_summary, 'total_read_pairs') # Load the assembly metrics summary to get the total assemblable reads assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json( args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc') assemblable_read_pairs = sum( assemblable_read_pairs_by_bc.get(bc, 0) for bc in cell_barcodes) reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts, total_read_pairs, assemblable_read_pairs, recovered_cells) save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes) with open(outs.barcode_support, 'w') as f: f.write('barcode,count\n') for k, v in bc_support.iteritems(): f.write('%s,%d\n' % (k, v)) write_barcode_umi_summary(args.umi_info, reporter, outs.barcode_umi_summary, args.min_readpairs_per_umi, cell_barcodes) reporter.report_summary_json(outs.summary)
def main(args, outs): random.seed(0) paired_end = cr_chem.is_paired_end(args.chemistry_def) # Use the chemistry to get the locations of various sequences rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def) rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def) bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def) si_read_def = cr_chem.get_si_read_def(args.chemistry_def) umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def) read_defs = [ rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def ] read_tags = [ None, None, (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG), (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG), (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG), ] # Determine which trimmed sequences need to be retained trim_defs = compute_trim_defs( read_defs, read_tags, args.chemistry_def.get('retain_trimmed_suffix_read')) outs.bam_comments = sorted( set([td.bam_to_fastq for td in trim_defs.itervalues()])) gem_groups = [chunk['gem_group'] for chunk in args.chunks] reporter = cr_report.Reporter( umi_length=cr_chem.get_umi_length(args.chemistry_def), primers=cr_utils.get_primers_from_dicts(args.primers), gem_groups=gem_groups) # Determine if barcode sequences need to be reverse complemented. bc_check_rc = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter) bc_check_rc.close() # Determine which read_iters need to retain trimmed sequence # (only one per read-type e.g., one per R1, one per R2, etc.) read_types_with_trim_def = set() rna_read_trim_defs = None rna_read2_trim_defs = None bc_read_trim_defs = None si_read_trim_defs = None umi_read_trim_defs = None if rna_read_def.read_type not in read_types_with_trim_def: rna_read_trim_defs = trim_defs read_types_with_trim_def.add(rna_read_def.read_type) if rna_read2_def.read_type not in read_types_with_trim_def: rna_read2_trim_defs = trim_defs read_types_with_trim_def.add(rna_read2_def.read_type) if bc_read_def.read_type not in read_types_with_trim_def: bc_read_trim_defs = trim_defs read_types_with_trim_def.add(bc_read_def.read_type) if si_read_def.read_type not in read_types_with_trim_def: si_read_trim_defs = trim_defs read_types_with_trim_def.add(si_read_def.read_type) if umi_read_def.read_type not in read_types_with_trim_def: umi_read_trim_defs = trim_defs read_types_with_trim_def.add(umi_read_def.read_type) # Setup read iterators. rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, rna_read_trim_defs) rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, rna_read2_trim_defs) bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, bc_read_trim_defs) si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, si_read_trim_defs) if cr_chem.has_umis(args.chemistry_def): umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, umi_read_trim_defs) else: umi_reads = FastqReader(None, None, False, None) fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads) # Compute trim order of the readers; this is to ensure stability in the ordering # in which trimmed sequence is added to the TRIMMED_SEQ tags trim_order = list( np.argsort([ reader.read_def.read_type for reader in fastq_readers if reader.read_def is not None ])) read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file) if paired_end: read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file) bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts) all_read_iter = itertools.izip_longest( *[reader.in_iter for reader in fastq_readers]) # Bam file to write auxiliary data to (that won't fit in a fastq hdr / QNAME) trimmed_seq_writer = ChunkedBamWriter(outs.trimmed_seqs, args.reads_per_file) EMPTY_READ = (None, '', '') reporter.extract_reads_init() for extractions in itertools.islice(all_read_iter, args.initial_reads): # Downsample if random.random() > args.subsample_rate: continue rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions rna_read = rna_extraction.read if rna_extraction is not None else EMPTY_READ rna_read2 = rna2_extraction.read if rna2_extraction is not None else EMPTY_READ bc_read = bc_extraction.read if bc_extraction is not None else EMPTY_READ si_read = si_extraction.read if si_extraction is not None else EMPTY_READ umi_read = umi_extraction.read if umi_extraction is not None else EMPTY_READ # Extra trimming for internal purposes if args.rna_read_length is not None: rna_read = (rna_read[0], rna_read[1][0:args.rna_read_length], rna_read[2][0:args.rna_read_length]) # Accumulate trimmed sequence; ordering is by read-type (I1,I2,R1,R2) # to ensure stability trimmed_seq = '' trimmed_qual = '' for i in trim_order: if extractions[i] is None: continue trimmed_seq += extractions[i].trimmed_seq trimmed_qual += extractions[i].trimmed_qual if bc_read != EMPTY_READ: # Reverse complement the barcode if necessary if barcode_rc: bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1]) # Track the barcode count distribution bc_counter.count(*bc_read) # Calculate metrics on raw sequences reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, args.gem_group, skip_metrics=args.skip_metrics) # Construct new fastq headers fastq_header1 = AugmentedFastqHeader(rna_read[0]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) fastq_header_str1 = fastq_header1.to_string() read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2])) # Write trimmed sequence data to a separate, unaligned BAM file # Note: We assume that there is only one trimmed sequence per read-pair trimmed_seq_data = pysam.AlignedSegment() trimmed_seq_data.query_name = fastq_header_str1.split( AugmentedFastqHeader.WORD_SEP)[0] trimmed_seq_data.flag = 4 trimmed_seq_data.seq = trimmed_seq trimmed_seq_data.qual = trimmed_qual trimmed_seq_writer.write(trimmed_seq_data) if paired_end: fastq_header2 = AugmentedFastqHeader(rna_read2[0]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1]) fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1]) fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2]) fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1]) fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2]) read2_writer.write( (fastq_header2.to_string(), rna_read2[1], rna_read2[2])) reporter.extract_reads_finalize() # Close input and output files. rna_reads.close() if paired_end: rna_read2s.close() bc_reads.close() si_reads.close() umi_reads.close() read1_writer.close() if paired_end: read2_writer.close() bc_counter.close() trimmed_seq_writer.close() # Set stage output parameters. if len(read1_writer.file_paths) > 0: outs.reads = read1_writer.get_out_paths() if paired_end: outs.read2s = read2_writer.get_out_paths(len(outs.reads)) else: outs.read2s = [] outs.gem_groups = [args.gem_group] * len(outs.reads) outs.read_groups = [args.read_group] * len(outs.reads) outs.trimmed_seqs = trimmed_seq_writer.get_out_paths() else: outs.reads = [] outs.read2s = [] outs.gem_groups = [] outs.read_groups = [] outs.trimmed_seqs = [] assert len(outs.gem_groups) == len(outs.reads) if paired_end: assert len(outs.reads) == len(outs.read2s) assert len(outs.trimmed_seqs) == len(outs.reads) # this is the first reporter stage, so store the pipeline metadata reporter.store_pipeline_metadata(martian.get_pipelines_version()) reporter.save(outs.chunked_reporter)
def _get_barcode_whitelist_set(chemistry): return cr_utils.load_barcode_whitelist(get_barcode_whitelist(chemistry), as_set=True)