def write_filtered_barcodes(out_csv, gem_group, mol_counter, bcs_per_genome): with open(out_csv, 'wb') as f: writer = csv.writer(f) for (genome, bc_ids) in bcs_per_genome.iteritems(): for bc_id in bc_ids: formatted_barcode = cr_utils.format_barcode_seq( mol_counter.decompress_barcode_seq(bc_id), gem_group) writer.writerow([genome, formatted_barcode])
def get_filtered_barcodes(barcode_info, library_info, barcodes, genome_idx=None, library_type=None): """Get a list of filtered barcode strings e.g. ['ACGT-1',...] Args: barcode_info (BarcodeInfo): Barcode info object. library_info (list of dict): Library info. barcodes (np.array): Barcode sequences. genome_idx (int): Restrict passing definition to this genome. None for no restriction. library_type (str): Restrict passing definition to this library type. None for no restriction. Returns: list of str """ # Without restrictions, assumes passing filter in a single library or genome is sufficient # for a barcode to be passing filter overall. pass_filter = barcode_info.pass_filter pf_barcode_idx = pass_filter[:,0] pf_library_idx = pass_filter[:,1] pf_genome_idx = pass_filter[:,2] mask = np.ones(pass_filter.shape[0], dtype=bool) if genome_idx is not None: mask &= pf_genome_idx == genome_idx if library_type is not None: library_inds = np.array([i for i,lib in enumerate(library_info) if lib['library_type'] == library_type], dtype=MOLECULE_INFO_COLUMNS['library_idx']) mask &= np.isin(pf_library_idx, library_inds) inds = np.flatnonzero(mask) lib_to_gg = np.array([lib['gem_group'] for lib in library_info], dtype='uint64') pf_gem_group = lib_to_gg[pf_library_idx[inds]] # Take unique, sorted barcodes (sorted by (gem_group, barcode_idx)) gg_bcs = np.unique(np.column_stack((pf_gem_group, pf_barcode_idx[inds])), axis=0) # Create barcode strings return [cr_utils.format_barcode_seq(barcodes[gg_bcs[i, 1]], gg_bcs[i, 0]) for i in xrange(gg_bcs.shape[0])]
def get_molecule_iter(self, barcode_length, subsample_rate=1.0): """ Return an iterator on Molecule tuples """ assert subsample_rate >= 0 and subsample_rate <= 1.0 # Store the previous compressed barcode so we don't have to decompress every single row prev_compressed_bc = None prev_gem_group = None prev_bc = None # Load the molecule data mol_barcodes = self.get_column('barcode') mol_gem_groups = self.get_column('gem_group') mol_genome_ints = self.get_column('genome') mol_gene_ints = self.get_column('gene') mol_reads = self.get_column('reads') gene_ids = self.get_ref_column('gene_ids') genome_ids = self.get_ref_column('genome_ids') if subsample_rate < 1.0: mol_reads = np.random.binomial(mol_reads, subsample_rate) for compressed_bc, gem_group, genome_int, gene_int, reads in itertools.izip( mol_barcodes, mol_gem_groups, mol_genome_ints, mol_gene_ints, mol_reads): if reads == 0: continue # Decompress the cell barcode if necessary if compressed_bc == prev_compressed_bc and gem_group == prev_gem_group: bc = prev_bc else: bc = cr_utils.format_barcode_seq( self.decompress_barcode_seq(compressed_bc, barcode_length=barcode_length), gem_group) yield Molecule(barcode=bc, genome=genome_ids[genome_int], gene_id=gene_ids[gene_int], reads=reads)
def main(args, outs): outs.coerce_strings() # Load whitelist whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_to_idx = OrderedDict((k, i) for i, k in enumerate(whitelist)) # Load feature reference feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) # Load library info from BAM in_bam = tk_bam.create_bam_infile(args.chunk_input) library_info = rna_library.get_bam_library_info(in_bam) # Get cell-associated barcodes by genome filtered_bcs_by_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bc_union = cr_utils.get_cell_associated_barcode_set( args.filtered_barcodes) # Create the barcode info barcode_info = MoleculeCounter.build_barcode_info(filtered_bcs_by_genome, library_info, whitelist) # Create the molecule info file mc = MoleculeCounter.open(outs.output, mode='w', feature_ref=feature_ref, barcodes=whitelist, library_info=library_info, barcode_info=barcode_info) # Initialize per-library metrics lib_metrics = {} for lib_idx in xrange(len(library_info)): lib_metrics[str(lib_idx)] = {} lib_metrics[str(lib_idx)][cr_mol_counter.USABLE_READS_METRIC] = 0 # Record read-counts per molecule. Note that UMIs are not contiguous # in the input because no sorting was done after UMI correction. prev_gem_group = None prev_barcode_idx = None for (gem_group, barcode_seq), reads_iter in \ itertools.groupby(in_bam, key=cr_utils.barcode_sort_key_no_umi): if barcode_seq is None: continue barcode_idx = barcode_to_idx[barcode_seq] # Assert expected sort order of input BAM assert gem_group >= prev_gem_group if gem_group == prev_gem_group: assert barcode_idx >= prev_barcode_idx is_cell_barcode = cr_utils.format_barcode_seq( barcode_seq, gem_group) in filtered_bc_union counts = defaultdict(int) for read in reads_iter: # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent. if read.is_secondary or \ read.is_read2 or \ cr_utils.is_read_low_support_umi(read) or \ not cr_utils.is_read_conf_mapped_to_feature(read): continue umi_seq = cr_utils.get_read_umi(read) if umi_seq is None: continue umi_int = MoleculeCounter.compress_umi_seq( umi_seq, MoleculeCounter.get_column_dtype('umi').itemsize * 8) feature_ids = cr_utils.get_read_gene_ids(read) assert len(feature_ids) == 1 feature_int = feature_ref.id_map[feature_ids[0]].index library_idx = cr_utils.get_read_library_index(read) counts[(umi_int, library_idx, feature_int)] += 1 if is_cell_barcode: lib_metrics[str(library_idx)][ cr_mol_counter.USABLE_READS_METRIC] += 1 prev_gem_group = gem_group prev_barcode_idx = barcode_idx # Record data for this barcode gg_int = MoleculeCounter.get_column_dtype('gem_group').type(gem_group) mc.append_column('gem_group', np.repeat(gg_int, len(counts))) bc_int = MoleculeCounter.get_column_dtype('barcode_idx').type( barcode_idx) mc.append_column('barcode_idx', np.repeat(bc_int, len(counts))) feature_ints = np.fromiter( (k[2] for k in counts.iterkeys()), dtype=MoleculeCounter.get_column_dtype('feature_idx'), count=len(counts)) # Sort by feature for fast matrix construction order = np.argsort(feature_ints) feature_ints = feature_ints[order] mc.append_column('feature_idx', feature_ints) del feature_ints li_ints = np.fromiter( (k[1] for k in counts.iterkeys()), dtype=MoleculeCounter.get_column_dtype('library_idx'), count=len(counts))[order] mc.append_column('library_idx', li_ints) del li_ints umi_ints = np.fromiter((k[0] for k in counts.iterkeys()), dtype=MoleculeCounter.get_column_dtype('umi'), count=len(counts))[order] mc.append_column('umi', umi_ints) del umi_ints count_ints = np.fromiter( counts.itervalues(), dtype=MoleculeCounter.get_column_dtype('count'), count=len(counts))[order] mc.append_column('count', count_ints) del count_ints in_bam.close() mc.set_metric(cr_mol_counter.LIBRARIES_METRIC, dict(lib_metrics)) mc.save()
def main(args, outs): np.random.seed(0) unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) cell_barcodes = set() bc_support = defaultdict(int) # Load barcode whitelist barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) all_gem_groups = sorted(set(args.gem_groups)) if args.recovered_cells: recovered_cells = args.recovered_cells else: recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len( all_gem_groups) for gem_group in all_gem_groups: if barcode_whitelist is None: break # Load barcode raw read count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, gem_group, proportions=False) counts = np.array(barcode_dist.values()) # Append gem group to barcode seqs barcodes = np.array([ cr_utils.format_barcode_seq(seq, gem_group) for seq in barcode_dist.keys() ]) # Call cell barcodes gg_bc_support, gg_cell_bcs, rpu_threshold, umi_threshold, confidence = call_cell_barcodes( args.umi_info, int(gem_group)) # Record the RPU and UMI thresholds reporter._get_metric_attr('vdj_filter_bcs_rpu_threshold', gem_group).set_value(rpu_threshold) reporter._get_metric_attr('vdj_filter_bcs_umi_threshold', gem_group).set_value(umi_threshold) reporter._get_metric_attr('vdj_filter_bcs_confidence', gem_group).set_value(confidence) if len(gg_bc_support) > 0: if args.force_cells is not None: sorted_bcs = map( lambda kv: kv[0], sorted(gg_bc_support.items(), key=lambda kv: kv[1], reverse=True)) gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells )] # Update set of BCs called as cells cell_barcodes.update(set(gg_cell_bcs)) # Sum BC support for bc, count in gg_bc_support.iteritems(): bc_support[bc] += count # Load the extract_reads summary to get the total raw reads total_read_pairs = cr_utils.get_metric_from_json( args.extract_reads_summary, 'total_read_pairs') reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts, total_read_pairs, recovered_cells) save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes) with open(outs.barcode_support, 'w') as f: f.write('barcode,count\n') for k, v in bc_support.iteritems(): f.write('%s,%d\n' % (k, v)) write_barcode_umi_summary(args.umi_info, reporter, outs.barcode_umi_summary, args.min_readpairs_per_umi, cell_barcodes) reporter.report_summary_json(outs.summary)
def main(args, outs): unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) cell_barcodes = set() bc_support = {} # Load barcode whitelist barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) all_gem_groups = sorted(set(args.gem_groups)) if args.recovered_cells: recovered_cells = args.recovered_cells else: recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len( all_gem_groups) for gem_group in all_gem_groups: if barcode_whitelist is None: break # Load barcode raw read count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, gem_group, proportions=False) counts = np.array(barcode_dist.values()) # Append gem group to barcode seqs barcodes = np.array([ cr_utils.format_barcode_seq(seq, gem_group) for seq in barcode_dist.keys() ]) # Call cell barcodes gg_bc_support, gg_cell_bcs, threshold = call_cell_barcodes( args.umi_summary, int(gem_group), args.min_umis, args.readpairs_per_umi_nx, args.readpairs_per_umi_ratio) # Record the threshold reporter._get_metric_attr( 'vdj_filtered_bc_contig_kth_umi_readpair_threshold', gem_group).set_value(threshold) if len(gg_bc_support) > 0: if args.force_cells is not None: sorted_bcs = map( lambda kv: kv[0], sorted(gg_bc_support.items(), key=lambda kv: kv[1], reverse=True)) gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells )] cell_barcodes.update(set(gg_cell_bcs)) bc_support.update(gg_bc_support) # Load the extract_reads summary to get the total raw reads total_read_pairs = cr_utils.get_metric_from_json( args.extract_reads_summary, 'total_read_pairs') # Load the assembly metrics summary to get the total assemblable reads assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json( args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc') assemblable_read_pairs = sum( assemblable_read_pairs_by_bc.get(bc, 0) for bc in cell_barcodes) reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts, total_read_pairs, assemblable_read_pairs, recovered_cells) save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes) with open(outs.barcode_support, 'w') as f: f.write('barcode,count\n') for k, v in bc_support.iteritems(): f.write('%s,%d\n' % (k, v)) write_barcode_umi_summary(args.umi_info, reporter, outs.barcode_umi_summary, args.min_readpairs_per_umi, cell_barcodes) reporter.report_summary_json(outs.summary)
def make_barcode_tags(qname, reporter, args): gem_group = args.gem_group correct_barcodes = args.correct_barcodes barcode_confidence_threshold = args.barcode_confidence_threshold barcode_whitelist = reporter.barcode_whitelist barcode_dist = reporter.barcode_dist tags = [] fastq_header = AugmentedFastqHeader(qname) # Barcode tags raw_bc_seq = fastq_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = fastq_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) barcode_info = None if len(raw_bc_seq) > 0: processed_bc_seq = reporter.raw_barcode_cb(raw_bc_seq, bc_qual) # Add the gem group if processed_bc_seq is not None: processed_bc_seq = cr_utils.format_barcode_seq(processed_bc_seq, gem_group=gem_group) if (processed_bc_seq is None) and (barcode_whitelist is not None): if correct_barcodes: # Try to correct the barcode processed_bc_seq = cr_stats.correct_bc_error(barcode_confidence_threshold, raw_bc_seq, bc_qual, barcode_dist) # Add the gem group if processed_bc_seq is not None: processed_bc_seq = cr_utils.format_barcode_seq(processed_bc_seq, gem_group=gem_group) else: # If the barcode was already corrected, take that (gem group is included) processed_bc_seq = fastq_header.get_tag(cr_constants.PROCESSED_BARCODE_TAG) tags.append((cr_constants.RAW_BARCODE_TAG, raw_bc_seq)) tags.append((cr_constants.RAW_BARCODE_QUAL_TAG, bc_qual)) if processed_bc_seq is not None: tags.append((cr_constants.PROCESSED_BARCODE_TAG, processed_bc_seq)) barcode_info = cr_constants.ProcessedRead(raw_bc_seq, processed_bc_seq, bc_qual) # UMI tags raw_umi_seq = fastq_header.get_tag(cr_constants.RAW_UMI_TAG) umi_qual = fastq_header.get_tag(cr_constants.UMI_QUAL_TAG) umi_info = None if len(raw_umi_seq) > 0: processed_umi_seq = reporter.raw_umi_cb(raw_umi_seq, umi_qual) tags.append((cr_constants.RAW_UMI_TAG, raw_umi_seq)) tags.append((cr_constants.UMI_QUAL_TAG, umi_qual)) if processed_umi_seq is not None: tags.append((cr_constants.PROCESSED_UMI_TAG, processed_umi_seq)) umi_info = cr_constants.ProcessedRead(raw_umi_seq, processed_umi_seq, umi_qual) # Sample index tags si_seq = fastq_header.get_tag(tk_constants.SAMPLE_INDEX_TAG) si_qual = fastq_header.get_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG) if len(si_seq) > 0: tags.append((tk_constants.SAMPLE_INDEX_TAG, si_seq)) tags.append((tk_constants.SAMPLE_INDEX_QUAL_TAG, si_qual)) stripped_qname = fastq_header.fastq_header return stripped_qname, tags, barcode_info, umi_info
def main(args, outs): np.random.seed(0) LogPerf.mem() with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() barcode_info = mc.get_barcode_info() metrics_in = mc.get_all_metrics() metrics_out = copy.deepcopy(metrics_in) # Compute subsampling rate and approximate new total readpair count frac_reads_kept = np.array(args.frac_reads_kept, dtype=float) total_reads_in = mc.get_raw_read_pairs_per_library() total_reads_out = total_reads_in * frac_reads_kept for lib_idx, _ in enumerate(library_info): metrics_out[cr_mol_counter.LIBRARIES_METRIC][str( lib_idx)][cr_mol_counter. DOWNSAMPLED_READS_METRIC] = total_reads_out[lib_idx] # downsample molecule info chunk = slice(args.chunk_start, args.chunk_start + args.chunk_len) mol_library_idx = mc.get_column_lazy('library_idx')[chunk] mol_read_pairs = mc.get_column_lazy('count')[chunk] mol_rate = frac_reads_kept[mol_library_idx] del mol_library_idx new_read_pairs = np.random.binomial(mol_read_pairs, mol_rate) del mol_read_pairs del mol_rate keep_mol = np.flatnonzero(new_read_pairs) new_read_pairs = new_read_pairs[keep_mol] mol_gem_group = mc.get_column_lazy('gem_group')[chunk][keep_mol] mol_barcode_idx = mc.get_column_lazy('barcode_idx')[chunk][keep_mol] mol_feature_idx = mc.get_column_lazy('feature_idx')[chunk][keep_mol] # Assert that gem groups start at 1 and are contiguous gem_groups = sorted(set(lib['gem_group'] for lib in library_info)) assert(min(gem_groups) == 1 and \ np.all(np.diff(np.array(gem_groups,dtype=int)) == 1)) feature_ref = mc.get_feature_ref() # Compute matrix dimensions # Get the range of possible barcode indices for each gem group. gg_barcode_idx_start = np.zeros(1 + len(gem_groups), dtype=int) gg_barcode_idx_len = np.zeros(1 + len(gem_groups), dtype=int) for gg_str, idx_range in sorted( args.gem_group_barcode_ranges.iteritems(), key=lambda kv: int(kv[0])): gg = int(gg_str) gg_barcode_idx_start[gg] = idx_range[0] gg_barcode_idx_len[gg] = idx_range[1] - idx_range[0] num_bcs = gg_barcode_idx_len.sum() num_features = feature_ref.get_num_features() print 'downsampled' LogPerf.mem() # Convert molecule barcode indices into matrix barcode indices # The molecule info barcode_idx is in this space: # [W_0, W_1, ...] where W_i is distinct original whitelist i. # The matrix is in, e.g., this space: # [w_0-1, w_1-2, w_0-3, ...] where w_i-j is a copy of whitelist i for gem group j. # Return to the original whitelist index mol_barcode_idx -= gg_barcode_idx_start.astype( np.uint64)[mol_gem_group] # Offset by the cumulative whitelist length up to a barcode's gem group gg_barcode_matrix_start = np.cumsum(gg_barcode_idx_len).astype( np.uint64) mol_barcode_idx += gg_barcode_matrix_start[mol_gem_group - 1] ones = np.ones(len(mol_barcode_idx), dtype=cr_matrix.DEFAULT_DATA_DTYPE) umi_matrix = sp_sparse.coo_matrix( (ones, (mol_feature_idx, mol_barcode_idx)), shape=(num_features, num_bcs)) print 'created umi matrix' LogPerf.mem() # Create a read-count matrix so we can summarize reads per barcode read_matrix = sp_sparse.coo_matrix( (new_read_pairs, (mol_feature_idx, mol_barcode_idx)), shape=(num_features, num_bcs)) del ones del mol_feature_idx del mol_barcode_idx del new_read_pairs # Get all barcodes strings for the raw matrix barcode_seqs = mc.get_barcodes() print len(barcode_seqs), len(gem_groups) print 'creating barcode strings' LogPerf.mem() barcodes = [] for gg in gem_groups: idx_start = gg_barcode_idx_start[gg] idx_end = idx_start + gg_barcode_idx_len[gg] gg_bcs = np.array([ cr_utils.format_barcode_seq(bc, gg) for bc in barcode_seqs[idx_start:idx_end] ]) barcodes.append(gg_bcs) barcodes = np.concatenate(barcodes) barcodes.flags.writeable = False print 'created barcode strings' LogPerf.mem() # Get mapped reads per barcode per library,genome read_summary = {} read_matrix = CountMatrix(feature_ref, barcodes, read_matrix) read_matrix.m = read_matrix.m.tocsc(copy=True) read_summary = summarize_read_matrix(read_matrix, library_info, barcode_info, barcode_seqs) del read_matrix print 'created read matrix' LogPerf.mem() # Construct the raw UMI matrix raw_umi_matrix = CountMatrix(feature_ref, barcodes, umi_matrix) raw_umi_matrix.save_h5_file(outs.raw_matrix_h5) outs.raw_nnz = raw_umi_matrix.m.nnz # Construct the filtered UMI matrix filtered_bcs = MoleculeCounter.get_filtered_barcodes( barcode_info, library_info, barcode_seqs) filtered_umi_matrix = raw_umi_matrix.select_barcodes_by_seq( filtered_bcs) filtered_umi_matrix.save_h5_file(outs.filtered_matrix_h5) outs.filtered_nnz = filtered_umi_matrix.m.nnz print 'created filtered umi matrix' LogPerf.mem() summary = { 'read_summary': read_summary, 'mol_metrics': metrics_out, } with open(outs.chunk_summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True) # Don't write MEX from chunks. outs.raw_matrices_mex = None outs.filtered_matrices_mex = None
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.chunk_input) counter = cr_mol_counter.MoleculeCounter.open(outs.output, mode='w') mol_data_keys = cr_mol_counter.MoleculeCounter.get_data_columns() mol_data_columns = {key: idx for idx, key in enumerate(mol_data_keys)} gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) genomes = cr_utils.get_reference_genomes(args.reference_path) genome_index = cr_reference.get_genome_index(genomes) none_gene_id = len(gene_index.get_genes()) # store reference index columns # NOTE - these must be cast to str first, as unicode is not supported counter.set_ref_column('genome_ids', [str(genome) for genome in genomes]) counter.set_ref_column('gene_ids', [str(gene.id) for gene in gene_index.genes]) counter.set_ref_column('gene_names', [str(gene.name) for gene in gene_index.genes]) filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bcs = set() for _, bcs in filtered_bcs_per_genome.iteritems(): filtered_bcs |= set(bcs) gg_metrics = collections.defaultdict( lambda: {cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC: 0}) for (gem_group, barcode, gene_ids), reads_iter in itertools.groupby( in_bam, key=cr_utils.barcode_sort_key): if barcode is None or gem_group is None: continue is_cell_barcode = cr_utils.format_barcode_seq( barcode, gem_group) in filtered_bcs molecules = collections.defaultdict( lambda: np.zeros(len(mol_data_columns), dtype=np.uint64)) compressed_barcode = cr_mol_counter.MoleculeCounter.compress_barcode_seq( barcode) gem_group = cr_mol_counter.MoleculeCounter.compress_gem_group( gem_group) read_positions = collections.defaultdict(set) for read in reads_iter: umi = cr_utils.get_read_umi(read) # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent. if read.is_secondary or umi is None or read.is_read2: continue raw_umi = cr_utils.get_read_raw_umi(read) raw_bc, raw_gg = cr_utils.split_barcode_seq( cr_utils.get_read_raw_barcode(read)) proc_bc, proc_gg = cr_utils.split_barcode_seq( cr_utils.get_read_barcode(read)) if cr_utils.is_read_conf_mapped_to_transcriptome( read, cr_utils.get_high_conf_mapq(args.align)): assert len(gene_ids) == 1 mol_key, map_type = (umi, gene_index.gene_id_to_int( gene_ids[0])), 'reads' read_pos = (read.tid, read.pos) uniq_read_pos = read_pos not in read_positions[mol_key] read_positions[mol_key].add(read_pos) if is_cell_barcode: gg_metrics[int(gem_group)][ cr_mol_counter. GG_CONF_MAPPED_FILTERED_BC_READS_METRIC] += 1 elif read.is_unmapped: mol_key, map_type, uniq_read_pos = ( umi, none_gene_id), 'unmapped_reads', False else: mol_key, map_type, uniq_read_pos = ( umi, none_gene_id), 'nonconf_mapped_reads', False molecules[mol_key][mol_data_columns[map_type]] += 1 molecules[mol_key][mol_data_columns['umi_corrected_reads']] += int( not raw_umi == umi) molecules[mol_key][mol_data_columns[ 'barcode_corrected_reads']] += int(not raw_bc == proc_bc) molecules[mol_key][mol_data_columns[ 'conf_mapped_uniq_read_pos']] += int(uniq_read_pos) for mol_key, molecule in sorted(molecules.items()): umi, gene_id = mol_key genome = cr_utils.get_genome_from_str( gene_index.int_to_gene_id(gene_id), genomes) genome_id = cr_reference.get_genome_id(genome, genome_index) counter.add( barcode=compressed_barcode, gem_group=gem_group, umi=cr_mol_counter.MoleculeCounter.compress_umi_seq(umi), gene=gene_id, genome=genome_id, **{ key: molecule[col_idx] for key, col_idx in mol_data_columns.iteritems() }) in_bam.close() counter.set_metric(cr_mol_counter.GEM_GROUPS_METRIC, dict(gg_metrics)) counter.save()
def main(args, outs): # Load barcode whitelist if args.barcode_whitelist is not None: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist) reporter = vdj_report.VdjReporter() # Load barcode count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group, args.library_type) if args.barcode_whitelist is not None: barcode_whitelist_set = set(barcode_whitelist) else: barcode_whitelist_set = None in_read1_fastq = cr_io.open_maybe_gzip(args.read1_chunk) in_read2_fastq = cr_io.open_maybe_gzip( args.read2_chunk) if args.read2_chunk else [] outs.corrected_bcs += h5_constants.LZ4_SUFFIX out_file = cr_io.open_maybe_gzip(outs.corrected_bcs, 'w') bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist, outs.corrected_barcode_counts) # Correct barcodes, add processed bc tag to fastq read_pair_iter = itertools.izip_longest(tk_fasta.read_generator_fastq(in_read1_fastq), \ tk_fasta.read_generator_fastq(in_read2_fastq)) for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads): read1_header = cr_fastq.AugmentedFastqHeader(read1[0]) raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) processed_bc = None if raw_bc: if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set: processed_bc = cr_stats.correct_bc_error( args.barcode_confidence_threshold, raw_bc, bc_qual, barcode_dist) else: # Disallow Ns in no-whitelist case if 'N' in raw_bc: processed_bc = None else: processed_bc = raw_bc if processed_bc: bc_counter.count(None, processed_bc, None) # Add gem group to barcode sequence processed_bc = cr_utils.format_barcode_seq( processed_bc, gem_group=args.gem_group) reporter.vdj_barcode_cb(raw_bc, processed_bc) out_file.write('%s\n' % (processed_bc if processed_bc is not None else '')) in_read1_fastq.close() if in_read2_fastq: in_read2_fastq.close() out_file.close() bc_counter.close() reporter.save(outs.chunked_reporter)
def main(args, outs): # Load barcode whitelist if args.barcode_whitelist is not None: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist) reporter = vdj_report.VdjReporter() # Load barcode count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group) if args.barcode_whitelist is not None: barcode_whitelist_set = set(barcode_whitelist) else: barcode_whitelist_set = None in_read1_fastq = open(args.read1_chunk) in_read2_fastq = open(args.read2_chunk) out_read1_fastq = open(outs.corrected_read1s, 'w') out_read2_fastq = open(outs.corrected_read2s, 'w') bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist, outs.corrected_barcode_counts) # Correct barcodes, add processed bc tag to fastq read_pair_iter = itertools.izip(tk_fasta.read_generator_fastq(in_read1_fastq), \ tk_fasta.read_generator_fastq(in_read2_fastq)) for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads): read1_header = cr_fastq.AugmentedFastqHeader(read1[0]) read2_header = cr_fastq.AugmentedFastqHeader(read2[0]) raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) if raw_bc: if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set: processed_bc = cr_stats.correct_bc_error( args.barcode_confidence_threshold, raw_bc, bc_qual, barcode_dist) else: # Disallow Ns in no-whitelist case if 'N' in raw_bc: processed_bc = None else: processed_bc = raw_bc if processed_bc: bc_counter.count(None, processed_bc, None) # Add gem group to barcode sequence processed_bc = cr_utils.format_barcode_seq( processed_bc, gem_group=args.gem_group) read1_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG, processed_bc) read2_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG, processed_bc) reporter.vdj_barcode_cb(raw_bc, processed_bc) tk_fasta.write_read_fastq(out_read1_fastq, read1_header.to_string(), read1[1], read1[2]) tk_fasta.write_read_fastq(out_read2_fastq, read2_header.to_string(), read2[1], read2[2]) in_read1_fastq.close() in_read2_fastq.close() out_read1_fastq.close() out_read2_fastq.close() bc_counter.close() reporter.save(outs.chunked_reporter)
def main(args, outs): np.random.seed(0) mc = MoleculeCounter.open(args.molecule_info, 'r') # Get cell-associated barcodes genomes = sorted( set( f.tags.get('genome', '') for f in mc.feature_reference.feature_defs)) cell_bcs_by_genome = get_cell_associated_barcodes(genomes, args.filtered_barcodes) # Load chunk of relevant data from the mol_info chunk = slice(int(args.chunk_start), int(args.chunk_start) + int(args.chunk_len)) mol_library_idx = mc.get_column_lazy('library_idx')[chunk] mol_read_pairs = mc.get_column_lazy('count')[chunk] mol_gem_group = mc.get_column_lazy('gem_group')[chunk] mol_barcode_idx = mc.get_column_lazy('barcode_idx')[chunk] mol_feature_idx = mc.get_column_lazy('feature_idx')[chunk] barcodes = mc.get_ref_column('barcodes') # Give each cell-associated barcode an integer index cell_bcs = sorted(list(cell_bcs_by_genome[''])) cell_bc_to_int = {bc: i for i, bc in enumerate(cell_bcs)} # Give each genome an integer index genome_to_int = {g: i for i, g in enumerate(genomes)} feature_int_to_genome_int = np.fromiter( (genome_to_int[f.tags.get('genome', '')] for f in mc.feature_reference.feature_defs), dtype=int) mol_genome_idx = feature_int_to_genome_int[mol_feature_idx] # determine which (library type, genome) pairs have any associated reads lib_types = sorted(set(lib['library_type'] for lib in mc.library_info)) lib_type_to_int = {l: i for i, l in enumerate(lib_types)} lib_idx_to_lib_type_idx = np.fromiter( (lib_type_to_int[lib['library_type']] for lib in mc.library_info), dtype=np.int) lib_type_genome_any_reads = np.zeros((len(lib_types), len(genomes)), dtype=np.bool) lib_genome_idx_pairs = set( izip(mol_library_idx[mol_read_pairs > 0], mol_genome_idx[mol_read_pairs > 0])) for (lib_idx, genome_idx) in lib_genome_idx_pairs: lib_type_idx = lib_idx_to_lib_type_idx[lib_idx] lib_type_genome_any_reads[lib_type_idx, genome_idx] = True # Run each subsampling task on this chunk of data n_tasks = len(args.subsample_info) n_genomes = len(genomes) n_cells = len(cell_bcs) umis_per_bc = np.zeros((n_tasks, n_genomes, n_cells)) features_det_per_bc = np.zeros((n_tasks, n_genomes, n_cells)) read_pairs_per_task = np.zeros((n_tasks, n_genomes)) umis_per_task = np.zeros((n_tasks, n_genomes)) for task_idx, task in enumerate(args.subsample_info): # Per-library subsampling rates rates_per_library = np.array(task['library_subsample_rates'], dtype=float) if np.count_nonzero(rates_per_library) == 0: continue mol_rate = rates_per_library[mol_library_idx] # Subsampled read pairs per molecule new_read_pairs = np.random.binomial(mol_read_pairs, mol_rate) # Compute tallies for each barcode group_keys = (mol_gem_group, mol_barcode_idx) group_values = (mol_feature_idx, mol_genome_idx, new_read_pairs) for (gg, bc_idx), (feature_idx, genome_idx, read_pairs) in \ cr_utils.numpy_groupby(group_values, group_keys): barcode = cr_utils.format_barcode_seq(barcodes[bc_idx], gg) cell_idx = cell_bc_to_int.get(barcode) for this_genome_idx in xrange(len(genomes)): umis = np.flatnonzero((read_pairs > 0) & (genome_idx == this_genome_idx)) this_genome_read_pairs = np.sum( read_pairs[genome_idx == this_genome_idx]) # Tally UMIs and median features detected if barcode in cell_bcs_by_genome[genomes[this_genome_idx]]: # This is a cell-associated barcode for this genome umis_per_bc[task_idx, this_genome_idx, cell_idx] = len(umis) features_det_per_bc[task_idx, this_genome_idx, cell_idx] = np.count_nonzero( np.bincount(feature_idx[umis])) # Tally numbers for duplicate fraction read_pairs_per_task[task_idx, this_genome_idx] += np.sum( this_genome_read_pairs) umis_per_task[task_idx, this_genome_idx] += len(umis) with open(outs.metrics, 'w') as f: data = { 'umis_per_bc': umis_per_bc, 'features_det_per_bc': features_det_per_bc, 'read_pairs': read_pairs_per_task, 'umis': umis_per_task, 'lib_type_genome_any_reads': lib_type_genome_any_reads, } cPickle.dump(data, f, protocol=cPickle.HIGHEST_PROTOCOL)