def get_bc_counts(genomes, genes, molecule_counter): genome_ids = molecule_counter.get_column('genome') genome_index = cr_reference.get_genome_index(genomes) conf_mapped_reads = molecule_counter.get_column('reads') barcodes = molecule_counter.get_column('barcode') bc_counts = {} for genome in genomes: genome_id = cr_reference.get_genome_id(genome, genome_index) genome_indices = genome_ids == genome_id if genome_indices.sum() == 0: # edge case - there's no data for this genome (e.g. empty sample, false barnyard sample, or nothing confidently mapped) continue bcs_for_genome = barcodes[genome_indices] # only count UMIs with at least one conf mapped read umi_conf_mapped_to_genome = conf_mapped_reads[genome_indices] > 0 bc_breaks = bcs_for_genome[1:] - bcs_for_genome[:-1] bc_breaks = np.concatenate( ([1], bc_breaks)) # first row is always a break bc_break_indices = np.nonzero(bc_breaks)[0] unique_bcs = bcs_for_genome[bc_break_indices] umis_per_bc = np.add.reduceat(umi_conf_mapped_to_genome, bc_break_indices) cmb_reads_per_bc = np.add.reduceat(conf_mapped_reads[genome_indices], bc_break_indices) bc_counts[genome] = (unique_bcs, umis_per_bc, cmb_reads_per_bc) return bc_counts
def write_filtered_molecules(ctr_in, ctr_out, genomes, bcs_per_genome): ctr_out.set_all_metrics(ctr_in.get_all_metrics()) filtered_bc_tuples = set() genome_ids = ctr_in.get_column('genome') genome_index = cr_reference.get_genome_index(genomes) for (genome, formatted_bcs) in bcs_per_genome.iteritems(): genome_id = cr_reference.get_genome_id(genome, genome_index) for formatted_bc in formatted_bcs: (bc, gg) = cr_utils.split_barcode_seq(formatted_bc) cbc = cr_mol_counter.MoleculeCounter.compress_barcode_seq(bc) filtered_bc_tuples.add((genome_id, gg, cbc)) def keep_molecule(genome_id, gem_group, barcode): tup = (genome_id, gem_group, barcode) return (tup in filtered_bc_tuples) filter_func = np.vectorize(keep_molecule) gem_groups = ctr_in.get_column('gem_group') barcodes = ctr_in.get_column('barcode') filter_index = filter_func(genome_ids, gem_groups, barcodes) for col in cr_mol_counter.MOLECULE_INFO_COLUMNS: data = ctr_in.get_column(col) filtered_data = data[filter_index] ctr_out.add_many(col, filtered_data) for col in cr_mol_counter.MOLECULE_REF_COLUMNS: ctr_out.set_ref_column(col, ctr_in.get_ref_column(col)) # summarize filtered data genomes = ctr_out.get_ref_column('genome_ids') filtered_reads = ctr_out.get_column('reads') flt_conf_mapped_per_genome = {} if len(genomes) == 1: genome = genomes[0] flt_conf_mapped_per_genome[genome] = filtered_reads.sum() else: genome_ids = ctr_out.get_column('genome') genome_index = cr_reference.get_genome_index(genomes) for genome in genomes: genome_id = cr_reference.get_genome_id(genome, genome_index) flt_conf_mapped_per_genome[genome] = filtered_reads[ genome_ids == genome_id].sum() summary = {'flt_conf_mapped_per_genome': flt_conf_mapped_per_genome} return summary
def main(args, outs): np.random.seed(0) with cr_mol_counter.MoleculeCounter.open(args.molecules, 'r', start=int(args.chunk_start), length=int( args.chunk_len)) as ctr_in: with cr_mol_counter.MoleculeCounter.open(outs.out_molecules, 'w') as ctr_out: metrics_in = ctr_in.get_all_metrics() metrics_out = metrics_in.copy() reads = ctr_in.get_column('reads') gem_groups = ctr_in.get_column('gem_group') if args.downsample and len(args.downsample_map) > 1: downsample_func = np.vectorize( lambda gem_group, read_count: np.random.binomial( read_count, args.downsample_map[str(gem_group)][ 'frac_reads_kept'])) # downsample metrics for gg in metrics_out[cr_mol_counter.GEM_GROUPS_METRIC]: frac_reads_kept = args.downsample_map[str( gg)]['frac_reads_kept'] total_reads_in = metrics_in[ cr_mol_counter.GEM_GROUPS_METRIC][gg][ cr_mol_counter.GG_TOTAL_READS_METRIC] total_reads_out = round(frac_reads_kept * total_reads_in) metrics_out[cr_mol_counter.GEM_GROUPS_METRIC][gg][ cr_mol_counter. GG_DOWNSAMPLED_READS_METRIC] = total_reads_out ctr_out.set_all_metrics(metrics_out) # downsample molecule info subsampled_reads = downsample_func(gem_groups, reads) for col in cr_mol_counter.MOLECULE_INFO_COLUMNS: if col == 'reads': data = subsampled_reads else: data = ctr_in.get_column(col) ctr_out.add_many(col, data) # pass reference info for col in cr_mol_counter.MOLECULE_REF_COLUMNS: ctr_out.set_ref_column(col, ctr_in.get_ref_column(col)) else: subsampled_reads = reads # collect summary stats genomes = ctr_in.get_ref_column('genome_ids') raw_conf_mapped_per_genome = {} if len(genomes) == 1: genome = genomes[0] raw_conf_mapped_per_genome[genome] = subsampled_reads.sum() else: genome_ids = ctr_in.get_column('genome') genome_index = cr_reference.get_genome_index(genomes) for genome in genomes: genome_id = cr_reference.get_genome_id( genome, genome_index) raw_conf_mapped_per_genome[genome] = subsampled_reads[ genome_ids == genome_id].sum() summary = { 'raw_conf_mapped_per_genome': raw_conf_mapped_per_genome, 'mol_counter_metrics': metrics_out } with open(outs.summary, 'w') as f: tk_json.dump_numpy(summary, f, pretty=True)
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.chunk_input) counter = cr_mol_counter.MoleculeCounter.open(outs.output, mode='w') mol_data_keys = cr_mol_counter.MoleculeCounter.get_data_columns() mol_data_columns = {key: idx for idx, key in enumerate(mol_data_keys)} gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) genomes = cr_utils.get_reference_genomes(args.reference_path) genome_index = cr_reference.get_genome_index(genomes) none_gene_id = len(gene_index.get_genes()) # store reference index columns # NOTE - these must be cast to str first, as unicode is not supported counter.set_ref_column('genome_ids', [str(genome) for genome in genomes]) counter.set_ref_column('gene_ids', [str(gene.id) for gene in gene_index.genes]) counter.set_ref_column('gene_names', [str(gene.name) for gene in gene_index.genes]) filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bcs = set() for _, bcs in filtered_bcs_per_genome.iteritems(): filtered_bcs |= set(bcs) gg_metrics = collections.defaultdict( lambda: {cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC: 0}) for (gem_group, barcode, gene_ids), reads_iter in itertools.groupby( in_bam, key=cr_utils.barcode_sort_key): if barcode is None or gem_group is None: continue is_cell_barcode = cr_utils.format_barcode_seq( barcode, gem_group) in filtered_bcs molecules = collections.defaultdict( lambda: np.zeros(len(mol_data_columns), dtype=np.uint64)) compressed_barcode = cr_mol_counter.MoleculeCounter.compress_barcode_seq( barcode) gem_group = cr_mol_counter.MoleculeCounter.compress_gem_group( gem_group) read_positions = collections.defaultdict(set) for read in reads_iter: umi = cr_utils.get_read_umi(read) # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent. if read.is_secondary or umi is None or read.is_read2: continue raw_umi = cr_utils.get_read_raw_umi(read) raw_bc, raw_gg = cr_utils.split_barcode_seq( cr_utils.get_read_raw_barcode(read)) proc_bc, proc_gg = cr_utils.split_barcode_seq( cr_utils.get_read_barcode(read)) if cr_utils.is_read_conf_mapped_to_transcriptome( read, cr_utils.get_high_conf_mapq(args.align)): assert len(gene_ids) == 1 mol_key, map_type = (umi, gene_index.gene_id_to_int( gene_ids[0])), 'reads' read_pos = (read.tid, read.pos) uniq_read_pos = read_pos not in read_positions[mol_key] read_positions[mol_key].add(read_pos) if is_cell_barcode: gg_metrics[int(gem_group)][ cr_mol_counter. GG_CONF_MAPPED_FILTERED_BC_READS_METRIC] += 1 elif read.is_unmapped: mol_key, map_type, uniq_read_pos = ( umi, none_gene_id), 'unmapped_reads', False else: mol_key, map_type, uniq_read_pos = ( umi, none_gene_id), 'nonconf_mapped_reads', False molecules[mol_key][mol_data_columns[map_type]] += 1 molecules[mol_key][mol_data_columns['umi_corrected_reads']] += int( not raw_umi == umi) molecules[mol_key][mol_data_columns[ 'barcode_corrected_reads']] += int(not raw_bc == proc_bc) molecules[mol_key][mol_data_columns[ 'conf_mapped_uniq_read_pos']] += int(uniq_read_pos) for mol_key, molecule in sorted(molecules.items()): umi, gene_id = mol_key genome = cr_utils.get_genome_from_str( gene_index.int_to_gene_id(gene_id), genomes) genome_id = cr_reference.get_genome_id(genome, genome_index) counter.add( barcode=compressed_barcode, gem_group=gem_group, umi=cr_mol_counter.MoleculeCounter.compress_umi_seq(umi), gene=gene_id, genome=genome_id, **{ key: molecule[col_idx] for key, col_idx in mol_data_columns.iteritems() }) in_bam.close() counter.set_metric(cr_mol_counter.GEM_GROUPS_METRIC, dict(gg_metrics)) counter.save()