def main(args, outs): np.random.seed(0) unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) # Load the umi info umi_info = vdj_umi_info.read_umi_info(args.umi_info, args.start_row, args.end_row) chains = umi_info['chains'] barcodes = umi_info['barcodes'] bc_gg = [str(cr_utils.split_barcode_seq(bc)[1]) for bc in barcodes] # Compute N50 read pairs per UMI for this gem group umi_read_pairs = [] total_read_pairs = {} chain_bad_read_pairs = {} for bc_idx, data_iter in itertools.groupby(itertools.izip( umi_info['barcode_idx'], umi_info['umi_idx'], umi_info['chain_idx'], umi_info['reads']), key=lambda x: x[0]): bc_umi_read_pairs = {} for _, umi, chain_idx, reads in data_iter: bc_umi_read_pairs[umi] = bc_umi_read_pairs.get(umi, 0) + reads chain = chains[chain_idx] total_read_pairs[chain] = total_read_pairs.get(chain, 0) + reads total_read_pairs[ cr_constants.MULTI_REFS_PREFIX] = total_read_pairs.get( cr_constants.MULTI_REFS_PREFIX, 0) + reads if reads < args.min_readpairs_per_umi[bc_gg[bc_idx]]: chain_bad_read_pairs[chain] = chain_bad_read_pairs.get( chain, 0) + reads chain_bad_read_pairs[ cr_constants.MULTI_REFS_PREFIX] = chain_bad_read_pairs.get( cr_constants.MULTI_REFS_PREFIX, 0) + reads for r in bc_umi_read_pairs.itervalues(): umi_read_pairs.append(r) rppu_n50 = tk_stats.NX(umi_read_pairs, 0.5) if rppu_n50 is None: rppu_n50 = float('NaN') # Report bad read-pairs/umi for chain in reporter.vdj_genes: bad_count = chain_bad_read_pairs.get(chain, 0) total_count = total_read_pairs.get(chain, 0) reporter._get_metric_attr('vdj_recombinome_low_support_reads_frac', chain).set_value(bad_count, total_count) reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50', cr_constants.MULTI_REFS_PREFIX, args.gem_group).set_value(rppu_n50) reporter.save(outs.chunked_reporter)
def call_cell_barcodes(umi_info_path, gem_group): """ Call cell barcodes by UMI support. Args: umi_info_path (str) - path to umi info h5 gem_group (int) - gem group Returns: (bc_support, cell_bcs, rt, ut) where bc_support = dict of { barcode: umi_count }, cell_bcs = list(str) of cell barcodes) rt = read pair per umi threshold used ut = umi threshold """ # Get umi info for this gem group only bc_str = vdj_umi_info.get_column(umi_info_path, 'barcodes') bc_gg = np.array([int(cr_utils.split_barcode_seq(bc)[1]) for bc in bc_str]) bc_in_gg = bc_gg == gem_group umi_info = vdj_umi_info.read_umi_info(umi_info_path) umi_barcode_idx = [] umi_read_pairs = [] for bc_idx, data_iter in itertools.groupby(itertools.izip( umi_info['barcode_idx'], umi_info['umi_idx'], umi_info['reads']), key=lambda x: x[0]): if not bc_in_gg[bc_idx]: continue bc_umi_read_pairs = {} for _, umi, reads in data_iter: bc_umi_read_pairs[umi] = bc_umi_read_pairs.get(umi, 0) + reads for r in bc_umi_read_pairs.itervalues(): umi_barcode_idx.append(bc_idx) umi_read_pairs.append(r) rpu_threshold, umi_threshold, bc_support, confidence = vdj_stats.call_vdj_cells( umi_barcode_idx=np.array(umi_barcode_idx, dtype=vdj_umi_info.get_dtype('barcode_idx')), umi_read_pairs=np.array(umi_read_pairs, dtype=vdj_umi_info.get_dtype('reads')), barcodes=bc_str, rpu_mix_init_sd=RPU_MIX_INIT_SD, umi_mix_init_sd=UMI_MIX_INIT_SD, verbosity=1, ) cell_bcs = [ bc for bc, umis in bc_support.iteritems() if umis >= umi_threshold ] return bc_support, cell_bcs, rpu_threshold, umi_threshold, confidence
def main(args, outs): np.random.seed(0) unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) # Load the umi info umi_info = vdj_umi_info.read_umi_info(args.umi_info, args.start_row, args.end_row) # Compute N50 read pairs per UMI for this gem group rppu_n50 = tk_stats.NX(umi_info['reads'], 0.5) if rppu_n50 is None: rppu_n50 = float('NaN') reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50', cr_constants.MULTI_REFS_PREFIX, args.gem_group).set_value(rppu_n50) reporter.save(outs.chunked_reporter)
def write_barcode_umi_summary(umi_info_filename, reporter, filename, threshold, cell_barcode_set): """ Write a summary of UMI readpair-counts per (barcode, chain) tuple. Args: filename - output filename threshold (int) - min read pairs per UMI used in asm barcodes - set of barcode strings """ # Load the umi info umi_info = vdj_umi_info.read_umi_info(umi_info_filename) chains = umi_info['chains'] barcodes = umi_info['barcodes'] sep = ',' with open(filename, 'w') as writer: field_names = ["bc"] field_names += [chain + "_all_umis" for chain in reporter.vdj_genes] + \ [chain + "_good_umis" for chain in reporter.vdj_genes] writer.write(sep.join(field_names)) writer.write("\n") # Assume sorted by barcode for bc_idx, umi_iter in itertools.groupby(itertools.izip( umi_info['barcode_idx'], umi_info['chain_idx'], umi_info['reads']), key=lambda x: x[0]): bc = barcodes[bc_idx] if bc not in cell_barcode_set: continue # Count UMIs umis = list(umi_iter) chain_counts = defaultdict(int) good_chain_counts = defaultdict(int) for bc_idx, chain_idx, reads in umis: chain = chains[chain_idx] chain_counts[chain] += 1 chain_counts[cr_constants.MULTI_REFS_PREFIX] += 1 _, gem_group = cr_utils.split_barcode_seq(barcodes[bc_idx]) if reads >= threshold: good_chain_counts[chain] += 1 good_chain_counts[cr_constants.MULTI_REFS_PREFIX] += 1 # Report barcode totals flds = {} flds["bc"] = bc num_good_umis = good_chain_counts[cr_constants.MULTI_REFS_PREFIX] reporter._get_metric_attr( 'vdj_recombinome_total_umis_per_cell_distribution').add( num_good_umis) reporter._get_metric_attr( 'vdj_recombinome_total_umis_per_cell_median').add( num_good_umis) # Report per-chain totals for this barcode for chain in reporter.vdj_genes: chain_all_umis = chain_counts[chain] chain_good_umis = good_chain_counts[chain] flds[chain + "_all_umis"] = chain_all_umis flds[chain + "_good_umis"] = chain_good_umis reporter._get_metric_attr( 'vdj_recombinome_umis_per_cell_distribution', chain).add(chain_good_umis) reporter._get_metric_attr( 'vdj_recombinome_umis_per_cell_median', chain).add(chain_good_umis) writer.write(sep.join([str(flds[name]) for name in field_names])) writer.write("\n")
def main(args, outs): np.random.seed(0) unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) # Load the umi info umi_info = vdj_umi_info.read_umi_info(args.umi_info, args.start_row, args.end_row) # Compute initial within-barcode thresholds # Assumes the fraction of noise-UMI reads is < some fraction (NX) barcode_nx = np.zeros(len(umi_info['barcodes']), dtype=int) # Assume grouped by barcode for bc, bc_reads in itertools.groupby(itertools.izip( umi_info['barcode_idx'], umi_info['reads']), key=lambda x: x[0]): bc_reads_arr = np.fromiter((reads for bc, reads in bc_reads), umi_info['reads'].dtype) barcode_nx[bc] = tk_stats.NX(bc_reads_arr, args.intra_barcode_nx) # Filter out UMIs below the within-BC threshold (in-place) top_in_bc = umi_info['reads'] >= barcode_nx[umi_info['barcode_idx']] for col in vdj_umi_info.UMI_INFO_COLS.iterkeys(): umi_info[col] = np.compress(top_in_bc, umi_info[col]) # Compute N50 read pairs per UMI for this gem group # and use it to subsample to the target N50. rppu_n50 = tk_stats.NX(umi_info['reads'], 0.5) if rppu_n50 is None: rppu_n50 = float('NaN') reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50', cr_constants.MULTI_REFS_PREFIX, args.gem_group).set_value(rppu_n50) if rppu_n50 == 0: subsample_rate = 1.0 else: subsample_rate = min(1.0, tk_stats.robust_divide(args.target_n50, rppu_n50)) reporter._get_metric_attr('vdj_assembly_subsample_rate', args.gem_group).set_value(subsample_rate, 1.0) # Weighted average of subsample rates where weight = sum of readpairs on UMIs for each gem-group reporter._get_metric_attr('vdj_assembly_overall_subsample_rate').set_value( subsample_rate * sum(umi_info['reads']), sum(umi_info['reads'])) # Find the global (per-chain) thresholds thresholds = {} chain_totals = {} # Sort the chains alphabetically for determinism in e.g. multi-library vs single-library # runs. chain_tuples = list(enumerate(umi_info['chains'])) sorted_chain_tuples = sorted(chain_tuples, key=lambda x: x[1]) for chain_idx, chain in sorted_chain_tuples: chain_reads = umi_info['reads'][umi_info['chain_idx'] == chain_idx] chain_totals[chain] = chain_reads.sum() # Record the per-chain N50 read pairs per UMI (but don't use it) chain_n50 = tk_stats.NX(chain_reads, 0.5) if chain_n50 is None: chain_n50 = float('NaN') reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50', chain, args.gem_group).set_value(chain_n50) print "Computing per-chain threshold for %s" % chain thresholds[chain] = vdj_stats.compute_readpairs_per_umi_threshold( chain_reads, subsample_rate) print " %d" % thresholds[chain] reporter._get_metric_attr( 'vdj_recombinome_readpairs_per_umi_threshold', chain, args.gem_group).set_value(thresholds[chain]) # Take the min threshold among the chains that make up N90 of all reads chain_n90 = tk_stats.NX(chain_totals.values(), 0.9) use_chains = [ chain for chain in thresholds.iterkeys() if chain_totals[chain] >= chain_n90 ] use_thresholds = [thresholds[c] for c in use_chains] print "Using thresholds from " + str(use_chains) + ": " + str( use_thresholds) # Handle case where no chains were detected if len(use_chains) == 0: threshold = 1 else: threshold = min(use_thresholds) outs.min_readpairs_per_umi = {args.gem_group: int(threshold)} outs.subsample_rate = {args.gem_group: float(subsample_rate)} reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_threshold', cr_constants.MULTI_REFS_PREFIX, args.gem_group).set_value(threshold) reporter.save(outs.chunked_reporter)