def correct_for_saturation(self, summary_json_paths): merged_jsons = cr_utils.merge_jsons_as_dict(summary_json_paths) for genome, matrix in self.matrices.iteritems(): effective_umi_diversity = merged_jsons.get( '%s_conf_mapped_effective_umi_diversity' % genome, 0) matrix.correct_for_saturation(float(effective_umi_diversity), genome)
def join(args, outs, chunk_defs, chunk_outs): summary = cr_utils.merge_jsons_as_dict([ args.extract_reads_summary, args.attach_bcs_and_umis_summary, args.mark_duplicates_summary, ]) gem_groups = sorted(set(args.gem_groups)) metrics = cr_mol_counter.MoleculeCounter.get_metrics_from_summary( summary, gem_groups, args.recovered_cells, args.force_cells) input_h5_filenames = [chunk_out.output for chunk_out in chunk_outs] # update with metrics that were computed in the chunks chunk_metric = cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC for gg, count in cr_mol_counter.MoleculeCounter.sum_gem_group_metric( input_h5_filenames, chunk_metric).iteritems(): metrics[cr_mol_counter.GEM_GROUPS_METRIC][gg][chunk_metric] = count # make sure to sort globally by gem group. since the input is a barcode-sorted BAM, we assume it's already sorted by barcode. sort_columns = ['gem_group'] cr_mol_counter.MoleculeCounter.concatenate_sort(outs.output, input_h5_filenames, sort_columns, metrics=metrics)
def join(args, outs, chunk_defs, chunk_outs): summary = cr_utils.merge_jsons_as_dict([ args.extract_reads_summary, args.attach_bcs_and_umis_summary, args.mark_duplicates_summary, ]) # Hack for getting reference metadata - # this used to be computed in prior stages. # This is needed for storage in the molecule_info HDF5. tmp_reporter = cr_report.Reporter() tmp_reporter.store_reference_metadata(args.reference_path, cr_constants.REFERENCE_TYPE, cr_constants.REFERENCE_METRIC_PREFIX) ref_metadata = tmp_reporter.report(cr_constants.DEFAULT_REPORT_TYPE) summary.update(ref_metadata) # Load library info from BAM in_bam = tk_bam.create_bam_infile(args.inputs[0]) library_info = rna_library.get_bam_library_info(in_bam) metrics = MoleculeCounter.get_metrics_from_summary(summary, library_info, args.recovered_cells, args.force_cells) input_h5_filenames = [chunk_out.output for chunk_out in chunk_outs] # update with metrics that were computed in the chunks chunk_metric = cr_mol_counter.USABLE_READS_METRIC summed_lib_metrics = MoleculeCounter.sum_library_metric( input_h5_filenames, chunk_metric) for lib_key, value in summed_lib_metrics.iteritems(): metrics[cr_mol_counter.LIBRARIES_METRIC][lib_key][chunk_metric] = value MoleculeCounter.concatenate(outs.output, input_h5_filenames, metrics=metrics)
def filter_barcodes(args, outs): random.seed(0) np.random.seed(0) correction_data = pd.read_csv(args.barcode_correction_csv) raw_matrix = cr_matrix.CountMatrix.load_h5_file(args.matrices_h5) if np.isin(rna_library.ANTIBODY_LIBRARY_TYPE, correction_data.library_type): matrix, metrics_to_report, removed_bcs_df = remove_bcs_with_high_umi_corrected_reads( correction_data, raw_matrix) ### report all idenitified aggregate barcodes, together with their reads, umi corrected reads, fraction of corrected reads, and fraction of total reads removed_bcs_df.to_csv(outs.aggregate_barcodes) summary = metrics_to_report else: matrix = raw_matrix summary = {} if args.cell_barcodes is not None: method = FilterMethod.MANUAL elif args.force_cells is not None: method = FilterMethod.TOP_N_BARCODES else: method = FilterMethod.ORDMAG_NONAMBIENT summary['total_diversity'] = matrix.bcs_dim summary['filter_barcodes_method'] = get_filter_method_name(method) # Get unique gem groups unique_gem_groups = sorted(list(set(args.gem_groups))) # Get per-gem group cell load if args.recovered_cells is not None: gg_recovered_cells = int( float(args.recovered_cells) / float(len(unique_gem_groups))) else: gg_recovered_cells = cr_constants.DEFAULT_RECOVERED_CELLS_PER_GEM_GROUP if args.force_cells is not None: gg_force_cells = int( float(args.force_cells) / float(len(unique_gem_groups))) # Only use gene expression matrix for cell calling gex_matrix = matrix.view().select_features_by_type( lib_constants.GENE_EXPRESSION_LIBRARY_TYPE) # Make initial cell calls for each genome separately genomes = gex_matrix.get_genomes() # (gem_group, genome) => dict filtered_metrics_groups = OrderedDict() # (gem_group, genome) => list of barcode strings filtered_bcs_groups = OrderedDict() for genome in genomes: genome_matrix = gex_matrix.select_features_by_genome(genome) # Make initial cell calls for each gem group individually for gem_group in unique_gem_groups: gg_matrix = genome_matrix.select_barcodes_by_gem_group(gem_group) if method == FilterMethod.ORDMAG or \ method == FilterMethod.ORDMAG_NONAMBIENT: gg_total_diversity = gg_matrix.bcs_dim gg_bc_counts = gg_matrix.get_counts_per_bc() gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_ordmag( gg_bc_counts, gg_recovered_cells, gg_total_diversity) gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices) elif method == FilterMethod.MANUAL: with (open(args.cell_barcodes)) as f: cell_barcodes = json.load(f) gg_filtered_bcs, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_manual( gg_matrix, cell_barcodes) elif method == FilterMethod.TOP_N_BARCODES: gg_bc_counts = gg_matrix.get_counts_per_bc() gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_fixed_cutoff( gg_bc_counts, gg_force_cells) gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices) else: martian.exit("Unsupported BC filtering method: %s" % method) if msg is not None: martian.log_info(msg) filtered_metrics_groups[(gem_group, genome)] = gg_filtered_metrics filtered_bcs_groups[(gem_group, genome)] = gg_filtered_bcs # Do additional cell calling outs.nonambient_calls = None if method == FilterMethod.ORDMAG_NONAMBIENT: # We need the full gene expression matrix instead of just a view full_gex_matrix = matrix.select_features_by_type( lib_constants.GENE_EXPRESSION_LIBRARY_TYPE) # Track these for recordkeeping eval_bcs_arrays = [] umis_per_bc_arrays = [] loglk_arrays = [] pvalue_arrays = [] pvalue_adj_arrays = [] nonambient_arrays = [] genome_call_arrays = [] # Do it by gem group, but agnostic to genome for gg in unique_gem_groups: gg_matrix = full_gex_matrix.select_barcodes_by_gem_group(gg) # Take union of initial cell calls across genomes gg_bcs = sorted( list( reduce(set.union, [ set(bcs) for group, bcs in filtered_bcs_groups.iteritems() if group[0] == gg ]))) result = cr_cell.find_nonambient_barcodes(gg_matrix, gg_bcs) if result is None: print 'Failed at attempt to call non-ambient barcodes in GEM group %s' % gg continue # Assign a genome to the cell calls by argmax genome counts genome_counts = [] for genome in genomes: genome_counts.append(gg_matrix.view() \ .select_features_by_genome(genome) \ .select_barcodes(result.eval_bcs) \ .get_counts_per_bc()) genome_counts = np.column_stack(genome_counts) genome_calls = np.array(genomes)[np.argmax(genome_counts, axis=1)] umis_per_bc = gg_matrix.get_counts_per_bc() eval_bcs_arrays.append(np.array(gg_matrix.bcs)[result.eval_bcs]) umis_per_bc_arrays.append(umis_per_bc[result.eval_bcs]) loglk_arrays.append(result.log_likelihood) pvalue_arrays.append(result.pvalues) pvalue_adj_arrays.append(result.pvalues_adj) nonambient_arrays.append(result.is_nonambient) genome_call_arrays.append(genome_calls) # Update the lists of cell-associated barcodes for genome in genomes: eval_bc_strs = np.array(gg_matrix.bcs)[result.eval_bcs] filtered_bcs_groups[(gg, genome)].extend( eval_bc_strs[(genome_calls == genome) & (result.is_nonambient)]) if len(eval_bcs_arrays) > 0: nonambient_summary = pd.DataFrame( OrderedDict([ ('barcode', np.concatenate(eval_bcs_arrays)), ('umis', np.concatenate(umis_per_bc_arrays)), ('ambient_loglk', np.concatenate(loglk_arrays)), ('pvalue', np.concatenate(pvalue_arrays)), ('pvalue_adj', np.concatenate(pvalue_adj_arrays)), ('nonambient', np.concatenate(nonambient_arrays)), ('genome', np.concatenate(genome_call_arrays)), ])) nonambient_summary.to_csv(outs.nonambient_calls) # Record all filtered barcodes genome_filtered_bcs = defaultdict(set) filtered_bcs = set() for (gem_group, genome), bcs in filtered_bcs_groups.iteritems(): genome_filtered_bcs[genome].update(bcs) filtered_bcs.update(bcs) # Combine initial-cell-calling metrics for genome in genomes: # Merge metrics over all gem groups for this genome txome_metrics = [ v for k, v in filtered_metrics_groups.iteritems() if k[1] == genome ] txome_summary = cr_stats.merge_filtered_metrics(txome_metrics) # Append method name to metrics summary.update({ ('%s_%s_%s' % (genome, key, get_filter_method_name(method))): txome_summary[key] \ for (key,_) in txome_summary.iteritems()}) summary['%s_filtered_bcs' % genome] = len(genome_filtered_bcs[genome]) # NOTE: This metric only applies to the initial cell calls summary['%s_filtered_bcs_cv' % genome] = txome_summary['filtered_bcs_cv'] # Deduplicate and sort filtered barcode sequences # Sort by (gem_group, barcode_sequence) barcode_sort_key = lambda x: cr_utils.split_barcode_seq(x)[::-1] for genome, bcs in genome_filtered_bcs.iteritems(): genome_filtered_bcs[genome] = sorted(list(set(bcs)), key=barcode_sort_key) filtered_bcs = sorted(list(set(filtered_bcs)), key=barcode_sort_key) # Re-compute various metrics on the filtered matrix reads_summary = cr_utils.merge_jsons_as_dict( [args.raw_fastq_summary, args.attach_bcs_summary]) matrix_summary = rna_report_mat.report_genomes( matrix, reads_summary=reads_summary, barcode_summary_h5_path=args.barcode_summary, recovered_cells=args.recovered_cells, cell_bc_seqs=genome_filtered_bcs) # Write metrics json combined_summary = matrix_summary.copy() combined_summary.update(summary) with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(combined_summary), f, indent=4, sort_keys=True) # Write the filtered barcodes file write_filtered_barcodes(outs.filtered_barcodes, genome_filtered_bcs) # Select cell-associated barcodes filtered_matrix = matrix.select_barcodes_by_seq(filtered_bcs) return filtered_matrix
def _report_genome_agnostic_metrics(self, summary_json_paths, barcode_summary_h5, recovered_cells, cell_bc_seqs): """ Report metrics that are computed across all barcodes and all genomes """ d = {} # Get total_reads and *_conf_mapped_reads_frac merged_jsons = cr_utils.merge_jsons_as_dict(summary_json_paths) total_reads = int(merged_jsons['total_reads']) conf_mapped_metrics = ['_'.join([ref, cr_constants.TRANSCRIPTOME_REGION, cr_constants.CONF_MAPPED_READ_TYPE, 'reads_frac']) for ref in self.matrices.keys()] total_conf_mapped_reads = sum(float(merged_jsons.get(metric, 0)) * float(total_reads) for metric in conf_mapped_metrics) # Get number of cell bcs across all genomes cell_bcs_union = self.union_barcodes(cell_bc_seqs) n_cell_bcs_union = len(cell_bcs_union) d['filtered_bcs_transcriptome_union'] = n_cell_bcs_union d['%s_filtered_bcs' % cr_constants.MULTI_REFS_PREFIX] = n_cell_bcs_union # Report reads/cell across all genomes d['%s_%s_total_raw_reads_per_filtered_bc' % (cr_constants.MULTI_REFS_PREFIX, cr_constants.TRANSCRIPTOME_REGION)] = tk_stats.robust_divide(total_reads, n_cell_bcs_union) d['%s_%s_total_conf_mapped_reads_per_filtered_bc' % (cr_constants.MULTI_REFS_PREFIX, cr_constants.TRANSCRIPTOME_REGION)] = tk_stats.robust_divide(total_conf_mapped_reads, n_cell_bcs_union) # Total UMI counts across all matrices and all filtered barcodes total_umi_counts = 0 for mat in self.matrices.values(): total_umi_counts += mat.select_barcodes_by_seq(cell_bcs_union).m.sum() # Deviation from cell load if recovered_cells is None: d['%s_filtered_bcs_difference_from_recovered_cells' % cr_constants.MULTI_REFS_PREFIX] = 0 d['%s_filtered_bcs_relative_difference_from_recovered_cells' % cr_constants.MULTI_REFS_PREFIX] = 0 else: d['%s_filtered_bcs_difference_from_recovered_cells' % cr_constants.MULTI_REFS_PREFIX] = int(n_cell_bcs_union) - int(recovered_cells) d['%s_filtered_bcs_relative_difference_from_recovered_cells' % cr_constants.MULTI_REFS_PREFIX] = tk_stats.robust_divide(n_cell_bcs_union - recovered_cells, recovered_cells) # Duplicate these metrics across genomes for backwards-compat for genome in self.matrices.keys(): d['%s_total_raw_reads_per_filtered_bc' % genome] = tk_stats.robust_divide(total_reads, n_cell_bcs_union) d['%s_total_conf_mapped_reads_per_filtered_bc' % genome] = tk_stats.robust_divide(total_conf_mapped_reads, n_cell_bcs_union) for read_type in cr_constants.MATRIX_REPORT_READ_TYPES: metric = '%s_total_%s_reads_per_filtered_bc' % (genome, read_type) if read_type in cr_constants.MATRIX_USE_MATRIX_FOR_READ_TYPE: n_reads = total_umi_counts else: h5_keys = ['%s_%s_%s_reads' % (txome, cr_constants.TRANSCRIPTOME_REGION, read_type) for txome in self.matrices.keys()] h5_keys = [x for x in h5_keys if x in barcode_summary_h5] n_reads = sum(np.array(barcode_summary_h5[h5_key]).sum() for h5_key in h5_keys) d[metric] = tk_stats.robust_divide(n_reads, n_cell_bcs_union) # Report frac reads in cells across all genomes total_conf_mapped_reads_in_cells = 0 total_conf_mapped_barcoded_reads = 0 for txome, matrix in self.matrices.iteritems(): h5_key = '%s_%s_%s_reads' % (txome, cr_constants.TRANSCRIPTOME_REGION, cr_constants.CONF_MAPPED_BC_READ_TYPE) cmb_reads = barcode_summary_h5[h5_key] cell_bc_indices = matrix.bcs_to_ints(cell_bcs_union) total_conf_mapped_reads_in_cells += cmb_reads[list(cell_bc_indices)].sum() if cell_bc_indices else 0 total_conf_mapped_barcoded_reads += cmb_reads[()].sum() d['multi_filtered_bcs_conf_mapped_barcoded_reads_cum_frac'] = tk_stats.robust_divide(total_conf_mapped_reads_in_cells, total_conf_mapped_barcoded_reads) # Compute fraction of reads usable (conf mapped, barcoded, filtered barcode) unique_barcodes = set(cell_bcs_union) in_unique_barcodes_vectorized = np.vectorize(lambda x: x in unique_barcodes) filtered_bc_h5_row = in_unique_barcodes_vectorized(np.array(barcode_summary_h5['bc_sequence'])) usable_reads = 0 for txome in self.matrices.keys(): h5_key = '%s_%s_%s_reads' % (txome, cr_constants.TRANSCRIPTOME_REGION, cr_constants.CONF_MAPPED_BC_READ_TYPE) if h5_key not in barcode_summary_h5: continue usable_reads += (filtered_bc_h5_row * np.array(barcode_summary_h5[h5_key])).sum() d['%s_transcriptome_usable_reads_frac' % cr_constants.MULTI_REFS_PREFIX] = tk_stats.robust_divide(usable_reads, total_reads) # Compute matrix density across all genomes total_nonzero_entries, total_entries = 0, 0 for matrix in self.matrices.values(): filtered_mat = matrix.select_barcodes_by_seq(cell_bcs_union) total_nonzero_entries += filtered_mat.m.getnnz() total_entries += filtered_mat.m.shape[0] * filtered_mat.m.shape[1] d['%s_filtered_gene_bc_matrix_density' % cr_constants.MULTI_REFS_PREFIX] = tk_stats.robust_divide(total_nonzero_entries, total_entries) return d