def report_genomes(matrix, reads_summary, barcode_summary_h5_path, recovered_cells, cell_bc_seqs): """Report on all genomes in this matrix""" barcode_summary_h5 = h5.File(barcode_summary_h5_path, 'r') metrics = {} genomes = matrix.get_genomes() assert len(cell_bc_seqs) == len(genomes) # Compute genome-agnostic metrics feature_types = sorted( list(set(f.feature_type for f in matrix.feature_ref.feature_defs))) for ftype in feature_types: total_reads = _get_total_reads(reads_summary, ftype) if total_reads == 0: continue genomes = matrix.get_genomes() conf_mapped_reads = _get_conf_mapped_reads(reads_summary, genomes, ftype) submatrix = matrix.view().select_features_by_type(ftype) prefix = rna_library.get_library_type_metric_prefix(ftype) m = _report_genome_agnostic_metrics(submatrix, barcode_summary_h5, recovered_cells, cell_bc_seqs, total_reads, conf_mapped_reads, prefix) if rna_library.has_genomes(ftype): for genome in genomes: # Compute genome-specific metrics genome_matrix = matrix.view().select_features_by_genome(genome) genome_summary = _report(genome_matrix, genome, barcode_summary_h5, recovered_cells, cell_bc_seqs[genome], prefix) for key, value in genome_summary.iteritems(): key = '_'.join([genome, key]) m[key] = value else: # This feature has no genomes cell_bcs_union = list( reduce(lambda a, x: a | set(x), cell_bc_seqs.itervalues(), set())) genome_summary = _report(submatrix, lib_constants.MULTI_REFS_PREFIX, barcode_summary_h5, recovered_cells, cell_bcs_union, prefix) for key, value in genome_summary.iteritems(): key = '_'.join([lib_constants.MULTI_REFS_PREFIX, key]) m[key] = value # Prepend feature type to metric keys m_prefixed = {(prefix + k): v for k, v in m.iteritems()} metrics.update(m_prefixed) return metrics
def plot_barcode_rank(chart, sample_properties, sample_data): """ Generate the RNA counter barcode rank plot """ if sample_properties.get( 'genomes' ) is None or sample_data.barcode_summary is None or sample_data.cell_barcodes is None: return None if len(sample_properties['genomes']) == 0: return None # UMI counts per BC across all genomes present if len(sample_properties['genomes']) > 1: genome = lib_constants.MULTI_REFS_PREFIX else: genome = sample_properties['genomes'][0] gex_prefix = rna_library.get_library_type_metric_prefix( lib_constants.GENE_EXPRESSION_LIBRARY_TYPE) key = cr_utils.format_barcode_summary_h5_key( gex_prefix, genome, cr_constants.TRANSCRIPTOME_REGION, cr_constants.CONF_MAPPED_DEDUPED_READ_TYPE) if key in sample_data.barcode_summary: counts_per_bc, plot_segments = sample_data.counter_barcode_rank_plot_data( key) return _plot_counter_barcode_rank(chart, counts_per_bc, plot_segments) else: # Not guaranteed to exist, depending on pipeline pass
def main(args, outs): in_bam = tk_bam.create_bam_infile(args.chunk_input) libraries = rna_library.get_bam_library_info(in_bam) distinct_library_types = sorted( list(set([x['library_type'] for x in libraries]))) library_prefixes = map( lambda lib: rna_library.get_library_type_metric_prefix(lib[ 'library_type']), libraries) chroms = in_bam.references barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_summary = cr_utils.load_barcode_tsv( args.barcodes_detected) if not barcode_whitelist else None # TODO: this is redundant gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_summary=barcode_summary, gem_groups=args.gem_groups, library_types=distinct_library_types) feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) if barcode_whitelist: barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, args.gem_groups) else: barcode_seqs = barcode_summary matrix = cr_matrix.CountMatrix.empty(feature_ref, barcode_seqs, dtype='int32') for qname, reads_iter, _ in cr_utils.iter_by_qname(in_bam, None): is_conf_mapped_deduped, genome, feature_id, bc = reporter.count_genes_bam_cb( reads_iter, libraries, library_prefixes, use_umis=cr_chem.has_umis(args.chemistry_def)) if is_conf_mapped_deduped: matrix.add(feature_id, bc) in_bam.close() reporter.store_reference_metadata(args.reference_path, cr_constants.REFERENCE_TYPE, cr_constants.REFERENCE_METRIC_PREFIX) matrix.save_h5_file(outs.matrices_h5) reporter.save(outs.chunked_reporter)
def _get_conf_mapped_reads(summary, genomes, library_type): prefix = rna_library.get_library_type_metric_prefix(library_type) conf_mapped_metrics = [ prefix + '_'.join([ ref, cr_constants.TRANSCRIPTOME_REGION, cr_constants.CONF_MAPPED_READ_TYPE, 'reads_frac' ]) for ref in genomes ] total_reads = _get_total_reads(summary, library_type) return sum( float(summary.get(metric, 0)) * float(total_reads) for metric in conf_mapped_metrics)
def main(args, outs): if not (os.path.isfile(args.molecule_info) and os.path.isfile(args.filtered_feature_counts_matrix)): set_empty(outs) return with open(args.counter_metrics_json) as f: protospacer_call_metrics = json.load(f) report_prefix = rna_library.get_library_type_metric_prefix( rna_library.CRISPR_LIBRARY_TYPE) filtered_feature_counts_matrix = cr_matrix.CountMatrix.load_h5_file( args.filtered_feature_counts_matrix) filtered_guide_counts_matrix = filtered_feature_counts_matrix.select_features_by_type( rna_library.CRISPR_LIBRARY_TYPE) num_gex_cbs = len(filtered_feature_counts_matrix.bcs) if feature_utils.check_if_none_or_empty(filtered_guide_counts_matrix): set_empty(outs) return feature_defs = filtered_guide_counts_matrix.feature_ref.feature_defs feature_map = { feature_def.id: feature_def.tags.get('sequence') for feature_def in feature_defs } """Protospacer calling""" (perturbation_calls_table, presence_calls, cells_with_ps, ps_calls_summary, umi_thresholds) = protospacer_calling.get_ps_calls_and_summary( filtered_guide_counts_matrix, feature_map, ) protospacer_call_metrics.update( protospacer_calling.get_protospacer_call_metrics( ps_calls_summary, num_gex_cbs, report_prefix)) perturbation_calls_table.to_csv(outs.protospacer_calls_per_cell) ps_calls_summary.to_csv(outs.protospacer_calls_summary) feature_utils.write_json_from_dict(cells_with_ps, outs.cells_per_protospacer) feature_utils.write_json_from_dict(umi_thresholds, outs.protospacer_umi_thresholds_json) feature_utils.write_csv_from_dict(umi_thresholds, outs.protospacer_umi_thresholds_csv, "Protospacer,UMI threshold\n") feature_utils.write_json_from_dict(protospacer_call_metrics, outs.protospacer_call_metrics_json)
def split(args): paired_end = cr_chem.is_paired_end(args.chemistry_def) if paired_end: assert len(args.read1s) == len(args.read2s) assert len(args.corrected_bcs) == len(args.read1s) chunks = [] # Determine the number of buckets required to achieve # the given chunk size. chunks_per_gem_group = {} with open(args.reads_summary) as f: reads_summary = json.load(f) for gg in args.gem_groups: # Get the libraries w/ this GEM group (should only be one) gg_library_ids = [lib['library_id'] for lib in args.library_info if lib['gem_group'] == gg] assert len(gg_library_ids) == 1 lib_type_prefix = rna_library.get_library_type_metric_prefix( lib_constants.VDJ_LIBRARY_TYPE) readpairs = reads_summary['%s%s_total_read_pairs_per_library' % (lib_type_prefix, gg_library_ids[0])] chunks_per_gem_group[str(gg)] = max(2, int(math.ceil(float(readpairs) / \ args.readpairs_per_chunk))) for fastq1, fastq2, bcs in itertools.izip_longest(args.read1s, args.read2s, args.corrected_bcs): chunks.append({ 'read1s_chunk': fastq1, 'read2s_chunk': fastq2 if paired_end else None, 'bcs': bcs, 'chunks_per_gem_group': chunks_per_gem_group, '__mem_gb': 6, }) return {'chunks': chunks, 'join': {'__mem_gb': 2}}
def get_metrics_from_summary(summary, libraries, total_recovered_cells=None, total_force_cells=None): """ Extract relevant metrics from a summary dict.""" mol_metrics = {} version_metrics = ['cellranger_version', 'reference_mkref_version', 'reference_fasta_hash', 'reference_gtf_hash'] for m in version_metrics: mol_metrics[m] = summary[m] chemistry_metrics = [m for m in summary if m.startswith('chemistry')] for m in chemistry_metrics: mol_metrics[m] = summary[m] # Per-library values lib_metrics = {} for lib_idx, lib in enumerate(libraries): lib_type_prefix = rna_library.get_library_type_metric_prefix(lib['library_type']) summary_name = '%s%s_total_read_pairs_per_library' % (lib_type_prefix, lib_idx) lib_metrics[str(lib_idx)] = { TOTAL_READS_METRIC: summary[summary_name], } # Per-gem-group values gg_metrics = {} gem_groups = sorted([lib['gem_group'] for lib in libraries]) for gg in gem_groups: # Distribute the toplevel expected and forced cells parameters # evenly among the gem groups. recovered_cells = total_recovered_cells / len(gem_groups) if total_recovered_cells is not None else None force_cells = total_force_cells / len(gem_groups) if total_force_cells is not None else None gg_metrics[str(gg)] = { GG_RECOVERED_CELLS_METRIC: recovered_cells, GG_FORCE_CELLS_METRIC: force_cells, } mol_metrics[LIBRARIES_METRIC] = lib_metrics mol_metrics[GEM_GROUPS_METRIC] = gg_metrics return mol_metrics
def remove_bcs_with_high_umi_corrected_reads(correction_data, matrix): """ Given a CountMatrix and and csv file containing information about umi corrected reads, detect all barcodes with unusually high fraction of corrected reads (proobably aggregates), and remove them from the CoutMatrix """ bcs_to_remove, reads_lost, removed_bcs_df = ab_utils.detect_aggregate_bcs( correction_data) bcs_to_remove = set(matrix.bc_to_int(bc) for bc in bcs_to_remove) # make sure filtered_bcs is in deterministic order or any later bootstrap sampling will not be deterministic filtered_bcs = [ i for i in xrange(matrix.bcs_dim) if i not in bcs_to_remove ] cleaned_matrix = matrix.select_barcodes(filtered_bcs) ### report how many aggregates were found, and the fraction of reads those accounted for metrics_to_report = {} report_prefix = rna_library.get_library_type_metric_prefix( rna_library.ANTIBODY_LIBRARY_TYPE) metrics_to_report[report_prefix + 'number_highly_corrected_GEMs'] = len(bcs_to_remove) metrics_to_report[report_prefix + 'reads_lost_to_highly_corrected_GEMs'] = reads_lost return cleaned_matrix, metrics_to_report, removed_bcs_df
def summarize_read_matrix(matrix, library_info, barcode_info, barcode_seqs): """Summarize matrix of read-pair counts""" lib_types = sorted(set(lib['library_type'] for lib in library_info)) view = matrix.view() summary = {} for lib_type in lib_types: if rna_library.has_genomes(lib_type): sum_genomes = map(str, barcode_info.genomes) else: sum_genomes = [lib_constants.MULTI_REFS_PREFIX] for genome in sum_genomes: m = view.select_features_by_type(lib_type) if rna_library.has_genomes(lib_type): m = m.select_features_by_genome(genome) genome_idx = barcode_info.genomes.index(genome) else: genome_idx = None prefix = '%s%s' % ( rna_library.get_library_type_metric_prefix(lib_type), genome) summary['%s_raw_mapped_reads' % prefix] = m.sum() filtered_bcs = MoleculeCounter.get_filtered_barcodes( barcode_info, library_info, barcode_seqs, genome_idx=genome_idx, library_type=lib_type) filtered_m = m.select_barcodes_by_seq(filtered_bcs) summary['%s_flt_mapped_reads' % prefix] = filtered_m.sum() summary['%s_filtered_bcs' % prefix] = len(filtered_bcs) return summary
def join(args, outs, chunk_defs, chunk_outs): outs.reads, outs.read2s, outs.tags = [], [], [] outs.gem_groups, outs.library_types, outs.library_ids, outs.read_groups = [], [], [], [] for chunk_out in chunk_outs: outs.reads += [read for read in chunk_out.reads] outs.read2s += [read2 for read2 in chunk_out.read2s] outs.tags += [tags for tags in chunk_out.tags] outs.gem_groups += [gem_group for gem_group in chunk_out.gem_groups] outs.library_types += [lt for lt in chunk_out.library_types] outs.library_ids += [li for li in chunk_out.library_ids] outs.read_groups += [ read_group for read_group in chunk_out.read_groups ] # Ensure that we have non-zero reads if not outs.reads: martian.exit( "No reads found. Check the input fastqs and/or the chemistry definition" ) # Ensure consistency of BAM comments assert all(chunk_out.bam_comments == chunk_outs[0].bam_comments for chunk_out in chunk_outs) outs.bam_comments = chunk_outs[0].bam_comments # Write barcode counts (merged by library_type) bc_counters = BarcodeCounter.merge_by( [co.barcode_counts for co in chunk_outs], [cd.library_type for cd in chunk_defs], args.barcode_whitelist, outs.gem_groups) with open(outs.barcode_counts, 'w') as f: tk_safe_json.dump_numpy(bc_counters, f) # Write feature counts feature_counts = None for chunk_def, chunk_out in itertools.izip(chunk_defs, chunk_outs): with open(chunk_out.feature_counts) as f: chunk_counts = np.asarray(json.load(f), dtype=int) if feature_counts is None: feature_counts = chunk_counts else: feature_counts += chunk_counts with open(outs.feature_counts, 'w') as f: json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f) outs.align = cr_utils.select_alignment_params(args.align) # Group reporters by library type outs.chunked_reporter = None reporter_groups = defaultdict(list) for chunk_def, chunk_out in zip(chunk_defs, chunk_outs): if not chunk_out.reads: continue chunk_lib_types = set(lt for lt in chunk_out.library_types) assert len(chunk_lib_types) == 1 lib_type = list(chunk_lib_types)[0] reporter_groups[lib_type].append(chunk_out.chunked_reporter) # Merge reporters and prefix JSON keys by library type summary = {} for lib_type, reporters in reporter_groups.iteritems(): j = cr_report.merge_reporters(reporters).to_json() prefix = rna_library.get_library_type_metric_prefix(lib_type) j_prefixed = dict((prefix + k, v) for k, v in j.iteritems()) summary.update(j_prefixed) # Use a temporary reporter to generate the metadata (w/o a prefix) tmp_reporter = cr_report.Reporter() tmp_reporter.store_chemistry_metadata(args.chemistry_def) summary.update(tmp_reporter.to_json()) # Write summary JSON with open(outs.summary, 'w') as f: tk_safe_json.dump_numpy(summary, f, pretty=True)
def join(args, outs, chunk_defs, chunk_outs): # Pass through the matrix chunks and nnz counts outs.raw_matrices_h5 = [o.raw_matrix_h5 for o in chunk_outs] outs.raw_nnz = sum(o.raw_nnz for o in chunk_outs) outs.filtered_matrices_h5 = [o.filtered_matrix_h5 for o in chunk_outs] outs.filted_nnz = sum(o.filtered_nnz for o in chunk_outs) with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() lib_types = sorted(set(lib['library_type'] for lib in library_info)) summary = { 'frac_reads_kept': chunk_defs[0].frac_reads_kept, 'num_cells_by_library': chunk_defs[0].num_cells, } # Merge read summary metrics read_summary = defaultdict(int) for filename in [co.chunk_summary for co in chunk_outs]: with open(filename) as f: d = json.load(f) for k in d['read_summary'].iterkeys(): read_summary[k] += d['read_summary'][k] summary.update(read_summary) # Get summary metrics with open(chunk_outs[0].chunk_summary) as f: mol_metrics = json.load(f)['mol_metrics'] chem_keys = [ k for k in mol_metrics.iterkeys() if k.startswith('chemistry') ] for k in chem_keys: summary[k] = mol_metrics[k] print json.dumps(mol_metrics, indent=4, sort_keys=True) # Report normalization metrics all_batches = OrderedDict() # These are all per-library-type min_frac_reads_kept = np.ones(len(lib_types), dtype='float') total_raw_read_pairs = np.zeros(len(lib_types), dtype='uint64') total_ds_raw_read_pairs = np.zeros(len(lib_types), dtype='uint64') total_cells = np.zeros(len(lib_types), dtype='uint64') for lib_type_idx, lib_type in enumerate(lib_types): lib_inds = [ i for i, lib in enumerate(library_info) if lib['library_type'] == lib_type ] for lib_idx in lib_inds: aggr_id = library_info[lib_idx]['aggr_id'] old_gg = library_info[lib_idx]['old_gem_group'] batch = aggr_id + ('-%d' % old_gg if old_gg > 1 else '') all_batches[batch] = None n_cells = summary['num_cells_by_library'][lib_idx] total_cells[lib_type_idx] += n_cells lib_metrics = mol_metrics[cr_mol_counter.LIBRARIES_METRIC][str( lib_idx)] raw_read_pairs = lib_metrics[cr_mol_counter.TOTAL_READS_METRIC] mapped_read_pairs = lib_metrics[cr_mol_counter.USABLE_READS_METRIC] ds_read_pairs = lib_metrics[ cr_mol_counter.DOWNSAMPLED_READS_METRIC] total_raw_read_pairs[lib_type_idx] += raw_read_pairs total_ds_raw_read_pairs[lib_type_idx] += ds_read_pairs frac_reads_kept = summary['frac_reads_kept'][lib_idx] min_frac_reads_kept[lib_type_idx] = min( min_frac_reads_kept[lib_type_idx], frac_reads_kept) pre_norm_raw_rppc = tk_stats.robust_divide(raw_read_pairs, n_cells) pre_norm_mapped_rppc = tk_stats.robust_divide( mapped_read_pairs, n_cells) # Prefix with batch and library type if lib_type.lower().startswith( rna_library.CUSTOM_LIBRARY_TYPE_PREFIX.lower()): lib_prefix = rna_library.CUSTOM_LIBRARY_TYPE_PREFIX + '_' else: lib_prefix = rna_library.get_library_type_metric_prefix( lib_type) p = (batch, lib_prefix) summary.update({ '%s_%sfrac_reads_kept' % p: frac_reads_kept, '%s_%spre_normalization_raw_reads_per_filtered_bc' % p: pre_norm_raw_rppc, '%s_%spre_normalization_cmb_reads_per_filtered_bc' % p: pre_norm_mapped_rppc, }) summary['batches'] = all_batches.keys() for lib_type_idx, lib_type in enumerate(lib_types): mean_rppc = tk_stats.robust_divide(total_raw_read_pairs[lib_type_idx], total_cells[lib_type_idx]) ds_mean_rppc = tk_stats.robust_divide( total_ds_raw_read_pairs[lib_type_idx], total_cells[lib_type_idx]) p = rna_library.get_library_type_metric_prefix(lib_type) summary.update({ '%spre_normalization_total_reads' % p: total_raw_read_pairs[lib_type_idx], '%spost_normalization_total_reads' % p: total_ds_raw_read_pairs[lib_type_idx], '%sfiltered_bcs_transcriptome_union' % p: total_cells[lib_type_idx], '%spre_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc' % p: mean_rppc, '%spost_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc' % p: ds_mean_rppc, '%slowest_frac_reads_kept' % p: min_frac_reads_kept[lib_type_idx], }) with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)
def join(args, outs, chunk_defs, chunk_outs): version = martian.get_pipelines_version() with open(args.summary) as f: summary = json.load(f) with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() barcode_info = mc.get_barcode_info() barcode_seqs = mc.get_barcodes() lib_types = sorted(set(lib['library_type'] for lib in library_info)) # make attrs for user-added columns in aggr csv extra_attrs = get_custom_aggr_columns(args.sample_defs) # track original library/gem info library_map = cr_matrix.make_library_map_aggr(args.gem_group_index) extra_attrs.update(library_map) # Merge raw matrix raw_matrix = cr_matrix.merge_matrices(args.raw_matrices_h5) raw_matrix.save_h5_file(outs.raw_matrix_h5, extra_attrs=extra_attrs) genomes = raw_matrix.get_genomes() # Create barcode summary HDF5 file w/ GEX data for the barcode rank plot with h5py.File(outs.barcode_summary_h5, 'w') as f: cr_io.create_hdf5_string_dataset(f, cr_constants.H5_BC_SEQUENCE_COL, raw_matrix.bcs) gex_bc_counts = raw_matrix.view().select_features_by_type(lib_constants.GENE_EXPRESSION_LIBRARY_TYPE).sum(axis=0).astype('uint64') genome_key = genomes[0] if len(genomes) == 1 else lib_constants.MULTI_REFS_PREFIX f.create_dataset('_%s_transcriptome_conf_mapped_deduped_barcoded_reads' % genome_key, data=gex_bc_counts) rna_matrix.save_mex(raw_matrix,outs.raw_matrix_mex, version) del raw_matrix # Merge filtered matrix filt_mat = cr_matrix.merge_matrices(args.filtered_matrices_h5) filt_mat.save_h5_file(outs.filtered_matrix_h5, extra_attrs=extra_attrs) # Summarize the matrix across library types and genomes for lib_type in lib_types: libtype_prefix = rna_library.get_library_type_metric_prefix(lib_type) if rna_library.has_genomes(lib_type): genomes = filt_mat.get_genomes() else: genomes = [None] mat_lib = filt_mat.view().select_features_by_type(lib_type) for genome in genomes: if genome is None: mat = mat_lib genome_idx = None else: mat = mat_lib.select_features_by_genome(genome) genome_idx = barcode_info.genomes.index(genome) # Select barcodes passing filter for this (lib_type, genome) filtered_bcs = MoleculeCounter.get_filtered_barcodes(barcode_info, library_info, barcode_seqs, genome_idx=genome_idx, library_type=lib_type) mat = mat.select_barcodes_by_seq(filtered_bcs) median_features = np.median(mat.count_ge(axis=0, threshold=cr_constants.MIN_COUNTS_PER_GENE)) median_counts = np.median(mat.sum(axis=0)) genome_prefix = genome if genome is not None else lib_constants.MULTI_REFS_PREFIX prefixes = (libtype_prefix, genome_prefix) if genome is not None: flt_reads = summary['%s%s_flt_mapped_reads' % prefixes] raw_reads = summary['%s%s_raw_mapped_reads' % prefixes] frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads) summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % prefixes] = frac_reads_in_cells summary.update({ '%s%s_filtered_bcs_median_counts' % prefixes: median_counts, '%s%s_filtered_bcs_median_unique_genes_detected' % prefixes: median_features, }) # Compute frac reads in cells across all genomes prefixes = [(libtype_prefix, g) for g in genomes if g is not None] if len(prefixes) == 0: prefixes = [(libtype_prefix, lib_constants.MULTI_REFS_PREFIX)] flt_reads = sum(summary['%s%s_flt_mapped_reads' % p] for p in prefixes) raw_reads = sum(summary['%s%s_raw_mapped_reads' % p] for p in prefixes) frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads) summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % ( libtype_prefix, lib_constants.MULTI_REFS_PREFIX)] = frac_reads_in_cells # Write MEX format (do it last because it converts the matrices to COO) rna_matrix.save_mex(filt_mat, outs.filtered_matrix_mex, version) with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)
def _get_total_reads(summary, library_type): prefix = rna_library.get_library_type_metric_prefix(library_type) return int(summary.get(prefix + 'total_reads', 0))
def main(args, outs): reporter = vdj_report.VdjReporter() # Set a default value of 0 for number of paired cells so that it will be # present in the metric summary csv even when there are no paired cells # or in denovo mode reporter._get_metric_attr( 'vdj_assembly_contig_pair_productive_full_len_bc_count', MULTI_REFS_PREFIX).set_value(0) barcode_contigs = defaultdict(list) contig_annotations = {} # Get annotations for each contig for annotation in iter(json.load(open(args.annotations))): contig_annotations[annotation['contig_name']] = annotation if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t', dtype={ 'component': int, 'num_reads': int, 'num_pairs': int, 'num_umis': int, 'umi_list': str, }) contig_summary = contig_summary.groupby('barcode') else: contig_summary = None if args.umi_summary and os.path.isfile(args.umi_summary): umi_summary = pd.read_csv(args.umi_summary, header=0, index_col=None, sep='\t') umi_summary = umi_summary.groupby('barcode') else: umi_summary = None if args.filter_summary: filter_summary = vdj_utils.load_contig_summary_table( args.filter_summary) else: filter_summary = None # Get contigs for each barcode for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)): contig_name = contig_hdr.split(' ')[0] if not filter_summary is None and not vdj_utils.is_contig_filtered( filter_summary, contig_name): continue barcode = vdj_utils.get_barcode_from_contig_name(contig_name) barcode_contigs[barcode].append((contig_name, contig_seq)) # Compute metrics for each barcode if args.cell_barcodes: barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes) else: # Pass an empty barcode JSON for bulk barcodes = {''} reference = vdj_ref.VdjReference(args.vdj_reference_path) for barcode in barcodes: contigs = barcode_contigs[barcode] annotations = [contig_annotations[contig[0]] for contig in contigs] reporter.vdj_barcode_contig_cb(barcode, contigs, annotations, reference) if not contig_summary is None and barcode in contig_summary.groups: bc_contig_summary = contig_summary.get_group(barcode) else: bc_contig_summary = None if not umi_summary is None and barcode in umi_summary.groups: bc_umi_summary = umi_summary.get_group(barcode) else: bc_umi_summary = None reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary, annotations, reference) ## Compute post-assembly per-cell metrics # Load the assembly metrics summary to get the total assemblable reads if args.assemble_metrics_summary and args.reads_summary: assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json( args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc') assemblable_read_pairs = sum( assemblable_read_pairs_by_bc.get(bc, 0) for bc in barcodes) lib_type_prefix = rna_library.get_library_type_metric_prefix( LIBRARY_TYPE) total_read_pairs = cr_utils.get_metric_from_json( args.reads_summary, '%stotal_read_pairs' % lib_type_prefix) reporter._get_metric_attr( 'vdj_assemblable_read_pairs_per_filtered_bc').set_value( assemblable_read_pairs, len(barcodes)) reporter._get_metric_attr('vdj_sequencing_efficiency').set_value( assemblable_read_pairs, total_read_pairs) ## Try to autodetect the chain type # Find all chains w/ a significant presence. # If there's exactly one, set the chain type filter to that. # Otherwise, show all chain types. chain_count = defaultdict(int) for anno_dict in contig_annotations.itervalues(): contig = vdj_annotations.AnnotatedContig.from_dict( anno_dict, reference) if contig.is_cell and contig.high_confidence and contig.productive: for anno in contig.annotations: if anno.feature.chain_type in vdj_constants.VDJ_CHAIN_TYPES: chain_count[anno.feature.chain_type] += 1 outs.chain_type = vdj_constants.ALL_CHAIN_TYPES print chain_count if len(chain_count) > 0: n_contigs = sum(chain_count.itervalues()) sig_chains = [ ct for ct, count in chain_count.iteritems() if tk_stats.robust_divide( count, n_contigs) >= MIN_CHAIN_TYPE_CONTIG_FRAC ] if len(sig_chains) == 1: outs.chain_type = sig_chains[0] reporter.report_summary_json(outs.summary)
def make_metric_name(name, library_type, genome, ss_type, ss_depth): lt_prefix = rna_library.get_library_type_metric_prefix(library_type) return '%s%s_%s_%s_%s' % (lt_prefix, genome, ss_type, ss_depth, name)