def join_matrices(args, outs, chunk_defs, chunk_outs): chunk_h5s = [chunk_out.matrices_h5 for chunk_out in chunk_outs] matrices = cr_matrix.merge_matrices(chunk_h5s) matrix_attrs = cr_matrix.make_matrix_attrs_count( args.sample_id, args.gem_groups, cr_chem.get_description(args.chemistry_def)) matrices.save_h5(outs.matrices_h5, extra_attrs=matrix_attrs) matrices.save_mex(outs.matrices_mex)
def join_matrices(args, outs, chunk_defs, chunk_outs): chunk_h5s = [chunk_out.matrices_h5 for chunk_out in chunk_outs] matrix = cr_matrix.merge_matrices(chunk_h5s) matrix_attrs = cr_matrix.make_matrix_attrs_count( args.sample_id, args.gem_groups, cr_chem.get_description(args.chemistry_def)) matrix.save_h5_file(outs.matrices_h5, extra_attrs=matrix_attrs) rna_matrix.save_mex(matrix, outs.matrices_mex, martian.get_pipelines_version())
def join(args, outs, chunk_defs, chunk_outs): outs.coerce_strings() input_vcfs = [chunk_out.filtered_variants for chunk_out in chunk_outs] tk_io.combine_vcfs(outs.filtered_variants, input_vcfs) raw_chunk_h5s = [ chunk_out.raw_allele_bc_matrices_h5 for chunk_out in chunk_outs ] raw_allele_bc_matrices = cr_matrix.merge_matrices(raw_chunk_h5s) likelihood_chunk_h5s = [ chunk_out.likelihood_allele_bc_matrices_h5 for chunk_out in chunk_outs ] likelihood_allele_bc_matrices = cr_matrix.merge_matrices( likelihood_chunk_h5s) raw_allele_bc_matrices.save_h5(outs.raw_allele_bc_matrices_h5) raw_allele_bc_matrices.save_mex(outs.raw_allele_bc_matrices_mex) likelihood_allele_bc_matrices.save_h5( outs.likelihood_allele_bc_matrices_h5) likelihood_allele_bc_matrices.save_mex( outs.likelihood_allele_bc_matrices_mex)
def join(args, outs, chunk_defs, chunk_outs): # Summarize genes and UMI counts chunks = zip(chunk_defs, chunk_outs) # Check for an empty chunk if len(chunks) == 0 or chunk_defs[0].subsample_info.get( 'subsample_type') is None or chunk_defs[0].subsample_info.get( 'subsample_rate') is None: outs.summary = None return chunk_key = lambda chunk: (chunk[0].subsample_info[ 'subsample_type'], chunk[0].subsample_info['target_rpc'], chunk[0]. subsample_info['subsample_rate']) # Merge reporter objects from main reporter_file_names = [ chunk_out.chunked_reporter for chunk_out in chunk_outs if os.path.isfile(chunk_out.chunked_reporter) ] merged_reporter = cr_report.merge_reporters(reporter_file_names) outs.subsampled_matrices = [] # Aggregate the molecule info chunks that belong together for chunk_group, (subsample_key, chunk_iter) in enumerate( itertools.groupby(sorted(chunks, key=chunk_key), chunk_key)): subsample_type, target_rpc, subsample_rate = subsample_key if subsample_type is None or subsample_rate is None: continue # Aggregate information over chunks with same key chunk_raw_h5s = [] chunk_filtered_h5s = [] all_subsample_types = cr_constants.ALL_SUBSAMPLE_TYPES all_target_rpc = None for chunk_def, chunk_out in chunk_iter: # List of target rpcs should be identical among all chunks assert all_target_rpc is None or all_target_rpc == chunk_def.subsample_info[ 'all_target_rpc'] all_target_rpc = chunk_def.subsample_info['all_target_rpc'] chunk_raw_h5s.append(chunk_out.subsampled_matrices['raw_matrices']) chunk_filtered_h5s.append( chunk_out.subsampled_matrices['filtered_matrices']) raw_matrices = cr_matrix.merge_matrices(chunk_raw_h5s) filtered_matrices = cr_matrix.merge_matrices(chunk_filtered_h5s) # Compute metrics on subsampled matrices merged_reporter.summarize_subsampled_matrices_cb( filtered_matrices, subsample_type, target_rpc) # Write the merged matrices outs.subsampled_matrices.append({ 'subsample_type': subsample_type, 'target_rpc': target_rpc, 'subsample_rate': subsample_rate, 'all_subsample_types': all_subsample_types, 'all_target_rpc': all_target_rpc, 'raw_matrices': martian.make_path('%s_%s_%s_raw_matrices.h5' % (subsample_type, target_rpc, chunk_group)), 'filtered_matrices': martian.make_path('%s_%s_%s_filtered_matrices.h5' % (subsample_type, target_rpc, chunk_group)), }) assert not os.path.exists(outs.subsampled_matrices[-1]['raw_matrices']) assert not os.path.exists( outs.subsampled_matrices[-1]['filtered_matrices']) raw_matrices.save_h5(outs.subsampled_matrices[-1]['raw_matrices']) filtered_matrices.save_h5( outs.subsampled_matrices[-1]['filtered_matrices']) merged_reporter.report_summary_json(filename=outs.summary)
def join(args, outs, chunk_defs, chunk_outs): version = martian.get_pipelines_version() with open(args.summary) as f: summary = json.load(f) with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() barcode_info = mc.get_barcode_info() barcode_seqs = mc.get_barcodes() lib_types = sorted(set(lib['library_type'] for lib in library_info)) # make attrs for user-added columns in aggr csv extra_attrs = get_custom_aggr_columns(args.sample_defs) # track original library/gem info library_map = cr_matrix.make_library_map_aggr(args.gem_group_index) extra_attrs.update(library_map) # Merge raw matrix raw_matrix = cr_matrix.merge_matrices(args.raw_matrices_h5) raw_matrix.save_h5_file(outs.raw_matrix_h5, extra_attrs=extra_attrs) genomes = raw_matrix.get_genomes() # Create barcode summary HDF5 file w/ GEX data for the barcode rank plot with h5py.File(outs.barcode_summary_h5, 'w') as f: cr_io.create_hdf5_string_dataset(f, cr_constants.H5_BC_SEQUENCE_COL, raw_matrix.bcs) gex_bc_counts = raw_matrix.view().select_features_by_type(lib_constants.GENE_EXPRESSION_LIBRARY_TYPE).sum(axis=0).astype('uint64') genome_key = genomes[0] if len(genomes) == 1 else lib_constants.MULTI_REFS_PREFIX f.create_dataset('_%s_transcriptome_conf_mapped_deduped_barcoded_reads' % genome_key, data=gex_bc_counts) rna_matrix.save_mex(raw_matrix,outs.raw_matrix_mex, version) del raw_matrix # Merge filtered matrix filt_mat = cr_matrix.merge_matrices(args.filtered_matrices_h5) filt_mat.save_h5_file(outs.filtered_matrix_h5, extra_attrs=extra_attrs) # Summarize the matrix across library types and genomes for lib_type in lib_types: libtype_prefix = rna_library.get_library_type_metric_prefix(lib_type) if rna_library.has_genomes(lib_type): genomes = filt_mat.get_genomes() else: genomes = [None] mat_lib = filt_mat.view().select_features_by_type(lib_type) for genome in genomes: if genome is None: mat = mat_lib genome_idx = None else: mat = mat_lib.select_features_by_genome(genome) genome_idx = barcode_info.genomes.index(genome) # Select barcodes passing filter for this (lib_type, genome) filtered_bcs = MoleculeCounter.get_filtered_barcodes(barcode_info, library_info, barcode_seqs, genome_idx=genome_idx, library_type=lib_type) mat = mat.select_barcodes_by_seq(filtered_bcs) median_features = np.median(mat.count_ge(axis=0, threshold=cr_constants.MIN_COUNTS_PER_GENE)) median_counts = np.median(mat.sum(axis=0)) genome_prefix = genome if genome is not None else lib_constants.MULTI_REFS_PREFIX prefixes = (libtype_prefix, genome_prefix) if genome is not None: flt_reads = summary['%s%s_flt_mapped_reads' % prefixes] raw_reads = summary['%s%s_raw_mapped_reads' % prefixes] frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads) summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % prefixes] = frac_reads_in_cells summary.update({ '%s%s_filtered_bcs_median_counts' % prefixes: median_counts, '%s%s_filtered_bcs_median_unique_genes_detected' % prefixes: median_features, }) # Compute frac reads in cells across all genomes prefixes = [(libtype_prefix, g) for g in genomes if g is not None] if len(prefixes) == 0: prefixes = [(libtype_prefix, lib_constants.MULTI_REFS_PREFIX)] flt_reads = sum(summary['%s%s_flt_mapped_reads' % p] for p in prefixes) raw_reads = sum(summary['%s%s_raw_mapped_reads' % p] for p in prefixes) frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads) summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % ( libtype_prefix, lib_constants.MULTI_REFS_PREFIX)] = frac_reads_in_cells # Write MEX format (do it last because it converts the matrices to COO) rna_matrix.save_mex(filt_mat, outs.filtered_matrix_mex, version) with open(outs.summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)