def split(args): ref = contig_manager.contig_manager( args.reference_path ) constants = load_h5(args.sc_norm_profiles, "constants").to_dict() ncells = constants["ncells"] window_size = constants["window_size"] # maximum memory usage is the maximum of these four values: # sumbins = sum(len(c) for c in primary_contigs)/window_size # maxbins = max(len(c) for c in all_contigs)/window_size # X + Q + H = ((2*sizeof(i8) + 2*sizeof(f32)) * ncells * sumbins) # occupancy = sizeof(f32) * levels(=6) * (ncells - 1) * sumbins / nchunks(=100) # het = X + Q + H + occupancy # X + Y + Z = ((2*sizeof(float)) * ncells * maxbins) # merged_bed = sc_cnv_calls_bed + internal_cnv_calls_bed # unmerged_bed = sc_unmerged_cnv_calls_bed + internal_unmerged_cnv_calls_bed # * NOTE: ask for the double the matrix sizes to acct for intermediate values f32sz = 4 sumbins = sum(ref.contig_lengths[c]/window_size+1 for c in ref.primary_contigs()) maxbins = max(ref.contig_lengths[c]/window_size+1 for c in ref.list_all_contigs()) XQH_mem_gb = float((2 + 2*f32sz) * ncells * sumbins)/1e9 occ_mem_gb = float(f32sz * 6 * (ncells - 1) * sumbins/100)/1e9 het_mem_gb = XQH_mem_gb + occ_mem_gb XYZ_mem_gb = 2 * float(f32sz * ncells * maxbins) / 1e9 merged_bed_gb = os.path.getsize(args.sc_cnv_calls)/1e9 + \ os.path.getsize(args.internal_cnv_calls)/1e9 + 1 unmerged_bed_gb = os.path.getsize(args.sc_unmerged_cnv_calls)/1e9 + \ os.path.getsize(args.internal_unmerged_cnv_calls)/1e9 + 1 mem_gb = int(np.ceil(max(het_mem_gb, XYZ_mem_gb, merged_bed_gb, unmerged_bed_gb))) + 3 return {'chunks': [], 'join': {'__mem_gb': mem_gb}}
def join(args, outs, chunk_defs, chunk_outs): ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) profiles, gc, mask = load_data(args.raw_profiles, args.tracks, chroms) gc_norm_params = json.load(open(args.gc_norm_params, "r")) scale = gc_norm_params["scale"] linear = gc_norm_params["linear"] quadratic = gc_norm_params["quadratic"] norm_profiles = gc_normalize(profiles, gc, linear, quadratic, chroms) bin_size = coverage_matrix.get_bin_size(args.raw_profiles) coverage_matrix.store_matrix(file_name=outs.normalized_profiles, chroms=chroms, profiles=norm_profiles, tracks=None, window_size=bin_size, masks=mask, dtype="float32") store = pd.HDFStore(outs.normalized_profiles, "a") constants = load_h5(args.raw_profiles, "constants") store["constants"] = constants store.close() store = pd.HDFStore(outs.normalized_profiles, "a") store["/gc_params/scale"] = pd.Series(scale) store["/gc_params/linear"] = pd.Series(linear) store["/gc_params/quadratic"] = pd.Series(quadratic) store.close()
def split(args): constants = load_h5(args.cluster_data, "constants").to_dict() matsize_gb = float(constants["ncells"] * constants["genomebins"]) / 1e9 return { 'chunks': [], 'join': { '__mem_gb': int(np.ceil(4 * matsize_gb + 1)) } }
def split(args): ref = contig_manager.contig_manager(args.reference_path) chroms = ref.list_all_contigs() max_chrom_size = max([ref.contig_lengths[chrom] for chrom in chroms]) constants = load_h5(args.profiles, "constants").to_dict() ncells = constants["ncells"] window_size = constants["window_size"] max_mat_size_gb = float( 2 * ncells * max_chrom_size / window_size) / 1e9 * 4 mem_gb = int(np.ceil(max_mat_size_gb * 4 + 1)) return {'chunks': [], 'join': {'__mem_gb': mem_gb}}
def split(args): constants = load_h5(args.cnv_tracks, "constants").to_dict() matsize_gb = float(constants["ncells"]*constants["genomebins"])/1e9 return {'chunks': [], 'join': {'__mem_gb' : int(np.ceil(matsize_gb * 12 + 2))}}
def join(args, outs, chunk_defs, chunk_outs): ## merge gc params jsons node_gc_params = {} sc_gc_params = json.load(open(args.sc_gc_params, "r")) internal_gc_params = json.load(open(args.internal_gc_params, "r")) ncells = len(sc_gc_params['linear']) nnodes = 2*ncells - 1 for key in ["scale", "linear", "quadratic"]: node_gc_params[key] = sc_gc_params[key] + internal_gc_params[key] with open(outs.node_gc_params, "w") as out: json.dump(node_gc_params, out, indent=4) ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) index_chrom = dict([(str(i), c) for i, c in enumerate(chroms)]) chrom_index = dict([(c, str(i)) for i, c in enumerate(chroms)]) tmp = martian.make_path('tmp.bed') tmp_dir = os.path.dirname(tmp) tmp_sorted = martian.make_path('tmp_sorted.bed') calls = [[args.sc_cnv_calls, args.internal_cnv_calls], [args.sc_unmerged_cnv_calls, args.internal_unmerged_cnv_calls]] out_calls = [outs.node_cnv_calls, outs.node_unmerged_cnv_calls] for calls, out in zip(calls, out_calls): with open(tmp, 'w') as outf: for f in calls: for l in open(f): fields = l.split() # offset internal node indices by ncells if f == calls[1]: fields[3] = str(int(fields[3]) + ncells) # fix type of confidence field to integer fields[-1] = str(int(float(fields[-1]))) # replace index number at start for sorting fields[0] = chrom_index[fields[0]] outf.write('\t'.join(fields) + '\n') no_unicode = dict(LC_ALL='C') tmp_mem_gib = max(1, int(np.ceil(float(os.path.getsize(tmp)) / (1024**3)))) try: subprocess.check_call(['sort', '-k1,1n', '-k2,2n', '-k3,3n', '--parallel=1', # force sort to use 1 thread '-S', '{}G'.format(tmp_mem_gib), '-T', tmp_dir, '-o', tmp_sorted, tmp], env=no_unicode, stderr=sys.stderr) # on some systems, --parallel is unavailable except subprocess.CalledProcessError: subprocess.check_call(['sort', '-k1,1n', '-k2,2n', '-k3,3n', # will by default only use 1 thread '-S', '{}G'.format(tmp_mem_gib), '-T', tmp_dir, '-o', tmp_sorted, tmp], env=no_unicode, stderr=sys.stderr) # strip index column into outfile with open(out, 'w') as outf: version = martian.get_pipelines_version() outf.write("#cellranger-dna {}\n".format(version)) outf.write("#reference genome: {}\n".format(args.reference_path)) outf.write("#chrom\tstart\tend\tid\tcopy_number\tevent_confidence\n") for l in open(tmp_sorted): l = l.split('\t') l[0] = index_chrom[l[0]] outf.write('\t'.join(l)) os.remove(tmp) os.remove(tmp_sorted) ## cnv tracks file sc_windows = load_h5(args.sc_cnv_tracks, "windows") internal_windows = load_h5(args.internal_cnv_tracks, "windows") windows = sc_windows.append(internal_windows).values constants = load_h5(args.sc_cnv_tracks, "constants") sc_ploidy_conf = scale_confidence_score(load_h5(args.sc_cnv_tracks, "ploidy_conf").values) internal_ploidy_conf = scale_confidence_score(load_h5( args.internal_cnv_tracks, "ploidy_conf").values) sc_scale_factor= load_h5(args.sc_cnv_tracks, "scale_factor") internal_scale_factor = load_h5(args.internal_cnv_tracks, "scale_factor") sc_rpb= load_h5(args.sc_cnv_tracks, "reads_per_bin") internal_rpb= load_h5(args.internal_cnv_tracks, "reads_per_bin") X = load_h5(args.sc_cnv_tracks, "cnv_tracks").values nbins = X.shape[1] Q = np.zeros((nnodes, nbins), dtype=X.dtype) Q[0:ncells, :] = X del X Q[ncells:, :] = load_h5(args.internal_cnv_tracks, "cnv_tracks").values store = pd.HDFStore(outs.node_cnv_tracks, "w") store["constants"] = constants store["windows"] = sc_windows.append(internal_windows) store["ploidy_conf"] = sc_ploidy_conf.append(internal_ploidy_conf) store["scale_factor"] = sc_scale_factor.append(internal_scale_factor) store["reads_per_bin"] = sc_rpb.append(internal_rpb) store["cnv_tracks"] = pd.DataFrame(Q) store.close() ## Compute heterogeneity and store in tree_data ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) if args.tracks is None: gmask = np.ones(nbins, dtype=bool) else: gmask = [] maptrack = pd.HDFStore(args.tracks, "r") for chrom in chroms: gmask.extend(maptrack["/map/"+chrom].values > MAPPABILITY_THRESHOLD) maptrack.close( ) gmask = np.array(gmask) ## update tree data # load tree store = pd.HDFStore( args.tree_data, "r" ) Z = store["/Z"].values distances = store["/distances"].values constants = store["/constants"] store.close( ) # Compute the heterogeneity at every *internal* node of the tree # obviously the heterogeneity is zero at every leaf, so don't # store a bunch of zeros levels = 6 het = compute_heterogeneity(Q, Z, gmask, windows, levels=levels) del Q # dump to disk store = pd.HDFStore( outs.tree_data, "w" ) store["Z"] = pd.DataFrame(Z) store["het"] = pd.DataFrame(het) store["distances"] = pd.Series(distances) store["windows"] = pd.Series(windows) store["constants"] = constants store.close( ) del het ## normalized profiles sc_store = pd.HDFStore(args.sc_norm_profiles, "r") internal_store = pd.HDFStore(args.internal_norm_profiles, "r") out_store = pd.HDFStore(outs.norm_node_profiles, "w") out_store["/constants"] = sc_store["/constants"] for chrom in chroms: ## first do the /contigs X = sc_store["/contigs/"+chrom].values Y = internal_store["/contigs/"+chrom].values assert X.shape[1] == Y.shape[1] nbins = X.shape[1] Z = np.zeros((2*ncells-1, nbins), dtype=X.dtype) Z[:ncells, :] = X Z[ncells:, :] = Y out_store["/contigs/"+chrom] = pd.DataFrame(Z) del X, Y, Z ## next do the /masks out_store["/masks/"+chrom] = sc_store["/masks/"+chrom] ## gc params for key in ["scale", "linear", "quadratic"]: out_store["/gc_params/"+key] = pd.concat([sc_store["/gc_params/"+key], internal_store["/gc_params/"+key]], ignore_index=True) ## do the normalization metrics out_store["/normalization_metrics"] =sc_store["normalization_metrics"].append(internal_store["/normalization_metrics"], ignore_index=True) out_store.close() sc_store.close() internal_store.close()
def main(args, outs): args.coerce_strings() outs.coerce_strings() start_cell = args.chunk["start"] end_cell = args.chunk["end"] ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) ## load genome data and cell profiles X, gmask, bdy = load_genome_data( args.profiles, args.tracks, chroms, start_cell=start_cell, end_cell=end_cell, integer=False, rounding=False, mappability_threshold=crdna.constants.MAPPABILITY_THRESHOLD) # compute chromosome boundaries after masking by gmask cbdy = np.zeros_like(bdy) for i in xrange(1, len(bdy)): cbdy[i] = (gmask[0:bdy[i]].sum()) ## load GC info and create GC track gctrack = [] store = pd.HDFStore(args.tracks, "r") for chrom in chroms: gctrack.extend(store["/GC/" + chrom].values) gctrack = np.array(gctrack)[gmask] store.close() ## if calling on nodes use scale factors from cells if args.is_singlecell: scale_guess_chunk = [None for _ in xrange(start_cell, end_cell)] cell_offset = 0 else: scale_guess = load_h5(args.profiles, "scale_guess") scale_guess_chunk = [[s] for s in scale_guess[start_cell:end_cell]] ## num cells = num internal nodes + 1 cell_offset = args.chunk["ncells"] + 1 ncells = X.shape[0] nbins = gmask.sum() P = 2 * np.ones((ncells, nbins), dtype="int8") S = np.zeros((ncells, nbins), dtype=bool) sdfs = [] scale_factors = np.zeros(ncells) pconf = np.zeros(ncells) windows = np.zeros(ncells, dtype=int) gc_norm_params = json.load(open(args.gc_norm_params, "r")) ## initialize parameters read_threshold = crdna.constants.BREAKPOINT_READ_THRESHOLD heuristics = crdna.constants.BREAKPOINT_CALLER_HEURISTICS ## override/augment heuristics by supplied params if args.params is not None: for k, v in args.params.iteritems(): heuristics[k] = v ## log heuristics used martian.log_info("Heuristics used:") for k, v in heuristics.iteritems(): martian.log_info("%s: %s" % (str(k), str(v))) debug_out = open(outs.debug, "w") if len(ref.list_species()) == 1: for i in xrange(ncells): debug_out.write("-" * 80 + "\n") debug_out.write("Cell %d\n" % (cell_offset + start_cell + i)) ## GC coefficients gc_linear = gc_norm_params["linear"][start_cell + i] gc_quadratic = gc_norm_params["quadratic"][start_cell + i] ## GC correction track for cell xi = parabola(gctrack, crdna.constants.GC_ORIGIN, gc_linear, gc_quadratic) xi_low = parabola(crdna.constants.MIN_GC, crdna.constants.GC_ORIGIN, gc_linear, gc_quadratic) xi_high = parabola(crdna.constants.MAX_GC, crdna.constants.GC_ORIGIN, gc_linear, gc_quadratic) xi[gctrack < crdna.constants.MIN_GC] = xi_low xi[gctrack > crdna.constants.MAX_GC] = xi_high y = X[i][gmask] ## do the CNV calling ploidy, S[i], gap, sdf, sf = call_cnvs( y, xi, ref, cbdy, scale_guess=scale_guess_chunk[i], log_func=debug_out.write, **heuristics) scale_factors[i] = sf sdfs.append(sdf) P[i] = np.clip(ploidy, 0, np.iinfo("int8").max - 1) pconf[i] = gap windows[i] = get_segment_window_size(y, read_threshold) debug_out.flush() debug_out.close() out = pd.HDFStore(outs.denoised_profiles, "w") out["/quantized"] = pd.DataFrame(P) out["/segment_index"] = pd.DataFrame(S) out["/scaling_data"] = pd.Series(sdfs) out["/scale_factor"] = pd.Series(scale_factors) out["/ploidy_conf"] = pd.Series(pconf) out["/windows"] = pd.Series(np.clip(windows, 1, None)) out.close()
def join(args, outs, chunk_defs, chunk_outs): ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) ## load genome data and cell profiles X, gmask, bdy = load_genome_data( args.profiles, args.tracks, chroms, integer=False, rounding=False, mappability_threshold=crdna.constants.MAPPABILITY_THRESHOLD) nbins = gmask.sum() # compute chromosome boundaries after masking by gmask cbdy = np.zeros_like(bdy) for i in xrange(1, len(bdy)): cbdy[i] = (gmask[0:bdy[i]].sum()) ## load GC info and create GC track gctrack = [] store = pd.HDFStore(args.tracks, "r") for chrom in chroms: gctrack.extend(store["/GC/" + chrom].values) gctrack = np.array(gctrack)[gmask] store.close() gc_norm_params = json.load(open(args.gc_norm_params, "r")) ## Aggregate data structures from individual chunks P = np.zeros((0, nbins), dtype="int8") # ploidy per cell S = np.zeros((0, nbins), dtype=bool) # segment index per cell sdfs = [] # scaling dataframes per cell scale_factors = [] # scale factor per cell pconf = np.zeros((0, ), dtype=float) # scaling confidence per cell windows = np.zeros((0, ), dtype=int) # segment window size per cell logger = sys.stdout.write ## add logging info from chunks for chunk_out in chunk_outs: with open(chunk_out.debug, "r") as debug_in: for line in debug_in: logger(line) logger("\n" + "*" * 80 + "\n") for chunk_out, chunk_def in zip(chunk_outs, chunk_defs): start_cell = chunk_def.chunk["start"] end_cell = chunk_def.chunk["end"] chunk_store = pd.HDFStore(chunk_out.denoised_profiles, "r") p_chunk = chunk_store["/quantized"].values s_chunk = chunk_store["/segment_index"].values sf_chunk = chunk_store["/scale_factor"].values pc_chunk = chunk_store["/ploidy_conf"].values sd_chunk = list(chunk_store["/scaling_data"]) w_chunk = chunk_store["/windows"].values chunk_store.close() if P.shape[0] == 0: ncells = chunk_def.chunk["ncells"] nbins = p_chunk.shape[1] P = np.zeros((ncells, nbins), dtype="int8") S = np.zeros((ncells, nbins), dtype=bool) scale_factors = np.zeros(ncells, dtype=float) pconf = np.zeros(ncells, dtype=float) windows = np.zeros(ncells, dtype=int) P[start_cell:end_cell, :] = p_chunk S[start_cell:end_cell, :] = s_chunk scale_factors[start_cell:end_cell] = sf_chunk sdfs.extend(sd_chunk) pconf[start_cell:end_cell] = pc_chunk windows[start_cell:end_cell] = w_chunk ## Find cells with low scaling confidence fix_scaling = np.zeros(0, dtype=int) if args.is_singlecell: cell_offset = 0 fix_scaling = np.where((pconf >= 0) & (pconf <= 0.02))[0] else: cell_offset = X.shape[0] + 1 good_cells = np.where(np.logical_or(pconf == -2, pconf > 0.10))[0] agg_window = int(np.median(windows if len(windows) > 0 else [sum(gmask)])) X_agg = aggregate_matrix(X[:, gmask], agg_window) ## initialize parameters heuristics = crdna.constants.BREAKPOINT_CALLER_HEURISTICS ## override/augment heuristics by supplied params if args.params is not None: for k, v in args.params.iteritems(): heuristics[k] = v logger("%d cells with low ploidy confidence\n" % len(fix_scaling)) for cell in fix_scaling: if len(good_cells) == 0: continue logger("-" * 80 + "\n") logger("Fixing cell %d\n" % (cell + cell_offset)) ## GC coefficients gc_linear = gc_norm_params["linear"][cell] gc_quadratic = gc_norm_params["quadratic"][cell] ## GC correction track for cell xi = parabola(gctrack, crdna.constants.GC_ORIGIN, gc_linear, gc_quadratic) xi_low = parabola(crdna.constants.MIN_GC, crdna.constants.GC_ORIGIN, gc_linear, gc_quadratic) xi_high = parabola(crdna.constants.MAX_GC, crdna.constants.GC_ORIGIN, gc_linear, gc_quadratic) xi[gctrack < crdna.constants.MIN_GC] = xi_low xi[gctrack > crdna.constants.MAX_GC] = xi_high y = X[cell][gmask] ## find the correlation distance to all cells that were scaled ## confidently. Then take all matches with > 90% correlation and ## compute the median ploidy over these cells. Find the closest ## scaling solution to the median and declare that the answer. all_corrs = compute_corr_dist_to_all(X_agg[cell][np.newaxis, :], X_agg) good_corrs = all_corrs[good_cells] best_matches = good_cells[good_corrs > 0.90] if len(best_matches) == 0: continue best_guess_ploidy = np.median(P[best_matches, :].mean(axis=1)) best_scaling_soln = np.argmin( np.abs(sdfs[cell]["aploidy"].values - best_guess_ploidy)) lam_best = sdfs[cell].loc[best_scaling_soln]["lam"] sindex = S[cell] segment_bdy2 = get_segment_bdy_from_index(sindex) window = get_segment_window_size(y, heuristics["ll_read_threshold"]) segment_means2, _ = compute_segment_data(y, xi, segment_bdy2, window) ploidy = get_ploidy_vector(y, segment_means2, segment_bdy2, lam_best) delta_ploidy = np.abs(P[cell].mean() - ploidy.mean()) logger("Ploidy: %.2f -> %.2f\n" % (P[cell].mean(), ploidy.mean())) P[cell] = np.clip(ploidy, 0, np.iinfo("int8").max - 1).astype("int8") if delta_ploidy > 0.1: pconf[cell] = -4 ## Compute read depth depth = np.zeros_like(pconf) for cell in xrange(len(pconf)): depth[cell] = X[cell][gmask].mean() ## Write data to h5 store = pd.HDFStore(outs.denoised_profiles, "w") store["/quantized"] = pd.DataFrame(P) store["/scale_factor"] = pd.Series(scale_factors) store["/reads_per_bin"] = pd.Series(depth) store["/segment_index"] = pd.DataFrame(S) store["/ploidy_conf"] = pd.Series(pconf) store["/scaling_data"] = pd.Series(sdfs) store["/windows"] = pd.Series(windows) segment_windows = int(np.median(windows)) if len(windows) else 1 constants = load_h5(args.profiles, "constants").to_dict() constants["segment_windows"] = segment_windows store["constants"] = pd.Series(constants) store.close()
def main(args, outs): args.coerce_strings() outs.coerce_strings() stats = pd.read_csv(args.barnyard) stats = stats[stats['cell_id'] != 'None'].copy() ncells = len(stats) martian.log_info('Subsetting per-barcode statistics to %d cells' % ncells) ref = contig_manager.contig_manager(args.reference_path) contig_lengths = [ ref.contig_lengths[k] for k in ref.primary_contigs(allow_sex_chromosomes=True) ] tot_ref_len = float(sum(contig_lengths)) martian.log_info('Reference sequence at %s has %d bp' % (args.reference_path, tot_ref_len)) # # Accumulate per-cell summary stats # PER_CELL_HEADER = [ 'barcode', 'cell_id', 'total_num_reads', 'num_unmapped_reads', 'num_lowmapq_reads', 'num_duplicate_reads', 'num_mapped_dedup_reads', 'frac_mapped_duplicates', 'effective_depth_of_coverage', 'effective_reads_per_1Mbp', 'raw_mapd', 'normalized_mapd', 'raw_dimapd', 'normalized_dimapd', 'mean_ploidy', 'ploidy_confidence', 'is_high_dimapd', 'is_noisy' ] # unusable reads are those that are non-cell barcodes that are also any of mapped, low mapq, nor dups # no barcode are reads whose barcode is not on the whitelist num_dups = stats['dups'] num_lowmapq = stats['low_mapq_lt_30'] num_unmapped = stats['no_barcode'] + stats['unusable_read'] + stats[ 'unmapped'] num_mapped = stats['mapped'] assert all(num_unmapped + num_dups + num_lowmapq + num_mapped == stats['denominator']) per_cell = pd.DataFrame(columns=PER_CELL_HEADER) per_cell['barcode'] = stats['BC'] per_cell['cell_id'] = np.arange(0, ncells) per_cell['num_mapped_dedup_reads'] = num_mapped per_cell['frac_mapped_duplicates'] = stats['dups_frac'] per_cell['num_unmapped_reads'] = num_unmapped per_cell['num_lowmapq_reads'] = num_lowmapq per_cell['num_duplicate_reads'] = num_dups per_cell['total_num_reads'] = stats['denominator'] per_cell['effective_depth_of_coverage'] = stats['num_mapped_bases'].astype( float) / tot_ref_len per_cell['effective_reads_per_1Mbp'] = np.round( stats['mapped'] / (tot_ref_len / 1e6)).astype(int) flat_metrics = load_h5(args.norm_node_profiles, 'normalization_metrics') per_cell['raw_mapd'] = flat_metrics['raw_mapd'].iloc[0:ncells].values per_cell['normalized_mapd'] = flat_metrics['norm_mapd'].iloc[ 0:ncells].values per_cell['raw_dimapd'] = flat_metrics['raw_dimapd'].iloc[0:ncells].values per_cell['normalized_dimapd'] = flat_metrics['norm_dimapd'].iloc[ 0:ncells].values mean_ploidy, num_altevents = process_cnv_metrics(ncells, args.node_cnv_calls, DEFAULT_CONFIDENCE) per_cell['mean_ploidy'] = mean_ploidy # per cell confidence score pconf = load_h5(args.node_cnv_tracks, "ploidy_conf").values[0:ncells] per_cell['ploidy_confidence'] = pconf # is noisy cell flag high_dimapd = flat_metrics['is_high_dimapd'].iloc[0:ncells].values per_cell['is_high_dimapd'] = high_dimapd # cells with low confidence, or cells whose ploidy estimate was # overruled using high confidence cells low_ploidy_conf = np.logical_or(pconf == -4, (pconf >= 0) & (pconf <= 2)) per_cell['is_noisy'] = np.logical_or(high_dimapd == 1, low_ploidy_conf).astype(int) with open(outs.per_cell_summary_metrics, 'w') as outfile: per_cell.to_csv(outfile, columns=PER_CELL_HEADER, index=False) # # Accumulate per-analysis summary stats # # combine mask vectors to calculate genome-wide mappability chroms = ref.primary_contigs(allow_sex_chromosomes=True) masks, _ = load_mask_bdy(args.norm_node_profiles, chroms) with open(args.report_basic, 'r') as infile: report_basic = json.load(infile) with open(args.singlecell_summary, 'r') as infile: singlecell_summary = json.load(infile) per_analysis = { 'total_num_bases_R1': report_basic['r1_tot_bases'], 'total_num_bases_R1_Q30': report_basic['r1_q30_bases'], 'total_num_bases_R2': report_basic['r2_tot_bases'], 'total_num_bases_R2_Q30': report_basic['r2_q30_bases'], 'frac_bases_R1_Q30': tk_stats.robust_divide(report_basic['r1_q30_bases'], report_basic['r1_tot_bases']), 'frac_bases_R2_Q30': tk_stats.robust_divide(report_basic['r2_q30_bases'], report_basic['r2_tot_bases']), 'total_num_reads': report_basic['num_reads'], 'total_num_reads_in_cells': per_cell['total_num_reads'].sum(), 'total_num_mapped_dedup_reads_in_cells': per_cell['num_mapped_dedup_reads'].sum(), 'mean_mapped_dedup_reads_per_cell': per_cell['num_mapped_dedup_reads'].mean(), 'median_frac_mapped_duplicates_per_cell': np.median(per_cell['frac_mapped_duplicates']), 'num_cells': ncells, 'median_effective_reads_per_1Mbp': np.median(per_cell['effective_reads_per_1Mbp']), 'frac_mappable_bins': tk_stats.robust_divide(sum(masks), len(masks)), 'frac_noisy_cells': tk_stats.robust_divide(sum(per_cell['is_noisy']), len(per_cell['is_noisy'])), 'shortest_primary_contig': min(contig_lengths), 'frac_non_cell_barcode': singlecell_summary['frac_waste_non_cell_barcode'], 'correct_bc_rate': report_basic['correct_bc_rate'], 'median_unmapped_frac': singlecell_summary['median_unmapped_frac'] } for prefix in ["normalized", "raw"]: for metric in ["mapd", "dimapd"]: per_cell_key = "%s_%s" % (prefix, metric) for perc in [25, 50, 75]: summary_key = "%s_%s_p%d" % (prefix, metric, perc) per_analysis[summary_key] = tk_stats.robust_percentile( per_cell[per_cell_key], perc) for cutoff in (25, 50, 75): k = 'mean_ploidy_p{:.2g}'.format(cutoff) per_analysis[k] = tk_stats.robust_percentile(per_cell['mean_ploidy'], cutoff) per_analysis['median_ploidy'] = per_analysis['mean_ploidy_p50'] with open(outs.summary, 'w') as outfile: outfile.write( tk_json.safe_jsonify(per_analysis, pretty=True) + os.linesep) SUMMARY_METRICS = [ 'total_num_reads', 'frac_bases_R1_Q30', 'frac_bases_R2_Q30', 'correct_bc_rate', 'frac_non_cell_barcode', 'shortest_primary_contig', 'frac_mappable_bins', 'num_cells', 'total_num_reads_in_cells', 'total_num_mapped_dedup_reads_in_cells', 'median_frac_mapped_duplicates_per_cell', 'mean_mapped_dedup_reads_per_cell', 'median_effective_reads_per_1Mbp', 'median_unmapped_frac', 'mean_ploidy_p25', 'mean_ploidy_p50', 'mean_ploidy_p75', 'raw_mapd_p25', 'raw_mapd_p50', 'raw_mapd_p75', 'normalized_mapd_p25', 'normalized_mapd_p50', 'normalized_mapd_p75', 'normalized_dimapd_p25', 'normalized_dimapd_p50', 'normalized_dimapd_p75', 'raw_dimapd_p25', 'raw_dimapd_p50', 'raw_dimapd_p75', 'frac_noisy_cells' ] with open(outs.summary_cs, 'w') as outfile: values = [per_analysis[key] for key in SUMMARY_METRICS] outfile.write(",".join(SUMMARY_METRICS) + "\n") outfile.write(",".join(map(str, values)) + "\n")