def estimate_gc_norm_cell(profile, gc, mask): window_size = estimate_window_size(profile[mask]) profile = aggregate_counts(profile[mask].astype(float), window_size) gc = aggregate_counts(gc[mask].astype(float), window_size) / window_size _, linear, quadratic = minimize_entropy(profile, gc) g = gc - GC_ORIGIN g2 = g * g parabola = gc_curve(g, g2, linear, quadratic).ravel() norm_profile = profile / parabola delta_gc_cv = get_delta_gc_cv(profile, norm_profile, gc) if delta_gc_cv < 0: linear = 0.0 quadratic = 0.0 delta_gc_cv = 0.0 return linear, quadratic
def add_cell_plots_to_axes(Y, axes, bdy=None, plot_res=10, plot_args={}): ncells, nbins = Y.shape for i in xrange(ncells): z = aggregate_counts(Y[i], plot_res) axes[i].plot(z / z.mean(), rasterized=True, **plot_args) axes[i].set_ylim(0, 5) axes[i].set_yticks(range(1, 5)) if bdy is not None: for x in bdy: axes[i].axvline(x / plot_res, color="k") if i < ncells - 1: axes[i].set_xticks([]) else: axes[i].set_xticks(bdy / plot_res) axes[i].set_xticklabels(axes[i].get_xticks().astype(int), rotation=45) # if i else # if bdy axes[i].set_xlim(0, nbins / plot_res)
def find_best_scale_v15(y, xi, segment_bdy, segment_means, segment_lengths, ref, cbdy, window, scale_guess, max_ploidy_long=10, zero_ploidy_count=200, max_segment_push_to_zero=200, prior_params = {"prior_mean": 2.0, "prior_std": 0.5}, min_ploidy = None, max_ploidy = None, verbose=True, log_func=sys.stdout.write): """ Find the best scaling factor that turns a segmented profile into an integer copy number profile. min_ploidy and max_ploidy override the algorithm. See comments for explanation.""" def print_scaling_solutions(scaling_data, order, best_index=None): log_func("Scaling solutions:\n") header = ["lam", "aploidy", "dom_pdy", "dom_frac", "lprior", "entropy", "nfit", "dpcv"] log_func("|".join([h.rjust(8) for h in header])+"\n") log_func("-"*(len("|".join([h.rjust(8) for h in header])))+"\n") for x in order: if scaling_data[x]["fopt"] is None: log_func("NO DATA\n") continue row = [] for h in header: value = ("%.2f"%scaling_data[x][h] if h in scaling_data[x] else "NAN") row.append(value.rjust(8)) log_func(" ".join(row)+" ") if best_index is not None and best_index == x: log_func("X") log_func("\n") is_sex_chrom_bin = np.zeros_like(y, dtype=bool) chroms = ref.primary_contigs(allow_sex_chromosomes=True) for i, chrom in enumerate(chroms): if chrom in ref.sex_chromosomes: is_sex_chrom_bin[cbdy[i]:cbdy[i+1]] = True ## assertions assert prior_params["prior_mean"] > 0, "prior mean must be positive" assert prior_params["prior_std"] > 0, "prior std. dev. must be positive" ## find all the candidate scaling factors scaling_data0 = find_scaling_candidates_v4(y, segment_bdy, segment_means, segment_lengths, is_sex_chrom_bin, window, scale_guess, max_ploidy_long=max_ploidy_long, zero_ploidy_count=zero_ploidy_count, max_segment_push_to_zero=max_segment_push_to_zero, verbose=verbose, log_func=log_func) log_func("INITIAL CANDIDATES\n") print_scaling_solutions(scaling_data0, range(len(scaling_data0)), None) ## if we only find one solution, there's nothing to do if len(scaling_data0) == 1: log_func("Only 1 solution, nothing to do.\n") if verbose: print_scaling_solutions(scaling_data0, [0], 0) return scaling_data0[0]["lam"], -3, pd.DataFrame(scaling_data0) nbins = len(y) ## compute DPCV after compensating for ploidy & GC for si, data in enumerate(scaling_data0): lam = data["lam"] segment_ploidy = np.round(segment_means/lam).astype(int) blocks = defaultdict(list) for i, p in enumerate(segment_ploidy): blocks[p].append(segment_bdy[i]) blen = [] bdpcv = [] for p, segs in blocks.iteritems(): seg_len = 0 for s, e in segs: seg_len += e-s if seg_len < 0.01*nbins: continue counts = np.zeros(seg_len) pos = 0 for s, e in segs: counts[pos:pos+(e-s)] = y[s:e]/xi[s:e]/max(p, 0.1) pos += e-s counts_agg = aggregate_counts(counts, window) dpcv = np.sqrt(np.abs(counts_agg.std()**2 - counts_agg.mean()))/counts_agg.mean() blen.append(seg_len) bdpcv.append(dpcv) blen = np.array(blen) bdpcv = np.array(bdpcv) data["dpcv"] = (blen*bdpcv).sum()/blen.sum() data["lprior"] = log_prior_ploidy(data["aploidy"], mu=prior_params["prior_mean"], sig=prior_params["prior_std"]) data["nfit"] = data["fopt"]/segment_lengths.sum( ) ## Throw away candidates with unusually high DPCV. This removes the ploidy ## 1 solutions that we see when there is noise. Also remove high ploidy ## solutions. MIN_PLOIDY = 0.0 MAX_PLOIDY = 8.0 ploidies = np.array([data["aploidy"] for data in scaling_data0]) pfilter = ploidies < MAX_PLOIDY ## Filter based on the dpcv of likely good solutions. This is mainly to ## eliminate the ploidy 1 case. Take the min of top 3 solutions, since ## very high ploidy solutions will have low dpcv as well, but they are ## garbage. dpcvs = np.array([data["dpcv"] for data in scaling_data0]) target_dpcv = np.min(dpcvs[:3]) entropy = np.array([data["entropy"] for data in scaling_data0]) target_entropy = np.max(entropy[:3]) outliers = np.logical_or(dpcvs > target_dpcv + 0.06, entropy < target_entropy - 1.0) select = ~outliers & pfilter if select.sum() == 0: if pfilter.sum() > 0: select = pfilter else: select = np.ones(len(scaling_data0), dtype=bool) scaling_data = [] for i, data in enumerate(scaling_data0): if select[i]: scaling_data.append(data) ## if min_ploidy and max_ploidy are specified, override the decision making ## - find all solutions with ploidy (min_ploidy, max_ploidy) ## - if empty find the solutions that are closest to the end points ## - amongst the restricted solutions as determined above, pick the solution ## with the least objective function value if min_ploidy is not None or max_ploidy is not None: pmin = min_ploidy if min_ploidy is not None else MIN_PLOIDY pmax = max_ploidy if max_ploidy is not None else MAX_PLOIDY assert pmin <= pmax, "min_ploidy <= max_ploidy must be true" log_func("Applying min ploidy & max ploidy: %d-%d\n"%(pmin, pmax)) filtered_data = [data for data in scaling_data if (data["aploidy"] > pmin and data["aploidy"] < pmax)] if len(filtered_data) == 0: ploidies = np.array([data["aploidy"] for data in scaling_data0]) pmin_nbr = np.argmin(np.abs(ploidies - pmin)) pmax_nbr = np.argmin(np.abs(ploidies - pmax)) if (scaling_data0[pmin_nbr]["fopt"] < scaling_data0[pmax_nbr]["fopt"]): best_index = pmin_nbr else: best_index = pmax_nbr if verbose: print_scaling_solutions(scaling_data0, [best_index], best_index) return (scaling_data0[best_index]["lam"], -1, pd.DataFrame(scaling_data0)) else: best_index = np.argmin([data["fopt"] for data in filtered_data]) if verbose: print_scaling_solutions(filtered_data, range(len(filtered_data)), best_index) return (filtered_data[best_index]["lam"], -1, pd.DataFrame(filtered_data)) ## determine if the solutions are "degenerate", i.e., almost euploid. ## if degenerate, then just pick the solution that optimizes the prior ## otherwise, pick the best solution or the second best solution based ## on whether the second best solution (in terms of the objective ## function value) actually is a better fit in terms of fewer segments ## whose ploidies don't nicely match the data order = np.argsort([x["nfit"] for x in scaling_data]) topn = order[:3] median_dom_frac = np.median([scaling_data[x]["dom_frac"] for x in topn]) is_degenerate = (median_dom_frac > 0.90) if is_degenerate: best_prior = np.argmax([x["lprior"] for x in scaling_data0]) gap = -2 ## pick the solution with the best prior if verbose: log_func("DEGENERATE\n") print_scaling_solutions([scaling_data0[best_prior]], [0], best_prior) return scaling_data0[best_prior]["lam"], gap, pd.DataFrame(scaling_data0) ## Pick the best fit solution best_index = order[0] if len(order) > 1: gap = scaling_data[order[1]]["nfit"] - scaling_data[best_index]["nfit"] else: gap = -3 if verbose: print_scaling_solutions(scaling_data, order, best_index) return scaling_data[best_index]["lam"], gap, pd.DataFrame(scaling_data0)
def find_scaling_candidates_v4(y, segment_bdy, segment_means, segment_lengths, is_sex_chrom_bin, window, scale_guess, max_ploidy_long=10, zero_ploidy_count=50, max_segment_push_to_zero=50, verbose=True, log_func=sys.stdout.write): """ Find a set of possible scaling factors by minimizing an objective function. In addition to the scaling factors, we also store other information corresponding to the scale. Same function as find_scaling_candidates, but output is a list of dicts.""" data_cats = ["fopt", "lam", "aploidy", "dom_pdy", "dom_frac", "levels", "entropy"] ## create initial guesses by setting the average ploidy based on ## the read count mean_read_count = y.mean()*window possible_ploidies = np.linspace(.5, 12, 20) guesses = mean_read_count/possible_ploidies ## focus on good segments for scaling (based on dpcv^2) dpcv2 = -np.ones_like(segment_means) for i, (s,e) in enumerate(segment_bdy): counts = y[s:e] mean = counts.mean() dpcv2[i] = (counts.std()**2 - mean)/np.clip(mean, 1e-2, None) median_length = np.median(segment_lengths) dpcv2_25_long, dpcv2_75_long = np.percentile( dpcv2[segment_lengths >= median_length], [25, 75]) good_segments = dpcv2 < dpcv2_75_long + 1.5*(dpcv2_75_long-dpcv2_25_long) if verbose: log_func("Using %.2f pct of segments to scale\n"%( good_segments.sum()*100./len(good_segments))) segment_means_filtered = segment_means[good_segments] segment_lengths_filtered = segment_lengths[good_segments] ## Run a minimization algorithm to determine the candidate scaling ## factors. data_dict = defaultdict(list) for g in guesses: lams, fopt, _, _, warnflag = scipy.optimize.fmin(lambda lam : objective_function(lam, segment_means_filtered, segment_lengths_filtered), g, full_output=True, disp=False) lam = lams[0] log_func("Guess %.2f, lam %.2f"%(g, lam)) ## if the minimization failed then stop if (warnflag == 1 or warnflag == 2 or (len(guesses) > 1 and (lam > guesses.max() or lam < guesses.min()))): log_func(" SKIP\n") continue log_func(" KEEP\n") segment_ploidies = np.round(segment_means/lam).astype(int) #pushed_to_zero = np.where((segment_ploidies == 0) & # (segment_means > zero_ploidy_count))[0] #if (len(pushed_to_zero) and # segment_lengths[pushed_to_zero].max( ) > max_segment_push_to_zero): # continue pcounter = Counter() for p, l in zip(segment_ploidies, segment_lengths): pcounter[p] += l nploidy_levels = len(pcounter.keys( )) ## compute the information content of the CNV profile entropy_counter = Counter( ) entropy_norm = 0 for (s,e), p in zip(segment_bdy, segment_ploidies): l = e-s ## skip sex chromosomes so both male and female diploids ## will have low entropy if is_sex_chrom_bin[s:e].sum() > 0.5*l: continue entropy_counter[p] += l entropy_norm += l if entropy_norm == 0: # if no segment is long, then set to arbitrary high value entropy = np.log(len(y)) else: entropy = 0.0 for p, c in entropy_counter.iteritems( ): occupancy = float(c)/entropy_norm entropy += -occupancy * np.log(occupancy) ## find the dominant ploidy and how much of the genome it covers for dom_ploidy, dom_count in pcounter.most_common(2): if int(dom_ploidy) != 0: break avg_ploidy = (segment_ploidies.astype(float)* segment_lengths).sum( )/segment_lengths.sum( ) key = int(np.round(lam)) dom_frac = dom_count*1.0/segment_lengths.sum() data_dict[key].append((fopt, lam, avg_ploidy, dom_ploidy, dom_frac, nploidy_levels, entropy)) ## if we were not able to find a single scale factor, then ## assign the initial segment ploidy 2 if len(data_dict.keys()) == 0: if verbose: log_func("Optimization failed, setting median to ploidy 2\n") y_agg = aggregate_counts(y, window) y_agg_med = np.median(y_agg[y_agg > 0]) lam = y_agg_med/2.0 return [dict(zip(data_cats, [None, lam, None, None, None, None, None]))] data = [] for k, v in data_dict.iteritems( ): data.append(dict(zip(data_cats, np.array(v).mean(axis=0)))) data.sort( key = lambda x: x["fopt"] ) if scale_guess is not None and len(scale_guess) == 1: sf = scale_guess[0]*window best_match = np.argmin([abs(d["lam"] - sf) for d in data]) if verbose: log_func("Override scaling solution. Pick closest to %.2f\n"%sf) return [data[best_match]] if verbose: log_func("%d minima found\n"%len(data)) return data
def estimate_gc_bias(profiles, tracks, reference_path): ## load genome tracks and profiles skipping sex chromosomes ref = contig_manager.contig_manager(reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=False) maptrack = pd.HDFStore(tracks, "r") cmask = [] gctrack = [] bdy = [0] mtrack = [] for chrom in chroms: x = maptrack["/map/"+chrom].values > MAPPABILITY_THRESHOLD cmask.extend(x) z = bdy[-1] + len(x) gctrack.extend(maptrack["/GC/"+chrom].values) mtrack.extend(maptrack["/map/"+chrom].values) bdy.append(z) cmask = np.array(cmask) maptrack.close( ) gctrack = np.array(gctrack) mtrack = np.array(mtrack) bdy = np.array(bdy) nbins = bdy[-1] pstore = pd.HDFStore(profiles, "r") ncells = len(pstore["/barcodes"].values) X = np.zeros((ncells, nbins), dtype="int32") for ci, chrom in enumerate(chroms): X[:, bdy[ci]:bdy[ci+1]] = pstore["/contigs/"+chrom].values pstore.close( ) ## genome wide profile of all cells @ GC_RES resolution ## restricted to mappable regions y = aggregate_counts(X.sum( axis=0 )[cmask], GC_RES).astype(float) y /= y.mean( ) gc = aggregate_counts(gctrack[cmask], GC_RES)/GC_RES gcbins = np.linspace(MIN_GC, MAX_GC, NUM_GC_BINS+1) gc_vals = 0.5 * (gcbins[1:] + gcbins[:-1]) gc_bin_index = np.searchsorted(gcbins, gc) gc0 = np.nanmean(gc_vals) ## group data points by GC bins and compute the median x_vals = [] y_vals = [] for bi in xrange(1, NUM_GC_BINS+1): bin_filter = gc_bin_index == bi num_data_points = bin_filter.sum( ) if num_data_points < MIN_POINTS_PER_BIN: continue bin_gc = gc_vals[bi-1] bin_val = np.median(y[bin_filter]) x_vals.append(bin_gc) y_vals.append(bin_val) # for bi x_vals = np.array(x_vals) - gc0 ## fit to ax^2 + bx + c a, b, c = np.polyfit(x_vals, y_vals, 2) ## GC metric is mean absolute deviation away from 1.0 gc_metric = np.abs(np.array(y_vals) - 1.0).sum( ) / len(y_vals) ## store gc data in summary summary = {} summary["GC_content"] = x_vals summary["scaled_read_counts"] = y_vals summary["quadratic_coefficients"] = [a, b, c] summary["gc_cells_only"] = gc_metric summary["gc0"] = gc0 #with open(outs.summary, "w") as out: # json.dump(summary, out, indent=4) # return( {'GCMetric': gc_metric, 'Summary': summary})