コード例 #1
0
def estimate_gc_norm_cell(profile, gc, mask):
    window_size = estimate_window_size(profile[mask])
    profile = aggregate_counts(profile[mask].astype(float), window_size)
    gc = aggregate_counts(gc[mask].astype(float), window_size) / window_size
    _, linear, quadratic = minimize_entropy(profile, gc)
    g = gc - GC_ORIGIN
    g2 = g * g
    parabola = gc_curve(g, g2, linear, quadratic).ravel()
    norm_profile = profile / parabola

    delta_gc_cv = get_delta_gc_cv(profile, norm_profile, gc)
    if delta_gc_cv < 0:
        linear = 0.0
        quadratic = 0.0
        delta_gc_cv = 0.0

    return linear, quadratic
コード例 #2
0
def add_cell_plots_to_axes(Y, axes, bdy=None, plot_res=10, plot_args={}):
    ncells, nbins = Y.shape
    for i in xrange(ncells):
        z = aggregate_counts(Y[i], plot_res)
        axes[i].plot(z / z.mean(), rasterized=True, **plot_args)
        axes[i].set_ylim(0, 5)
        axes[i].set_yticks(range(1, 5))
        if bdy is not None:
            for x in bdy:
                axes[i].axvline(x / plot_res, color="k")
            if i < ncells - 1:
                axes[i].set_xticks([])
            else:
                axes[i].set_xticks(bdy / plot_res)
                axes[i].set_xticklabels(axes[i].get_xticks().astype(int),
                                        rotation=45)
            # if i else
        # if bdy
        axes[i].set_xlim(0, nbins / plot_res)
コード例 #3
0
def find_best_scale_v15(y, xi, segment_bdy, segment_means, segment_lengths,
    ref, cbdy, window, scale_guess, max_ploidy_long=10, zero_ploidy_count=200, 
    max_segment_push_to_zero=200, prior_params = {"prior_mean": 2.0,
    "prior_std": 0.5}, min_ploidy = None, max_ploidy = None, verbose=True,
    log_func=sys.stdout.write):
    """ Find the best scaling factor that turns a segmented profile
        into an integer copy number profile. min_ploidy and max_ploidy override
        the algorithm. See comments for explanation."""

    def print_scaling_solutions(scaling_data, order, best_index=None):
        log_func("Scaling solutions:\n")
        header = ["lam", "aploidy", "dom_pdy",
                  "dom_frac", "lprior", "entropy", "nfit", "dpcv"]
        log_func("|".join([h.rjust(8) for h in header])+"\n")
        log_func("-"*(len("|".join([h.rjust(8) for h in header])))+"\n")
        for x in order:
            if scaling_data[x]["fopt"] is None:
                log_func("NO DATA\n")
                continue
            row = []
            for h in header:
                value = ("%.2f"%scaling_data[x][h] if h in scaling_data[x]
                    else "NAN")
                row.append(value.rjust(8))
            log_func(" ".join(row)+" ")
            if best_index is not None and best_index == x:
                log_func("X")
            log_func("\n")
    
    is_sex_chrom_bin = np.zeros_like(y, dtype=bool)
    
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)
    for i, chrom in enumerate(chroms):
        if chrom in ref.sex_chromosomes:
            is_sex_chrom_bin[cbdy[i]:cbdy[i+1]] = True


    ## assertions
    assert prior_params["prior_mean"] > 0, "prior mean must be positive"
    assert prior_params["prior_std"] > 0, "prior std. dev. must be positive"
    
    ## find all the candidate scaling factors
    scaling_data0 = find_scaling_candidates_v4(y, segment_bdy, segment_means,
        segment_lengths, is_sex_chrom_bin, window, scale_guess,
        max_ploidy_long=max_ploidy_long, zero_ploidy_count=zero_ploidy_count,
        max_segment_push_to_zero=max_segment_push_to_zero, verbose=verbose,
        log_func=log_func)
    log_func("INITIAL CANDIDATES\n")
    print_scaling_solutions(scaling_data0, range(len(scaling_data0)), None)

    ## if we only find one solution, there's nothing to do
    if len(scaling_data0) == 1:
        log_func("Only 1 solution, nothing to do.\n")
        if verbose:
            print_scaling_solutions(scaling_data0, [0], 0)
        return scaling_data0[0]["lam"], -3, pd.DataFrame(scaling_data0)
    nbins = len(y)
    ## compute DPCV after compensating for ploidy & GC
    for si, data in enumerate(scaling_data0):
        lam = data["lam"]
        
        segment_ploidy = np.round(segment_means/lam).astype(int)

        blocks = defaultdict(list)
        for i, p in enumerate(segment_ploidy):
            blocks[p].append(segment_bdy[i])
        blen = []
        bdpcv = []
        for p, segs in blocks.iteritems():
            seg_len = 0
            for s, e in segs:
                seg_len += e-s
            if seg_len < 0.01*nbins:
                continue
            counts = np.zeros(seg_len)
            pos = 0
            for s, e in segs:
                counts[pos:pos+(e-s)] = y[s:e]/xi[s:e]/max(p, 0.1)
                pos += e-s
            counts_agg = aggregate_counts(counts, window)
            dpcv = np.sqrt(np.abs(counts_agg.std()**2 -
                counts_agg.mean()))/counts_agg.mean()
            blen.append(seg_len)
            bdpcv.append(dpcv)
        blen = np.array(blen)
        bdpcv = np.array(bdpcv)
        data["dpcv"] = (blen*bdpcv).sum()/blen.sum()
        data["lprior"] = log_prior_ploidy(data["aploidy"],
            mu=prior_params["prior_mean"], sig=prior_params["prior_std"])
        data["nfit"] = data["fopt"]/segment_lengths.sum( )
    
    ## Throw away candidates with unusually high DPCV. This removes the ploidy
    ## 1 solutions that we see when there is noise. Also remove high ploidy 
    ## solutions.
    MIN_PLOIDY = 0.0
    MAX_PLOIDY = 8.0
    ploidies = np.array([data["aploidy"] for data in scaling_data0])
    pfilter = ploidies < MAX_PLOIDY

    ## Filter based on the dpcv of likely good solutions. This is mainly to 
    ## eliminate the ploidy 1 case. Take the min of top 3 solutions, since 
    ## very high ploidy solutions will have low dpcv as well, but they are
    ## garbage.
    dpcvs = np.array([data["dpcv"] for data in scaling_data0])
    target_dpcv = np.min(dpcvs[:3])
    entropy = np.array([data["entropy"] for data in scaling_data0])
    target_entropy = np.max(entropy[:3])
    outliers = np.logical_or(dpcvs > target_dpcv + 0.06,
                            entropy < target_entropy - 1.0)
     
    select = ~outliers & pfilter
    if select.sum() == 0:
        if pfilter.sum() > 0:
            select = pfilter
        else:
            select = np.ones(len(scaling_data0), dtype=bool)

    scaling_data = []
    for i, data in enumerate(scaling_data0):
        if select[i]:
            scaling_data.append(data)
           
    ## if min_ploidy and max_ploidy are specified, override the decision making
    ## - find all solutions with ploidy (min_ploidy, max_ploidy)
    ## - if empty find the solutions that are closest to the end points
    ## - amongst the restricted solutions as determined above, pick the solution
    ##   with the least objective function value
    if min_ploidy is not None or max_ploidy is not None:
        pmin = min_ploidy if min_ploidy is not None else MIN_PLOIDY
        pmax = max_ploidy if max_ploidy is not None else MAX_PLOIDY
        assert pmin <= pmax, "min_ploidy <= max_ploidy must be true"

        log_func("Applying min ploidy & max ploidy: %d-%d\n"%(pmin, pmax))

        filtered_data = [data for data in scaling_data
                         if (data["aploidy"] > pmin and
                             data["aploidy"] < pmax)]
        if len(filtered_data) == 0:
            ploidies = np.array([data["aploidy"] for data in scaling_data0])
            pmin_nbr = np.argmin(np.abs(ploidies - pmin))
            pmax_nbr = np.argmin(np.abs(ploidies - pmax))
            if (scaling_data0[pmin_nbr]["fopt"] <
                    scaling_data0[pmax_nbr]["fopt"]):
                best_index = pmin_nbr
            else:
                best_index = pmax_nbr

            if verbose:
                print_scaling_solutions(scaling_data0, [best_index], best_index)
            return (scaling_data0[best_index]["lam"], -1, 
                pd.DataFrame(scaling_data0))
        else:
            best_index = np.argmin([data["fopt"] for data in filtered_data])
            if verbose:
                print_scaling_solutions(filtered_data,
                    range(len(filtered_data)), best_index)
            return (filtered_data[best_index]["lam"], -1, 
                pd.DataFrame(filtered_data))

    ## determine if the solutions are "degenerate", i.e., almost euploid.
    ## if degenerate, then just pick the solution that optimizes the prior
    ## otherwise, pick the best solution or the second best solution based
    ## on whether the second best solution (in terms of the objective
    ## function value) actually is a better fit in terms of fewer segments
    ## whose ploidies don't nicely match the data

    order = np.argsort([x["nfit"] for x in scaling_data])

    topn = order[:3]
    median_dom_frac = np.median([scaling_data[x]["dom_frac"] for x in topn])
    is_degenerate = (median_dom_frac > 0.90)
    if is_degenerate:
        best_prior = np.argmax([x["lprior"] for x in scaling_data0])
        gap = -2
        ## pick the solution with the best prior
        if verbose:
            log_func("DEGENERATE\n")
            print_scaling_solutions([scaling_data0[best_prior]], [0], 
                best_prior)
        return scaling_data0[best_prior]["lam"], gap, pd.DataFrame(scaling_data0)
    
    ## Pick the best fit solution
    best_index = order[0]
    if len(order) > 1:
        gap = scaling_data[order[1]]["nfit"] - scaling_data[best_index]["nfit"]
    else:
        gap = -3

    if verbose:
        print_scaling_solutions(scaling_data, order, best_index)
    
    return scaling_data[best_index]["lam"], gap, pd.DataFrame(scaling_data0)
コード例 #4
0
def find_scaling_candidates_v4(y, segment_bdy, segment_means, segment_lengths,
    is_sex_chrom_bin, window, scale_guess, max_ploidy_long=10,
    zero_ploidy_count=50, max_segment_push_to_zero=50, verbose=True,
    log_func=sys.stdout.write):
    """ Find a set of possible scaling factors by minimizing an
        objective function. In addition to the scaling factors, we also
        store other information corresponding to the scale. Same function
        as find_scaling_candidates, but output is a list of dicts."""

    data_cats = ["fopt", "lam", "aploidy", "dom_pdy",
                               "dom_frac", "levels", "entropy"]

    ## create initial guesses by setting the average ploidy based on
    ## the read count
    mean_read_count = y.mean()*window
    possible_ploidies = np.linspace(.5, 12, 20)
    guesses = mean_read_count/possible_ploidies

    ## focus on good segments for scaling (based on dpcv^2)
    dpcv2 = -np.ones_like(segment_means)
    for i, (s,e) in enumerate(segment_bdy):
        counts = y[s:e]
        mean = counts.mean()
        dpcv2[i] = (counts.std()**2 - mean)/np.clip(mean, 1e-2, None)
    median_length = np.median(segment_lengths)
    dpcv2_25_long, dpcv2_75_long = np.percentile(
        dpcv2[segment_lengths >= median_length], [25, 75])
    good_segments = dpcv2 < dpcv2_75_long + 1.5*(dpcv2_75_long-dpcv2_25_long)
    if verbose:
        log_func("Using %.2f pct of segments to scale\n"%(
            good_segments.sum()*100./len(good_segments)))
    segment_means_filtered = segment_means[good_segments]
    segment_lengths_filtered = segment_lengths[good_segments]

    ## Run a minimization algorithm to determine the candidate scaling
    ## factors.
    data_dict = defaultdict(list)
    for g in guesses:
        lams, fopt, _, _, warnflag = scipy.optimize.fmin(lambda lam :
              objective_function(lam, segment_means_filtered,
                                 segment_lengths_filtered),
              g, full_output=True, disp=False)
        lam = lams[0]
        log_func("Guess %.2f, lam %.2f"%(g, lam))

        ## if the minimization failed then stop
        if (warnflag == 1 or warnflag == 2 or (len(guesses) > 1 and 
            (lam > guesses.max() or lam < guesses.min()))):
            log_func(" SKIP\n")
            continue
        log_func(" KEEP\n")
        segment_ploidies = np.round(segment_means/lam).astype(int)

        #pushed_to_zero = np.where((segment_ploidies == 0) &
        #                          (segment_means > zero_ploidy_count))[0]
        #if (len(pushed_to_zero) and
        #    segment_lengths[pushed_to_zero].max( ) > max_segment_push_to_zero):
        #    continue
        
        pcounter = Counter()
        for p, l in zip(segment_ploidies, segment_lengths):
            pcounter[p] += l
        nploidy_levels = len(pcounter.keys( ))

        ## compute the information content of the CNV profile
        entropy_counter = Counter( )
        entropy_norm = 0
        for (s,e), p in zip(segment_bdy, segment_ploidies):
            l = e-s
            ## skip sex chromosomes so both male and female diploids
            ## will have low entropy
            if is_sex_chrom_bin[s:e].sum() > 0.5*l:
                continue
            entropy_counter[p] += l
            entropy_norm += l
        if entropy_norm == 0:
            # if no segment is long, then set to arbitrary high value
            entropy = np.log(len(y)) 
        else:
            entropy = 0.0
            for p, c in entropy_counter.iteritems( ):
                occupancy = float(c)/entropy_norm
                entropy += -occupancy * np.log(occupancy)

        ## find the dominant ploidy and how much of the genome it covers
        for dom_ploidy, dom_count in pcounter.most_common(2):
            if int(dom_ploidy) != 0:
                break

        avg_ploidy = (segment_ploidies.astype(float)*
                      segment_lengths).sum( )/segment_lengths.sum( )
        key = int(np.round(lam))
        dom_frac = dom_count*1.0/segment_lengths.sum()
        data_dict[key].append((fopt, lam, avg_ploidy, dom_ploidy,
                               dom_frac, nploidy_levels, entropy))

    ## if we were not able to find a single scale factor, then
    ## assign the initial segment ploidy 2
    if len(data_dict.keys()) == 0:
        if verbose:
            log_func("Optimization failed, setting median to ploidy 2\n")
        y_agg = aggregate_counts(y, window)
        y_agg_med = np.median(y_agg[y_agg > 0])
        lam = y_agg_med/2.0
        return [dict(zip(data_cats, [None, lam, None,
            None, None, None, None]))]

    data = []
    for k, v in data_dict.iteritems( ):
        data.append(dict(zip(data_cats, np.array(v).mean(axis=0))))

    data.sort( key = lambda x: x["fopt"] )

    if scale_guess is not None and len(scale_guess) == 1:
        sf = scale_guess[0]*window
        best_match = np.argmin([abs(d["lam"] - sf) for d in data])
        if verbose:
            log_func("Override scaling solution. Pick closest to %.2f\n"%sf)
        return [data[best_match]]

    if verbose:
        log_func("%d minima found\n"%len(data))
    return data
コード例 #5
0
def estimate_gc_bias(profiles, tracks, reference_path):
    ## load genome tracks and profiles skipping sex chromosomes
    
    ref = contig_manager.contig_manager(reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=False)

    maptrack = pd.HDFStore(tracks, "r")
    cmask = []
    gctrack = []
    bdy = [0]
    mtrack = []
    for chrom in chroms:
        x = maptrack["/map/"+chrom].values > MAPPABILITY_THRESHOLD
        cmask.extend(x)
        z = bdy[-1] + len(x)
        gctrack.extend(maptrack["/GC/"+chrom].values)
        mtrack.extend(maptrack["/map/"+chrom].values)
        bdy.append(z)
    cmask = np.array(cmask)
    maptrack.close( )
    gctrack = np.array(gctrack)
    mtrack = np.array(mtrack)
    bdy = np.array(bdy)

    nbins = bdy[-1]
    pstore = pd.HDFStore(profiles, "r")
    ncells = len(pstore["/barcodes"].values)
    X = np.zeros((ncells, nbins), dtype="int32")
    for ci, chrom in enumerate(chroms):
        X[:, bdy[ci]:bdy[ci+1]] = pstore["/contigs/"+chrom].values
    pstore.close( )

    ## genome wide profile of all cells @ GC_RES resolution
    ## restricted to mappable regions
    y = aggregate_counts(X.sum( axis=0 )[cmask], GC_RES).astype(float)
    y /= y.mean( )
    gc = aggregate_counts(gctrack[cmask], GC_RES)/GC_RES

    gcbins = np.linspace(MIN_GC, MAX_GC, NUM_GC_BINS+1)
    gc_vals = 0.5 * (gcbins[1:] + gcbins[:-1])
    gc_bin_index = np.searchsorted(gcbins, gc)
    gc0 = np.nanmean(gc_vals)

    ## group data points by GC bins and compute the median
    x_vals = []
    y_vals = []
    for bi in xrange(1, NUM_GC_BINS+1):
        bin_filter = gc_bin_index == bi
        num_data_points = bin_filter.sum( )
        if num_data_points < MIN_POINTS_PER_BIN:
            continue
        bin_gc = gc_vals[bi-1]
        bin_val = np.median(y[bin_filter])
        x_vals.append(bin_gc)
        y_vals.append(bin_val)
    # for bi
    x_vals = np.array(x_vals) - gc0
    
    ## fit to ax^2 + bx + c
    a, b, c = np.polyfit(x_vals, y_vals, 2)
    
    ## GC metric is mean absolute deviation away from 1.0
    gc_metric = np.abs(np.array(y_vals) - 1.0).sum( ) / len(y_vals)

    ## store gc data in summary
    summary = {}
    summary["GC_content"] = x_vals
    summary["scaled_read_counts"] = y_vals
    summary["quadratic_coefficients"] = [a, b, c]
    summary["gc_cells_only"] = gc_metric
    summary["gc0"] = gc0
   
    #with open(outs.summary, "w") as out:
    #    json.dump(summary, out, indent=4)
    #
    return( {'GCMetric': gc_metric, 'Summary': summary})