Example #1
0
def get_scores_from_fits(fits, use_correlations):
    if use_correlations:
        R2_pairs = [(fit.LOO_score,fit.with_correlations.LOO_score) for fit in iterate_fits(fits)]
        R2_pairs = [(s1,s2) for s1,s2 in R2_pairs if s1>-1 and s2>-1]
        basic = np.array([b for b,m in R2_pairs])
        multi = np.array([m for b,m in R2_pairs])
    else:
        basic = np.array([fit.LOO_score for fit in iterate_fits(fits) if fit.LOO_score>-1])
        multi = None
    return basic,multi
Example #2
0
def get_scores_from_fits(fits, use_correlations):
    if use_correlations:
        R2_pairs = [(fit.LOO_score, fit.with_correlations.LOO_score)
                    for fit in iterate_fits(fits)]
        R2_pairs = [(s1, s2) for s1, s2 in R2_pairs if s1 > -1 and s2 > -1]
        basic = np.array([b for b, m in R2_pairs])
        multi = np.array([m for b, m in R2_pairs])
    else:
        basic = np.array([
            fit.LOO_score for fit in iterate_fits(fits) if fit.LOO_score > -1
        ])
        multi = None
    return basic, multi
Example #3
0
def plot_bootstrap_onset_variance(data, fits):
    mu_and_std = []
    for fit in iterate_fits(fits):
        a,h,mu_global,_ = fit.theta
        
        nParams, nSamples = fit.theta_samples.shape
        mu_bootstrap = np.empty(nSamples)
        for i in xrange(nSamples):
            a,h,mu_i,_ = fit.theta_samples[:,i]
            mu_bootstrap[i] = mu_i
        mu_std = np.std(mu_bootstrap)
        mu_and_std.append( (mu_global, mu_std) )
        
    mu,mu_std = zip(*mu_and_std)
    
    fig = plt.figure()
    ax = fig.add_axes([0.12,0.12,0.8,0.8])
    ax.plot(mu, mu_std, 'bx')
    ax.set_ylabel('onset time bootstrap std', fontsize=cfg.fontsize)

    # set the development stages as x labels
    stages = [stage.scaled(age_scaler) for stage in dev_stages]
    ax.set_xticks([stage.central_age for stage in stages])
    ax.set_xticklabels([stage.short_name for stage in stages], fontsize=cfg.fontsize, fontstretch='condensed', rotation=90)    
    yticks = ax.get_yticks()
    yticks = [yticks[0], yticks[-1]]
    ax.set_yticks(yticks)
    ax.set_yticklabels(['{:.1g}'.format(t) for t in yticks], fontsize=cfg.fontsize)
    
    # mark birth time with a vertical line
    ymin, ymax = ax.get_ylim()
    birth_age = age_scaler.scale(0)
    ax.plot([birth_age, birth_age], [ymin, ymax], '--', color='0.85')
    return fig
def add_change_distributions(data, fitter, fits, age_range=None, n_bins=50):
    """ Compute a histogram of "strength of transition" at different ages.
        The histogram is computed for each (gene,region) in fits and is added to the fit objects.
        Currently this function only works for sigmoid fits. It uses the h parameter explicitly,
        relies on monotonicity, etc. It is probably not too hard to generalize it to other shapes.
    """
    shape = fitter.shape
    assert shape.cache_name() in [
        'sigmoid', 'sigslope'
    ]  # the function currently works only for sigmoid/sigslope fits

    bin_edges, bin_centers = get_bins(data, age_range, n_bins)
    fits.change_distribution_params = Bunch(
        bin_edges=bin_edges,
        bin_centers=bin_centers,
    )

    for dsname, g, r, fit in iterate_fits(fits, return_keys=True):
        weights = calc_bootstrap_change_distribution(shape, fit.theta_samples,
                                                     bin_edges)
        fit.change_distribution_weights = weights
        fit.change_distribution_spread = change_distribution_spread_cumsum(
            bin_centers, weights)
        fit.change_distribution_mean_std = change_distribution_mean_and_std(
            bin_centers, weights)
Example #5
0
def plot_comparison_bar(data, shapes, all_fits, threshold_percentile=None):
    nShapes = len(shapes)

    mu = np.empty(nShapes)
    se = np.empty(nShapes)
    for i,fits in enumerate(all_fits):
        scores = np.array([f.LOO_score for f in iterate_fits(fits, R2_threshold=-1)])
        if threshold_percentile is not None:
            threshold_score = np.percentile(scores, 50)
            scores = scores[scores > threshold_score]
        mu[i] = np.mean(scores)
        se[i] = scipy.stats.sem(scores)
        
    # reorder by mean score
    idx = np.argsort(mu)[::-1]
    mu = mu[idx]
    se = se[idx]
    shapes = [shapes[i] for i in idx]

    index = np.arange(nShapes)
    bar_width = 0.8
    fig = plt.figure()
    ax = fig.add_axes([0.12,0.12,0.8,0.8])
    ax.bar(index, mu, yerr=se, width=bar_width, color='b', error_kw = {'ecolor': '0.3', 'linewidth': 2})  
    ax.set_ylabel('Mean $R^2$', fontsize=fontsize)
    ax.set_xticks(index + bar_width/2)
    ax.set_xticklabels([s.display_name() for s in shapes], fontsize=fontsize)
    yticks = [0, 0.1, 0.2, 0.3]
    ax.set_yticks(yticks)
    ax.set_yticklabels(['{:g}'.format(t) for t in yticks], fontsize=fontsize)
    return fig
Example #6
0
def plot_change_width_scatter(data, fitter, fits):
    bin_edges = fits.change_distribution_params.bin_edges
    bin_centers = fits.change_distribution_params.bin_centers
    shape = fitter.shape

    def get_width(weights):
        return change_distribution_width_cumsum(bin_centers, weights)

    width_pairs = []
    for fit in iterate_fits(fits):
        weights_single = calc_change_distribution(shape, fit.theta, bin_edges)
        width_single = get_width(weights_single)
        width_bootstrap = get_width(fit.change_distribution_weights)
        width_pairs.append( (width_single, width_bootstrap) )        
    width_single,width_bootstrap = zip(*width_pairs)
    
    fig = plt.figure()
    ax = fig.add_axes([0.12,0.12,0.8,0.8])
    maxw = int(math.ceil(max(max(width_single), max(width_bootstrap))))
    minw = int(math.floor(min(min(width_single), min(width_bootstrap)))) 
    ax.scatter(width_single, width_bootstrap, alpha=0.8)
    ax.plot(np.mean(width_single), np.mean(width_bootstrap), 'rx', markersize=8, markeredgewidth=2, label='mean')
    ax.plot([minw, maxw], [minw, maxw],'k--')
    ax.set_xlim(minw,maxw)
    ax.set_ylim(minw,maxw)
    ticks = range(minw, maxw+1)
    ax.set_yticks(ticks)
    ax.set_xticks(ticks)
    ax.set_xticklabels([str(t) for t in ticks], fontsize=cfg.fontsize)
    ax.set_yticklabels([str(t) for t in ticks], fontsize=cfg.fontsize)
    ax.set_xlabel('width of single fit', fontsize=cfg.fontsize)
    ax.set_ylabel('width by bootstrap', fontsize=cfg.fontsize)
    ax.set_title('change distribution of single fit vs. bootstrap', fontsize=cfg.fontsize)

    return fig
Example #7
0
def plot_onset_times(all_data, data, fitter, fits, dct_pathways, R2_threshold,
                     b_unique):
    fig = plt.figure()
    ax = fig.add_axes([0.12, 0.12, 0.8, 0.8])

    stages = [stage.scaled(age_scaler) for stage in dev_stages]
    low = min(stage.from_age for stage in stages)
    high = max(stage.to_age for stage in stages)

    n_fits = sum(
        len(ds.gene_names) * len(ds.region_names) for ds in all_data.datasets)
    bin_edges, change_vals = get_change_distribution_for_whole_genome(
        all_data, fitter)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    ax.plot(bin_centers,
            change_vals,
            linewidth=5,
            label='whole genome ({} fits)'.format(n_fits))

    for i, (pathway_name, genes) in enumerate(sorted(dct_pathways.items())):
        pathway_fits = restrict_genes(fits, genes)
        thetas = [(g, r, fit.theta) for dsname, g, r, fit in iterate_fits(
            pathway_fits, R2_threshold=R2_threshold, return_keys=True)]
        if not thetas:
            print 'Skipping {}. No fits left'.format(pathway_name)
            continue

        bin_edges, change_vals = compute_change_distribution(fitter.shape,
                                                             thetas,
                                                             low,
                                                             high,
                                                             n_bins=50)
        bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
        linestyles = ['-', '--', '-.']
        style = linestyles[int(i / 7)]
        label = '{} ({} fits)'.format(pathway_name, len(thetas))
        ax.plot(bin_centers, change_vals, style, linewidth=3, label=label)
    ax.legend(loc='best', fontsize=18, frameon=False)

    ax.set_ylabel('expression change magnitude', fontsize=fontsize)

    # set the development stages as x labels
    stages = [stage.scaled(age_scaler) for stage in dev_stages]
    ax.set_xticks([stage.central_age for stage in stages])
    ax.set_xticklabels([stage.short_name for stage in stages],
                       fontsize=fontsize,
                       fontstretch='condensed',
                       rotation=90)
    yticks = ax.get_yticks()
    yticks = [yticks[0], yticks[-1]]
    ax.set_yticks(yticks)
    ax.set_yticklabels(['{:g}'.format(t) for t in yticks], fontsize=fontsize)

    # mark birth time with a vertical line
    ymin, ymax = ax.get_ylim()
    birth_age = age_scaler.scale(0)
    ax.plot([birth_age, birth_age], [ymin, ymax], '--', color='0.85')

    return fig
Example #8
0
def main():
    fits = get_fits()
    def cond(fit):
        a,h,mu,w = fit.theta
        if h*w > 0:
            return False
        return abs(w) < 0.5
    return [(g,r) for dsname,g,r,fit in iterate_fits(fits,R2_threshold=0.5,return_keys=True) if cond(fit)]
Example #9
0
def plot_comparison_bar(data, shapes, all_fits):
    n = len(shapes)
    assert len(all_fits) == n
    assert n == 2

    score_pairs = [
        (f1.LOO_score, f2.LOO_score)
        for f1, f2 in iterate_fits(all_fits[0], all_fits[1], R2_threshold=-1)
    ]
    scores1, scores2 = zip(*score_pairs)
    all_scores = [scores1, scores2]

    _, pval = scipy.stats.wilcoxon(scores1, scores2)
    pval = pval / 2  # one sided p-value
    print '*** wilcoxon signed rank p-value (one sided) = {:.3g}'.format(pval)

    mu = np.empty(n)
    se = np.empty(n)
    for i, scores in enumerate(all_scores):
        mu[i] = np.mean(scores)
        se[i] = scipy.stats.sem(scores)

    # reorder by mean score
    idx = np.argsort(mu)[::-1]
    mu = mu[idx]
    se = se[idx]
    shapes = [shapes[i] for i in idx]

    index = np.arange(n)
    bar_width = 0.8
    fig = plt.figure()
    ax = fig.add_axes([0.12, 0.12, 0.8, 0.8])
    ax.bar(index,
           mu,
           yerr=se,
           width=bar_width,
           color='b',
           error_kw={
               'ecolor': '0.3',
               'linewidth': 2
           })
    ax.set_xlabel('shape', fontsize=fontsize)
    ax.set_ylabel('Mean $R^2$', fontsize=fontsize)
    ax.set_xticks(index + bar_width / 2)
    ax.set_xticklabels([s.cache_name() for s in shapes], fontsize=fontsize)
    yticks = [0, 0.1, 0.2, 0.3]
    ax.set_yticks(yticks)
    ax.set_yticklabels(['{:g}'.format(t) for t in yticks], fontsize=fontsize)
    return fig
def analyze_variant(theta,sigma):
    theta_priors = priors_name if theta else None
    sigma_prior = 'normal' if sigma else None
    shape = Sigslope(theta_priors)
    fitter = Fitter(shape,sigma_prior)
    fits = get_all_fits(data,fitter,allow_new_computation=False)
    LOO_scores = [f.LOO_score for f in iterate_fits(fits) if f.LOO_score is not None]
    mu,sem = bootstrap(LOO_scores, np.mean)
    return Bunch(
        theta = theta,
        sigma = sigma,
        LOO_scores = LOO_scores,
        mu = mu,
        sem = sem,
    )
Example #11
0
def plot_comparison_scatter(data1, fitter1, fits1, data2, fitter2, fits2):
    pairs = [(fit1.LOO_score, fit2.LOO_score) for fit1,fit2 in iterate_fits(fits1,fits2)]
    scores1,scores2 = zip(*pairs)
    
    fig = plt.figure()
    plt.scatter(scores1, scores2, alpha=0.5)
    plt.plot([-1, 1], [-1, 1],'k--')
    plt.xlim(-1,1)
    plt.ylim(-1,1)
    ttl1 = r'Comparison of scores using {} vs. {}'.format(fitter1.shape,fitter2.shape)
    ttl2 = r'{}, {}'.format(data1.name, data1.pathway)
    plt.title('\n'.join([ttl1, ttl2]), fontsize=cfg.fontsize)
    plt.xlabel('R2 for {}'.format(fitter1.shape), fontsize=cfg.fontsize)
    plt.ylabel('R2 for {}'.format(fitter2.shape), fontsize=cfg.fontsize)    
    return fig
Example #12
0
def analyze_paired_scores_with_and_without_priors(n_best=10):
    nFitter = Fitter(Sigslope())
    yFitter = Fitter(Sigslope(priors_name), 'normal')

    nFits = get_all_fits(data, nFitter, allow_new_computation=False)
    yFits = get_all_fits(data, yFitter, allow_new_computation=False)

    score_pairs = [(f1.LOO_score, f2.LOO_score)
                   for f1, f2 in iterate_fits(nFits, yFits)]
    nScores, yScores = zip(*score_pairs)

    _, pval = scipy.stats.wilcoxon(nScores, yScores)
    pval = pval / 2  # one sided p-value
    print '*** wilcoxon signed rank p-value (one sided) = {:.3g}'.format(pval)

    # find examples of best improvements
    diffs = [(f2.LOO_score - f1.LOO_score, f1.LOO_score, f2.LOO_score, g, r)
             for dsname, g, r, f1, f2 in iterate_fits(
                 nFits, yFits, R2_threshold=-1, return_keys=True)]
    diffs.sort(reverse=True)
    print 'Gene/Regions for which priors produce best R2 improvement:'
    for i, (delta, R2_without, R2_with, g, r) in enumerate(diffs[:10]):
        print '{i}) {g}@{r}, delta-R2={delta:.3g}. R2_without={R2_without:.3g}, R2_with={R2_with:.3g}'.format(
            **locals())
Example #13
0
def analyze_variant(theta, sigma):
    theta_priors = priors_name if theta else None
    sigma_prior = 'normal' if sigma else None
    shape = Sigslope(theta_priors)
    fitter = Fitter(shape, sigma_prior)
    fits = get_all_fits(data, fitter, allow_new_computation=False)
    LOO_scores = [
        f.LOO_score for f in iterate_fits(fits) if f.LOO_score is not None
    ]
    mu, sem = bootstrap(LOO_scores, np.mean)
    return Bunch(
        theta=theta,
        sigma=sigma,
        LOO_scores=LOO_scores,
        mu=mu,
        sem=sem,
    )
Example #14
0
def plot_onset_times(all_data, data, fitter, fits, dct_pathways, R2_threshold, b_unique):    
    fig = plt.figure()
    ax = fig.add_axes([0.12,0.12,0.8,0.8])

    stages = [stage.scaled(age_scaler) for stage in dev_stages]
    low = min(stage.from_age for stage in stages)
    high = max(stage.to_age for stage in stages) 

    n_fits = sum(len(ds.gene_names) * len(ds.region_names) for ds in all_data.datasets)
    bin_edges, change_vals = get_change_distribution_for_whole_genome(all_data,fitter)
    bin_centers = (bin_edges[:-1] + bin_edges[1:])/2
    ax.plot(bin_centers, change_vals, linewidth=5, label='whole genome ({} fits)'.format(n_fits))

    for i,(pathway_name, genes) in enumerate(sorted(dct_pathways.items())):
        pathway_fits = restrict_genes(fits,genes)    
        thetas = [(g,r,fit.theta) for dsname,g,r,fit in iterate_fits(pathway_fits, R2_threshold=R2_threshold, return_keys=True)]
        if not thetas:
            print 'Skipping {}. No fits left'.format(pathway_name)
            continue

        bin_edges, change_vals = compute_change_distribution(fitter.shape, thetas, low, high, n_bins=50)
        bin_centers = (bin_edges[:-1] + bin_edges[1:])/2
        linestyles = ['-', '--', '-.']
        style = linestyles[int(i/7)]
        label = '{} ({} fits)'.format(pathway_name,len(thetas))
        ax.plot(bin_centers, change_vals, style, linewidth=3, label=label)
    ax.legend(loc='best', fontsize=18, frameon=False)

    ax.set_ylabel('expression change magnitude', fontsize=fontsize)

    # set the development stages as x labels
    stages = [stage.scaled(age_scaler) for stage in dev_stages]
    ax.set_xticks([stage.central_age for stage in stages])
    ax.set_xticklabels([stage.short_name for stage in stages], fontsize=fontsize, fontstretch='condensed', rotation=90)    
    yticks = ax.get_yticks()
    yticks = [yticks[0], yticks[-1]]
    ax.set_yticks(yticks)
    ax.set_yticklabels(['{:g}'.format(t) for t in yticks], fontsize=fontsize)
    
    # mark birth time with a vertical line
    ymin, ymax = ax.get_ylim()
    birth_age = age_scaler.scale(0)
    ax.plot([birth_age, birth_age], [ymin, ymax], '--', color='0.85')

    return fig
Example #15
0
def plot_comparison_scatter(data, shape1, fits1, shape2, fits2):
    pairs = [(f1.LOO_score, f2.LOO_score) for f1,f2 in iterate_fits(fits1,fits2)]
    scores1,scores2 = zip(*pairs)
    
    fig = plt.figure()
    ax = fig.add_axes([0.12,0.12,0.8,0.8])
    ax.scatter(scores1, scores2, alpha=0.3)
    ax.plot([-1, 1], [-1, 1],'k--')
    ax.set_xlim(-1,1)
    ax.set_ylim(-1,1)
    ticks = [-1,1]
    ax.set_yticks(ticks)
    ax.set_xticks(ticks)
    ax.set_xticklabels([str(t) for t in ticks], fontsize=fontsize)
    ax.set_yticklabels([str(t) for t in ticks], fontsize=fontsize)
    ax.set_xlabel('$R^2$ for {}'.format(shape1), fontsize=fontsize)
    ax.set_ylabel('$R^2$ for {}'.format(shape2), fontsize=fontsize)
    return fig
Example #16
0
def plot_comparison_scatter(data, shape1, fits1, shape2, fits2):
    pairs = [(f1.LOO_score, f2.LOO_score)
             for f1, f2 in iterate_fits(fits1, fits2)]
    scores1, scores2 = zip(*pairs)

    fig = plt.figure()
    ax = fig.add_axes([0.12, 0.12, 0.8, 0.8])
    ax.scatter(scores1, scores2, alpha=0.3)
    ax.plot([-1, 1], [-1, 1], 'k--')
    ax.set_xlim(-1, 1)
    ax.set_ylim(-1, 1)
    ticks = [-1, 1]
    ax.set_yticks(ticks)
    ax.set_xticks(ticks)
    ax.set_xticklabels([str(t) for t in ticks], fontsize=fontsize)
    ax.set_yticklabels([str(t) for t in ticks], fontsize=fontsize)
    ax.set_xlabel('$R^2$ for {}'.format(shape1), fontsize=fontsize)
    ax.set_ylabel('$R^2$ for {}'.format(shape2), fontsize=fontsize)
    return fig
Example #17
0
def get_onset_times(data, fitter, R2_threshold, b_force=False):
    filename = join(cache_dir(),fit_results_relative_path(data,fitter) + '.pkl')
    if isfile(filename):
        print 'Loading onset distribution from {}'.format(filename)
        with open(filename) as f:
            bin_edges, change_vals = pickle.load(f)
    else:
        print 'Computing...'
        fits = get_all_fits(data, fitter)        
        thetas = [fit.theta for fit in iterate_fits(fits, R2_threshold=R2_threshold)]
        stages = [stage.scaled(age_scaler) for stage in dev_stages]
        low = min(stage.from_age for stage in stages)
        high = max(stage.to_age for stage in stages) 
        bin_edges, change_vals = compute_change_distribution(fitter.shape, thetas, low, high, n_bins=50)    

        print 'Saving result to {}'.format(filename)
        ensure_dir(dirname(filename))   
        with open(filename,'w') as f:
            pickle.dump((bin_edges,change_vals),f)
    return bin_edges, change_vals
def add_change_distributions(data, fitter, fits, age_range=None, n_bins=50):
    """ Compute a histogram of "strength of transition" at different ages.
        The histogram is computed for each (gene,region) in fits and is added to the fit objects.
        Currently this function only works for sigmoid fits. It uses the h parameter explicitly,
        relies on monotonicity, etc. It is probably not too hard to generalize it to other shapes.
    """
    shape = fitter.shape
    assert shape.cache_name() in ['sigmoid','sigslope'] # the function currently works only for sigmoid/sigslope fits

    bin_edges, bin_centers = get_bins(data, age_range, n_bins)
    fits.change_distribution_params = Bunch(
        bin_edges = bin_edges,
        bin_centers = bin_centers,
    )

    for dsname,g,r,fit in iterate_fits(fits, return_keys=True):
        weights = calc_bootstrap_change_distribution(shape, fit.theta_samples, bin_edges)
        fit.change_distribution_weights = weights
        fit.change_distribution_spread = change_distribution_spread_cumsum(bin_centers, weights)
        fit.change_distribution_mean_std = change_distribution_mean_and_std(bin_centers, weights)
def analyze_paired_scores_with_and_without_priors(n_best=10):
    nFitter = Fitter(Sigslope())
    yFitter = Fitter(Sigslope(priors_name), 'normal')

    nFits = get_all_fits(data,nFitter,allow_new_computation=False)
    yFits = get_all_fits(data,yFitter,allow_new_computation=False)

    score_pairs = [(f1.LOO_score, f2.LOO_score) for f1,f2 in iterate_fits(nFits, yFits)]
    nScores, yScores = zip(*score_pairs)
    
    _, pval = scipy.stats.wilcoxon(nScores, yScores)
    pval = pval/2  # one sided p-value
    print '*** wilcoxon signed rank p-value (one sided) = {:.3g}'.format(pval)
    
    # find examples of best improvements
    diffs = [(f2.LOO_score-f1.LOO_score, f1.LOO_score, f2.LOO_score, g, r) for dsname,g,r,f1,f2 in iterate_fits(nFits, yFits, R2_threshold=-1, return_keys=True)]
    diffs.sort(reverse=True)
    print 'Gene/Regions for which priors produce best R2 improvement:'
    for i,(delta,R2_without, R2_with, g,r) in enumerate(diffs[:10]):
        print '{i}) {g}@{r}, delta-R2={delta:.3g}. R2_without={R2_without:.3g}, R2_with={R2_with:.3g}'.format(**locals())
Example #20
0
def plot_change_width_scatter(data, fitter, fits):
    bin_edges = fits.change_distribution_params.bin_edges
    bin_centers = fits.change_distribution_params.bin_centers
    shape = fitter.shape

    def get_width(weights):
        return change_distribution_width_cumsum(bin_centers, weights)

    width_pairs = []
    for fit in iterate_fits(fits):
        weights_single = calc_change_distribution(shape, fit.theta, bin_edges)
        width_single = get_width(weights_single)
        width_bootstrap = get_width(fit.change_distribution_weights)
        width_pairs.append((width_single, width_bootstrap))
    width_single, width_bootstrap = zip(*width_pairs)

    fig = plt.figure()
    ax = fig.add_axes([0.12, 0.12, 0.8, 0.8])
    maxw = int(math.ceil(max(max(width_single), max(width_bootstrap))))
    minw = int(math.floor(min(min(width_single), min(width_bootstrap))))
    ax.scatter(width_single, width_bootstrap, alpha=0.8)
    ax.plot(np.mean(width_single),
            np.mean(width_bootstrap),
            'rx',
            markersize=8,
            markeredgewidth=2,
            label='mean')
    ax.plot([minw, maxw], [minw, maxw], 'k--')
    ax.set_xlim(minw, maxw)
    ax.set_ylim(minw, maxw)
    ticks = range(minw, maxw + 1)
    ax.set_yticks(ticks)
    ax.set_xticks(ticks)
    ax.set_xticklabels([str(t) for t in ticks], fontsize=cfg.fontsize)
    ax.set_yticklabels([str(t) for t in ticks], fontsize=cfg.fontsize)
    ax.set_xlabel('width of single fit', fontsize=cfg.fontsize)
    ax.set_ylabel('width by bootstrap', fontsize=cfg.fontsize)
    ax.set_title('change distribution of single fit vs. bootstrap',
                 fontsize=cfg.fontsize)

    return fig
def plot_theta_diff_scatter(show_title=False):
    yFitter = Fitter(Sigslope(priors_name),'normal')
    nFitter = Fitter(Sigslope())
    yFits = get_all_fits(data,yFitter)
    nFits = get_all_fits(data,nFitter)
    pairs = [(nFit.LOO_score,yFit.LOO_score) for nFit,yFit in iterate_fits(nFits,yFits)]
    diff_pairs = [(n,y-n) for n,y in pairs if n is not None and y is not None]
    n,d = zip(*diff_pairs)
    
    fig = plt.figure()
    ax = fig.add_axes([0.15,0.12,0.8,0.8])
    ax.scatter(n, d, alpha=0.5)
    xlims = ax.get_xlim()
    ax.plot(xlims,[0, 0],'k--')
    ax.set_xlim(*xlims)
    if show_title:
        ax.title(r'Improvement from prior on $\theta$ vs. baseline $R^2$', fontsize=fontsize)
    ax.set_xlabel(r'$R^2$(no priors)', fontsize=fontsize)
    ax.set_ylabel(r'$R^2$($\theta$) - $R^2$(no priors)', fontsize=fontsize) 
    ax.tick_params(axis='both', labelsize=fontsize)
    return fig
Example #22
0
def plot_comparison_over_R2_score(data, shapes, all_fits, zoom=None, nbins=50):
    if zoom is None:
        zoom = (-1, 1)
    fig = plt.figure()
    ax = fig.add_axes([0.12, 0.12, 0.8, 0.8])
    zoom_max = 0
    for shape, fits in zip(shapes, all_fits):
        scores = np.array([f.LOO_score for f in iterate_fits(fits)])
        scores[scores < -0.999] = -0.999
        h, bins = np.histogram(scores, bins=nbins, density=True)
        xpos = (bins[:-1] + bins[1:]) / 2
        zoom_data = h[(xpos >= zoom[0]) & (xpos <= zoom[1])]
        zoom_max = max(max(zoom_data), zoom_max)
        ax.plot(xpos, h, linewidth=3, label=shape.cache_name())
    ax.set_xlim(*zoom)
    ax.set_ylim(0, zoom_max * 1.1)
    ax.legend(loc='best', fontsize=fontsize, frameon=False)
    ax.set_xlabel('test $R^2$ score', fontsize=fontsize)
    ax.set_ylabel("probability density", fontsize=fontsize)
    ax.tick_params(axis='both', labelsize=fontsize)
    return fig
Example #23
0
def plot_comparison_over_R2_score(data, shapes, all_fits, zoom=None, nbins=50):
    if zoom is None:
        zoom = (-1,1)
    fig = plt.figure()
    ax = fig.add_axes([0.12,0.12,0.8,0.8])
    zoom_max = 0
    for shape,fits in zip(shapes,all_fits):
        scores = np.array([f.LOO_score for f in iterate_fits(fits)])
        scores[scores < -0.999] = -0.999
        h,bins = np.histogram(scores,bins=nbins,density=True)
        xpos = (bins[:-1] + bins[1:])/2
        zoom_data = h[(xpos>=zoom[0]) & (xpos<=zoom[1])]
        zoom_max = max(max(zoom_data),zoom_max)
        ax.plot(xpos,h, linewidth=3, label=shape.cache_name())        
    ax.set_xlim(*zoom)
    ax.set_ylim(0,zoom_max*1.1)
    ax.legend(loc='best', fontsize=fontsize, frameon=False)
    ax.set_xlabel('test $R^2$ score', fontsize=fontsize)
    ax.set_ylabel("probability density", fontsize=fontsize)
    ax.tick_params(axis='both', labelsize=fontsize)
    return fig
Example #24
0
def plot_bootstrap_onset_variance(data, fits):
    mu_and_std = []
    for fit in iterate_fits(fits):
        a, h, mu_global, _ = fit.theta

        nParams, nSamples = fit.theta_samples.shape
        mu_bootstrap = np.empty(nSamples)
        for i in xrange(nSamples):
            a, h, mu_i, _ = fit.theta_samples[:, i]
            mu_bootstrap[i] = mu_i
        mu_std = np.std(mu_bootstrap)
        mu_and_std.append((mu_global, mu_std))

    mu, mu_std = zip(*mu_and_std)

    fig = plt.figure()
    ax = fig.add_axes([0.12, 0.12, 0.8, 0.8])
    ax.plot(mu, mu_std, 'bx')
    ax.set_ylabel('onset time bootstrap std', fontsize=cfg.fontsize)

    # set the development stages as x labels
    stages = [stage.scaled(age_scaler) for stage in dev_stages]
    ax.set_xticks([stage.central_age for stage in stages])
    ax.set_xticklabels([stage.short_name for stage in stages],
                       fontsize=cfg.fontsize,
                       fontstretch='condensed',
                       rotation=90)
    yticks = ax.get_yticks()
    yticks = [yticks[0], yticks[-1]]
    ax.set_yticks(yticks)
    ax.set_yticklabels(['{:.1g}'.format(t) for t in yticks],
                       fontsize=cfg.fontsize)

    # mark birth time with a vertical line
    ymin, ymax = ax.get_ylim()
    birth_age = age_scaler.scale(0)
    ax.plot([birth_age, birth_age], [ymin, ymax], '--', color='0.85')
    return fig
Example #25
0
def plot_comparison_bar(data, shapes, all_fits):
    n = len(shapes)
    assert len(all_fits) == n
    assert n == 2

    score_pairs = [(f1.LOO_score, f2.LOO_score) for f1,f2 in iterate_fits(all_fits[0], all_fits[1], R2_threshold=-1)]
    scores1, scores2 = zip(*score_pairs)
    all_scores = [scores1, scores2]
    
    _, pval = scipy.stats.wilcoxon(scores1, scores2)
    pval = pval/2  # one sided p-value
    print '*** wilcoxon signed rank p-value (one sided) = {:.3g}'.format(pval)
    
    mu = np.empty(n)
    se = np.empty(n)
    for i,scores in enumerate(all_scores):
        mu[i] = np.mean(scores)
        se[i] = scipy.stats.sem(scores)
    
    # reorder by mean score
    idx = np.argsort(mu)[::-1]
    mu = mu[idx]
    se = se[idx]
    shapes = [shapes[i] for i in idx]

    index = np.arange(n)
    bar_width = 0.8
    fig = plt.figure()
    ax = fig.add_axes([0.12,0.12,0.8,0.8])
    ax.bar(index, mu, yerr=se, width=bar_width, color='b', error_kw = {'ecolor': '0.3', 'linewidth': 2})  
    ax.set_xlabel('shape', fontsize=fontsize)
    ax.set_ylabel('Mean $R^2$', fontsize=fontsize)
    ax.set_xticks(index + bar_width/2)
    ax.set_xticklabels([s.cache_name() for s in shapes], fontsize=fontsize)
    yticks = [0, 0.1, 0.2, 0.3]
    ax.set_yticks(yticks)
    ax.set_yticklabels(['{:g}'.format(t) for t in yticks], fontsize=fontsize)
    return fig
Example #26
0
def plot_theta_diff_scatter(show_title=False):
    yFitter = Fitter(Sigslope(priors_name), 'normal')
    nFitter = Fitter(Sigslope())
    yFits = get_all_fits(data, yFitter)
    nFits = get_all_fits(data, nFitter)
    pairs = [(nFit.LOO_score, yFit.LOO_score)
             for nFit, yFit in iterate_fits(nFits, yFits)]
    diff_pairs = [(n, y - n) for n, y in pairs
                  if n is not None and y is not None]
    n, d = zip(*diff_pairs)

    fig = plt.figure()
    ax = fig.add_axes([0.15, 0.12, 0.8, 0.8])
    ax.scatter(n, d, alpha=0.5)
    xlims = ax.get_xlim()
    ax.plot(xlims, [0, 0], 'k--')
    ax.set_xlim(*xlims)
    if show_title:
        ax.title(r'Improvement from prior on $\theta$ vs. baseline $R^2$',
                 fontsize=fontsize)
    ax.set_xlabel(r'$R^2$(no priors)', fontsize=fontsize)
    ax.set_ylabel(r'$R^2$($\theta$) - $R^2$(no priors)', fontsize=fontsize)
    ax.tick_params(axis='both', labelsize=fontsize)
    return fig
Example #27
0
cfg.verbosity = 1
age_scaler = LogScaler()
pathway = '17full'
data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler)
data_shuffled = GeneData.load('both').restrict_pathway(pathway).scale_ages(
    age_scaler).shuffle()

shape = Sigmoid('sigmoid_wide')
fitter = Fitter(shape, sigma_prior='normal')
fits = get_all_fits(data, fitter, allow_new_computation=False)
fits_shuffled = get_all_fits(data_shuffled,
                             fitter,
                             allow_new_computation=False)
R2_pairs = [(fit.LOO_score, fit2.LOO_score)
            for fit, fit2 in iterate_fits(fits, fits_shuffled)]
R2 = np.array([r for r, r_shuffled in R2_pairs])
R2_shuffled = np.array([r_shuffled for r, r_shuffled in R2_pairs])

name = '{}-{}'.format(data.pathway, shape.cache_name())
fig = plot_score_distribution(R2, R2_shuffled)
save_figure(fig,
            'RP/R2-distribution-{}.png'.format(name),
            under_results=True,
            b_close=True)

mu_shuffled = np.mean(R2_shuffled)
std_shuffled = np.std(R2_shuffled)
z_scores = (R2 - mu_shuffled) / std_shuffled
fig = plot_z_scores(z_scores)
save_figure(fig,
            ttl = '\n'.join([ttl, ttl_fit])
        plt.title(ttl)
    return vals

cfg.verbosity = 1
age_scaler = LogScaler()
pathway = 'serotonin'
data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler)
shape = Sigmoid()
fitter = Fitter(shape)
fits = get_all_fits(data,fitter, allow_new_computation=False)

def translate(g,r,fit):
    series = data.get_one_series(g,r)
    theta,sigma = fitter.translate_parameters_to_priors_scale(series.ages, series.single_expression, fit.theta, fit.sigma)
    a,h,mu,w = theta
    if h < 0:
        theta = (a+h,-h,mu,-w) # this is an equivalent sigmoid, with h now positive
    return Bunch(
        theta = theta,
        sigma = sigma,
    )
    
flat_fits = [translate(g,r,fit) for dsname,g,r,fit in iterate_fits(fits, return_keys=True)]

# This script is meant to be run as a setup, then run commands interactively, e.g.:
create_hist(flat_fits, 'a', -2, 1)
create_hist(flat_fits, 'h', -1, 3)
create_hist(flat_fits, 'w', -0.5, 1)
create_hist(flat_fits, 'mu', -2, 2)
create_hist(flat_fits, 'p', 0, 10)
Example #29
0
data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler)
shape = Sigmoid()
fitter = Fitter(shape)
fits = get_all_fits(data, fitter, allow_new_computation=False)


def translate(g, r, fit):
    series = data.get_one_series(g, r)
    theta, sigma = fitter.translate_parameters_to_priors_scale(
        series.ages, series.single_expression, fit.theta, fit.sigma)
    a, h, mu, w = theta
    if h < 0:
        theta = (a + h, -h, mu, -w
                 )  # this is an equivalent sigmoid, with h now positive
    return Bunch(
        theta=theta,
        sigma=sigma,
    )


flat_fits = [
    translate(g, r, fit)
    for dsname, g, r, fit in iterate_fits(fits, return_keys=True)
]

# This script is meant to be run as a setup, then run commands interactively, e.g.:
create_hist(flat_fits, 'a', -2, 1)
create_hist(flat_fits, 'h', -1, 3)
create_hist(flat_fits, 'w', -0.5, 1)
create_hist(flat_fits, 'mu', -2, 2)
create_hist(flat_fits, 'p', 0, 10)
Example #30
0
import setup
import config as cfg
from load_data import GeneData
from shapes.sigmoid import Sigmoid
from fitter import Fitter
from all_fits import get_all_fits, iterate_fits
from scalers import LogScaler

cfg.verbosity = 1
age_scaler = LogScaler()
pathway = 'serotonin'
data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler)
fitter = Fitter(Sigmoid(priors=None))
fits = get_all_fits(data, fitter)

extreme = [(g, r) for dsname, g, r, fit in iterate_fits(
    fits, R2_threshold=0.5, return_keys=True) if abs(fit.theta[0]) > 100]
Example #31
0
def print_diff_points(data1, fitter1, fits1, data2, fitter2, fits2, n):
    diffs = [(fit1.LOO_score-fit2.LOO_score, g,r, fit1.LOO_score, fit2.LOO_score) for dsname,g,r,fit1,fit2 in iterate_fits(fits1,fits2, return_keys=True)]
    diffs.sort()
    
    print 'Top {} fits where {} > {}:'.format(n, fitter1.shape, fitter2.shape)
    for diff,g,r,score1,score2 in diffs[-n:]:
        print '\t{}@{}: diff={:.2g}, {}={:.2g}, {}={:.2g}'.format(g,r,diff,fitter1.shape,score1,fitter2.shape,score2)

    print 'Top {} fits where {} < {}:'.format(n, fitter1.shape, fitter2.shape)
    for diff,g,r,score1,score2 in diffs[:n]:
        print '\t{}@{}: diff={:.2g}, {}={:.2g}, {}={:.2g}'.format(g,r,diff,fitter1.shape,score1,fitter2.shape,score2)
import setup
import config as cfg
from load_data import GeneData
from shapes.sigmoid import Sigmoid
from fitter import Fitter
from all_fits import get_all_fits, iterate_fits
from scalers import LogScaler

cfg.verbosity = 1
age_scaler = LogScaler()
pathway = 'serotonin'
data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler)
fitter = Fitter(Sigmoid(priors=None))
fits = get_all_fits(data,fitter)

extreme = [(g,r) for dsname,g,r,fit in iterate_fits(fits, R2_threshold=0.5, return_keys=True) if abs(fit.theta[0]) > 100]
Example #33
0
    ax.set_xlabel('z score', fontsize=fontsize)
    ax.set_ylabel('probability', fontsize=fontsize)   
    ax.tick_params(axis='both', labelsize=fontsize)
    return fig

cfg.verbosity = 1
age_scaler = LogScaler()
pathway = '17full'
data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler)
data_shuffled = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler).shuffle()

shape = Sigmoid('sigmoid_wide')
fitter = Fitter(shape,sigma_prior='normal')
fits = get_all_fits(data,fitter,allow_new_computation=False)
fits_shuffled = get_all_fits(data_shuffled,fitter,allow_new_computation=False)
R2_pairs = [(fit.LOO_score,fit2.LOO_score) for fit,fit2 in iterate_fits(fits,fits_shuffled)]
R2 = np.array([r for r,r_shuffled in R2_pairs])
R2_shuffled = np.array([r_shuffled for r,r_shuffled in R2_pairs])

name = '{}-{}'.format(data.pathway,shape.cache_name())
fig = plot_score_distribution(R2,R2_shuffled)
save_figure(fig,'RP/R2-distribution-{}.png'.format(name), under_results=True, b_close=True)

mu_shuffled = np.mean(R2_shuffled)
std_shuffled = np.std(R2_shuffled)
z_scores = (R2-mu_shuffled)/std_shuffled
fig = plot_z_scores(z_scores)
save_figure(fig,'RP/R2-z-scores-{}.png'.format(name), under_results=True, b_close=True)

T, signed_rank_p_value = wilcoxon(R2, R2_shuffled)
maxShuffled = R2_shuffled.max()