Example #1
0
def _get_dataset_fits(data, dataset, fitter, k_of_n, n_correlation_iterations, correlations_k_of_n, allow_new_computation):
    def arg_mapper(gr,f_proxy):
        g,r = gr
        series = dataset.get_one_series(g,r)
        return f_proxy(series,fitter)
        
    # sharding is done by gene, so plots.plot_and_save_all_genes can work on a shard
    # this also requires that the list of all genes be taken from the whole data
    # and not from each dataset. Otherwise we can get a mismatch between the genes 
    # in the shard for different datasets.
    dataset_fits = job_splitting.compute(
        name = 'fits',
        f = _compute_fit,
        arg_mapper = arg_mapper,
        all_keys = list(product(dataset.gene_names,dataset.region_names)),
        all_sharding_keys = data.gene_names,
        f_sharding_key = lambda gr: gr[0],
        k_of_n = k_of_n,
        base_filename = fit_results_relative_path(dataset,fitter),
        allow_new_computation = allow_new_computation,
    )
    
    if n_correlation_iterations > 0:
        # The problem is that if we're using a shard for the basic fits we won't have theta for all genes in a region
        # which is necessary for computing correlations in that region.
        assert k_of_n is None, "Can't perform correlation computations when sharding is enabled at the basic fit level" 
        _add_dataset_correlation_fits(dataset, fitter, dataset_fits, n_correlation_iterations, correlations_k_of_n, allow_new_computation)

    if cfg.verbosity > 0:
        print 'Adding fit scores... ',
    _add_scores(dataset, dataset_fits)
    if cfg.verbosity > 0:
        print 'done!'
    
    return dataset_fits
Example #2
0
def get_change_distribution_for_whole_genome(all_data, fitter):
    # NOTE: the distribution for all genes should be precomputed by running onset_times_whole_genome.py
    filename = join(cache_dir(),fit_results_relative_path(all_data,fitter) + '.pkl')
    print 'Loading whole genome onset distribution from {}'.format(filename)
    with open(filename) as f:
        bin_edges, change_vals = pickle.load(f)
    return bin_edges, change_vals
Example #3
0
def create_top_correlations_html(data, fitter, fits, scores, regions, n_top=None):
    if n_top is None:
        n_top = len(scores)
        
    basedir = join(results_dir(), fit_results_relative_path(data,fitter))
    ensure_dir(basedir)
    gene_dir = 'gene-subplot'
    series_dir = 'gene-region-fits'

    def key_func(score):
        g,r,pval,lst_R2 = score
        return r
    scores.sort(key=key_func)
    top_genes = [g for g,r,pval,lst_R2 in scores[:n_top]]
    top_scores = {g:r for g,r,pval,lst_R2 in scores[:n_top]}
    top_pvals = {g:pval for g,r,pval,lst_R2 in scores[:n_top]}
    
    def get_onset_time(fit):
        a,h,mu,_ = fit.theta
        age = age_scaler.unscale(mu)
        txt = 'onset = {:.3g} years'.format(age)
        cls = ''
        return txt,cls
    
    create_html(
        data, fitter, fits, basedir, gene_dir, series_dir,
        gene_names = top_genes, 
        region_names = regions,
        extra_columns = [('r',top_scores),('p-value',top_pvals)],
        extra_fields_per_fit = [get_onset_time],
        b_inline_images = True,
        b_R2_dist = False, 
        ttl = 'Fit for genes with top Spearman correlations',
        filename = 'top-gradual-maturation',
    )
Example #4
0
def get_change_distribution_for_whole_genome(all_data, fitter):
    # NOTE: the distribution for all genes should be precomputed by running onset_times_whole_genome.py
    filename = join(cache_dir(),
                    fit_results_relative_path(all_data, fitter) + '.pkl')
    print 'Loading whole genome onset distribution from {}'.format(filename)
    with open(filename) as f:
        bin_edges, change_vals = pickle.load(f)
    return bin_edges, change_vals
def create_top_genes_html(data, fitter, fits, scores, regions, n_top=None, filename_suffix=''):
    if n_top is None:
        n_top = len(scores)
        
    basedir = join(results_dir(), fit_results_relative_path(data,fitter))
    ensure_dir(basedir)
    gene_dir = 'gene-subplot'
    series_dir = 'gene-region-fits'

    def key_func(score):
        g,pval,qval = score
        return pval
    scores.sort(key=key_func)
    top_genes = [g for g,pval,qval in scores[:n_top]]
    top_pvals = {g:pval for g,pval,qval in scores[:n_top]}
    top_qvals = {g:qval for g,pval,qval in scores[:n_top]}
    
    n = len(scores)
    n05 = len([g for g,pval,qval in scores if qval < 0.05])
    n01 = len([g for g,pval,qval in scores if qval < 0.01])
    top_text = """\
<pre>
one sided t-test: {regions[0]} < {regions[1]}
{n05}/{n} q-values < 0.05
{n01}/{n} q_values < 0.01
</pre>
""".format(**locals())
    
    def get_onset_time(fit):
        a,h,mu,_ = fit.theta
        age = age_scaler.unscale(mu)
        return 'onset = {:.3g} years'.format(age)
        
    def get_onset_dist(fit):
        mu_vals = fit.theta_samples[2,:]
        mu = mu_vals.mean()
        vLow,vHigh = np.percentile(mu_vals, (20,80))
        mu = age_scaler.unscale(mu)
        vLow = age_scaler.unscale(vLow)
        vHigh = age_scaler.unscale(vHigh)
        txt = 'onset reestimate (mean [20%, 80%]) = {:.3g} [{:.3g},{:.3g}]'.format(mu,vLow,vHigh)
        cls = ''
        return txt,cls
    
    create_html(
        data, fitter, fits, basedir, gene_dir, series_dir,
        gene_names = top_genes, 
        region_names = regions,
        extra_columns = [('p-value',top_pvals), ('q-value',top_qvals)],
        extra_fields_per_fit = [get_onset_time, get_onset_dist],
        b_inline_images = True,
        inline_image_size = '30%',
        b_R2_dist = False, 
        ttl = 'Fit for genes with top t-test scores',
        top_text = top_text,
        filename = 'gradual-maturation-t-test' + filename_suffix,
    )
def export_timing_info_for_all_fits(data, fitter, fits):
    change_dist = compute_timing_info_for_all_fits(data, fitter, fits)
    README = """\
mu:
The mean age of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

std:
The standard deviation of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

weights:
The change distributions for each gene and region.
Dimensions: <n-genes> X <n-regions> X <n-bins>

bin_centers:
The ages for the center of each bin used in calculating the histogram in "weights".
Dimensions: <n-bins> X 1

bin_edges:
The edges of the bins used in calculating the change histogram.
(centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated)
Dimensions: <n-bins + 1> X 1

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CHANGE_DISTRIBUTIONS=README,
        genes=list_of_strings_to_matlab_cell_array(change_dist.genes),
        regions=list_of_strings_to_matlab_cell_array(change_dist.regions),
        age_scaler=scalers.unify(change_dist.age_scaler).cache_name(),
        mu=change_dist.mu,
        std=change_dist.std,
        bin_edges=change_dist.bin_edges,
        bin_centers=change_dist.bin_centers,
        weights=change_dist.weights,
    )
    filename = join(
        cache_dir(),
        fit_results_relative_path(data, fitter) + '-change-dist.mat')
    save_matfile(mdict, filename)
Example #7
0
def create_top_correlations_html(data,
                                 fitter,
                                 fits,
                                 scores,
                                 regions,
                                 n_top=None):
    if n_top is None:
        n_top = len(scores)

    basedir = join(results_dir(), fit_results_relative_path(data, fitter))
    ensure_dir(basedir)
    gene_dir = 'gene-subplot'
    series_dir = 'gene-region-fits'

    def key_func(score):
        g, r, pval, lst_R2 = score
        return r

    scores.sort(key=key_func)
    top_genes = [g for g, r, pval, lst_R2 in scores[:n_top]]
    top_scores = {g: r for g, r, pval, lst_R2 in scores[:n_top]}
    top_pvals = {g: pval for g, r, pval, lst_R2 in scores[:n_top]}

    def get_onset_time(fit):
        a, h, mu, _ = fit.theta
        age = age_scaler.unscale(mu)
        txt = 'onset = {:.3g} years'.format(age)
        cls = ''
        return txt, cls

    create_html(
        data,
        fitter,
        fits,
        basedir,
        gene_dir,
        series_dir,
        gene_names=top_genes,
        region_names=regions,
        extra_columns=[('r', top_scores), ('p-value', top_pvals)],
        extra_fields_per_fit=[get_onset_time],
        b_inline_images=True,
        b_R2_dist=False,
        ttl='Fit for genes with top Spearman correlations',
        filename='top-gradual-maturation',
    )
def export_timing_info_for_all_fits(data, fitter, fits):
    change_dist = compute_timing_info_for_all_fits(data, fitter, fits)
    README = """\
mu:
The mean age of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

std:
The standard deviation of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

weights:
The change distributions for each gene and region.
Dimensions: <n-genes> X <n-regions> X <n-bins>

bin_centers:
The ages for the center of each bin used in calculating the histogram in "weights".
Dimensions: <n-bins> X 1

bin_edges:
The edges of the bins used in calculating the change histogram.
(centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated)
Dimensions: <n-bins + 1> X 1

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CHANGE_DISTRIBUTIONS = README,
        genes = list_of_strings_to_matlab_cell_array(change_dist.genes),
        regions = list_of_strings_to_matlab_cell_array(change_dist.regions),
        age_scaler = scalers.unify(change_dist.age_scaler).cache_name(),
        mu = change_dist.mu,
        std = change_dist.std,
        bin_edges = change_dist.bin_edges,
        bin_centers = change_dist.bin_centers,
        weights = change_dist.weights,
    )
    filename = join(cache_dir(), fit_results_relative_path(data,fitter) + '-change-dist.mat')
    save_matfile(mdict, filename)
Example #9
0
def save_fits_and_create_html(data, fitter, fits=None, basedir=None, 
                              do_genes=True, do_series=True, do_hist=True, do_html=True, only_main_html=False,
                              k_of_n=None, 
                              use_correlations=False, correlations=None,
                              show_change_distributions=False,
                              html_kw=None,
                              figure_kw=None):
    if fits is None:
        fits = get_all_fits(data,fitter,k_of_n)
    if basedir is None:
        basedir = join(results_dir(), fit_results_relative_path(data,fitter))
        if use_correlations:
            basedir = join(basedir,'with-correlations')
    if html_kw is None:
        html_kw = {}
    if figure_kw is None:
        figure_kw = {}
    print 'Writing HTML under {}'.format(basedir)
    ensure_dir(basedir)
    gene_dir = 'gene-subplot'
    series_dir = 'gene-region-fits'
    correlations_dir = 'gene-correlations'
    scores_dir = 'score_distributions'
    if do_genes and not only_main_html: # relies on the sharding of the fits respecting gene boundaries
        plot_and_save_all_genes(data, fitter, fits, join(basedir,gene_dir), show_change_distributions)
    if do_series and not only_main_html:
        plot_and_save_all_series(data, fitter, fits, join(basedir,series_dir), use_correlations, show_change_distributions, figure_kw)
    if do_hist and k_of_n is None and not only_main_html:
        create_score_distribution_html(fits, use_correlations, join(basedir,scores_dir))
    if do_html and k_of_n is None:
        link_to_correlation_plots = use_correlations and correlations is not None
        if link_to_correlation_plots and not only_main_html:
            plot_and_save_all_gene_correlations(data, correlations, join(basedir,correlations_dir))
        dct_pathways = load_17_pathways_breakdown()
        pathway_genes = set.union(*dct_pathways.values())
        data_genes = set(data.gene_names)
        missing = pathway_genes - data_genes
        b_pathways = len(missing) < len(pathway_genes)/2 # simple heuristic to create pathways only if we have most of the genes (currently 61 genes are missing)
        create_html(
            data, fitter, fits, basedir, gene_dir, series_dir, scores_dir, correlations_dir=correlations_dir,
            use_correlations=use_correlations, link_to_correlation_plots=link_to_correlation_plots, 
            b_pathways=b_pathways, **html_kw
        )
Example #10
0
def get_onset_times(data, fitter, R2_threshold, b_force=False):
    filename = join(cache_dir(),fit_results_relative_path(data,fitter) + '.pkl')
    if isfile(filename):
        print 'Loading onset distribution from {}'.format(filename)
        with open(filename) as f:
            bin_edges, change_vals = pickle.load(f)
    else:
        print 'Computing...'
        fits = get_all_fits(data, fitter)        
        thetas = [fit.theta for fit in iterate_fits(fits, R2_threshold=R2_threshold)]
        stages = [stage.scaled(age_scaler) for stage in dev_stages]
        low = min(stage.from_age for stage in stages)
        high = max(stage.to_age for stage in stages) 
        bin_edges, change_vals = compute_change_distribution(fitter.shape, thetas, low, high, n_bins=50)    

        print 'Saving result to {}'.format(filename)
        ensure_dir(dirname(filename))   
        with open(filename,'w') as f:
            pickle.dump((bin_edges,change_vals),f)
    return bin_edges, change_vals
Example #11
0
def save_theta_text_files(data, fitter, fits):
    assert fitter.shape.cache_name() == 'spline', "save to text is only supported for splines at the moment"
    for dataset in data.datasets:
        filename = join(cache_dir(), fit_results_relative_path(dataset,fitter) + '.txt')
        dataset_fits = fits[dataset.name]    
        print 'Saving text file to {}'.format(filename)
        with open(filename, 'w') as f:
            for (g,r),fit in dataset_fits.iteritems():
                if fit.theta is None:
                    continue
                knots, coeffs, degree = fit.theta[0]
                knots = list(knots)
                coeffs = list(coeffs)
                gr_text = """\
Gene symbol: {g}
Region: {r}
Spline knots: {knots}
Spline coefficients: {coeffs}
Spline degree: {degree}
""".format(**locals())
                print >>f, gr_text
Example #12
0
def _add_dataset_correlation_fits(dataset, fitter, ds_fits, n_iterations,
                                  k_of_n, allow_new_computation):
    def arg_mapper(key, f_proxy):
        ir, loo_point = key
        r = dataset.region_names[ir]
        series = dataset.get_several_series(dataset.gene_names, r)
        basic_theta = [ds_fits[(g, r)].theta for g in dataset.gene_names]
        return f_proxy(series, fitter, basic_theta, loo_point, n_iterations)

    all_keys = []
    for ir, r in enumerate(dataset.region_names):
        all_keys.append((ir, None))
        series = dataset.get_several_series(dataset.gene_names, r)
        for iy, g in enumerate(dataset.gene_names):
            for ix in xrange(len(series.ages)):
                loo_point = (ix, iy)
                all_keys.append((ir, loo_point))

    def f_sharding_key(
            key):  # keep all x points in the same shard for same r,iy
        r, loo_point = key
        if loo_point is None:
            return (r, None)
        else:
            ix, iy = loo_point
            return (r, iy)

    dct_results = job_splitting.compute(
        name='fits-correlations',
        f=_compute_fit_with_correlations,
        arg_mapper=arg_mapper,
        all_keys=all_keys,
        f_sharding_key=f_sharding_key,
        k_of_n=k_of_n,
        base_filename=fit_results_relative_path(dataset, fitter) +
        '-correlations-{}'.format(n_iterations),
        allow_new_computation=allow_new_computation,
    )
    _add_dataset_correlation_fits_from_results_dictionary(
        dataset, ds_fits, dct_results)
Example #13
0
def _get_dataset_fits(data, dataset, fitter, k_of_n, n_correlation_iterations,
                      correlations_k_of_n, allow_new_computation):
    def arg_mapper(gr, f_proxy):
        g, r = gr
        series = dataset.get_one_series(g, r)
        return f_proxy(series, fitter)

    # sharding is done by gene, so plots.plot_and_save_all_genes can work on a shard
    # this also requires that the list of all genes be taken from the whole data
    # and not from each dataset. Otherwise we can get a mismatch between the genes
    # in the shard for different datasets.
    dataset_fits = job_splitting.compute(
        name='fits',
        f=_compute_fit,
        arg_mapper=arg_mapper,
        all_keys=list(product(dataset.gene_names, dataset.region_names)),
        all_sharding_keys=data.gene_names,
        f_sharding_key=lambda gr: gr[0],
        k_of_n=k_of_n,
        base_filename=fit_results_relative_path(dataset, fitter),
        allow_new_computation=allow_new_computation,
    )

    if n_correlation_iterations > 0:
        # The problem is that if we're using a shard for the basic fits we won't have theta for all genes in a region
        # which is necessary for computing correlations in that region.
        assert k_of_n is None, "Can't perform correlation computations when sharding is enabled at the basic fit level"
        _add_dataset_correlation_fits(dataset, fitter, dataset_fits,
                                      n_correlation_iterations,
                                      correlations_k_of_n,
                                      allow_new_computation)

    if cfg.verbosity > 0:
        print 'Adding fit scores... ',
    _add_scores(dataset, dataset_fits)
    if cfg.verbosity > 0:
        print 'done!'

    return dataset_fits
Example #14
0
def save_theta_text_files(data, fitter, fits):
    assert fitter.shape.cache_name(
    ) == 'spline', "save to text is only supported for splines at the moment"
    for dataset in data.datasets:
        filename = join(cache_dir(),
                        fit_results_relative_path(dataset, fitter) + '.txt')
        dataset_fits = fits[dataset.name]
        print 'Saving text file to {}'.format(filename)
        with open(filename, 'w') as f:
            for (g, r), fit in dataset_fits.iteritems():
                if fit.theta is None:
                    continue
                knots, coeffs, degree = fit.theta[0]
                knots = list(knots)
                coeffs = list(coeffs)
                gr_text = """\
Gene symbol: {g}
Region: {r}
Spline knots: {knots}
Spline coefficients: {coeffs}
Spline degree: {degree}
""".format(**locals())
                print >> f, gr_text
Example #15
0
def _add_dataset_correlation_fits(dataset, fitter, ds_fits, n_iterations, k_of_n, allow_new_computation):
    def arg_mapper(key, f_proxy):
        ir, loo_point = key
        r = dataset.region_names[ir]
        series = dataset.get_several_series(dataset.gene_names,r)
        basic_theta = [ds_fits[(g,r)].theta for g in dataset.gene_names]
        return f_proxy(series, fitter, basic_theta, loo_point, n_iterations)
        
    all_keys = []
    for ir,r in enumerate(dataset.region_names):
        all_keys.append((ir,None))
        series = dataset.get_several_series(dataset.gene_names,r)
        for iy,g in enumerate(dataset.gene_names):
            for ix in xrange(len(series.ages)):
                loo_point = (ix,iy)
                all_keys.append((ir,loo_point))
        
    def f_sharding_key(key): # keep all x points in the same shard for same r,iy
        r, loo_point = key
        if loo_point is None:
            return (r,None)
        else:
            ix,iy = loo_point
            return (r,iy)
        
    dct_results = job_splitting.compute(
        name = 'fits-correlations',
        f = _compute_fit_with_correlations,
        arg_mapper = arg_mapper,
        all_keys = all_keys,
        f_sharding_key = f_sharding_key,
        k_of_n = k_of_n,
        base_filename = fit_results_relative_path(dataset,fitter) + '-correlations-{}'.format(n_iterations),
        allow_new_computation = allow_new_computation,
    )
    _add_dataset_correlation_fits_from_results_dictionary(dataset, ds_fits, dct_results)
Example #16
0
def save_as_mat_files(data, fitter, fits, has_change_distributions):
    for dataset in data.datasets:
        filename = join(cache_dir(),
                        fit_results_relative_path(dataset, fitter) + '.mat')
        dataset_fits = fits[dataset.name]

        print 'Saving mat file to {}'.format(filename)
        shape = fitter.shape

        gene_names = dataset.gene_names
        gene_idx = {g: i for i, g in enumerate(gene_names)}
        n_genes = len(gene_names)
        region_names = dataset.region_names
        region_idx = {r: i for i, r in enumerate(region_names)}
        n_regions = len(region_names)

        write_theta = shape.can_export_params_to_matlab()
        if write_theta:
            theta = init_array(np.NaN, shape.n_params(), n_genes, n_regions)
        else:
            theta = np.NaN

        fit_scores = init_array(np.NaN, n_genes, n_regions)
        LOO_scores = init_array(np.NaN, n_genes, n_regions)
        fit_predictions = init_array(np.NaN, *dataset.expression.shape)
        LOO_predictions = init_array(np.NaN, *dataset.expression.shape)
        high_res_predictions = init_array(np.NaN, cfg.n_curve_points_to_plot,
                                          n_genes, n_regions)
        scaled_high_res_ages = np.linspace(dataset.ages.min(),
                                           dataset.ages.max(),
                                           cfg.n_curve_points_to_plot)
        original_high_res_ages = scalers.unify(
            dataset.age_scaler).unscale(scaled_high_res_ages)
        if has_change_distributions:
            change_distribution_bin_centers = fits.change_distribution_params.bin_centers
            n_bins = len(change_distribution_bin_centers)
            change_distribution_weights = init_array(np.NaN, n_bins, n_genes,
                                                     n_regions)
        else:
            change_distribution_bin_centers = []
            change_distribution_weights = []
        for (g, r), fit in dataset_fits.iteritems():
            series = dataset.get_one_series(g, r)
            ig = gene_idx[g]
            ir = region_idx[r]
            fit_scores[ig, ir] = fit.fit_score
            LOO_scores[ig, ir] = fit.LOO_score
            if write_theta and fit.theta is not None:
                theta[:, ig, ir] = fit.theta
            if fit.fit_predictions is not None:
                fit_predictions[series.original_inds, ig,
                                ir] = fit.fit_predictions
            if fit.LOO_predictions is not None:
                LOO_predictions[series.original_inds, ig,
                                ir] = fit.LOO_predictions
            if fit.theta is not None:
                high_res_predictions[:, ig,
                                     ir] = shape.f(fit.theta,
                                                   scaled_high_res_ages)
            change_weights = getattr(fit, 'change_distribution_weights', None)
            if change_weights is not None:
                change_distribution_weights[:, ig, ir] = change_weights
        mdict = dict(
            gene_names=list_of_strings_to_matlab_cell_array(gene_names),
            region_names=list_of_strings_to_matlab_cell_array(region_names),
            theta=theta,
            fit_scores=fit_scores,
            LOO_scores=LOO_scores,
            fit_predictions=fit_predictions,
            LOO_predictions=LOO_predictions,
            high_res_predictions=high_res_predictions,
            high_res_ages=original_high_res_ages,
            change_distribution_bin_centers=change_distribution_bin_centers,
            change_distribution_weights=change_distribution_weights,
        )
        savemat(filename, mdict, oned_as='column')
def create_top_genes_html(data,
                          fitter,
                          fits,
                          scores,
                          regions,
                          n_top=None,
                          filename_suffix=''):
    if n_top is None:
        n_top = len(scores)

    basedir = join(results_dir(), fit_results_relative_path(data, fitter))
    ensure_dir(basedir)
    gene_dir = 'gene-subplot'
    series_dir = 'gene-region-fits'

    def key_func(score):
        g, pval, qval = score
        return pval

    scores.sort(key=key_func)
    top_genes = [g for g, pval, qval in scores[:n_top]]
    top_pvals = {g: pval for g, pval, qval in scores[:n_top]}
    top_qvals = {g: qval for g, pval, qval in scores[:n_top]}

    n = len(scores)
    n05 = len([g for g, pval, qval in scores if qval < 0.05])
    n01 = len([g for g, pval, qval in scores if qval < 0.01])
    top_text = """\
<pre>
one sided t-test: {regions[0]} < {regions[1]}
{n05}/{n} q-values < 0.05
{n01}/{n} q_values < 0.01
</pre>
""".format(**locals())

    def get_onset_time(fit):
        a, h, mu, _ = fit.theta
        age = age_scaler.unscale(mu)
        return 'onset = {:.3g} years'.format(age)

    def get_onset_dist(fit):
        mu_vals = fit.theta_samples[2, :]
        mu = mu_vals.mean()
        vLow, vHigh = np.percentile(mu_vals, (20, 80))
        mu = age_scaler.unscale(mu)
        vLow = age_scaler.unscale(vLow)
        vHigh = age_scaler.unscale(vHigh)
        txt = 'onset reestimate (mean [20%, 80%]) = {:.3g} [{:.3g},{:.3g}]'.format(
            mu, vLow, vHigh)
        cls = ''
        return txt, cls

    create_html(
        data,
        fitter,
        fits,
        basedir,
        gene_dir,
        series_dir,
        gene_names=top_genes,
        region_names=regions,
        extra_columns=[('p-value', top_pvals), ('q-value', top_qvals)],
        extra_fields_per_fit=[get_onset_time, get_onset_dist],
        b_inline_images=True,
        inline_image_size='30%',
        b_R2_dist=False,
        ttl='Fit for genes with top t-test scores',
        top_text=top_text,
        filename='gradual-maturation-t-test' + filename_suffix,
    )
Example #18
0
def save_as_mat_files(data, fitter, fits, has_change_distributions):
    for dataset in data.datasets:
        filename = join(cache_dir(), fit_results_relative_path(dataset,fitter) + '.mat')
        dataset_fits = fits[dataset.name]
    
        print 'Saving mat file to {}'.format(filename)
        shape = fitter.shape
        
        gene_names = dataset.gene_names
        gene_idx = {g:i for i,g in enumerate(gene_names)}
        n_genes = len(gene_names)
        region_names = dataset.region_names
        region_idx = {r:i for i,r in enumerate(region_names)}
        n_regions = len(region_names)
        
        write_theta = shape.can_export_params_to_matlab()
        if write_theta:
            theta = init_array(np.NaN, shape.n_params(), n_genes,n_regions)
        else:
            theta = np.NaN
        
        fit_scores = init_array(np.NaN, n_genes,n_regions)
        LOO_scores = init_array(np.NaN, n_genes,n_regions)
        fit_predictions = init_array(np.NaN, *dataset.expression.shape)
        LOO_predictions = init_array(np.NaN, *dataset.expression.shape)
        high_res_predictions = init_array(np.NaN, cfg.n_curve_points_to_plot, n_genes, n_regions)
        scaled_high_res_ages = np.linspace(dataset.ages.min(), dataset.ages.max(), cfg.n_curve_points_to_plot)
        original_high_res_ages = scalers.unify(dataset.age_scaler).unscale(scaled_high_res_ages)
        if has_change_distributions:
            change_distribution_bin_centers = fits.change_distribution_params.bin_centers
            n_bins = len(change_distribution_bin_centers)
            change_distribution_weights = init_array(np.NaN, n_bins, n_genes, n_regions)
        else:
            change_distribution_bin_centers = []
            change_distribution_weights = []
        for (g,r),fit in dataset_fits.iteritems():
            series = dataset.get_one_series(g,r)
            ig = gene_idx[g]
            ir = region_idx[r]
            fit_scores[ig,ir] = fit.fit_score
            LOO_scores[ig,ir] = fit.LOO_score
            if write_theta and fit.theta is not None:
                theta[:,ig,ir] = fit.theta
            if fit.fit_predictions is not None:
                fit_predictions[series.original_inds,ig,ir] = fit.fit_predictions
            if fit.LOO_predictions is not None:
                LOO_predictions[series.original_inds,ig,ir] = fit.LOO_predictions
            if fit.theta is not None:
                high_res_predictions[:,ig,ir] = shape.f(fit.theta, scaled_high_res_ages)
            change_weights = getattr(fit,'change_distribution_weights',None)
            if change_weights is not None:
                change_distribution_weights[:,ig,ir] = change_weights
        mdict = dict(
            gene_names = list_of_strings_to_matlab_cell_array(gene_names),
            region_names = list_of_strings_to_matlab_cell_array(region_names),
            theta = theta,
            fit_scores = fit_scores,
            LOO_scores = LOO_scores,
            fit_predictions = fit_predictions,
            LOO_predictions = LOO_predictions,
            high_res_predictions = high_res_predictions,
            high_res_ages = original_high_res_ages,
            change_distribution_bin_centers = change_distribution_bin_centers,
            change_distribution_weights = change_distribution_weights,
        )
        savemat(filename, mdict, oned_as='column')
Example #19
0
def save_fits_and_create_html(data,
                              fitter,
                              fits=None,
                              basedir=None,
                              do_genes=True,
                              do_series=True,
                              do_hist=True,
                              do_html=True,
                              only_main_html=False,
                              k_of_n=None,
                              use_correlations=False,
                              correlations=None,
                              show_change_distributions=False,
                              exons_layout=False,
                              html_kw=None,
                              figure_kw=None):
    if fits is None:
        fits = get_all_fits(data, fitter, k_of_n)
    if basedir is None:
        basedir = join(results_dir(), fit_results_relative_path(data, fitter))
        if use_correlations:
            basedir = join(basedir, 'with-correlations')
    if html_kw is None:
        html_kw = {}
    if figure_kw is None:
        figure_kw = {}
    print 'Writing HTML under {}'.format(basedir)
    ensure_dir(basedir)
    gene_dir = 'gene-subplot'
    exons_dir = 'exons_subplot_series' if cfg.exons_plots_from_series else 'exons_subplot'
    series_dir = 'gene-region-fits'
    correlations_dir = 'gene-correlations'
    scores_dir = 'score_distributions'
    if do_genes and not only_main_html:  # relies on the sharding of the fits respecting gene boundaries
        plot_and_save_all_genes(data, fitter, fits, join(basedir, gene_dir),
                                show_change_distributions)
    if do_series and not only_main_html:
        plot_and_save_all_series(data, fitter, fits, join(basedir, series_dir),
                                 use_correlations, show_change_distributions,
                                 exons_layout, figure_kw)
    if exons_layout and not only_main_html:
        if cfg.exons_plots_from_series:
            plot_and_save_all_exons_from_series(fits, join(basedir, exons_dir),
                                                join(basedir, series_dir))
        else:
            plot_and_save_all_exons(data, fitter, fits,
                                    join(basedir, exons_dir))
    if do_hist and k_of_n is None and not only_main_html:
        create_score_distribution_html(fits, use_correlations,
                                       join(basedir, scores_dir))
    if do_html and k_of_n is None:
        link_to_correlation_plots = use_correlations and correlations is not None
        if link_to_correlation_plots and not only_main_html:
            plot_and_save_all_gene_correlations(
                data, correlations, join(basedir, correlations_dir))
        dct_pathways = load_17_pathways_breakdown()
        pathway_genes = set.union(*dct_pathways.values())
        data_genes = set(data.gene_names)
        missing = pathway_genes - data_genes
        b_pathways = len(missing) < len(
            pathway_genes
        ) / 2  # simple heuristic to create pathways only if we have most of the genes (currently 61 genes are missing)
        create_html(data,
                    fitter,
                    fits,
                    basedir,
                    gene_dir,
                    exons_dir,
                    series_dir,
                    scores_dir,
                    correlations_dir=correlations_dir,
                    use_correlations=use_correlations,
                    link_to_correlation_plots=link_to_correlation_plots,
                    b_pathways=b_pathways,
                    exons_layout=exons_layout,
                    **html_kw)

def calc_bootstrap_change_distribution(shape, theta_samples, bin_edges):
    bin_centers = bin_edges_to_centers(bin_edges)
    n_params, n_samples = theta_samples.shape
    weights = np.zeros(bin_centers.shape)
    for i in xrange(n_samples):
        weights += calc_change_distribution(shape, theta_samples[:, i],
                                            bin_edges)
    weights /= n_samples  # now values are in fraction of total change (doesn't have to sum up to 1 if ages don't cover the whole transition range)
    return weights


@cache(lambda data, fitter, fits: join(
    cache_dir(),
    fit_results_relative_path(data, fitter) + '-dprime-cube.pkl'))
def compute_dprime_measures_for_all_pairs(data, fitter, fits):
    genes = data.gene_names
    regions = data.region_names
    r2ds = data.region_to_dataset()
    cube_shape = (len(genes), len(regions), len(regions))
    d_mu = np.empty(cube_shape)  # mu2-mu1 for all genes and region pairs
    std = np.empty(cube_shape)  # std (combined) for all genes and region pairs

    def get_mu_std(g, r):
        dsfits = fits[r2ds[r]]
        fit = dsfits.get((g, r))
        if fit is None:
            return np.nan, np.nan
        else:
            return fit.change_distribution_mean_std
    for dsname,g,r,fit in iterate_fits(fits, return_keys=True):
        weights = calc_bootstrap_change_distribution(shape, fit.theta_samples, bin_edges)
        fit.change_distribution_weights = weights
        fit.change_distribution_spread = change_distribution_spread_cumsum(bin_centers, weights)
        fit.change_distribution_mean_std = change_distribution_mean_and_std(bin_centers, weights)

def calc_bootstrap_change_distribution(shape, theta_samples, bin_edges):
    bin_centers = bin_edges_to_centers(bin_edges)
    n_params, n_samples = theta_samples.shape
    weights = np.zeros(bin_centers.shape)
    for i in xrange(n_samples):
        weights += calc_change_distribution(shape, theta_samples[:,i], bin_edges)
    weights /= n_samples # now values are in fraction of total change (doesn't have to sum up to 1 if ages don't cover the whole transition range)
    return weights

@cache(lambda data, fitter, fits: join(cache_dir(), fit_results_relative_path(data,fitter) + '-dprime-cube.pkl'))
def compute_dprime_measures_for_all_pairs(data, fitter, fits):
    genes = data.gene_names
    regions = data.region_names 
    r2ds = data.region_to_dataset()        
    cube_shape = (len(genes), len(regions), len(regions))
    d_mu = np.empty(cube_shape) # mu2-mu1 for all genes and region pairs
    std = np.empty(cube_shape) # std (combined) for all genes and region pairs
    def get_mu_std(g,r):
        dsfits = fits[r2ds[r]]
        fit = dsfits.get((g,r))
        if fit is None:
            return np.nan, np.nan
        else:
            return fit.change_distribution_mean_std
    for ig,g in enumerate(genes):