Exemple #1
0
def do_gene_fits(data, gene, fitter, filename, b_show):
    fig = plot_gene(data,gene)
    if filename is None:
        ensure_dir(results_dir())
        filename = join(results_dir(), 'fit.png')
    print 'Saving figure to {}'.format(filename)
    save_figure(fig, filename)
    if b_show:
        plt.show(block=True)
Exemple #2
0
def do_one_fit(series, fitter, loo, filename, b_show):
    if fitter is not None:
        theta, sigma, LOO_predictions,_ = fitter.fit(series.ages, series.single_expression, loo=loo)
        fig = plot_one_series(series, fitter.shape, theta, LOO_predictions)
    else:
        fig = plot_one_series(series)
    if filename is None:
        ensure_dir(results_dir())
        filename = join(results_dir(), 'fit.png')
    save_figure(fig, filename, print_filename=True)
    if b_show:
        plt.show(block=True)
def fit_serveral_genes(series, fitter, loo, filename, b_show):
    if fitter is not None:
        theta, L, LOO_predictions,_ = fitter.fit(series.ages, series.expression, loo=loo)
        print 'L = {}'.format(L)
        fig = plot_series(series, fitter.shape, theta, LOO_predictions)
    else:
        fig = plot_series(series)
    if filename is None:
        ensure_dir(results_dir())
        filename = join(results_dir(), 'fits.png')
    print 'Saving figure to {}'.format(filename)
    save_figure(fig, filename)
    if b_show:
        plt.show(block=True)
def create_top_correlations_html(data, fitter, fits, scores, regions, n_top=None):
    if n_top is None:
        n_top = len(scores)
        
    basedir = join(results_dir(), fit_results_relative_path(data,fitter))
    ensure_dir(basedir)
    gene_dir = 'gene-subplot'
    series_dir = 'gene-region-fits'

    def key_func(score):
        g,r,pval,lst_R2 = score
        return r
    scores.sort(key=key_func)
    top_genes = [g for g,r,pval,lst_R2 in scores[:n_top]]
    top_scores = {g:r for g,r,pval,lst_R2 in scores[:n_top]}
    top_pvals = {g:pval for g,r,pval,lst_R2 in scores[:n_top]}
    
    def get_onset_time(fit):
        a,h,mu,_ = fit.theta
        age = age_scaler.unscale(mu)
        txt = 'onset = {:.3g} years'.format(age)
        cls = ''
        return txt,cls
    
    create_html(
        data, fitter, fits, basedir, gene_dir, series_dir,
        gene_names = top_genes, 
        region_names = regions,
        extra_columns = [('r',top_scores),('p-value',top_pvals)],
        extra_fields_per_fit = [get_onset_time],
        b_inline_images = True,
        b_R2_dist = False, 
        ttl = 'Fit for genes with top Spearman correlations',
        filename = 'top-gradual-maturation',
    )
def export_cube():
    cube = load_pickle(RegionPairTiming.cube_filename)
    README = """\
d_mu:
mu(r2)-mu(r1) for every gene and region pair. 
Dimensions: <n-genes> X <n-regions> X <n-regions>

combined_std: 
The combined standard deviation of the two change distributions.
std = sqrt(0.5*(std1^2 + std2^2))
Dimensions: <n-genes> X <n-regions> X <n-regions>

score:
The d' for the two change distributions. Equal to d_mu ./ combined_std.
Dimensions: <n-genes> X <n-regions> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CUBE = README,
        genes = list_of_strings_to_matlab_cell_array(cube.genes),
        regions = list_of_strings_to_matlab_cell_array(cube.regions),
        age_scaler = scalers.unify(cube.age_scaler).cache_name(),
        d_mu = cube.d_mu,
        combined_std = cube.std,
        scores = cube.d_mu / cube.std,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'cube.mat'))
Exemple #6
0
def export_cube():
    cube = load_pickle(RegionPairTiming.cube_filename)
    README = """\
d_mu:
mu(r2)-mu(r1) for every gene and region pair. 
Dimensions: <n-genes> X <n-regions> X <n-regions>

combined_std: 
The combined standard deviation of the two change distributions.
std = sqrt(0.5*(std1^2 + std2^2))
Dimensions: <n-genes> X <n-regions> X <n-regions>

score:
The d' for the two change distributions. Equal to d_mu ./ combined_std.
Dimensions: <n-genes> X <n-regions> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CUBE=README,
        genes=list_of_strings_to_matlab_cell_array(cube.genes),
        regions=list_of_strings_to_matlab_cell_array(cube.regions),
        age_scaler=scalers.unify(cube.age_scaler).cache_name(),
        d_mu=cube.d_mu,
        combined_std=cube.std,
        scores=cube.d_mu / cube.std,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'cube.mat'))
Exemple #7
0
def fit_serveral_genes(series, fitter, loo, filename, b_show):
    if fitter is not None:
        theta, L, LOO_predictions, _ = fitter.fit(series.ages,
                                                  series.expression,
                                                  loo=loo)
        print 'L = {}'.format(L)
        fig = plot_series(series, fitter.shape, theta, LOO_predictions)
    else:
        fig = plot_series(series)
    if filename is None:
        ensure_dir(results_dir())
        filename = join(results_dir(), 'fits.png')
    print 'Saving figure to {}'.format(filename)
    save_figure(fig, filename)
    if b_show:
        plt.show(block=True)
def create_top_genes_html(data, fitter, fits, scores, regions, n_top=None, filename_suffix=''):
    if n_top is None:
        n_top = len(scores)
        
    basedir = join(results_dir(), fit_results_relative_path(data,fitter))
    ensure_dir(basedir)
    gene_dir = 'gene-subplot'
    series_dir = 'gene-region-fits'

    def key_func(score):
        g,pval,qval = score
        return pval
    scores.sort(key=key_func)
    top_genes = [g for g,pval,qval in scores[:n_top]]
    top_pvals = {g:pval for g,pval,qval in scores[:n_top]}
    top_qvals = {g:qval for g,pval,qval in scores[:n_top]}
    
    n = len(scores)
    n05 = len([g for g,pval,qval in scores if qval < 0.05])
    n01 = len([g for g,pval,qval in scores if qval < 0.01])
    top_text = """\
<pre>
one sided t-test: {regions[0]} < {regions[1]}
{n05}/{n} q-values < 0.05
{n01}/{n} q_values < 0.01
</pre>
""".format(**locals())
    
    def get_onset_time(fit):
        a,h,mu,_ = fit.theta
        age = age_scaler.unscale(mu)
        return 'onset = {:.3g} years'.format(age)
        
    def get_onset_dist(fit):
        mu_vals = fit.theta_samples[2,:]
        mu = mu_vals.mean()
        vLow,vHigh = np.percentile(mu_vals, (20,80))
        mu = age_scaler.unscale(mu)
        vLow = age_scaler.unscale(vLow)
        vHigh = age_scaler.unscale(vHigh)
        txt = 'onset reestimate (mean [20%, 80%]) = {:.3g} [{:.3g},{:.3g}]'.format(mu,vLow,vHigh)
        cls = ''
        return txt,cls
    
    create_html(
        data, fitter, fits, basedir, gene_dir, series_dir,
        gene_names = top_genes, 
        region_names = regions,
        extra_columns = [('p-value',top_pvals), ('q-value',top_qvals)],
        extra_fields_per_fit = [get_onset_time, get_onset_dist],
        b_inline_images = True,
        inline_image_size = '30%',
        b_R2_dist = False, 
        ttl = 'Fit for genes with top t-test scores',
        top_text = top_text,
        filename = 'gradual-maturation-t-test' + filename_suffix,
    )
def export_cytoscape(timing, pval_cutoff):
    res = timing.analyze_all_pathways().filter_regions(exclude=["PFC"])

    def safe_pathway_name(pathway):
        return re.sub(r"\s+", "-", pathway)

    def edge_weight(pval):
        return min(200, int(-50 / np.log10(pval)))

    vals = [
        (x.r1, safe_pathway_name(x.pathway), x.r2, edge_weight(x.pval))
        for x in res.res
        if -np.log10(x.pval) > pval_cutoff
    ]

    lines = ["{} {} {}".format(r1, pathway, r2) for r1, pathway, r2, w in vals]
    save_file(join(results_dir(), "cytoscape", "regions.sif"), lines)
    lines = ["{} ({}) {} = {}".format(r1, pathway, r2, w) for r1, pathway, r2, w in vals]
    save_file(join(results_dir(), "cytoscape", "edge_weights.attrs"), ["EdgeWeights"] + lines)
def export_cytoscape(timing, pval_cutoff):
    res = timing.analyze_all_pathways().filter_regions(exclude=['PFC'])

    def safe_pathway_name(pathway):
        return re.sub(r'\s+', '-', pathway)

    def edge_weight(pval):
        return min(200, int(-50 / np.log10(pval)))

    vals = [(x.r1, safe_pathway_name(x.pathway), x.r2, edge_weight(x.pval))
            for x in res.res if -np.log10(x.pval) > pval_cutoff]

    lines = ['{} {} {}'.format(r1, pathway, r2) for r1, pathway, r2, w in vals]
    save_file(join(results_dir(), 'cytoscape', 'regions.sif'), lines)
    lines = [
        '{} ({}) {} = {}'.format(r1, pathway, r2, w)
        for r1, pathway, r2, w in vals
    ]
    save_file(join(results_dir(), 'cytoscape', 'edge_weights.attrs'),
              ['EdgeWeights'] + lines)
def export_pathways():
    change_dist = load_pickle(SingleRegion.change_dist_filename)
    matlab_g2i = {g:(i+1) for i,g in enumerate(change_dist.genes)} # NOTE that matlab is one based
    
    pathways = pathway_lists.read_all_pathways()
    pathway_names = pathways.keys() # make sure the order stays fixed
    pathway_genes_names = np.array([list_of_strings_to_matlab_cell_array(pathways[p]) for p in pathway_names], dtype=object)
    pathway_genes_idx = np.array([np.array([matlab_g2i[g] for g in pathways[p]]) for p in pathway_names], dtype=object)

    matlab_p2i = {p:(i+1) for i,p in enumerate(pathway_names)} # NOTE matlab indexing is one based
    list_names = pathway_lists.all_pathway_lists()
    list_pathway_names = np.empty(len(list_names), dtype=object)
    list_pathway_idx = np.empty(len(list_names), dtype=object)
    for i,listname in enumerate(list_names):
        pathways_in_list = pathway_lists.list_to_pathway_names(listname)
        list_pathway_names[i] = list_of_strings_to_matlab_cell_array(pathways_in_list)
        list_pathway_idx[i] = [matlab_p2i[p] for p in pathways_in_list]
    README = """\
pathway_names:
Cell array of all pathway names. The name in cell number k is the name of the
pathway at position k in "pathway_genes_names" and "pathway_genes_idx".

pathway_genes_names:
Cell array (size <n-pathways>). Each cell contains a cell array of strings which 
are the gene symbols of the genes in that pathway.

pathway_genes_idx:
Same as pathway_genes_names, but each cell in the outer cell array is now an 
array of gene indices corresponding to the gene positions in cube.mat and change-distributions.mat.
Hopefully this should be easier to use in matlab.

list_names:
Names of pathway lists prepared by Noa

list_pathway_names:
Call array. One item per list. Each item is a cell array of strings which are 
the names of the pathways belonging to that list.

list_pathway_idx:
Same as list_pathway_names, but instead of listing the pathways by name, they 
are given as indices into the previous pathway_xxx structures.
"""
    mdict = dict(
        README_PATHWAYS = README,
        pathway_names = list_of_strings_to_matlab_cell_array(pathway_names),
        pathway_genes_names = pathway_genes_names,
        pathway_genes_idx = pathway_genes_idx,
        list_names = list_of_strings_to_matlab_cell_array(list_names),
        list_pathway_names = list_pathway_names,
        list_pathway_idx = list_pathway_idx,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'pathways.mat'))
def save_scores(singles, scores, order):
    filename = join(results_dir(), 'pathway-spearman-{}.txt'.format('-'.join(order)))
    print 'Saving ordering results to {}'.format(filename)
    with open(filename,'w') as f:
        print >>f, 'Region Order: {}'.format(' '.join(order))
        header = '{:<60}{:<7}{:<15}{:<10}{:<15}'.format('pathway', 'nGenes', '-log10(pval)', 'pval', 'Spearman rho')
        print >>f, header
        print >>f, '-'*len(header)
        for logpval, pval, sr, pathway in scores:
            pathway_size = len(singles.pathways[pathway])
            if len(pathway) > 55:
                pathway = pathway[:55] + '...'
            print >>f, '{pathway:<60}{pathway_size:<7}{logpval:<15.3g}{pval:<10.3g}{sr:<15.3g}'.format(**locals())
Exemple #13
0
 def save_top_results(self, n=50):
     filename = join(results_dir(), 'dprime-top-results-{}.txt'.format(self.filename_suffix))
     print 'Saving top {} results to {}'.format(n,filename)
     with open(filename,'w') as f:
         header = '{:<60}{:<7}{:<5}{:<5}{:<15}{:<10}{:<10}{:<10}{:<10}{:<10}'.format('pathway', 'nGenes', 'r1', 'r2', '-log10(pval)', 'score', 'delta', 'w-delta', 'mu1 yrs', 'mu2 yrs')
         print >>f, header
         print >>f, '-'*len(header)
         for x in self.res[:n]:
             logpval = -np.log10(x.pval)
             pathway = x.pathway
             if len(pathway) > 55:
                 pathway = pathway[:55] + '...'
             print >>f, '{pathway:<60}{x.pathway_size:<7}{x.r1:<5}{x.r2:<5}{logpval:<15.3g}{x.score:<10.3g}{x.delta:<10.3g}{x.weighted_delta:<10.3g}{x.mu1_years:<10.3g}{x.mu2_years:<10.3g}'.format(**locals())
Exemple #14
0
def save_figure(fig, filename, b_close=False, b_square=True, show_frame=False, under_results=False, print_filename=False):
    if under_results:
        dirname = results_dir()
        filename = join(dirname,filename)
        ensure_dir(os.path.dirname(filename))
    if cfg.verbosity >= 1 or print_filename:
        print 'Saving figure to {}'.format(filename)
    figure_size_x = cfg.default_figure_size_x_square if b_square else cfg.default_figure_size_x
    fig.set_size_inches(figure_size_x, cfg.default_figure_size_y)
    if show_frame:
        facecolor = cfg.default_figure_facecolor
    else:
        facecolor = 'white'
    fig.savefig(filename, facecolor=facecolor, dpi=cfg.default_figure_dpi)
    if b_close:
        plt.close(fig)
def save_scores(singles, scores, order):
    filename = join(results_dir(),
                    'pathway-spearman-{}.txt'.format('-'.join(order)))
    print 'Saving ordering results to {}'.format(filename)
    with open(filename, 'w') as f:
        print >> f, 'Region Order: {}'.format(' '.join(order))
        header = '{:<60}{:<7}{:<15}{:<10}{:<15}'.format(
            'pathway', 'nGenes', '-log10(pval)', 'pval', 'Spearman rho')
        print >> f, header
        print >> f, '-' * len(header)
        for logpval, pval, sr, pathway in scores:
            pathway_size = len(singles.pathways[pathway])
            if len(pathway) > 55:
                pathway = pathway[:55] + '...'
            print >> f, '{pathway:<60}{pathway_size:<7}{logpval:<15.3g}{pval:<10.3g}{sr:<15.3g}'.format(
                **locals())
Exemple #16
0
def export_singles():
    change_dist = load_pickle(SingleRegion.change_dist_filename)
    README = """\
mu:
The mean age of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

std:
The standard deviation of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

weights:
The change distributions for each gene and region.
Dimensions: <n-genes> X <n-regions> X <n-bins>

bin_centers:
The ages for the center of each bin used in calculating the histogram in "weights".
Dimensions: <n-bins> X 1

bin_edges:
The edges of the bins used in calculating the change histogram.
(centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated)
Dimensions: <n-bins + 1> X 1

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CHANGE_DISTRIBUTIONS=README,
        genes=list_of_strings_to_matlab_cell_array(change_dist.genes),
        regions=list_of_strings_to_matlab_cell_array(change_dist.regions),
        age_scaler=scalers.unify(change_dist.age_scaler).cache_name(),
        mu=change_dist.mu,
        std=change_dist.std,
        bin_edges=change_dist.bin_edges,
        bin_centers=change_dist.bin_centers,
        weights=change_dist.weights,
    )
    save_matfile(mdict,
                 join(results_dir(), 'export', 'change-distributions.mat'))
Exemple #17
0
def create_top_correlations_html(data,
                                 fitter,
                                 fits,
                                 scores,
                                 regions,
                                 n_top=None):
    if n_top is None:
        n_top = len(scores)

    basedir = join(results_dir(), fit_results_relative_path(data, fitter))
    ensure_dir(basedir)
    gene_dir = 'gene-subplot'
    series_dir = 'gene-region-fits'

    def key_func(score):
        g, r, pval, lst_R2 = score
        return r

    scores.sort(key=key_func)
    top_genes = [g for g, r, pval, lst_R2 in scores[:n_top]]
    top_scores = {g: r for g, r, pval, lst_R2 in scores[:n_top]}
    top_pvals = {g: pval for g, r, pval, lst_R2 in scores[:n_top]}

    def get_onset_time(fit):
        a, h, mu, _ = fit.theta
        age = age_scaler.unscale(mu)
        txt = 'onset = {:.3g} years'.format(age)
        cls = ''
        return txt, cls

    create_html(
        data,
        fitter,
        fits,
        basedir,
        gene_dir,
        series_dir,
        gene_names=top_genes,
        region_names=regions,
        extra_columns=[('r', top_scores), ('p-value', top_pvals)],
        extra_fields_per_fit=[get_onset_time],
        b_inline_images=True,
        b_R2_dist=False,
        ttl='Fit for genes with top Spearman correlations',
        filename='top-gradual-maturation',
    )
def export_singles():
    change_dist = load_pickle(SingleRegion.change_dist_filename)
    README = """\
mu:
The mean age of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

std:
The standard deviation of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

weights:
The change distributions for each gene and region.
Dimensions: <n-genes> X <n-regions> X <n-bins>

bin_centers:
The ages for the center of each bin used in calculating the histogram in "weights".
Dimensions: <n-bins> X 1

bin_edges:
The edges of the bins used in calculating the change histogram.
(centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated)
Dimensions: <n-bins + 1> X 1

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CHANGE_DISTRIBUTIONS = README,
        genes = list_of_strings_to_matlab_cell_array(change_dist.genes),
        regions = list_of_strings_to_matlab_cell_array(change_dist.regions),
        age_scaler = scalers.unify(change_dist.age_scaler).cache_name(),
        mu = change_dist.mu,
        std = change_dist.std,
        bin_edges = change_dist.bin_edges,
        bin_centers = change_dist.bin_centers,
        weights = change_dist.weights,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'change-distributions.mat'))
Exemple #19
0
 def save_top_results(self, n=50):
     filename = join(
         results_dir(),
         'dprime-top-results-{}.txt'.format(self.filename_suffix))
     print 'Saving top {} results to {}'.format(n, filename)
     with open(filename, 'w') as f:
         header = '{:<60}{:<7}{:<5}{:<5}{:<15}{:<10}{:<10}{:<10}{:<10}{:<10}'.format(
             'pathway', 'nGenes', 'r1', 'r2', '-log10(pval)', 'score',
             'delta', 'w-delta', 'mu1 yrs', 'mu2 yrs')
         print >> f, header
         print >> f, '-' * len(header)
         for x in self.res[:n]:
             logpval = -np.log10(x.pval)
             pathway = x.pathway
             if len(pathway) > 55:
                 pathway = pathway[:55] + '...'
             print >> f, '{pathway:<60}{x.pathway_size:<7}{x.r1:<5}{x.r2:<5}{logpval:<15.3g}{x.score:<10.3g}{x.delta:<10.3g}{x.weighted_delta:<10.3g}{x.mu1_years:<10.3g}{x.mu2_years:<10.3g}'.format(
                 **locals())
Exemple #20
0
def save_fits_and_create_html(data, fitter, fits=None, basedir=None, 
                              do_genes=True, do_series=True, do_hist=True, do_html=True, only_main_html=False,
                              k_of_n=None, 
                              use_correlations=False, correlations=None,
                              show_change_distributions=False,
                              html_kw=None,
                              figure_kw=None):
    if fits is None:
        fits = get_all_fits(data,fitter,k_of_n)
    if basedir is None:
        basedir = join(results_dir(), fit_results_relative_path(data,fitter))
        if use_correlations:
            basedir = join(basedir,'with-correlations')
    if html_kw is None:
        html_kw = {}
    if figure_kw is None:
        figure_kw = {}
    print 'Writing HTML under {}'.format(basedir)
    ensure_dir(basedir)
    gene_dir = 'gene-subplot'
    series_dir = 'gene-region-fits'
    correlations_dir = 'gene-correlations'
    scores_dir = 'score_distributions'
    if do_genes and not only_main_html: # relies on the sharding of the fits respecting gene boundaries
        plot_and_save_all_genes(data, fitter, fits, join(basedir,gene_dir), show_change_distributions)
    if do_series and not only_main_html:
        plot_and_save_all_series(data, fitter, fits, join(basedir,series_dir), use_correlations, show_change_distributions, figure_kw)
    if do_hist and k_of_n is None and not only_main_html:
        create_score_distribution_html(fits, use_correlations, join(basedir,scores_dir))
    if do_html and k_of_n is None:
        link_to_correlation_plots = use_correlations and correlations is not None
        if link_to_correlation_plots and not only_main_html:
            plot_and_save_all_gene_correlations(data, correlations, join(basedir,correlations_dir))
        dct_pathways = load_17_pathways_breakdown()
        pathway_genes = set.union(*dct_pathways.values())
        data_genes = set(data.gene_names)
        missing = pathway_genes - data_genes
        b_pathways = len(missing) < len(pathway_genes)/2 # simple heuristic to create pathways only if we have most of the genes (currently 61 genes are missing)
        create_html(
            data, fitter, fits, basedir, gene_dir, series_dir, scores_dir, correlations_dir=correlations_dir,
            use_correlations=use_correlations, link_to_correlation_plots=link_to_correlation_plots, 
            b_pathways=b_pathways, **html_kw
        )
def compute_region_ordering(singles):
    timings = singles.region_timings_per_pathway() # pathway -> { r -> mu }
    sorted_timings = {} # pathway -> list of regions (sorted by mu)
    for pathway, dct in timings.iteritems():
        sorted_regions_and_times = sorted((mu,r) for r,mu in dct.iteritems())
        sorted_timings[pathway] = [r for mu,r in sorted_regions_and_times]

    filename = join(results_dir(), 'dprime-region-ordering-{}.txt'.format(singles.listname))
    print 'Saving ordering results to {}'.format(filename)
    with open(filename,'w') as f:
        header = '{:<60}{:<7}{}'.format('pathway', 'nGenes', 'Regions (early to late)')
        print >>f, header
        print >>f, '-'*len(header)
        for pathway, ordered_regions in sorted_timings.iteritems():
            pathway_size = len(singles.pathways[pathway])
            if len(pathway) > 55:
                pathway = pathway[:55] + '...'
            ordered_regions = ' '.join(ordered_regions)
            print >>f, '{pathway:<60}{pathway_size:<7}{ordered_regions}'.format(**locals())
Exemple #22
0
def save_figure(fig,
                filename,
                b_close=False,
                b_square=True,
                show_frame=False,
                under_results=False,
                print_filename=False):
    if under_results:
        dirname = results_dir()
        filename = join(dirname, filename)
        ensure_dir(os.path.dirname(filename))
    if cfg.verbosity >= 1 or print_filename:
        print 'Saving figure to {}'.format(filename)
    figure_size_x = cfg.default_figure_size_x_square if b_square else cfg.default_figure_size_x
    fig.set_size_inches(figure_size_x, cfg.default_figure_size_y)
    if show_frame:
        facecolor = cfg.default_figure_facecolor
    else:
        facecolor = 'white'
    fig.savefig(filename, facecolor=facecolor, dpi=cfg.default_figure_dpi)
    if b_close:
        plt.close(fig)
def compute_region_ordering(singles):
    timings = singles.region_timings_per_pathway()  # pathway -> { r -> mu }
    sorted_timings = {}  # pathway -> list of regions (sorted by mu)
    for pathway, dct in timings.iteritems():
        sorted_regions_and_times = sorted((mu, r) for r, mu in dct.iteritems())
        sorted_timings[pathway] = [r for mu, r in sorted_regions_and_times]

    filename = join(results_dir(),
                    'dprime-region-ordering-{}.txt'.format(singles.listname))
    print 'Saving ordering results to {}'.format(filename)
    with open(filename, 'w') as f:
        header = '{:<60}{:<7}{}'.format('pathway', 'nGenes',
                                        'Regions (early to late)')
        print >> f, header
        print >> f, '-' * len(header)
        for pathway, ordered_regions in sorted_timings.iteritems():
            pathway_size = len(singles.pathways[pathway])
            if len(pathway) > 55:
                pathway = pathway[:55] + '...'
            ordered_regions = ' '.join(ordered_regions)
            print >> f, '{pathway:<60}{pathway_size:<7}{ordered_regions}'.format(
                **locals())
def create_top_genes_html(data,
                          fitter,
                          fits,
                          scores,
                          regions,
                          n_top=None,
                          filename_suffix=''):
    if n_top is None:
        n_top = len(scores)

    basedir = join(results_dir(), fit_results_relative_path(data, fitter))
    ensure_dir(basedir)
    gene_dir = 'gene-subplot'
    series_dir = 'gene-region-fits'

    def key_func(score):
        g, pval, qval = score
        return pval

    scores.sort(key=key_func)
    top_genes = [g for g, pval, qval in scores[:n_top]]
    top_pvals = {g: pval for g, pval, qval in scores[:n_top]}
    top_qvals = {g: qval for g, pval, qval in scores[:n_top]}

    n = len(scores)
    n05 = len([g for g, pval, qval in scores if qval < 0.05])
    n01 = len([g for g, pval, qval in scores if qval < 0.01])
    top_text = """\
<pre>
one sided t-test: {regions[0]} < {regions[1]}
{n05}/{n} q-values < 0.05
{n01}/{n} q_values < 0.01
</pre>
""".format(**locals())

    def get_onset_time(fit):
        a, h, mu, _ = fit.theta
        age = age_scaler.unscale(mu)
        return 'onset = {:.3g} years'.format(age)

    def get_onset_dist(fit):
        mu_vals = fit.theta_samples[2, :]
        mu = mu_vals.mean()
        vLow, vHigh = np.percentile(mu_vals, (20, 80))
        mu = age_scaler.unscale(mu)
        vLow = age_scaler.unscale(vLow)
        vHigh = age_scaler.unscale(vHigh)
        txt = 'onset reestimate (mean [20%, 80%]) = {:.3g} [{:.3g},{:.3g}]'.format(
            mu, vLow, vHigh)
        cls = ''
        return txt, cls

    create_html(
        data,
        fitter,
        fits,
        basedir,
        gene_dir,
        series_dir,
        gene_names=top_genes,
        region_names=regions,
        extra_columns=[('p-value', top_pvals), ('q-value', top_qvals)],
        extra_fields_per_fit=[get_onset_time, get_onset_dist],
        b_inline_images=True,
        inline_image_size='30%',
        b_R2_dist=False,
        ttl='Fit for genes with top t-test scores',
        top_text=top_text,
        filename='gradual-maturation-t-test' + filename_suffix,
    )
Exemple #25
0
def export_pathways():
    change_dist = load_pickle(SingleRegion.change_dist_filename)
    matlab_g2i = {g: (i + 1)
                  for i, g in enumerate(change_dist.genes)
                  }  # NOTE that matlab is one based

    pathways = pathway_lists.read_all_pathways()
    pathway_names = pathways.keys()  # make sure the order stays fixed
    pathway_genes_names = np.array([
        list_of_strings_to_matlab_cell_array(pathways[p])
        for p in pathway_names
    ],
                                   dtype=object)
    pathway_genes_idx = np.array([
        np.array([matlab_g2i[g] for g in pathways[p]]) for p in pathway_names
    ],
                                 dtype=object)

    matlab_p2i = {p: (i + 1)
                  for i, p in enumerate(pathway_names)
                  }  # NOTE matlab indexing is one based
    list_names = pathway_lists.all_pathway_lists()
    list_pathway_names = np.empty(len(list_names), dtype=object)
    list_pathway_idx = np.empty(len(list_names), dtype=object)
    for i, listname in enumerate(list_names):
        pathways_in_list = pathway_lists.list_to_pathway_names(listname)
        list_pathway_names[i] = list_of_strings_to_matlab_cell_array(
            pathways_in_list)
        list_pathway_idx[i] = [matlab_p2i[p] for p in pathways_in_list]
    README = """\
pathway_names:
Cell array of all pathway names. The name in cell number k is the name of the
pathway at position k in "pathway_genes_names" and "pathway_genes_idx".

pathway_genes_names:
Cell array (size <n-pathways>). Each cell contains a cell array of strings which 
are the gene symbols of the genes in that pathway.

pathway_genes_idx:
Same as pathway_genes_names, but each cell in the outer cell array is now an 
array of gene indices corresponding to the gene positions in cube.mat and change-distributions.mat.
Hopefully this should be easier to use in matlab.

list_names:
Names of pathway lists prepared by Noa

list_pathway_names:
Call array. One item per list. Each item is a cell array of strings which are 
the names of the pathways belonging to that list.

list_pathway_idx:
Same as list_pathway_names, but instead of listing the pathways by name, they 
are given as indices into the previous pathway_xxx structures.
"""
    mdict = dict(
        README_PATHWAYS=README,
        pathway_names=list_of_strings_to_matlab_cell_array(pathway_names),
        pathway_genes_names=pathway_genes_names,
        pathway_genes_idx=pathway_genes_idx,
        list_names=list_of_strings_to_matlab_cell_array(list_names),
        list_pathway_names=list_pathway_names,
        list_pathway_idx=list_pathway_idx,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'pathways.mat'))
Exemple #26
0
def save_fits_and_create_html(data,
                              fitter,
                              fits=None,
                              basedir=None,
                              do_genes=True,
                              do_series=True,
                              do_hist=True,
                              do_html=True,
                              only_main_html=False,
                              k_of_n=None,
                              use_correlations=False,
                              correlations=None,
                              show_change_distributions=False,
                              exons_layout=False,
                              html_kw=None,
                              figure_kw=None):
    if fits is None:
        fits = get_all_fits(data, fitter, k_of_n)
    if basedir is None:
        basedir = join(results_dir(), fit_results_relative_path(data, fitter))
        if use_correlations:
            basedir = join(basedir, 'with-correlations')
    if html_kw is None:
        html_kw = {}
    if figure_kw is None:
        figure_kw = {}
    print 'Writing HTML under {}'.format(basedir)
    ensure_dir(basedir)
    gene_dir = 'gene-subplot'
    exons_dir = 'exons_subplot_series' if cfg.exons_plots_from_series else 'exons_subplot'
    series_dir = 'gene-region-fits'
    correlations_dir = 'gene-correlations'
    scores_dir = 'score_distributions'
    if do_genes and not only_main_html:  # relies on the sharding of the fits respecting gene boundaries
        plot_and_save_all_genes(data, fitter, fits, join(basedir, gene_dir),
                                show_change_distributions)
    if do_series and not only_main_html:
        plot_and_save_all_series(data, fitter, fits, join(basedir, series_dir),
                                 use_correlations, show_change_distributions,
                                 exons_layout, figure_kw)
    if exons_layout and not only_main_html:
        if cfg.exons_plots_from_series:
            plot_and_save_all_exons_from_series(fits, join(basedir, exons_dir),
                                                join(basedir, series_dir))
        else:
            plot_and_save_all_exons(data, fitter, fits,
                                    join(basedir, exons_dir))
    if do_hist and k_of_n is None and not only_main_html:
        create_score_distribution_html(fits, use_correlations,
                                       join(basedir, scores_dir))
    if do_html and k_of_n is None:
        link_to_correlation_plots = use_correlations and correlations is not None
        if link_to_correlation_plots and not only_main_html:
            plot_and_save_all_gene_correlations(
                data, correlations, join(basedir, correlations_dir))
        dct_pathways = load_17_pathways_breakdown()
        pathway_genes = set.union(*dct_pathways.values())
        data_genes = set(data.gene_names)
        missing = pathway_genes - data_genes
        b_pathways = len(missing) < len(
            pathway_genes
        ) / 2  # simple heuristic to create pathways only if we have most of the genes (currently 61 genes are missing)
        create_html(data,
                    fitter,
                    fits,
                    basedir,
                    gene_dir,
                    exons_dir,
                    series_dir,
                    scores_dir,
                    correlations_dir=correlations_dir,
                    use_correlations=use_correlations,
                    link_to_correlation_plots=link_to_correlation_plots,
                    b_pathways=b_pathways,
                    exons_layout=exons_layout,
                    **html_kw)
Exemple #27
0
if __name__ == '__main__':
    disable_all_warnings()
    parser = get_common_parser()
    parser.add_argument('--shape2', required=True, help='The shape to compare against', choices=allowed_shape_names())
    parser.add_argument('--scaling2', help='The scaling used when fitting shape2. Default: none', choices=allowed_scaler_names())
    parser.add_argument('--sigma_prior2', help='Prior to use for 1/sigma when fitting shape2. Default: None', choices=get_allowed_priors(is_sigma=True))
    parser.add_argument('--priors2', help='The priors used for theta when fitting shape2. Default: None', choices=get_allowed_priors())
    parser.add_argument('--filename', help='Where to save the figure. Default: results/comparison.png')
    parser.add_argument('--show', help='Show figure and wait before exiting', action='store_true')
    parser.add_argument('--ndiffs', type=int, default=5, help='Number of top diffs to show. Default=5.')
    args = parser.parse_args()
    data1, fitter1 = process_common_inputs(args)    
    data2 = get_data_from_args(args.dataset, args.pathway, args.from_age, args.scaling2, args.shuffle)
    fitter2 = get_fitter_from_args(args.shape2, args.priors2, args.sigma_prior2)

    fits1 = get_all_fits(data1,fitter1)
    fits2 = get_all_fits(data2,fitter2)

    print_diff_points(data1,fitter1,fits1, data2,fitter2,fits2, args.ndiffs)

    fig = plot_comparison_scatter(data1,fitter1,fits1, data2,fitter2,fits2)

    filename = args.filename    
    if filename is None:
        ensure_dir(results_dir())
        filename = join(results_dir(), 'shape_comparison.png')
    save_figure(fig, filename)    

    if args.show:
        plt.show(block=True)
Exemple #28
0
from __future__ import print_function

import setup
from os.path import join
from dev_stages import dev_stages
from scalers import LogScaler
from project_dirs import results_dir

filename = join(results_dir(), 'dev-stages.txt')
with open(filename, 'w') as f:
    scaler = LogScaler()
    header = '{:<30} {:<8} {:<10} {:<10}'.format('Full Name', 'Label', 'Age',
                                                 'Log Scale')
    print(header, file=f)
    print(len(header) * '-', file=f)
    for stage in dev_stages:
        name = stage.name
        short_name = stage.short_name
        age = stage.central_age
        log_age = stage.scaled(scaler).central_age
        print('{:<30} {:<8} {:<10.3g} {:<10.3g}'.format(
            name, short_name, age, log_age),
              file=f)
Exemple #29
0
mu_shuffled = np.mean(R2_shuffled)
std_shuffled = np.std(R2_shuffled)
z_scores = (R2 - mu_shuffled) / std_shuffled
fig = plot_z_scores(z_scores)
save_figure(fig,
            'RP/R2-z-scores-{}.png'.format(name),
            under_results=True,
            b_close=True)

T, signed_rank_p_value = wilcoxon(R2, R2_shuffled)
maxShuffled = R2_shuffled.max()
nAbove = np.count_nonzero(R2 > maxShuffled)
nTotal = len(R2)
pct = 100.0 * nAbove / nTotal
filename = join(results_dir(), 'RP/R2-distribution-{}.txt'.format(name))
with open(filename, 'w') as f:
    print('shuffled = {:.2g} +/- {:.2g}'.format(mu_shuffled, std_shuffled),
          file=f)
    print('maximal shuffled score: {:.2g}'.format(maxShuffled), file=f)
    print('{:.2g}% ({}/{}) of scores are above maximal shuffled score'.format(
        pct, nAbove, nTotal),
          file=f)
    for z_threshold in [1, 2, 3, 4, 5]:
        nAbove = np.count_nonzero(z_scores > z_threshold)
        pct = 100.0 * nAbove / nTotal
        print('{:.2g}% ({}/{}) of z-scores are above {}'.format(
            pct, nAbove, nTotal, z_threshold),
              file=f)
    print('wilxocon signed-rank p-value = {:.2g}'.format(signed_rank_p_value),
          file=f)
Exemple #30
0
fits = get_all_fits(data,fitter,allow_new_computation=False)
fits_shuffled = get_all_fits(data_shuffled,fitter,allow_new_computation=False)
R2_pairs = [(fit.LOO_score,fit2.LOO_score) for fit,fit2 in iterate_fits(fits,fits_shuffled)]
R2 = np.array([r for r,r_shuffled in R2_pairs])
R2_shuffled = np.array([r_shuffled for r,r_shuffled in R2_pairs])

name = '{}-{}'.format(data.pathway,shape.cache_name())
fig = plot_score_distribution(R2,R2_shuffled)
save_figure(fig,'RP/R2-distribution-{}.png'.format(name), under_results=True, b_close=True)

mu_shuffled = np.mean(R2_shuffled)
std_shuffled = np.std(R2_shuffled)
z_scores = (R2-mu_shuffled)/std_shuffled
fig = plot_z_scores(z_scores)
save_figure(fig,'RP/R2-z-scores-{}.png'.format(name), under_results=True, b_close=True)

T, signed_rank_p_value = wilcoxon(R2, R2_shuffled)
maxShuffled = R2_shuffled.max()
nAbove = np.count_nonzero(R2 > maxShuffled)
nTotal = len(R2)
pct = 100.0 * nAbove/nTotal
filename = join(results_dir(),'RP/R2-distribution-{}.txt'.format(name))
with open(filename,'w') as f:
    print('shuffled = {:.2g} +/- {:.2g}'.format(mu_shuffled,std_shuffled), file=f)
    print('maximal shuffled score: {:.2g}'.format(maxShuffled), file=f)
    print('{:.2g}% ({}/{}) of scores are above maximal shuffled score'.format(pct,nAbove,nTotal), file=f)
    for z_threshold in [1,2,3,4,5]:
        nAbove = np.count_nonzero(z_scores > z_threshold)
        pct = 100.0 * nAbove/nTotal
        print('{:.2g}% ({}/{}) of z-scores are above {}'.format(pct,nAbove,nTotal,z_threshold), file=f)
    print('wilxocon signed-rank p-value = {:.2g}'.format(signed_rank_p_value), file=f)
from __future__ import print_function

import setup
from os.path import join
from dev_stages import dev_stages
from scalers import LogScaler
from project_dirs import results_dir


filename = join(results_dir(),'dev-stages.txt')
with open(filename,'w') as f:
    scaler = LogScaler()
    header = '{:<30} {:<8} {:<10} {:<10}'.format('Full Name', 'Label', 'Age', 'Log Scale')
    print(header, file=f)
    print(len(header)*'-', file=f)
    for stage in dev_stages:
        name = stage.name
        short_name = stage.short_name
        age = stage.central_age
        log_age = stage.scaled(scaler).central_age
        print('{:<30} {:<8} {:<10.3g} {:<10.3g}'.format(name, short_name, age, log_age), file=f)