def do_gene_fits(data, gene, fitter, filename, b_show): fig = plot_gene(data,gene) if filename is None: ensure_dir(results_dir()) filename = join(results_dir(), 'fit.png') print 'Saving figure to {}'.format(filename) save_figure(fig, filename) if b_show: plt.show(block=True)
def do_one_fit(series, fitter, loo, filename, b_show): if fitter is not None: theta, sigma, LOO_predictions,_ = fitter.fit(series.ages, series.single_expression, loo=loo) fig = plot_one_series(series, fitter.shape, theta, LOO_predictions) else: fig = plot_one_series(series) if filename is None: ensure_dir(results_dir()) filename = join(results_dir(), 'fit.png') save_figure(fig, filename, print_filename=True) if b_show: plt.show(block=True)
def fit_serveral_genes(series, fitter, loo, filename, b_show): if fitter is not None: theta, L, LOO_predictions,_ = fitter.fit(series.ages, series.expression, loo=loo) print 'L = {}'.format(L) fig = plot_series(series, fitter.shape, theta, LOO_predictions) else: fig = plot_series(series) if filename is None: ensure_dir(results_dir()) filename = join(results_dir(), 'fits.png') print 'Saving figure to {}'.format(filename) save_figure(fig, filename) if b_show: plt.show(block=True)
def create_top_correlations_html(data, fitter, fits, scores, regions, n_top=None): if n_top is None: n_top = len(scores) basedir = join(results_dir(), fit_results_relative_path(data,fitter)) ensure_dir(basedir) gene_dir = 'gene-subplot' series_dir = 'gene-region-fits' def key_func(score): g,r,pval,lst_R2 = score return r scores.sort(key=key_func) top_genes = [g for g,r,pval,lst_R2 in scores[:n_top]] top_scores = {g:r for g,r,pval,lst_R2 in scores[:n_top]} top_pvals = {g:pval for g,r,pval,lst_R2 in scores[:n_top]} def get_onset_time(fit): a,h,mu,_ = fit.theta age = age_scaler.unscale(mu) txt = 'onset = {:.3g} years'.format(age) cls = '' return txt,cls create_html( data, fitter, fits, basedir, gene_dir, series_dir, gene_names = top_genes, region_names = regions, extra_columns = [('r',top_scores),('p-value',top_pvals)], extra_fields_per_fit = [get_onset_time], b_inline_images = True, b_R2_dist = False, ttl = 'Fit for genes with top Spearman correlations', filename = 'top-gradual-maturation', )
def export_cube(): cube = load_pickle(RegionPairTiming.cube_filename) README = """\ d_mu: mu(r2)-mu(r1) for every gene and region pair. Dimensions: <n-genes> X <n-regions> X <n-regions> combined_std: The combined standard deviation of the two change distributions. std = sqrt(0.5*(std1^2 + std2^2)) Dimensions: <n-genes> X <n-regions> X <n-regions> score: The d' for the two change distributions. Equal to d_mu ./ combined_std. Dimensions: <n-genes> X <n-regions> X <n-regions> genes: Gene names for the genes represented in other arrays regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CUBE = README, genes = list_of_strings_to_matlab_cell_array(cube.genes), regions = list_of_strings_to_matlab_cell_array(cube.regions), age_scaler = scalers.unify(cube.age_scaler).cache_name(), d_mu = cube.d_mu, combined_std = cube.std, scores = cube.d_mu / cube.std, ) save_matfile(mdict, join(results_dir(), 'export', 'cube.mat'))
def export_cube(): cube = load_pickle(RegionPairTiming.cube_filename) README = """\ d_mu: mu(r2)-mu(r1) for every gene and region pair. Dimensions: <n-genes> X <n-regions> X <n-regions> combined_std: The combined standard deviation of the two change distributions. std = sqrt(0.5*(std1^2 + std2^2)) Dimensions: <n-genes> X <n-regions> X <n-regions> score: The d' for the two change distributions. Equal to d_mu ./ combined_std. Dimensions: <n-genes> X <n-regions> X <n-regions> genes: Gene names for the genes represented in other arrays regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CUBE=README, genes=list_of_strings_to_matlab_cell_array(cube.genes), regions=list_of_strings_to_matlab_cell_array(cube.regions), age_scaler=scalers.unify(cube.age_scaler).cache_name(), d_mu=cube.d_mu, combined_std=cube.std, scores=cube.d_mu / cube.std, ) save_matfile(mdict, join(results_dir(), 'export', 'cube.mat'))
def fit_serveral_genes(series, fitter, loo, filename, b_show): if fitter is not None: theta, L, LOO_predictions, _ = fitter.fit(series.ages, series.expression, loo=loo) print 'L = {}'.format(L) fig = plot_series(series, fitter.shape, theta, LOO_predictions) else: fig = plot_series(series) if filename is None: ensure_dir(results_dir()) filename = join(results_dir(), 'fits.png') print 'Saving figure to {}'.format(filename) save_figure(fig, filename) if b_show: plt.show(block=True)
def create_top_genes_html(data, fitter, fits, scores, regions, n_top=None, filename_suffix=''): if n_top is None: n_top = len(scores) basedir = join(results_dir(), fit_results_relative_path(data,fitter)) ensure_dir(basedir) gene_dir = 'gene-subplot' series_dir = 'gene-region-fits' def key_func(score): g,pval,qval = score return pval scores.sort(key=key_func) top_genes = [g for g,pval,qval in scores[:n_top]] top_pvals = {g:pval for g,pval,qval in scores[:n_top]} top_qvals = {g:qval for g,pval,qval in scores[:n_top]} n = len(scores) n05 = len([g for g,pval,qval in scores if qval < 0.05]) n01 = len([g for g,pval,qval in scores if qval < 0.01]) top_text = """\ <pre> one sided t-test: {regions[0]} < {regions[1]} {n05}/{n} q-values < 0.05 {n01}/{n} q_values < 0.01 </pre> """.format(**locals()) def get_onset_time(fit): a,h,mu,_ = fit.theta age = age_scaler.unscale(mu) return 'onset = {:.3g} years'.format(age) def get_onset_dist(fit): mu_vals = fit.theta_samples[2,:] mu = mu_vals.mean() vLow,vHigh = np.percentile(mu_vals, (20,80)) mu = age_scaler.unscale(mu) vLow = age_scaler.unscale(vLow) vHigh = age_scaler.unscale(vHigh) txt = 'onset reestimate (mean [20%, 80%]) = {:.3g} [{:.3g},{:.3g}]'.format(mu,vLow,vHigh) cls = '' return txt,cls create_html( data, fitter, fits, basedir, gene_dir, series_dir, gene_names = top_genes, region_names = regions, extra_columns = [('p-value',top_pvals), ('q-value',top_qvals)], extra_fields_per_fit = [get_onset_time, get_onset_dist], b_inline_images = True, inline_image_size = '30%', b_R2_dist = False, ttl = 'Fit for genes with top t-test scores', top_text = top_text, filename = 'gradual-maturation-t-test' + filename_suffix, )
def export_cytoscape(timing, pval_cutoff): res = timing.analyze_all_pathways().filter_regions(exclude=["PFC"]) def safe_pathway_name(pathway): return re.sub(r"\s+", "-", pathway) def edge_weight(pval): return min(200, int(-50 / np.log10(pval))) vals = [ (x.r1, safe_pathway_name(x.pathway), x.r2, edge_weight(x.pval)) for x in res.res if -np.log10(x.pval) > pval_cutoff ] lines = ["{} {} {}".format(r1, pathway, r2) for r1, pathway, r2, w in vals] save_file(join(results_dir(), "cytoscape", "regions.sif"), lines) lines = ["{} ({}) {} = {}".format(r1, pathway, r2, w) for r1, pathway, r2, w in vals] save_file(join(results_dir(), "cytoscape", "edge_weights.attrs"), ["EdgeWeights"] + lines)
def export_cytoscape(timing, pval_cutoff): res = timing.analyze_all_pathways().filter_regions(exclude=['PFC']) def safe_pathway_name(pathway): return re.sub(r'\s+', '-', pathway) def edge_weight(pval): return min(200, int(-50 / np.log10(pval))) vals = [(x.r1, safe_pathway_name(x.pathway), x.r2, edge_weight(x.pval)) for x in res.res if -np.log10(x.pval) > pval_cutoff] lines = ['{} {} {}'.format(r1, pathway, r2) for r1, pathway, r2, w in vals] save_file(join(results_dir(), 'cytoscape', 'regions.sif'), lines) lines = [ '{} ({}) {} = {}'.format(r1, pathway, r2, w) for r1, pathway, r2, w in vals ] save_file(join(results_dir(), 'cytoscape', 'edge_weights.attrs'), ['EdgeWeights'] + lines)
def export_pathways(): change_dist = load_pickle(SingleRegion.change_dist_filename) matlab_g2i = {g:(i+1) for i,g in enumerate(change_dist.genes)} # NOTE that matlab is one based pathways = pathway_lists.read_all_pathways() pathway_names = pathways.keys() # make sure the order stays fixed pathway_genes_names = np.array([list_of_strings_to_matlab_cell_array(pathways[p]) for p in pathway_names], dtype=object) pathway_genes_idx = np.array([np.array([matlab_g2i[g] for g in pathways[p]]) for p in pathway_names], dtype=object) matlab_p2i = {p:(i+1) for i,p in enumerate(pathway_names)} # NOTE matlab indexing is one based list_names = pathway_lists.all_pathway_lists() list_pathway_names = np.empty(len(list_names), dtype=object) list_pathway_idx = np.empty(len(list_names), dtype=object) for i,listname in enumerate(list_names): pathways_in_list = pathway_lists.list_to_pathway_names(listname) list_pathway_names[i] = list_of_strings_to_matlab_cell_array(pathways_in_list) list_pathway_idx[i] = [matlab_p2i[p] for p in pathways_in_list] README = """\ pathway_names: Cell array of all pathway names. The name in cell number k is the name of the pathway at position k in "pathway_genes_names" and "pathway_genes_idx". pathway_genes_names: Cell array (size <n-pathways>). Each cell contains a cell array of strings which are the gene symbols of the genes in that pathway. pathway_genes_idx: Same as pathway_genes_names, but each cell in the outer cell array is now an array of gene indices corresponding to the gene positions in cube.mat and change-distributions.mat. Hopefully this should be easier to use in matlab. list_names: Names of pathway lists prepared by Noa list_pathway_names: Call array. One item per list. Each item is a cell array of strings which are the names of the pathways belonging to that list. list_pathway_idx: Same as list_pathway_names, but instead of listing the pathways by name, they are given as indices into the previous pathway_xxx structures. """ mdict = dict( README_PATHWAYS = README, pathway_names = list_of_strings_to_matlab_cell_array(pathway_names), pathway_genes_names = pathway_genes_names, pathway_genes_idx = pathway_genes_idx, list_names = list_of_strings_to_matlab_cell_array(list_names), list_pathway_names = list_pathway_names, list_pathway_idx = list_pathway_idx, ) save_matfile(mdict, join(results_dir(), 'export', 'pathways.mat'))
def save_scores(singles, scores, order): filename = join(results_dir(), 'pathway-spearman-{}.txt'.format('-'.join(order))) print 'Saving ordering results to {}'.format(filename) with open(filename,'w') as f: print >>f, 'Region Order: {}'.format(' '.join(order)) header = '{:<60}{:<7}{:<15}{:<10}{:<15}'.format('pathway', 'nGenes', '-log10(pval)', 'pval', 'Spearman rho') print >>f, header print >>f, '-'*len(header) for logpval, pval, sr, pathway in scores: pathway_size = len(singles.pathways[pathway]) if len(pathway) > 55: pathway = pathway[:55] + '...' print >>f, '{pathway:<60}{pathway_size:<7}{logpval:<15.3g}{pval:<10.3g}{sr:<15.3g}'.format(**locals())
def save_top_results(self, n=50): filename = join(results_dir(), 'dprime-top-results-{}.txt'.format(self.filename_suffix)) print 'Saving top {} results to {}'.format(n,filename) with open(filename,'w') as f: header = '{:<60}{:<7}{:<5}{:<5}{:<15}{:<10}{:<10}{:<10}{:<10}{:<10}'.format('pathway', 'nGenes', 'r1', 'r2', '-log10(pval)', 'score', 'delta', 'w-delta', 'mu1 yrs', 'mu2 yrs') print >>f, header print >>f, '-'*len(header) for x in self.res[:n]: logpval = -np.log10(x.pval) pathway = x.pathway if len(pathway) > 55: pathway = pathway[:55] + '...' print >>f, '{pathway:<60}{x.pathway_size:<7}{x.r1:<5}{x.r2:<5}{logpval:<15.3g}{x.score:<10.3g}{x.delta:<10.3g}{x.weighted_delta:<10.3g}{x.mu1_years:<10.3g}{x.mu2_years:<10.3g}'.format(**locals())
def save_figure(fig, filename, b_close=False, b_square=True, show_frame=False, under_results=False, print_filename=False): if under_results: dirname = results_dir() filename = join(dirname,filename) ensure_dir(os.path.dirname(filename)) if cfg.verbosity >= 1 or print_filename: print 'Saving figure to {}'.format(filename) figure_size_x = cfg.default_figure_size_x_square if b_square else cfg.default_figure_size_x fig.set_size_inches(figure_size_x, cfg.default_figure_size_y) if show_frame: facecolor = cfg.default_figure_facecolor else: facecolor = 'white' fig.savefig(filename, facecolor=facecolor, dpi=cfg.default_figure_dpi) if b_close: plt.close(fig)
def save_scores(singles, scores, order): filename = join(results_dir(), 'pathway-spearman-{}.txt'.format('-'.join(order))) print 'Saving ordering results to {}'.format(filename) with open(filename, 'w') as f: print >> f, 'Region Order: {}'.format(' '.join(order)) header = '{:<60}{:<7}{:<15}{:<10}{:<15}'.format( 'pathway', 'nGenes', '-log10(pval)', 'pval', 'Spearman rho') print >> f, header print >> f, '-' * len(header) for logpval, pval, sr, pathway in scores: pathway_size = len(singles.pathways[pathway]) if len(pathway) > 55: pathway = pathway[:55] + '...' print >> f, '{pathway:<60}{pathway_size:<7}{logpval:<15.3g}{pval:<10.3g}{sr:<15.3g}'.format( **locals())
def export_singles(): change_dist = load_pickle(SingleRegion.change_dist_filename) README = """\ mu: The mean age of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> std: The standard deviation of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> genes: Gene names for the genes represented in other arrays weights: The change distributions for each gene and region. Dimensions: <n-genes> X <n-regions> X <n-bins> bin_centers: The ages for the center of each bin used in calculating the histogram in "weights". Dimensions: <n-bins> X 1 bin_edges: The edges of the bins used in calculating the change histogram. (centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated) Dimensions: <n-bins + 1> X 1 regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CHANGE_DISTRIBUTIONS=README, genes=list_of_strings_to_matlab_cell_array(change_dist.genes), regions=list_of_strings_to_matlab_cell_array(change_dist.regions), age_scaler=scalers.unify(change_dist.age_scaler).cache_name(), mu=change_dist.mu, std=change_dist.std, bin_edges=change_dist.bin_edges, bin_centers=change_dist.bin_centers, weights=change_dist.weights, ) save_matfile(mdict, join(results_dir(), 'export', 'change-distributions.mat'))
def create_top_correlations_html(data, fitter, fits, scores, regions, n_top=None): if n_top is None: n_top = len(scores) basedir = join(results_dir(), fit_results_relative_path(data, fitter)) ensure_dir(basedir) gene_dir = 'gene-subplot' series_dir = 'gene-region-fits' def key_func(score): g, r, pval, lst_R2 = score return r scores.sort(key=key_func) top_genes = [g for g, r, pval, lst_R2 in scores[:n_top]] top_scores = {g: r for g, r, pval, lst_R2 in scores[:n_top]} top_pvals = {g: pval for g, r, pval, lst_R2 in scores[:n_top]} def get_onset_time(fit): a, h, mu, _ = fit.theta age = age_scaler.unscale(mu) txt = 'onset = {:.3g} years'.format(age) cls = '' return txt, cls create_html( data, fitter, fits, basedir, gene_dir, series_dir, gene_names=top_genes, region_names=regions, extra_columns=[('r', top_scores), ('p-value', top_pvals)], extra_fields_per_fit=[get_onset_time], b_inline_images=True, b_R2_dist=False, ttl='Fit for genes with top Spearman correlations', filename='top-gradual-maturation', )
def export_singles(): change_dist = load_pickle(SingleRegion.change_dist_filename) README = """\ mu: The mean age of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> std: The standard deviation of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> genes: Gene names for the genes represented in other arrays weights: The change distributions for each gene and region. Dimensions: <n-genes> X <n-regions> X <n-bins> bin_centers: The ages for the center of each bin used in calculating the histogram in "weights". Dimensions: <n-bins> X 1 bin_edges: The edges of the bins used in calculating the change histogram. (centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated) Dimensions: <n-bins + 1> X 1 regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CHANGE_DISTRIBUTIONS = README, genes = list_of_strings_to_matlab_cell_array(change_dist.genes), regions = list_of_strings_to_matlab_cell_array(change_dist.regions), age_scaler = scalers.unify(change_dist.age_scaler).cache_name(), mu = change_dist.mu, std = change_dist.std, bin_edges = change_dist.bin_edges, bin_centers = change_dist.bin_centers, weights = change_dist.weights, ) save_matfile(mdict, join(results_dir(), 'export', 'change-distributions.mat'))
def save_top_results(self, n=50): filename = join( results_dir(), 'dprime-top-results-{}.txt'.format(self.filename_suffix)) print 'Saving top {} results to {}'.format(n, filename) with open(filename, 'w') as f: header = '{:<60}{:<7}{:<5}{:<5}{:<15}{:<10}{:<10}{:<10}{:<10}{:<10}'.format( 'pathway', 'nGenes', 'r1', 'r2', '-log10(pval)', 'score', 'delta', 'w-delta', 'mu1 yrs', 'mu2 yrs') print >> f, header print >> f, '-' * len(header) for x in self.res[:n]: logpval = -np.log10(x.pval) pathway = x.pathway if len(pathway) > 55: pathway = pathway[:55] + '...' print >> f, '{pathway:<60}{x.pathway_size:<7}{x.r1:<5}{x.r2:<5}{logpval:<15.3g}{x.score:<10.3g}{x.delta:<10.3g}{x.weighted_delta:<10.3g}{x.mu1_years:<10.3g}{x.mu2_years:<10.3g}'.format( **locals())
def save_fits_and_create_html(data, fitter, fits=None, basedir=None, do_genes=True, do_series=True, do_hist=True, do_html=True, only_main_html=False, k_of_n=None, use_correlations=False, correlations=None, show_change_distributions=False, html_kw=None, figure_kw=None): if fits is None: fits = get_all_fits(data,fitter,k_of_n) if basedir is None: basedir = join(results_dir(), fit_results_relative_path(data,fitter)) if use_correlations: basedir = join(basedir,'with-correlations') if html_kw is None: html_kw = {} if figure_kw is None: figure_kw = {} print 'Writing HTML under {}'.format(basedir) ensure_dir(basedir) gene_dir = 'gene-subplot' series_dir = 'gene-region-fits' correlations_dir = 'gene-correlations' scores_dir = 'score_distributions' if do_genes and not only_main_html: # relies on the sharding of the fits respecting gene boundaries plot_and_save_all_genes(data, fitter, fits, join(basedir,gene_dir), show_change_distributions) if do_series and not only_main_html: plot_and_save_all_series(data, fitter, fits, join(basedir,series_dir), use_correlations, show_change_distributions, figure_kw) if do_hist and k_of_n is None and not only_main_html: create_score_distribution_html(fits, use_correlations, join(basedir,scores_dir)) if do_html and k_of_n is None: link_to_correlation_plots = use_correlations and correlations is not None if link_to_correlation_plots and not only_main_html: plot_and_save_all_gene_correlations(data, correlations, join(basedir,correlations_dir)) dct_pathways = load_17_pathways_breakdown() pathway_genes = set.union(*dct_pathways.values()) data_genes = set(data.gene_names) missing = pathway_genes - data_genes b_pathways = len(missing) < len(pathway_genes)/2 # simple heuristic to create pathways only if we have most of the genes (currently 61 genes are missing) create_html( data, fitter, fits, basedir, gene_dir, series_dir, scores_dir, correlations_dir=correlations_dir, use_correlations=use_correlations, link_to_correlation_plots=link_to_correlation_plots, b_pathways=b_pathways, **html_kw )
def compute_region_ordering(singles): timings = singles.region_timings_per_pathway() # pathway -> { r -> mu } sorted_timings = {} # pathway -> list of regions (sorted by mu) for pathway, dct in timings.iteritems(): sorted_regions_and_times = sorted((mu,r) for r,mu in dct.iteritems()) sorted_timings[pathway] = [r for mu,r in sorted_regions_and_times] filename = join(results_dir(), 'dprime-region-ordering-{}.txt'.format(singles.listname)) print 'Saving ordering results to {}'.format(filename) with open(filename,'w') as f: header = '{:<60}{:<7}{}'.format('pathway', 'nGenes', 'Regions (early to late)') print >>f, header print >>f, '-'*len(header) for pathway, ordered_regions in sorted_timings.iteritems(): pathway_size = len(singles.pathways[pathway]) if len(pathway) > 55: pathway = pathway[:55] + '...' ordered_regions = ' '.join(ordered_regions) print >>f, '{pathway:<60}{pathway_size:<7}{ordered_regions}'.format(**locals())
def save_figure(fig, filename, b_close=False, b_square=True, show_frame=False, under_results=False, print_filename=False): if under_results: dirname = results_dir() filename = join(dirname, filename) ensure_dir(os.path.dirname(filename)) if cfg.verbosity >= 1 or print_filename: print 'Saving figure to {}'.format(filename) figure_size_x = cfg.default_figure_size_x_square if b_square else cfg.default_figure_size_x fig.set_size_inches(figure_size_x, cfg.default_figure_size_y) if show_frame: facecolor = cfg.default_figure_facecolor else: facecolor = 'white' fig.savefig(filename, facecolor=facecolor, dpi=cfg.default_figure_dpi) if b_close: plt.close(fig)
def compute_region_ordering(singles): timings = singles.region_timings_per_pathway() # pathway -> { r -> mu } sorted_timings = {} # pathway -> list of regions (sorted by mu) for pathway, dct in timings.iteritems(): sorted_regions_and_times = sorted((mu, r) for r, mu in dct.iteritems()) sorted_timings[pathway] = [r for mu, r in sorted_regions_and_times] filename = join(results_dir(), 'dprime-region-ordering-{}.txt'.format(singles.listname)) print 'Saving ordering results to {}'.format(filename) with open(filename, 'w') as f: header = '{:<60}{:<7}{}'.format('pathway', 'nGenes', 'Regions (early to late)') print >> f, header print >> f, '-' * len(header) for pathway, ordered_regions in sorted_timings.iteritems(): pathway_size = len(singles.pathways[pathway]) if len(pathway) > 55: pathway = pathway[:55] + '...' ordered_regions = ' '.join(ordered_regions) print >> f, '{pathway:<60}{pathway_size:<7}{ordered_regions}'.format( **locals())
def create_top_genes_html(data, fitter, fits, scores, regions, n_top=None, filename_suffix=''): if n_top is None: n_top = len(scores) basedir = join(results_dir(), fit_results_relative_path(data, fitter)) ensure_dir(basedir) gene_dir = 'gene-subplot' series_dir = 'gene-region-fits' def key_func(score): g, pval, qval = score return pval scores.sort(key=key_func) top_genes = [g for g, pval, qval in scores[:n_top]] top_pvals = {g: pval for g, pval, qval in scores[:n_top]} top_qvals = {g: qval for g, pval, qval in scores[:n_top]} n = len(scores) n05 = len([g for g, pval, qval in scores if qval < 0.05]) n01 = len([g for g, pval, qval in scores if qval < 0.01]) top_text = """\ <pre> one sided t-test: {regions[0]} < {regions[1]} {n05}/{n} q-values < 0.05 {n01}/{n} q_values < 0.01 </pre> """.format(**locals()) def get_onset_time(fit): a, h, mu, _ = fit.theta age = age_scaler.unscale(mu) return 'onset = {:.3g} years'.format(age) def get_onset_dist(fit): mu_vals = fit.theta_samples[2, :] mu = mu_vals.mean() vLow, vHigh = np.percentile(mu_vals, (20, 80)) mu = age_scaler.unscale(mu) vLow = age_scaler.unscale(vLow) vHigh = age_scaler.unscale(vHigh) txt = 'onset reestimate (mean [20%, 80%]) = {:.3g} [{:.3g},{:.3g}]'.format( mu, vLow, vHigh) cls = '' return txt, cls create_html( data, fitter, fits, basedir, gene_dir, series_dir, gene_names=top_genes, region_names=regions, extra_columns=[('p-value', top_pvals), ('q-value', top_qvals)], extra_fields_per_fit=[get_onset_time, get_onset_dist], b_inline_images=True, inline_image_size='30%', b_R2_dist=False, ttl='Fit for genes with top t-test scores', top_text=top_text, filename='gradual-maturation-t-test' + filename_suffix, )
def export_pathways(): change_dist = load_pickle(SingleRegion.change_dist_filename) matlab_g2i = {g: (i + 1) for i, g in enumerate(change_dist.genes) } # NOTE that matlab is one based pathways = pathway_lists.read_all_pathways() pathway_names = pathways.keys() # make sure the order stays fixed pathway_genes_names = np.array([ list_of_strings_to_matlab_cell_array(pathways[p]) for p in pathway_names ], dtype=object) pathway_genes_idx = np.array([ np.array([matlab_g2i[g] for g in pathways[p]]) for p in pathway_names ], dtype=object) matlab_p2i = {p: (i + 1) for i, p in enumerate(pathway_names) } # NOTE matlab indexing is one based list_names = pathway_lists.all_pathway_lists() list_pathway_names = np.empty(len(list_names), dtype=object) list_pathway_idx = np.empty(len(list_names), dtype=object) for i, listname in enumerate(list_names): pathways_in_list = pathway_lists.list_to_pathway_names(listname) list_pathway_names[i] = list_of_strings_to_matlab_cell_array( pathways_in_list) list_pathway_idx[i] = [matlab_p2i[p] for p in pathways_in_list] README = """\ pathway_names: Cell array of all pathway names. The name in cell number k is the name of the pathway at position k in "pathway_genes_names" and "pathway_genes_idx". pathway_genes_names: Cell array (size <n-pathways>). Each cell contains a cell array of strings which are the gene symbols of the genes in that pathway. pathway_genes_idx: Same as pathway_genes_names, but each cell in the outer cell array is now an array of gene indices corresponding to the gene positions in cube.mat and change-distributions.mat. Hopefully this should be easier to use in matlab. list_names: Names of pathway lists prepared by Noa list_pathway_names: Call array. One item per list. Each item is a cell array of strings which are the names of the pathways belonging to that list. list_pathway_idx: Same as list_pathway_names, but instead of listing the pathways by name, they are given as indices into the previous pathway_xxx structures. """ mdict = dict( README_PATHWAYS=README, pathway_names=list_of_strings_to_matlab_cell_array(pathway_names), pathway_genes_names=pathway_genes_names, pathway_genes_idx=pathway_genes_idx, list_names=list_of_strings_to_matlab_cell_array(list_names), list_pathway_names=list_pathway_names, list_pathway_idx=list_pathway_idx, ) save_matfile(mdict, join(results_dir(), 'export', 'pathways.mat'))
def save_fits_and_create_html(data, fitter, fits=None, basedir=None, do_genes=True, do_series=True, do_hist=True, do_html=True, only_main_html=False, k_of_n=None, use_correlations=False, correlations=None, show_change_distributions=False, exons_layout=False, html_kw=None, figure_kw=None): if fits is None: fits = get_all_fits(data, fitter, k_of_n) if basedir is None: basedir = join(results_dir(), fit_results_relative_path(data, fitter)) if use_correlations: basedir = join(basedir, 'with-correlations') if html_kw is None: html_kw = {} if figure_kw is None: figure_kw = {} print 'Writing HTML under {}'.format(basedir) ensure_dir(basedir) gene_dir = 'gene-subplot' exons_dir = 'exons_subplot_series' if cfg.exons_plots_from_series else 'exons_subplot' series_dir = 'gene-region-fits' correlations_dir = 'gene-correlations' scores_dir = 'score_distributions' if do_genes and not only_main_html: # relies on the sharding of the fits respecting gene boundaries plot_and_save_all_genes(data, fitter, fits, join(basedir, gene_dir), show_change_distributions) if do_series and not only_main_html: plot_and_save_all_series(data, fitter, fits, join(basedir, series_dir), use_correlations, show_change_distributions, exons_layout, figure_kw) if exons_layout and not only_main_html: if cfg.exons_plots_from_series: plot_and_save_all_exons_from_series(fits, join(basedir, exons_dir), join(basedir, series_dir)) else: plot_and_save_all_exons(data, fitter, fits, join(basedir, exons_dir)) if do_hist and k_of_n is None and not only_main_html: create_score_distribution_html(fits, use_correlations, join(basedir, scores_dir)) if do_html and k_of_n is None: link_to_correlation_plots = use_correlations and correlations is not None if link_to_correlation_plots and not only_main_html: plot_and_save_all_gene_correlations( data, correlations, join(basedir, correlations_dir)) dct_pathways = load_17_pathways_breakdown() pathway_genes = set.union(*dct_pathways.values()) data_genes = set(data.gene_names) missing = pathway_genes - data_genes b_pathways = len(missing) < len( pathway_genes ) / 2 # simple heuristic to create pathways only if we have most of the genes (currently 61 genes are missing) create_html(data, fitter, fits, basedir, gene_dir, exons_dir, series_dir, scores_dir, correlations_dir=correlations_dir, use_correlations=use_correlations, link_to_correlation_plots=link_to_correlation_plots, b_pathways=b_pathways, exons_layout=exons_layout, **html_kw)
if __name__ == '__main__': disable_all_warnings() parser = get_common_parser() parser.add_argument('--shape2', required=True, help='The shape to compare against', choices=allowed_shape_names()) parser.add_argument('--scaling2', help='The scaling used when fitting shape2. Default: none', choices=allowed_scaler_names()) parser.add_argument('--sigma_prior2', help='Prior to use for 1/sigma when fitting shape2. Default: None', choices=get_allowed_priors(is_sigma=True)) parser.add_argument('--priors2', help='The priors used for theta when fitting shape2. Default: None', choices=get_allowed_priors()) parser.add_argument('--filename', help='Where to save the figure. Default: results/comparison.png') parser.add_argument('--show', help='Show figure and wait before exiting', action='store_true') parser.add_argument('--ndiffs', type=int, default=5, help='Number of top diffs to show. Default=5.') args = parser.parse_args() data1, fitter1 = process_common_inputs(args) data2 = get_data_from_args(args.dataset, args.pathway, args.from_age, args.scaling2, args.shuffle) fitter2 = get_fitter_from_args(args.shape2, args.priors2, args.sigma_prior2) fits1 = get_all_fits(data1,fitter1) fits2 = get_all_fits(data2,fitter2) print_diff_points(data1,fitter1,fits1, data2,fitter2,fits2, args.ndiffs) fig = plot_comparison_scatter(data1,fitter1,fits1, data2,fitter2,fits2) filename = args.filename if filename is None: ensure_dir(results_dir()) filename = join(results_dir(), 'shape_comparison.png') save_figure(fig, filename) if args.show: plt.show(block=True)
from __future__ import print_function import setup from os.path import join from dev_stages import dev_stages from scalers import LogScaler from project_dirs import results_dir filename = join(results_dir(), 'dev-stages.txt') with open(filename, 'w') as f: scaler = LogScaler() header = '{:<30} {:<8} {:<10} {:<10}'.format('Full Name', 'Label', 'Age', 'Log Scale') print(header, file=f) print(len(header) * '-', file=f) for stage in dev_stages: name = stage.name short_name = stage.short_name age = stage.central_age log_age = stage.scaled(scaler).central_age print('{:<30} {:<8} {:<10.3g} {:<10.3g}'.format( name, short_name, age, log_age), file=f)
mu_shuffled = np.mean(R2_shuffled) std_shuffled = np.std(R2_shuffled) z_scores = (R2 - mu_shuffled) / std_shuffled fig = plot_z_scores(z_scores) save_figure(fig, 'RP/R2-z-scores-{}.png'.format(name), under_results=True, b_close=True) T, signed_rank_p_value = wilcoxon(R2, R2_shuffled) maxShuffled = R2_shuffled.max() nAbove = np.count_nonzero(R2 > maxShuffled) nTotal = len(R2) pct = 100.0 * nAbove / nTotal filename = join(results_dir(), 'RP/R2-distribution-{}.txt'.format(name)) with open(filename, 'w') as f: print('shuffled = {:.2g} +/- {:.2g}'.format(mu_shuffled, std_shuffled), file=f) print('maximal shuffled score: {:.2g}'.format(maxShuffled), file=f) print('{:.2g}% ({}/{}) of scores are above maximal shuffled score'.format( pct, nAbove, nTotal), file=f) for z_threshold in [1, 2, 3, 4, 5]: nAbove = np.count_nonzero(z_scores > z_threshold) pct = 100.0 * nAbove / nTotal print('{:.2g}% ({}/{}) of z-scores are above {}'.format( pct, nAbove, nTotal, z_threshold), file=f) print('wilxocon signed-rank p-value = {:.2g}'.format(signed_rank_p_value), file=f)
fits = get_all_fits(data,fitter,allow_new_computation=False) fits_shuffled = get_all_fits(data_shuffled,fitter,allow_new_computation=False) R2_pairs = [(fit.LOO_score,fit2.LOO_score) for fit,fit2 in iterate_fits(fits,fits_shuffled)] R2 = np.array([r for r,r_shuffled in R2_pairs]) R2_shuffled = np.array([r_shuffled for r,r_shuffled in R2_pairs]) name = '{}-{}'.format(data.pathway,shape.cache_name()) fig = plot_score_distribution(R2,R2_shuffled) save_figure(fig,'RP/R2-distribution-{}.png'.format(name), under_results=True, b_close=True) mu_shuffled = np.mean(R2_shuffled) std_shuffled = np.std(R2_shuffled) z_scores = (R2-mu_shuffled)/std_shuffled fig = plot_z_scores(z_scores) save_figure(fig,'RP/R2-z-scores-{}.png'.format(name), under_results=True, b_close=True) T, signed_rank_p_value = wilcoxon(R2, R2_shuffled) maxShuffled = R2_shuffled.max() nAbove = np.count_nonzero(R2 > maxShuffled) nTotal = len(R2) pct = 100.0 * nAbove/nTotal filename = join(results_dir(),'RP/R2-distribution-{}.txt'.format(name)) with open(filename,'w') as f: print('shuffled = {:.2g} +/- {:.2g}'.format(mu_shuffled,std_shuffled), file=f) print('maximal shuffled score: {:.2g}'.format(maxShuffled), file=f) print('{:.2g}% ({}/{}) of scores are above maximal shuffled score'.format(pct,nAbove,nTotal), file=f) for z_threshold in [1,2,3,4,5]: nAbove = np.count_nonzero(z_scores > z_threshold) pct = 100.0 * nAbove/nTotal print('{:.2g}% ({}/{}) of z-scores are above {}'.format(pct,nAbove,nTotal,z_threshold), file=f) print('wilxocon signed-rank p-value = {:.2g}'.format(signed_rank_p_value), file=f)
from __future__ import print_function import setup from os.path import join from dev_stages import dev_stages from scalers import LogScaler from project_dirs import results_dir filename = join(results_dir(),'dev-stages.txt') with open(filename,'w') as f: scaler = LogScaler() header = '{:<30} {:<8} {:<10} {:<10}'.format('Full Name', 'Label', 'Age', 'Log Scale') print(header, file=f) print(len(header)*'-', file=f) for stage in dev_stages: name = stage.name short_name = stage.short_name age = stage.central_age log_age = stage.scaled(scaler).central_age print('{:<30} {:<8} {:<10.3g} {:<10.3g}'.format(name, short_name, age, log_age), file=f)