def analyze_paired_scores_with_and_without_priors(n_best=10): nFitter = Fitter(Sigslope()) yFitter = Fitter(Sigslope(priors_name), 'normal') nFits = get_all_fits(data,nFitter,allow_new_computation=False) yFits = get_all_fits(data,yFitter,allow_new_computation=False) score_pairs = [(f1.LOO_score, f2.LOO_score) for f1,f2 in iterate_fits(nFits, yFits)] nScores, yScores = zip(*score_pairs) _, pval = scipy.stats.wilcoxon(nScores, yScores) pval = pval/2 # one sided p-value print '*** wilcoxon signed rank p-value (one sided) = {:.3g}'.format(pval) # find examples of best improvements diffs = [(f2.LOO_score-f1.LOO_score, f1.LOO_score, f2.LOO_score, g, r) for dsname,g,r,f1,f2 in iterate_fits(nFits, yFits, R2_threshold=-1, return_keys=True)] diffs.sort(reverse=True) print 'Gene/Regions for which priors produce best R2 improvement:' for i,(delta,R2_without, R2_with, g,r) in enumerate(diffs[:10]): print '{i}) {g}@{r}, delta-R2={delta:.3g}. R2_without={R2_without:.3g}, R2_with={R2_with:.3g}'.format(**locals())
def do_fits(data, fitter, k_of_n, add_correlations, correlations_k_of_n): n_correlation_iterations = 4 if add_correlations else 0 print """ ============================================================================================== ============================================================================================== ==== Computing Fits with {} ============================================================================================== ============================================================================================== """.format(fitter) fits = get_all_fits(data, fitter, k_of_n, n_correlation_iterations=n_correlation_iterations, correlations_k_of_n=correlations_k_of_n) return fits
def plot_theta_diff_scatter(show_title=False): yFitter = Fitter(Sigslope(priors_name),'normal') nFitter = Fitter(Sigslope()) yFits = get_all_fits(data,yFitter) nFits = get_all_fits(data,nFitter) pairs = [(nFit.LOO_score,yFit.LOO_score) for nFit,yFit in iterate_fits(nFits,yFits)] diff_pairs = [(n,y-n) for n,y in pairs if n is not None and y is not None] n,d = zip(*diff_pairs) fig = plt.figure() ax = fig.add_axes([0.15,0.12,0.8,0.8]) ax.scatter(n, d, alpha=0.5) xlims = ax.get_xlim() ax.plot(xlims,[0, 0],'k--') ax.set_xlim(*xlims) if show_title: ax.title(r'Improvement from prior on $\theta$ vs. baseline $R^2$', fontsize=fontsize) ax.set_xlabel(r'$R^2$(no priors)', fontsize=fontsize) ax.set_ylabel(r'$R^2$($\theta$) - $R^2$(no priors)', fontsize=fontsize) ax.tick_params(axis='both', labelsize=fontsize) return fig
def analyze_variant(theta,sigma): theta_priors = priors_name if theta else None sigma_prior = 'normal' if sigma else None shape = Sigslope(theta_priors) fitter = Fitter(shape,sigma_prior) fits = get_all_fits(data,fitter,allow_new_computation=False) LOO_scores = [f.LOO_score for f in iterate_fits(fits) if f.LOO_score is not None] mu,sem = bootstrap(LOO_scores, np.mean) return Bunch( theta = theta, sigma = sigma, LOO_scores = LOO_scores, mu = mu, sem = sem, )
def plot_theta_diff_scatter(show_title=False): yFitter = Fitter(Sigslope(priors_name), 'normal') nFitter = Fitter(Sigslope()) yFits = get_all_fits(data, yFitter) nFits = get_all_fits(data, nFitter) pairs = [(nFit.LOO_score, yFit.LOO_score) for nFit, yFit in iterate_fits(nFits, yFits)] diff_pairs = [(n, y - n) for n, y in pairs if n is not None and y is not None] n, d = zip(*diff_pairs) fig = plt.figure() ax = fig.add_axes([0.15, 0.12, 0.8, 0.8]) ax.scatter(n, d, alpha=0.5) xlims = ax.get_xlim() ax.plot(xlims, [0, 0], 'k--') ax.set_xlim(*xlims) if show_title: ax.title(r'Improvement from prior on $\theta$ vs. baseline $R^2$', fontsize=fontsize) ax.set_xlabel(r'$R^2$(no priors)', fontsize=fontsize) ax.set_ylabel(r'$R^2$($\theta$) - $R^2$(no priors)', fontsize=fontsize) ax.tick_params(axis='both', labelsize=fontsize) return fig
def analyze_paired_scores_with_and_without_priors(n_best=10): nFitter = Fitter(Sigslope()) yFitter = Fitter(Sigslope(priors_name), 'normal') nFits = get_all_fits(data, nFitter, allow_new_computation=False) yFits = get_all_fits(data, yFitter, allow_new_computation=False) score_pairs = [(f1.LOO_score, f2.LOO_score) for f1, f2 in iterate_fits(nFits, yFits)] nScores, yScores = zip(*score_pairs) _, pval = scipy.stats.wilcoxon(nScores, yScores) pval = pval / 2 # one sided p-value print '*** wilcoxon signed rank p-value (one sided) = {:.3g}'.format(pval) # find examples of best improvements diffs = [(f2.LOO_score - f1.LOO_score, f1.LOO_score, f2.LOO_score, g, r) for dsname, g, r, f1, f2 in iterate_fits( nFits, yFits, R2_threshold=-1, return_keys=True)] diffs.sort(reverse=True) print 'Gene/Regions for which priors produce best R2 improvement:' for i, (delta, R2_without, R2_with, g, r) in enumerate(diffs[:10]): print '{i}) {g}@{r}, delta-R2={delta:.3g}. R2_without={R2_without:.3g}, R2_with={R2_with:.3g}'.format( **locals())
def analyze_variant(theta, sigma): theta_priors = priors_name if theta else None sigma_prior = 'normal' if sigma else None shape = Sigslope(theta_priors) fitter = Fitter(shape, sigma_prior) fits = get_all_fits(data, fitter, allow_new_computation=False) LOO_scores = [ f.LOO_score for f in iterate_fits(fits) if f.LOO_score is not None ] mu, sem = bootstrap(LOO_scores, np.mean) return Bunch( theta=theta, sigma=sigma, LOO_scores=LOO_scores, mu=mu, sem=sem, )
def save_fits_and_create_html(data, fitter, fits=None, basedir=None, do_genes=True, do_series=True, do_hist=True, do_html=True, only_main_html=False, k_of_n=None, use_correlations=False, correlations=None, show_change_distributions=False, html_kw=None, figure_kw=None): if fits is None: fits = get_all_fits(data,fitter,k_of_n) if basedir is None: basedir = join(results_dir(), fit_results_relative_path(data,fitter)) if use_correlations: basedir = join(basedir,'with-correlations') if html_kw is None: html_kw = {} if figure_kw is None: figure_kw = {} print 'Writing HTML under {}'.format(basedir) ensure_dir(basedir) gene_dir = 'gene-subplot' series_dir = 'gene-region-fits' correlations_dir = 'gene-correlations' scores_dir = 'score_distributions' if do_genes and not only_main_html: # relies on the sharding of the fits respecting gene boundaries plot_and_save_all_genes(data, fitter, fits, join(basedir,gene_dir), show_change_distributions) if do_series and not only_main_html: plot_and_save_all_series(data, fitter, fits, join(basedir,series_dir), use_correlations, show_change_distributions, figure_kw) if do_hist and k_of_n is None and not only_main_html: create_score_distribution_html(fits, use_correlations, join(basedir,scores_dir)) if do_html and k_of_n is None: link_to_correlation_plots = use_correlations and correlations is not None if link_to_correlation_plots and not only_main_html: plot_and_save_all_gene_correlations(data, correlations, join(basedir,correlations_dir)) dct_pathways = load_17_pathways_breakdown() pathway_genes = set.union(*dct_pathways.values()) data_genes = set(data.gene_names) missing = pathway_genes - data_genes b_pathways = len(missing) < len(pathway_genes)/2 # simple heuristic to create pathways only if we have most of the genes (currently 61 genes are missing) create_html( data, fitter, fits, basedir, gene_dir, series_dir, scores_dir, correlations_dir=correlations_dir, use_correlations=use_correlations, link_to_correlation_plots=link_to_correlation_plots, b_pathways=b_pathways, **html_kw )
def get_onset_times(data, fitter, R2_threshold, b_force=False): filename = join(cache_dir(),fit_results_relative_path(data,fitter) + '.pkl') if isfile(filename): print 'Loading onset distribution from {}'.format(filename) with open(filename) as f: bin_edges, change_vals = pickle.load(f) else: print 'Computing...' fits = get_all_fits(data, fitter) thetas = [fit.theta for fit in iterate_fits(fits, R2_threshold=R2_threshold)] stages = [stage.scaled(age_scaler) for stage in dev_stages] low = min(stage.from_age for stage in stages) high = max(stage.to_age for stage in stages) bin_edges, change_vals = compute_change_distribution(fitter.shape, thetas, low, high, n_bins=50) print 'Saving result to {}'.format(filename) ensure_dir(dirname(filename)) with open(filename,'w') as f: pickle.dump((bin_edges,change_vals),f) return bin_edges, change_vals
return mu,fit.LOO_score lst_mu_R2 = [get_onset_time(r) for r in regions] onset_times, lst_R2 = zip(*lst_mu_R2) r,pval = spearmanr(onset_times, range(len(regions))) return r,pval,lst_R2 lst_pathways = [ 'serotonin', 'dopamine', ] for pathway in lst_pathways: data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler) shape = Sigmoid(priors='sigmoid_wide') fitter = Fitter(shape, sigma_prior='normal') fits = get_all_fits(data, fitter, allow_new_computation=False) # R2_threshold = 0.5 YYY problem - we might be using bad fits. regions = ['OFC', 'M1C', 'S1C', 'IPC', 'V1C'] scores = [] for g in data.gene_names: r,pval,lst_R2 = get_gene_correlation(fits,g,regions) scores.append( (g,r,pval,lst_R2) ) fig = plot_correlation_histogram(scores,pathway) save_figure(fig,'{}/gradual-maturation-hist.png'.format(pathway,pathway), under_results=True, b_close=True) for fR2 in [np.mean]: #[min,max,np.mean]: fig = plot_scatter(scores, pathway, fR2) save_figure(fig,'{}/gradual-maturation-scatter-{}.png'.format(pathway,fR2.__name__), under_results=True, b_close=True)
onset_times, lst_R2 = zip(*lst_mu_R2) r, pval = spearmanr(onset_times, range(len(regions))) return r, pval, lst_R2 lst_pathways = [ 'serotonin', 'dopamine', ] for pathway in lst_pathways: data = GeneData.load('both').restrict_pathway(pathway).scale_ages( age_scaler) shape = Sigmoid(priors='sigmoid_wide') fitter = Fitter(shape, sigma_prior='normal') fits = get_all_fits(data, fitter, allow_new_computation=False) # R2_threshold = 0.5 YYY problem - we might be using bad fits. regions = ['OFC', 'M1C', 'S1C', 'IPC', 'V1C'] scores = [] for g in data.gene_names: r, pval, lst_R2 = get_gene_correlation(fits, g, regions) scores.append((g, r, pval, lst_R2)) fig = plot_correlation_histogram(scores, pathway) save_figure(fig, '{}/gradual-maturation-hist.png'.format(pathway, pathway), under_results=True, b_close=True)
ax.set_xlabel('z score', fontsize=fontsize) ax.set_ylabel('probability', fontsize=fontsize) ax.tick_params(axis='both', labelsize=fontsize) return fig cfg.verbosity = 1 age_scaler = LogScaler() pathway = '17full' data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler) data_shuffled = GeneData.load('both').restrict_pathway(pathway).scale_ages( age_scaler).shuffle() shape = Sigmoid('sigmoid_wide') fitter = Fitter(shape, sigma_prior='normal') fits = get_all_fits(data, fitter, allow_new_computation=False) fits_shuffled = get_all_fits(data_shuffled, fitter, allow_new_computation=False) R2_pairs = [(fit.LOO_score, fit2.LOO_score) for fit, fit2 in iterate_fits(fits, fits_shuffled)] R2 = np.array([r for r, r_shuffled in R2_pairs]) R2_shuffled = np.array([r_shuffled for r, r_shuffled in R2_pairs]) name = '{}-{}'.format(data.pathway, shape.cache_name()) fig = plot_score_distribution(R2, R2_shuffled) save_figure(fig, 'RP/R2-distribution-{}.png'.format(name), under_results=True, b_close=True)
if __name__ == '__main__': disable_all_warnings() parser = get_common_parser() parser.add_argument('--shape2', required=True, help='The shape to compare against', choices=allowed_shape_names()) parser.add_argument('--scaling2', help='The scaling used when fitting shape2. Default: none', choices=allowed_scaler_names()) parser.add_argument('--sigma_prior2', help='Prior to use for 1/sigma when fitting shape2. Default: None', choices=get_allowed_priors(is_sigma=True)) parser.add_argument('--priors2', help='The priors used for theta when fitting shape2. Default: None', choices=get_allowed_priors()) parser.add_argument('--filename', help='Where to save the figure. Default: results/comparison.png') parser.add_argument('--show', help='Show figure and wait before exiting', action='store_true') parser.add_argument('--ndiffs', type=int, default=5, help='Number of top diffs to show. Default=5.') args = parser.parse_args() data1, fitter1 = process_common_inputs(args) data2 = get_data_from_args(args.dataset, args.pathway, args.from_age, args.scaling2, args.shuffle) fitter2 = get_fitter_from_args(args.shape2, args.priors2, args.sigma_prior2) fits1 = get_all_fits(data1,fitter1) fits2 = get_all_fits(data2,fitter2) print_diff_points(data1,fitter1,fits1, data2,fitter2,fits2, args.ndiffs) fig = plot_comparison_scatter(data1,fitter1,fits1, data2,fitter2,fits2) filename = args.filename if filename is None: ensure_dir(results_dir()) filename = join(results_dir(), 'shape_comparison.png') save_figure(fig, filename) if args.show: plt.show(block=True)
##################################################### GRs = [ ('ABHD4','STC', (5, 8)), ] for g,r,yrange in GRs: for fitter in fitters: print 'Doing {}@{}...'.format(g,r) series = data.get_one_series(g,r) theta,_,_,_ = fitter.fit(series.ages, series.single_expression) fig = plot_one_series(series, fitter.shape, theta, yrange) save_figure(fig,'RP/fit-examples-{}-{}-{}.png'.format(fitter.shape.cache_name(), g,r), under_results=True) ##################################################### # Comparison for whole pathway ##################################################### pathway = '17full' data = data.restrict_pathway(pathway) fits = [get_all_fits(data,fitter,allow_new_computation=False) for fitter in fitters] fig = plot_comparison_bar(data, shapes, fits) save_figure(fig,'RP/sigslope-comparison-bar-{}.png'.format(data.pathway), under_results=True) fig = plot_comparison_over_R2_score(data, shapes, fits) save_figure(fig,'RP/sigslope-comparison-vs-R2-{}.png'.format(data.pathway), under_results=True) fig = plot_comparison_scatter(data,shapes[0],fits[0],shapes[1],fits[1]) save_figure(fig,'RP/scatter-{}-{}-{}.png'.format(shapes[0],shapes[1],pathway), under_results=True) plt.close('all')
save_figure(fig, filename, b_close=True, under_results=True) dct_tuples.update(dct_region_tuples) print_best_improvements(dct_tuples) tuples = dct_tuples.values() pairs = [(x[0],x[1]) for x in tuples] fig = plot_comparison_scatter(pairs,pathway) filename = join('RP','correlation-diff-scatter-{}.png'.format(pathway)) save_figure(fig, filename, b_close=True, under_results=True) fig = plot_comparison_bar(tuples) filename = join('RP','correlation-diff-bar-{}.png'.format(pathway)) save_figure(fig, filename, b_close=True, under_results=True) fig = plot_comparison_bar(tuples, several_levels=True) filename = join('RP','correlation-diff-bar-several-levels-{}.png'.format(pathway)) save_figure(fig, filename, b_close=True, under_results=True) disable_all_warnings() cfg.verbosity = 1 age_scaler = LogScaler() shape = Sigslope('sigslope80') fitter = Fitter(shape, sigma_prior='normal') pathways = ['cannabinoids', 'serotonin'] for pathway in pathways: data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler) fits = get_all_fits(data, fitter, n_correlation_iterations=4, allow_new_computation=False) analyze_pathway(pathway, data, fitter, fits)
def get_fits(): data = GeneData.load('both').restrict_pathway('17pathways').scale_ages(age_scaler) shape = Sigmoid(priors='sigmoid_wide') fitter = Fitter(shape, sigma_prior='normal') fits = get_all_fits(data, fitter) return fits
tuples = dct_tuples.values() pairs = [(x[0], x[1]) for x in tuples] fig = plot_comparison_scatter(pairs, pathway) filename = join('RP', 'correlation-diff-scatter-{}.png'.format(pathway)) save_figure(fig, filename, b_close=True, under_results=True) fig = plot_comparison_bar(tuples) filename = join('RP', 'correlation-diff-bar-{}.png'.format(pathway)) save_figure(fig, filename, b_close=True, under_results=True) fig = plot_comparison_bar(tuples, several_levels=True) filename = join( 'RP', 'correlation-diff-bar-several-levels-{}.png'.format(pathway)) save_figure(fig, filename, b_close=True, under_results=True) disable_all_warnings() cfg.verbosity = 1 age_scaler = LogScaler() shape = Sigslope('sigslope80') fitter = Fitter(shape, sigma_prior='normal') pathways = ['cannabinoids', 'serotonin'] for pathway in pathways: data = GeneData.load('both').restrict_pathway(pathway).scale_ages( age_scaler) fits = get_all_fits(data, fitter, n_correlation_iterations=4, allow_new_computation=False) analyze_pathway(pathway, data, fitter, fits)
import setup import config as cfg from load_data import GeneData from shapes.sigmoid import Sigmoid from fitter import Fitter from all_fits import get_all_fits, iterate_fits from scalers import LogScaler cfg.verbosity = 1 age_scaler = LogScaler() pathway = 'serotonin' data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler) fitter = Fitter(Sigmoid(priors=None)) fits = get_all_fits(data, fitter) extreme = [(g, r) for dsname, g, r, fit in iterate_fits( fits, R2_threshold=0.5, return_keys=True) if abs(fit.theta[0]) > 100]
def save_fits_and_create_html(data, fitter, fits=None, basedir=None, do_genes=True, do_series=True, do_hist=True, do_html=True, only_main_html=False, k_of_n=None, use_correlations=False, correlations=None, show_change_distributions=False, exons_layout=False, html_kw=None, figure_kw=None): if fits is None: fits = get_all_fits(data, fitter, k_of_n) if basedir is None: basedir = join(results_dir(), fit_results_relative_path(data, fitter)) if use_correlations: basedir = join(basedir, 'with-correlations') if html_kw is None: html_kw = {} if figure_kw is None: figure_kw = {} print 'Writing HTML under {}'.format(basedir) ensure_dir(basedir) gene_dir = 'gene-subplot' exons_dir = 'exons_subplot_series' if cfg.exons_plots_from_series else 'exons_subplot' series_dir = 'gene-region-fits' correlations_dir = 'gene-correlations' scores_dir = 'score_distributions' if do_genes and not only_main_html: # relies on the sharding of the fits respecting gene boundaries plot_and_save_all_genes(data, fitter, fits, join(basedir, gene_dir), show_change_distributions) if do_series and not only_main_html: plot_and_save_all_series(data, fitter, fits, join(basedir, series_dir), use_correlations, show_change_distributions, exons_layout, figure_kw) if exons_layout and not only_main_html: if cfg.exons_plots_from_series: plot_and_save_all_exons_from_series(fits, join(basedir, exons_dir), join(basedir, series_dir)) else: plot_and_save_all_exons(data, fitter, fits, join(basedir, exons_dir)) if do_hist and k_of_n is None and not only_main_html: create_score_distribution_html(fits, use_correlations, join(basedir, scores_dir)) if do_html and k_of_n is None: link_to_correlation_plots = use_correlations and correlations is not None if link_to_correlation_plots and not only_main_html: plot_and_save_all_gene_correlations( data, correlations, join(basedir, correlations_dir)) dct_pathways = load_17_pathways_breakdown() pathway_genes = set.union(*dct_pathways.values()) data_genes = set(data.gene_names) missing = pathway_genes - data_genes b_pathways = len(missing) < len( pathway_genes ) / 2 # simple heuristic to create pathways only if we have most of the genes (currently 61 genes are missing) create_html(data, fitter, fits, basedir, gene_dir, exons_dir, series_dir, scores_dir, correlations_dir=correlations_dir, use_correlations=use_correlations, link_to_correlation_plots=link_to_correlation_plots, b_pathways=b_pathways, exons_layout=exons_layout, **html_kw)
import setup import config as cfg from load_data import GeneData from shapes.sigmoid import Sigmoid from fitter import Fitter from all_fits import get_all_fits, iterate_fits from scalers import LogScaler cfg.verbosity = 1 age_scaler = LogScaler() pathway = 'serotonin' data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler) fitter = Fitter(Sigmoid(priors=None)) fits = get_all_fits(data,fitter) extreme = [(g,r) for dsname,g,r,fit in iterate_fits(fits, R2_threshold=0.5, return_keys=True) if abs(fit.theta[0]) > 100]
ax.bar(bin_edges[:-1], probs, width=width, color='b') ax.set_xlabel('z score', fontsize=fontsize) ax.set_ylabel('probability', fontsize=fontsize) ax.tick_params(axis='both', labelsize=fontsize) return fig cfg.verbosity = 1 age_scaler = LogScaler() pathway = '17full' data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler) data_shuffled = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler).shuffle() shape = Sigmoid('sigmoid_wide') fitter = Fitter(shape,sigma_prior='normal') fits = get_all_fits(data,fitter,allow_new_computation=False) fits_shuffled = get_all_fits(data_shuffled,fitter,allow_new_computation=False) R2_pairs = [(fit.LOO_score,fit2.LOO_score) for fit,fit2 in iterate_fits(fits,fits_shuffled)] R2 = np.array([r for r,r_shuffled in R2_pairs]) R2_shuffled = np.array([r_shuffled for r,r_shuffled in R2_pairs]) name = '{}-{}'.format(data.pathway,shape.cache_name()) fig = plot_score_distribution(R2,R2_shuffled) save_figure(fig,'RP/R2-distribution-{}.png'.format(name), under_results=True, b_close=True) mu_shuffled = np.mean(R2_shuffled) std_shuffled = np.std(R2_shuffled) z_scores = (R2-mu_shuffled)/std_shuffled fig = plot_z_scores(z_scores) save_figure(fig,'RP/R2-z-scores-{}.png'.format(name), under_results=True, b_close=True)