def get_change_distribution_for_whole_genome(all_data, fitter): # NOTE: the distribution for all genes should be precomputed by running onset_times_whole_genome.py filename = join(cache_dir(),fit_results_relative_path(all_data,fitter) + '.pkl') print 'Loading whole genome onset distribution from {}'.format(filename) with open(filename) as f: bin_edges, change_vals = pickle.load(f) return bin_edges, change_vals
def get_change_distribution_for_whole_genome(all_data, fitter): # NOTE: the distribution for all genes should be precomputed by running onset_times_whole_genome.py filename = join(cache_dir(), fit_results_relative_path(all_data, fitter) + '.pkl') print 'Loading whole genome onset distribution from {}'.format(filename) with open(filename) as f: bin_edges, change_vals = pickle.load(f) return bin_edges, change_vals
def export_timing_info_for_all_fits(data, fitter, fits): change_dist = compute_timing_info_for_all_fits(data, fitter, fits) README = """\ mu: The mean age of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> std: The standard deviation of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> genes: Gene names for the genes represented in other arrays weights: The change distributions for each gene and region. Dimensions: <n-genes> X <n-regions> X <n-bins> bin_centers: The ages for the center of each bin used in calculating the histogram in "weights". Dimensions: <n-bins> X 1 bin_edges: The edges of the bins used in calculating the change histogram. (centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated) Dimensions: <n-bins + 1> X 1 regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CHANGE_DISTRIBUTIONS=README, genes=list_of_strings_to_matlab_cell_array(change_dist.genes), regions=list_of_strings_to_matlab_cell_array(change_dist.regions), age_scaler=scalers.unify(change_dist.age_scaler).cache_name(), mu=change_dist.mu, std=change_dist.std, bin_edges=change_dist.bin_edges, bin_centers=change_dist.bin_centers, weights=change_dist.weights, ) filename = join( cache_dir(), fit_results_relative_path(data, fitter) + '-change-dist.mat') save_matfile(mdict, filename)
def save_to_mat(self): filename = join(cache_dir(), 'both', 'dprime-all-pathways-and-regions-{}.mat'.format(self._filename_suffix)) mdict = dict( pathway = list_of_strings_to_matlab_cell_array([x.pathway for x in self.res]), r1 = list_of_strings_to_matlab_cell_array([x.r1 for x in self.res]), r2 = list_of_strings_to_matlab_cell_array([x.r2 for x in self.res]), score = np.array([x.score for x in self.res]), delta = np.array([x.delta for x in self.res]), weighted_delta = np.array([x.weighted_delta for x in self.res]), mu1_years = np.array([x.mu1_years for x in self.res]), mu2_years = np.array([x.mu2_years for x in self.res]), pval = np.array([x.pval for x in self.res]), pathway_size = np.array([x.pathway_size for x in self.res]), ) print 'Saving results to {}'.format(filename) savemat(filename, mdict, oned_as='column')
def export_timing_info_for_all_fits(data, fitter, fits): change_dist = compute_timing_info_for_all_fits(data, fitter, fits) README = """\ mu: The mean age of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> std: The standard deviation of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> genes: Gene names for the genes represented in other arrays weights: The change distributions for each gene and region. Dimensions: <n-genes> X <n-regions> X <n-bins> bin_centers: The ages for the center of each bin used in calculating the histogram in "weights". Dimensions: <n-bins> X 1 bin_edges: The edges of the bins used in calculating the change histogram. (centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated) Dimensions: <n-bins + 1> X 1 regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CHANGE_DISTRIBUTIONS = README, genes = list_of_strings_to_matlab_cell_array(change_dist.genes), regions = list_of_strings_to_matlab_cell_array(change_dist.regions), age_scaler = scalers.unify(change_dist.age_scaler).cache_name(), mu = change_dist.mu, std = change_dist.std, bin_edges = change_dist.bin_edges, bin_centers = change_dist.bin_centers, weights = change_dist.weights, ) filename = join(cache_dir(), fit_results_relative_path(data,fitter) + '-change-dist.mat') save_matfile(mdict, filename)
def get_onset_times(data, fitter, R2_threshold, b_force=False): filename = join(cache_dir(),fit_results_relative_path(data,fitter) + '.pkl') if isfile(filename): print 'Loading onset distribution from {}'.format(filename) with open(filename) as f: bin_edges, change_vals = pickle.load(f) else: print 'Computing...' fits = get_all_fits(data, fitter) thetas = [fit.theta for fit in iterate_fits(fits, R2_threshold=R2_threshold)] stages = [stage.scaled(age_scaler) for stage in dev_stages] low = min(stage.from_age for stage in stages) high = max(stage.to_age for stage in stages) bin_edges, change_vals = compute_change_distribution(fitter.shape, thetas, low, high, n_bins=50) print 'Saving result to {}'.format(filename) ensure_dir(dirname(filename)) with open(filename,'w') as f: pickle.dump((bin_edges,change_vals),f) return bin_edges, change_vals
def save_to_mat(self): filename = join( cache_dir(), 'both', 'dprime-all-pathways-and-regions-{}.mat'.format( self._filename_suffix)) mdict = dict( pathway=list_of_strings_to_matlab_cell_array( [x.pathway for x in self.res]), r1=list_of_strings_to_matlab_cell_array([x.r1 for x in self.res]), r2=list_of_strings_to_matlab_cell_array([x.r2 for x in self.res]), score=np.array([x.score for x in self.res]), delta=np.array([x.delta for x in self.res]), weighted_delta=np.array([x.weighted_delta for x in self.res]), mu1_years=np.array([x.mu1_years for x in self.res]), mu2_years=np.array([x.mu2_years for x in self.res]), pval=np.array([x.pval for x in self.res]), pathway_size=np.array([x.pathway_size for x in self.res]), ) print 'Saving results to {}'.format(filename) savemat(filename, mdict, oned_as='column')
def save_theta_text_files(data, fitter, fits): assert fitter.shape.cache_name() == 'spline', "save to text is only supported for splines at the moment" for dataset in data.datasets: filename = join(cache_dir(), fit_results_relative_path(dataset,fitter) + '.txt') dataset_fits = fits[dataset.name] print 'Saving text file to {}'.format(filename) with open(filename, 'w') as f: for (g,r),fit in dataset_fits.iteritems(): if fit.theta is None: continue knots, coeffs, degree = fit.theta[0] knots = list(knots) coeffs = list(coeffs) gr_text = """\ Gene symbol: {g} Region: {r} Spline knots: {knots} Spline coefficients: {coeffs} Spline degree: {degree} """.format(**locals()) print >>f, gr_text
class SingleRegion(object): change_dist_filename = join( cache_dir(), 'both', 'fits-log-all-sigslope-theta-sigslope80-sigma-normal-change-dist.pkl') def __init__(self, listname='all'): self.listname = listname self.pathways = pathway_lists.read_all_pathways(listname) self.change_dist = load_pickle( SingleRegion.change_dist_filename, 'change distribution for all genes and regions') self.genes = self.change_dist.genes self.regions = self.change_dist.regions self.g2i = {g: i for i, g in enumerate(self.genes)} self.r2i = {r: i for i, r in enumerate(self.regions)} self.age_scaler = self.change_dist.age_scaler self.mu = self.change_dist.mu self.std = self.change_dist.std self.bin_edges = self.change_dist.bin_edges self.bin_centers = self.change_dist.bin_centers self.weights = self.change_dist.weights def region_timings_per_pathway(self): def mean_age(pathway_genes, r): pathway_ig = [self.g2i[g] for g in pathway_genes] ir = self.r2i[r] ages = self.mu[pathway_ig, ir] weights = 1 / self.std[pathway_ig, ir] age = np.dot(weights, ages) / sum(weights) return self.age_scaler.unscale(age) res = {} # pathway -> { r -> mu } for pathway in self.pathways.iterkeys(): pathway_genes = self.pathways[pathway] res[pathway] = { r: mean_age(pathway_genes, r) for r in self.regions } return res
def save_theta_text_files(data, fitter, fits): assert fitter.shape.cache_name( ) == 'spline', "save to text is only supported for splines at the moment" for dataset in data.datasets: filename = join(cache_dir(), fit_results_relative_path(dataset, fitter) + '.txt') dataset_fits = fits[dataset.name] print 'Saving text file to {}'.format(filename) with open(filename, 'w') as f: for (g, r), fit in dataset_fits.iteritems(): if fit.theta is None: continue knots, coeffs, degree = fit.theta[0] knots = list(knots) coeffs = list(coeffs) gr_text = """\ Gene symbol: {g} Region: {r} Spline knots: {knots} Spline coefficients: {coeffs} Spline degree: {degree} """.format(**locals()) print >> f, gr_text
def _cache_filename(base_filename, k_of_n): filename = join(cache_dir(), base_filename + '.pkl') if k_of_n is not None: k, n = k_of_n filename = '{}.{}-of-{}'.format(filename, k, n) return filename
def _batch_dir(base_filename): return join(cache_dir(), base_filename + '-batches')
import setup from os.path import join import project_dirs from all_fits import Bunch, convert_format def f_convert(fit): "added fitter and shape params" return Bunch( fitter=fit.fitter, seed=fit.seed, theta=fit.theta, sigma=fit.sigma, fit_predictions=fit.fit_predictions, LOO_predictions=fit.LOO_predictions, ) filename = join(project_dirs.cache_dir(), 'kang2011', 'fits-serotonin-poly1-t0-s0.pkl') convert_format(filename, f_convert)
import setup from os.path import join import project_dirs from all_fits import Bunch, convert_format def f_convert(fit): "added fitter and shape params" return Bunch( fitter = fit.fitter, seed = fit.seed, theta = fit.theta, sigma = fit.sigma, fit_predictions = fit.fit_predictions, LOO_predictions = fit.LOO_predictions, ) filename = join(project_dirs.cache_dir(), 'kang2011', 'fits-serotonin-poly1-t0-s0.pkl') convert_format(filename, f_convert)
class RegionPairTiming(object): cube_filename = join( cache_dir(), 'both', 'fits-log-all-sigslope-theta-sigslope80-sigma-normal-dprime-cube.pkl') def __init__(self, listname='all'): self.listname = listname self.single = SingleRegion(listname) self.pathways = self.single.pathways self.genes = self.single.genes self.regions = self.single.regions self.g2i = self.single.g2i self.r2i = self.single.r2i self.age_scaler = self.single.age_scaler self.mu = self.single.mu self.single_std = self.single.std cube = load_pickle( RegionPairTiming.cube_filename, name='timing d-prime info for all genes and region pairs') self.d_mu = cube.d_mu self.pair_std = cube.std self.scores = self.d_mu / self.pair_std self.baseline = self.baseline_distribution_all_pairs(100, 10000) @cache(lambda self: join( cache_dir(), 'both', 'dprime-all-pathways-and-regions-{}.pkl'.format( self.listname))) def analyze_all_pathways(self): res = {} # (pathway,r1,r2) -> timing results for pathway in self.pathways.iterkeys(): print 'Analyzing region pairs for pathway {}'.format(pathway) pathway_genes = self.pathways[pathway] for r1 in self.regions: for r2 in self.regions: if r2 <= r1: # keep only results "above the diagonal" (r1 < r2 lexicographically) continue pathway_res = self.analyze_pathway_and_region_pair( pathway_genes, r1, r2) res[(pathway, r1, r2)] = pathway_res return TimingResults.fromResultsDct(res, self.listname, self.pathways) def analyze_pathway_and_region_pair(self, pathway_genes, r1, r2): ir1, ir2 = self.r2i[r1], self.r2i[r2] pathway_ig = [self.g2i[g] for g in pathway_genes] all_pathway_scores = self.scores[pathway_ig, ir1, ir2] score = nanmean(all_pathway_scores) mu, sigma = self.baseline[(r1, r2)] sigma = sigma / np.sqrt(len(pathway_ig)) z = (score - mu) / sigma pval = z_score_to_p_value(z) pathway_d_mu = self.d_mu[pathway_ig, ir1, ir2] pathway_pair_std = self.pair_std[pathway_ig, ir1, ir2] weights = 1 / pathway_pair_std valid = ~np.isnan( pathway_d_mu ) # needed for the PFC region from colantuoni which doesn't contain all genes\ weights, pathway_d_mu = weights[valid], pathway_d_mu[valid] weighted_delta = np.dot(weights, pathway_d_mu) / sum(weights) delta = np.mean(pathway_d_mu) too_many_nans = False if not valid.all(): assert r1 == 'PFC' or r2 == 'PFC', "r1={}, r2={}".format(r1, r2) n_genes = len(valid) n_non_valid = n_genes - np.count_nonzero(valid) if float(n_non_valid) / n_genes > 0.05: too_many_nans = True def mean_age(ir): if too_many_nans: return np.NaN ages = self.mu[pathway_ig, ir] weights = 1 / self.single_std[pathway_ig, ir] valid = ~np.isnan(weights) weights, ages = weights[valid], ages[valid] age = np.dot(weights, ages) / sum(weights) return self.age_scaler.unscale(age) return Bunch( score=score if not too_many_nans else np.nan, delta=delta if not too_many_nans else np.nan, weighted_delta=weighted_delta if not too_many_nans else np.nan, mu1_years=mean_age(ir1), mu2_years=mean_age(ir2), pval=pval if not too_many_nans else np.nan, pathway_size=len(pathway_genes), ) @cache(filename=join(cache_dir(), 'both', 'dprime-baseline.pkl')) def baseline_distribution_all_pairs(self, sample_size, n_samples): res = {} for r1 in self.regions: print 'Sampling baseline distribution of {} vs. all other regions'.format( r1) for r2 in self.regions: if (r2, r1) in res: mu, sigma = res[(r2, r1)] res[(r1, r2)] = -mu, sigma else: res[(r1, r2)] = self.baseline_distribution_one_pair( r1, r2, sample_size, n_samples) return res def baseline_distribution_one_pair(self, r1, r2, sample_size, n_samples): ir1, ir2 = self.r2i[r1], self.r2i[r2] pair_scores = self.scores[:, ir1, ir2] x = np.empty(n_samples) for i in xrange(n_samples): inds = np.random.random_integers(0, len(pair_scores) - 1, sample_size) x[i] = nanmean(pair_scores[inds]) mu = x.mean() sigma = x.std() * np.sqrt(sample_size) return mu, sigma
def save_as_mat_files(data, fitter, fits, has_change_distributions): for dataset in data.datasets: filename = join(cache_dir(), fit_results_relative_path(dataset,fitter) + '.mat') dataset_fits = fits[dataset.name] print 'Saving mat file to {}'.format(filename) shape = fitter.shape gene_names = dataset.gene_names gene_idx = {g:i for i,g in enumerate(gene_names)} n_genes = len(gene_names) region_names = dataset.region_names region_idx = {r:i for i,r in enumerate(region_names)} n_regions = len(region_names) write_theta = shape.can_export_params_to_matlab() if write_theta: theta = init_array(np.NaN, shape.n_params(), n_genes,n_regions) else: theta = np.NaN fit_scores = init_array(np.NaN, n_genes,n_regions) LOO_scores = init_array(np.NaN, n_genes,n_regions) fit_predictions = init_array(np.NaN, *dataset.expression.shape) LOO_predictions = init_array(np.NaN, *dataset.expression.shape) high_res_predictions = init_array(np.NaN, cfg.n_curve_points_to_plot, n_genes, n_regions) scaled_high_res_ages = np.linspace(dataset.ages.min(), dataset.ages.max(), cfg.n_curve_points_to_plot) original_high_res_ages = scalers.unify(dataset.age_scaler).unscale(scaled_high_res_ages) if has_change_distributions: change_distribution_bin_centers = fits.change_distribution_params.bin_centers n_bins = len(change_distribution_bin_centers) change_distribution_weights = init_array(np.NaN, n_bins, n_genes, n_regions) else: change_distribution_bin_centers = [] change_distribution_weights = [] for (g,r),fit in dataset_fits.iteritems(): series = dataset.get_one_series(g,r) ig = gene_idx[g] ir = region_idx[r] fit_scores[ig,ir] = fit.fit_score LOO_scores[ig,ir] = fit.LOO_score if write_theta and fit.theta is not None: theta[:,ig,ir] = fit.theta if fit.fit_predictions is not None: fit_predictions[series.original_inds,ig,ir] = fit.fit_predictions if fit.LOO_predictions is not None: LOO_predictions[series.original_inds,ig,ir] = fit.LOO_predictions if fit.theta is not None: high_res_predictions[:,ig,ir] = shape.f(fit.theta, scaled_high_res_ages) change_weights = getattr(fit,'change_distribution_weights',None) if change_weights is not None: change_distribution_weights[:,ig,ir] = change_weights mdict = dict( gene_names = list_of_strings_to_matlab_cell_array(gene_names), region_names = list_of_strings_to_matlab_cell_array(region_names), theta = theta, fit_scores = fit_scores, LOO_scores = LOO_scores, fit_predictions = fit_predictions, LOO_predictions = LOO_predictions, high_res_predictions = high_res_predictions, high_res_ages = original_high_res_ages, change_distribution_bin_centers = change_distribution_bin_centers, change_distribution_weights = change_distribution_weights, ) savemat(filename, mdict, oned_as='column')
def _batch_dir(base_filename): return join(cache_dir(),base_filename + '-batches')
def _cache_filename(base_filename, k_of_n): filename = join(cache_dir(), base_filename + '.pkl') if k_of_n is not None: k,n = k_of_n filename = '{}.{}-of-{}'.format(filename,k,n) return filename
for dsname,g,r,fit in iterate_fits(fits, return_keys=True): weights = calc_bootstrap_change_distribution(shape, fit.theta_samples, bin_edges) fit.change_distribution_weights = weights fit.change_distribution_spread = change_distribution_spread_cumsum(bin_centers, weights) fit.change_distribution_mean_std = change_distribution_mean_and_std(bin_centers, weights) def calc_bootstrap_change_distribution(shape, theta_samples, bin_edges): bin_centers = bin_edges_to_centers(bin_edges) n_params, n_samples = theta_samples.shape weights = np.zeros(bin_centers.shape) for i in xrange(n_samples): weights += calc_change_distribution(shape, theta_samples[:,i], bin_edges) weights /= n_samples # now values are in fraction of total change (doesn't have to sum up to 1 if ages don't cover the whole transition range) return weights @cache(lambda data, fitter, fits: join(cache_dir(), fit_results_relative_path(data,fitter) + '-dprime-cube.pkl')) def compute_dprime_measures_for_all_pairs(data, fitter, fits): genes = data.gene_names regions = data.region_names r2ds = data.region_to_dataset() cube_shape = (len(genes), len(regions), len(regions)) d_mu = np.empty(cube_shape) # mu2-mu1 for all genes and region pairs std = np.empty(cube_shape) # std (combined) for all genes and region pairs def get_mu_std(g,r): dsfits = fits[r2ds[r]] fit = dsfits.get((g,r)) if fit is None: return np.nan, np.nan else: return fit.change_distribution_mean_std for ig,g in enumerate(genes):
def save_as_mat_files(data, fitter, fits, has_change_distributions): for dataset in data.datasets: filename = join(cache_dir(), fit_results_relative_path(dataset, fitter) + '.mat') dataset_fits = fits[dataset.name] print 'Saving mat file to {}'.format(filename) shape = fitter.shape gene_names = dataset.gene_names gene_idx = {g: i for i, g in enumerate(gene_names)} n_genes = len(gene_names) region_names = dataset.region_names region_idx = {r: i for i, r in enumerate(region_names)} n_regions = len(region_names) write_theta = shape.can_export_params_to_matlab() if write_theta: theta = init_array(np.NaN, shape.n_params(), n_genes, n_regions) else: theta = np.NaN fit_scores = init_array(np.NaN, n_genes, n_regions) LOO_scores = init_array(np.NaN, n_genes, n_regions) fit_predictions = init_array(np.NaN, *dataset.expression.shape) LOO_predictions = init_array(np.NaN, *dataset.expression.shape) high_res_predictions = init_array(np.NaN, cfg.n_curve_points_to_plot, n_genes, n_regions) scaled_high_res_ages = np.linspace(dataset.ages.min(), dataset.ages.max(), cfg.n_curve_points_to_plot) original_high_res_ages = scalers.unify( dataset.age_scaler).unscale(scaled_high_res_ages) if has_change_distributions: change_distribution_bin_centers = fits.change_distribution_params.bin_centers n_bins = len(change_distribution_bin_centers) change_distribution_weights = init_array(np.NaN, n_bins, n_genes, n_regions) else: change_distribution_bin_centers = [] change_distribution_weights = [] for (g, r), fit in dataset_fits.iteritems(): series = dataset.get_one_series(g, r) ig = gene_idx[g] ir = region_idx[r] fit_scores[ig, ir] = fit.fit_score LOO_scores[ig, ir] = fit.LOO_score if write_theta and fit.theta is not None: theta[:, ig, ir] = fit.theta if fit.fit_predictions is not None: fit_predictions[series.original_inds, ig, ir] = fit.fit_predictions if fit.LOO_predictions is not None: LOO_predictions[series.original_inds, ig, ir] = fit.LOO_predictions if fit.theta is not None: high_res_predictions[:, ig, ir] = shape.f(fit.theta, scaled_high_res_ages) change_weights = getattr(fit, 'change_distribution_weights', None) if change_weights is not None: change_distribution_weights[:, ig, ir] = change_weights mdict = dict( gene_names=list_of_strings_to_matlab_cell_array(gene_names), region_names=list_of_strings_to_matlab_cell_array(region_names), theta=theta, fit_scores=fit_scores, LOO_scores=LOO_scores, fit_predictions=fit_predictions, LOO_predictions=LOO_predictions, high_res_predictions=high_res_predictions, high_res_ages=original_high_res_ages, change_distribution_bin_centers=change_distribution_bin_centers, change_distribution_weights=change_distribution_weights, ) savemat(filename, mdict, oned_as='column')
bin_centers, weights) def calc_bootstrap_change_distribution(shape, theta_samples, bin_edges): bin_centers = bin_edges_to_centers(bin_edges) n_params, n_samples = theta_samples.shape weights = np.zeros(bin_centers.shape) for i in xrange(n_samples): weights += calc_change_distribution(shape, theta_samples[:, i], bin_edges) weights /= n_samples # now values are in fraction of total change (doesn't have to sum up to 1 if ages don't cover the whole transition range) return weights @cache(lambda data, fitter, fits: join( cache_dir(), fit_results_relative_path(data, fitter) + '-dprime-cube.pkl')) def compute_dprime_measures_for_all_pairs(data, fitter, fits): genes = data.gene_names regions = data.region_names r2ds = data.region_to_dataset() cube_shape = (len(genes), len(regions), len(regions)) d_mu = np.empty(cube_shape) # mu2-mu1 for all genes and region pairs std = np.empty(cube_shape) # std (combined) for all genes and region pairs def get_mu_std(g, r): dsfits = fits[r2ds[r]] fit = dsfits.get((g, r)) if fit is None: return np.nan, np.nan else: