Esempio n. 1
0
def export_cube():
    cube = load_pickle(RegionPairTiming.cube_filename)
    README = """\
d_mu:
mu(r2)-mu(r1) for every gene and region pair. 
Dimensions: <n-genes> X <n-regions> X <n-regions>

combined_std: 
The combined standard deviation of the two change distributions.
std = sqrt(0.5*(std1^2 + std2^2))
Dimensions: <n-genes> X <n-regions> X <n-regions>

score:
The d' for the two change distributions. Equal to d_mu ./ combined_std.
Dimensions: <n-genes> X <n-regions> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CUBE=README,
        genes=list_of_strings_to_matlab_cell_array(cube.genes),
        regions=list_of_strings_to_matlab_cell_array(cube.regions),
        age_scaler=scalers.unify(cube.age_scaler).cache_name(),
        d_mu=cube.d_mu,
        combined_std=cube.std,
        scores=cube.d_mu / cube.std,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'cube.mat'))
def export_cube():
    cube = load_pickle(RegionPairTiming.cube_filename)
    README = """\
d_mu:
mu(r2)-mu(r1) for every gene and region pair. 
Dimensions: <n-genes> X <n-regions> X <n-regions>

combined_std: 
The combined standard deviation of the two change distributions.
std = sqrt(0.5*(std1^2 + std2^2))
Dimensions: <n-genes> X <n-regions> X <n-regions>

score:
The d' for the two change distributions. Equal to d_mu ./ combined_std.
Dimensions: <n-genes> X <n-regions> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CUBE = README,
        genes = list_of_strings_to_matlab_cell_array(cube.genes),
        regions = list_of_strings_to_matlab_cell_array(cube.regions),
        age_scaler = scalers.unify(cube.age_scaler).cache_name(),
        d_mu = cube.d_mu,
        combined_std = cube.std,
        scores = cube.d_mu / cube.std,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'cube.mat'))
def export_pathways():
    change_dist = load_pickle(SingleRegion.change_dist_filename)
    matlab_g2i = {g:(i+1) for i,g in enumerate(change_dist.genes)} # NOTE that matlab is one based
    
    pathways = pathway_lists.read_all_pathways()
    pathway_names = pathways.keys() # make sure the order stays fixed
    pathway_genes_names = np.array([list_of_strings_to_matlab_cell_array(pathways[p]) for p in pathway_names], dtype=object)
    pathway_genes_idx = np.array([np.array([matlab_g2i[g] for g in pathways[p]]) for p in pathway_names], dtype=object)

    matlab_p2i = {p:(i+1) for i,p in enumerate(pathway_names)} # NOTE matlab indexing is one based
    list_names = pathway_lists.all_pathway_lists()
    list_pathway_names = np.empty(len(list_names), dtype=object)
    list_pathway_idx = np.empty(len(list_names), dtype=object)
    for i,listname in enumerate(list_names):
        pathways_in_list = pathway_lists.list_to_pathway_names(listname)
        list_pathway_names[i] = list_of_strings_to_matlab_cell_array(pathways_in_list)
        list_pathway_idx[i] = [matlab_p2i[p] for p in pathways_in_list]
    README = """\
pathway_names:
Cell array of all pathway names. The name in cell number k is the name of the
pathway at position k in "pathway_genes_names" and "pathway_genes_idx".

pathway_genes_names:
Cell array (size <n-pathways>). Each cell contains a cell array of strings which 
are the gene symbols of the genes in that pathway.

pathway_genes_idx:
Same as pathway_genes_names, but each cell in the outer cell array is now an 
array of gene indices corresponding to the gene positions in cube.mat and change-distributions.mat.
Hopefully this should be easier to use in matlab.

list_names:
Names of pathway lists prepared by Noa

list_pathway_names:
Call array. One item per list. Each item is a cell array of strings which are 
the names of the pathways belonging to that list.

list_pathway_idx:
Same as list_pathway_names, but instead of listing the pathways by name, they 
are given as indices into the previous pathway_xxx structures.
"""
    mdict = dict(
        README_PATHWAYS = README,
        pathway_names = list_of_strings_to_matlab_cell_array(pathway_names),
        pathway_genes_names = pathway_genes_names,
        pathway_genes_idx = pathway_genes_idx,
        list_names = list_of_strings_to_matlab_cell_array(list_names),
        list_pathway_names = list_pathway_names,
        list_pathway_idx = list_pathway_idx,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'pathways.mat'))
def export_timing_info_for_all_fits(data, fitter, fits):
    change_dist = compute_timing_info_for_all_fits(data, fitter, fits)
    README = """\
mu:
The mean age of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

std:
The standard deviation of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

weights:
The change distributions for each gene and region.
Dimensions: <n-genes> X <n-regions> X <n-bins>

bin_centers:
The ages for the center of each bin used in calculating the histogram in "weights".
Dimensions: <n-bins> X 1

bin_edges:
The edges of the bins used in calculating the change histogram.
(centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated)
Dimensions: <n-bins + 1> X 1

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CHANGE_DISTRIBUTIONS=README,
        genes=list_of_strings_to_matlab_cell_array(change_dist.genes),
        regions=list_of_strings_to_matlab_cell_array(change_dist.regions),
        age_scaler=scalers.unify(change_dist.age_scaler).cache_name(),
        mu=change_dist.mu,
        std=change_dist.std,
        bin_edges=change_dist.bin_edges,
        bin_centers=change_dist.bin_centers,
        weights=change_dist.weights,
    )
    filename = join(
        cache_dir(),
        fit_results_relative_path(data, fitter) + '-change-dist.mat')
    save_matfile(mdict, filename)
Esempio n. 5
0
 def save_to_mat(self):
     filename = join(cache_dir(), 'both', 'dprime-all-pathways-and-regions-{}.mat'.format(self._filename_suffix))
     mdict = dict(
         pathway = list_of_strings_to_matlab_cell_array([x.pathway for x in self.res]),
         r1 = list_of_strings_to_matlab_cell_array([x.r1 for x in self.res]),
         r2 = list_of_strings_to_matlab_cell_array([x.r2 for x in self.res]),
         score = np.array([x.score for x in self.res]),
         delta = np.array([x.delta for x in self.res]),
         weighted_delta = np.array([x.weighted_delta for x in self.res]),
         mu1_years = np.array([x.mu1_years for x in self.res]),
         mu2_years = np.array([x.mu2_years for x in self.res]),
         pval = np.array([x.pval for x in self.res]),
         pathway_size = np.array([x.pathway_size for x in self.res]),
     )
     print 'Saving results to {}'.format(filename)
     savemat(filename, mdict, oned_as='column')
def export_timing_info_for_all_fits(data, fitter, fits):
    change_dist = compute_timing_info_for_all_fits(data, fitter, fits)
    README = """\
mu:
The mean age of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

std:
The standard deviation of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

weights:
The change distributions for each gene and region.
Dimensions: <n-genes> X <n-regions> X <n-bins>

bin_centers:
The ages for the center of each bin used in calculating the histogram in "weights".
Dimensions: <n-bins> X 1

bin_edges:
The edges of the bins used in calculating the change histogram.
(centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated)
Dimensions: <n-bins + 1> X 1

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CHANGE_DISTRIBUTIONS = README,
        genes = list_of_strings_to_matlab_cell_array(change_dist.genes),
        regions = list_of_strings_to_matlab_cell_array(change_dist.regions),
        age_scaler = scalers.unify(change_dist.age_scaler).cache_name(),
        mu = change_dist.mu,
        std = change_dist.std,
        bin_edges = change_dist.bin_edges,
        bin_centers = change_dist.bin_centers,
        weights = change_dist.weights,
    )
    filename = join(cache_dir(), fit_results_relative_path(data,fitter) + '-change-dist.mat')
    save_matfile(mdict, filename)
Esempio n. 7
0
 def save_to_mat(self):
     filename = join(
         cache_dir(), 'both',
         'dprime-all-pathways-and-regions-{}.mat'.format(
             self._filename_suffix))
     mdict = dict(
         pathway=list_of_strings_to_matlab_cell_array(
             [x.pathway for x in self.res]),
         r1=list_of_strings_to_matlab_cell_array([x.r1 for x in self.res]),
         r2=list_of_strings_to_matlab_cell_array([x.r2 for x in self.res]),
         score=np.array([x.score for x in self.res]),
         delta=np.array([x.delta for x in self.res]),
         weighted_delta=np.array([x.weighted_delta for x in self.res]),
         mu1_years=np.array([x.mu1_years for x in self.res]),
         mu2_years=np.array([x.mu2_years for x in self.res]),
         pval=np.array([x.pval for x in self.res]),
         pathway_size=np.array([x.pathway_size for x in self.res]),
     )
     print 'Saving results to {}'.format(filename)
     savemat(filename, mdict, oned_as='column')
Esempio n. 8
0
def export_pathways():
    change_dist = load_pickle(SingleRegion.change_dist_filename)
    matlab_g2i = {g: (i + 1)
                  for i, g in enumerate(change_dist.genes)
                  }  # NOTE that matlab is one based

    pathways = pathway_lists.read_all_pathways()
    pathway_names = pathways.keys()  # make sure the order stays fixed
    pathway_genes_names = np.array([
        list_of_strings_to_matlab_cell_array(pathways[p])
        for p in pathway_names
    ],
                                   dtype=object)
    pathway_genes_idx = np.array([
        np.array([matlab_g2i[g] for g in pathways[p]]) for p in pathway_names
    ],
                                 dtype=object)

    matlab_p2i = {p: (i + 1)
                  for i, p in enumerate(pathway_names)
                  }  # NOTE matlab indexing is one based
    list_names = pathway_lists.all_pathway_lists()
    list_pathway_names = np.empty(len(list_names), dtype=object)
    list_pathway_idx = np.empty(len(list_names), dtype=object)
    for i, listname in enumerate(list_names):
        pathways_in_list = pathway_lists.list_to_pathway_names(listname)
        list_pathway_names[i] = list_of_strings_to_matlab_cell_array(
            pathways_in_list)
        list_pathway_idx[i] = [matlab_p2i[p] for p in pathways_in_list]
    README = """\
pathway_names:
Cell array of all pathway names. The name in cell number k is the name of the
pathway at position k in "pathway_genes_names" and "pathway_genes_idx".

pathway_genes_names:
Cell array (size <n-pathways>). Each cell contains a cell array of strings which 
are the gene symbols of the genes in that pathway.

pathway_genes_idx:
Same as pathway_genes_names, but each cell in the outer cell array is now an 
array of gene indices corresponding to the gene positions in cube.mat and change-distributions.mat.
Hopefully this should be easier to use in matlab.

list_names:
Names of pathway lists prepared by Noa

list_pathway_names:
Call array. One item per list. Each item is a cell array of strings which are 
the names of the pathways belonging to that list.

list_pathway_idx:
Same as list_pathway_names, but instead of listing the pathways by name, they 
are given as indices into the previous pathway_xxx structures.
"""
    mdict = dict(
        README_PATHWAYS=README,
        pathway_names=list_of_strings_to_matlab_cell_array(pathway_names),
        pathway_genes_names=pathway_genes_names,
        pathway_genes_idx=pathway_genes_idx,
        list_names=list_of_strings_to_matlab_cell_array(list_names),
        list_pathway_names=list_pathway_names,
        list_pathway_idx=list_pathway_idx,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'pathways.mat'))
Esempio n. 9
0
def save_as_mat_files(data, fitter, fits, has_change_distributions):
    for dataset in data.datasets:
        filename = join(cache_dir(), fit_results_relative_path(dataset,fitter) + '.mat')
        dataset_fits = fits[dataset.name]
    
        print 'Saving mat file to {}'.format(filename)
        shape = fitter.shape
        
        gene_names = dataset.gene_names
        gene_idx = {g:i for i,g in enumerate(gene_names)}
        n_genes = len(gene_names)
        region_names = dataset.region_names
        region_idx = {r:i for i,r in enumerate(region_names)}
        n_regions = len(region_names)
        
        write_theta = shape.can_export_params_to_matlab()
        if write_theta:
            theta = init_array(np.NaN, shape.n_params(), n_genes,n_regions)
        else:
            theta = np.NaN
        
        fit_scores = init_array(np.NaN, n_genes,n_regions)
        LOO_scores = init_array(np.NaN, n_genes,n_regions)
        fit_predictions = init_array(np.NaN, *dataset.expression.shape)
        LOO_predictions = init_array(np.NaN, *dataset.expression.shape)
        high_res_predictions = init_array(np.NaN, cfg.n_curve_points_to_plot, n_genes, n_regions)
        scaled_high_res_ages = np.linspace(dataset.ages.min(), dataset.ages.max(), cfg.n_curve_points_to_plot)
        original_high_res_ages = scalers.unify(dataset.age_scaler).unscale(scaled_high_res_ages)
        if has_change_distributions:
            change_distribution_bin_centers = fits.change_distribution_params.bin_centers
            n_bins = len(change_distribution_bin_centers)
            change_distribution_weights = init_array(np.NaN, n_bins, n_genes, n_regions)
        else:
            change_distribution_bin_centers = []
            change_distribution_weights = []
        for (g,r),fit in dataset_fits.iteritems():
            series = dataset.get_one_series(g,r)
            ig = gene_idx[g]
            ir = region_idx[r]
            fit_scores[ig,ir] = fit.fit_score
            LOO_scores[ig,ir] = fit.LOO_score
            if write_theta and fit.theta is not None:
                theta[:,ig,ir] = fit.theta
            if fit.fit_predictions is not None:
                fit_predictions[series.original_inds,ig,ir] = fit.fit_predictions
            if fit.LOO_predictions is not None:
                LOO_predictions[series.original_inds,ig,ir] = fit.LOO_predictions
            if fit.theta is not None:
                high_res_predictions[:,ig,ir] = shape.f(fit.theta, scaled_high_res_ages)
            change_weights = getattr(fit,'change_distribution_weights',None)
            if change_weights is not None:
                change_distribution_weights[:,ig,ir] = change_weights
        mdict = dict(
            gene_names = list_of_strings_to_matlab_cell_array(gene_names),
            region_names = list_of_strings_to_matlab_cell_array(region_names),
            theta = theta,
            fit_scores = fit_scores,
            LOO_scores = LOO_scores,
            fit_predictions = fit_predictions,
            LOO_predictions = LOO_predictions,
            high_res_predictions = high_res_predictions,
            high_res_ages = original_high_res_ages,
            change_distribution_bin_centers = change_distribution_bin_centers,
            change_distribution_weights = change_distribution_weights,
        )
        savemat(filename, mdict, oned_as='column')
Esempio n. 10
0
def save_as_mat_files(data, fitter, fits, has_change_distributions):
    for dataset in data.datasets:
        filename = join(cache_dir(),
                        fit_results_relative_path(dataset, fitter) + '.mat')
        dataset_fits = fits[dataset.name]

        print 'Saving mat file to {}'.format(filename)
        shape = fitter.shape

        gene_names = dataset.gene_names
        gene_idx = {g: i for i, g in enumerate(gene_names)}
        n_genes = len(gene_names)
        region_names = dataset.region_names
        region_idx = {r: i for i, r in enumerate(region_names)}
        n_regions = len(region_names)

        write_theta = shape.can_export_params_to_matlab()
        if write_theta:
            theta = init_array(np.NaN, shape.n_params(), n_genes, n_regions)
        else:
            theta = np.NaN

        fit_scores = init_array(np.NaN, n_genes, n_regions)
        LOO_scores = init_array(np.NaN, n_genes, n_regions)
        fit_predictions = init_array(np.NaN, *dataset.expression.shape)
        LOO_predictions = init_array(np.NaN, *dataset.expression.shape)
        high_res_predictions = init_array(np.NaN, cfg.n_curve_points_to_plot,
                                          n_genes, n_regions)
        scaled_high_res_ages = np.linspace(dataset.ages.min(),
                                           dataset.ages.max(),
                                           cfg.n_curve_points_to_plot)
        original_high_res_ages = scalers.unify(
            dataset.age_scaler).unscale(scaled_high_res_ages)
        if has_change_distributions:
            change_distribution_bin_centers = fits.change_distribution_params.bin_centers
            n_bins = len(change_distribution_bin_centers)
            change_distribution_weights = init_array(np.NaN, n_bins, n_genes,
                                                     n_regions)
        else:
            change_distribution_bin_centers = []
            change_distribution_weights = []
        for (g, r), fit in dataset_fits.iteritems():
            series = dataset.get_one_series(g, r)
            ig = gene_idx[g]
            ir = region_idx[r]
            fit_scores[ig, ir] = fit.fit_score
            LOO_scores[ig, ir] = fit.LOO_score
            if write_theta and fit.theta is not None:
                theta[:, ig, ir] = fit.theta
            if fit.fit_predictions is not None:
                fit_predictions[series.original_inds, ig,
                                ir] = fit.fit_predictions
            if fit.LOO_predictions is not None:
                LOO_predictions[series.original_inds, ig,
                                ir] = fit.LOO_predictions
            if fit.theta is not None:
                high_res_predictions[:, ig,
                                     ir] = shape.f(fit.theta,
                                                   scaled_high_res_ages)
            change_weights = getattr(fit, 'change_distribution_weights', None)
            if change_weights is not None:
                change_distribution_weights[:, ig, ir] = change_weights
        mdict = dict(
            gene_names=list_of_strings_to_matlab_cell_array(gene_names),
            region_names=list_of_strings_to_matlab_cell_array(region_names),
            theta=theta,
            fit_scores=fit_scores,
            LOO_scores=LOO_scores,
            fit_predictions=fit_predictions,
            LOO_predictions=LOO_predictions,
            high_res_predictions=high_res_predictions,
            high_res_ages=original_high_res_ages,
            change_distribution_bin_centers=change_distribution_bin_centers,
            change_distribution_weights=change_distribution_weights,
        )
        savemat(filename, mdict, oned_as='column')