def _get_dataset_fits(data, dataset, fitter, k_of_n, n_correlation_iterations, correlations_k_of_n, allow_new_computation): def arg_mapper(gr,f_proxy): g,r = gr series = dataset.get_one_series(g,r) return f_proxy(series,fitter) # sharding is done by gene, so plots.plot_and_save_all_genes can work on a shard # this also requires that the list of all genes be taken from the whole data # and not from each dataset. Otherwise we can get a mismatch between the genes # in the shard for different datasets. dataset_fits = job_splitting.compute( name = 'fits', f = _compute_fit, arg_mapper = arg_mapper, all_keys = list(product(dataset.gene_names,dataset.region_names)), all_sharding_keys = data.gene_names, f_sharding_key = lambda gr: gr[0], k_of_n = k_of_n, base_filename = fit_results_relative_path(dataset,fitter), allow_new_computation = allow_new_computation, ) if n_correlation_iterations > 0: # The problem is that if we're using a shard for the basic fits we won't have theta for all genes in a region # which is necessary for computing correlations in that region. assert k_of_n is None, "Can't perform correlation computations when sharding is enabled at the basic fit level" _add_dataset_correlation_fits(dataset, fitter, dataset_fits, n_correlation_iterations, correlations_k_of_n, allow_new_computation) if cfg.verbosity > 0: print 'Adding fit scores... ', _add_scores(dataset, dataset_fits) if cfg.verbosity > 0: print 'done!' return dataset_fits
def _add_dataset_correlation_fits(dataset, fitter, ds_fits, n_iterations, k_of_n, allow_new_computation): def arg_mapper(key, f_proxy): ir, loo_point = key r = dataset.region_names[ir] series = dataset.get_several_series(dataset.gene_names, r) basic_theta = [ds_fits[(g, r)].theta for g in dataset.gene_names] return f_proxy(series, fitter, basic_theta, loo_point, n_iterations) all_keys = [] for ir, r in enumerate(dataset.region_names): all_keys.append((ir, None)) series = dataset.get_several_series(dataset.gene_names, r) for iy, g in enumerate(dataset.gene_names): for ix in xrange(len(series.ages)): loo_point = (ix, iy) all_keys.append((ir, loo_point)) def f_sharding_key( key): # keep all x points in the same shard for same r,iy r, loo_point = key if loo_point is None: return (r, None) else: ix, iy = loo_point return (r, iy) dct_results = job_splitting.compute( name='fits-correlations', f=_compute_fit_with_correlations, arg_mapper=arg_mapper, all_keys=all_keys, f_sharding_key=f_sharding_key, k_of_n=k_of_n, base_filename=fit_results_relative_path(dataset, fitter) + '-correlations-{}'.format(n_iterations), allow_new_computation=allow_new_computation, ) _add_dataset_correlation_fits_from_results_dictionary( dataset, ds_fits, dct_results)
def _get_dataset_fits(data, dataset, fitter, k_of_n, n_correlation_iterations, correlations_k_of_n, allow_new_computation): def arg_mapper(gr, f_proxy): g, r = gr series = dataset.get_one_series(g, r) return f_proxy(series, fitter) # sharding is done by gene, so plots.plot_and_save_all_genes can work on a shard # this also requires that the list of all genes be taken from the whole data # and not from each dataset. Otherwise we can get a mismatch between the genes # in the shard for different datasets. dataset_fits = job_splitting.compute( name='fits', f=_compute_fit, arg_mapper=arg_mapper, all_keys=list(product(dataset.gene_names, dataset.region_names)), all_sharding_keys=data.gene_names, f_sharding_key=lambda gr: gr[0], k_of_n=k_of_n, base_filename=fit_results_relative_path(dataset, fitter), allow_new_computation=allow_new_computation, ) if n_correlation_iterations > 0: # The problem is that if we're using a shard for the basic fits we won't have theta for all genes in a region # which is necessary for computing correlations in that region. assert k_of_n is None, "Can't perform correlation computations when sharding is enabled at the basic fit level" _add_dataset_correlation_fits(dataset, fitter, dataset_fits, n_correlation_iterations, correlations_k_of_n, allow_new_computation) if cfg.verbosity > 0: print 'Adding fit scores... ', _add_scores(dataset, dataset_fits) if cfg.verbosity > 0: print 'done!' return dataset_fits
def _add_dataset_correlation_fits(dataset, fitter, ds_fits, n_iterations, k_of_n, allow_new_computation): def arg_mapper(key, f_proxy): ir, loo_point = key r = dataset.region_names[ir] series = dataset.get_several_series(dataset.gene_names,r) basic_theta = [ds_fits[(g,r)].theta for g in dataset.gene_names] return f_proxy(series, fitter, basic_theta, loo_point, n_iterations) all_keys = [] for ir,r in enumerate(dataset.region_names): all_keys.append((ir,None)) series = dataset.get_several_series(dataset.gene_names,r) for iy,g in enumerate(dataset.gene_names): for ix in xrange(len(series.ages)): loo_point = (ix,iy) all_keys.append((ir,loo_point)) def f_sharding_key(key): # keep all x points in the same shard for same r,iy r, loo_point = key if loo_point is None: return (r,None) else: ix,iy = loo_point return (r,iy) dct_results = job_splitting.compute( name = 'fits-correlations', f = _compute_fit_with_correlations, arg_mapper = arg_mapper, all_keys = all_keys, f_sharding_key = f_sharding_key, k_of_n = k_of_n, base_filename = fit_results_relative_path(dataset,fitter) + '-correlations-{}'.format(n_iterations), allow_new_computation = allow_new_computation, ) _add_dataset_correlation_fits_from_results_dictionary(dataset, ds_fits, dct_results)