def regress(self): """ Execute StARS :return: list Returns a list of regression results that base_regression's pileup_data can process """ if MPControl.is_dask(): from inferelator.distributed.dask_functions import lasso_stars_regress_dask return lasso_stars_regress_dask(self.X, self.Y, self.alphas, self.num_subsamples, self.random_seed, self.method, self.params, self.G, self.genes) def regression_maker(j): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=self.genes[j], i=j, total=self.G), level=level) data = stars_model_select(self.X.values, utils.scale_vector(self.Y.get_gene_data(j, force_dense=True, flatten=True)), self.alphas, method=self.method, num_subsamples=self.num_subsamples, random_seed=self.random_seed, **self.params) data['ind'] = j return data return MPControl.map(regression_maker, range(self.G), tell_children=False)
def regress(self): """ Execute Elastic Net :return: list Returns a list of regression results that base_regression's pileup_data can process """ if MPControl.is_dask(): from inferelator.distributed.dask_functions import sklearn_regress_dask return sklearn_regress_dask(self.X, self.Y, self.model, self.G, self.genes, self.min_coef) def regression_maker(j): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format( gn=self.genes[j], i=j, total=self.G), level=level) data = sklearn_gene(self.X.values, utils.scale_vector( self.Y.get_gene_data(j, force_dense=True, flatten=True)), copy.copy(self.model), min_coef=self.min_coef) data['ind'] = j return data return MPControl.map(regression_maker, range(self.G), tell_children=False)
def regress(self): """ Execute BBSR :return: pd.DataFrame [G x K], pd.DataFrame [G x K] Returns the regression betas and beta error reductions for all threads if this is the master thread (rank 0) Returns None, None if it's a subordinate thread """ if MPControl.is_dask(): from inferelator.distributed.dask_functions import bbsr_regress_dask return bbsr_regress_dask(self.X, self.Y, self.pp, self.weights_mat, self.G, self.genes, self.nS) def regression_maker(j): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format( gn=self.genes[j], i=j, total=self.G), level=level) data = bayes_stats.bbsr( self.X.values, utils.scale_vector( self.Y.get_gene_data(j, force_dense=True, flatten=True)), self.pp.iloc[j, :].values.flatten(), self.weights_mat.iloc[j, :].values.flatten(), self.nS, ordinary_least_squares=self.ols_only) data['ind'] = j return data return MPControl.map(regression_maker, range(self.G), tell_children=False)
def regress(self): """ Execute Elastic Net :return: list Returns a list of regression results that base_regression's pileup_data can process """ if MPControl.client.name() == "dask": from inferelator.distributed.dask_functions import elasticnet_regress_dask return elasticnet_regress_dask(self.X, self.Y, self.params, self.G, self.genes) def regression_maker(j): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format( gn=self.genes[j], i=j, total=self.G), level=level) data = elastic_net(self.X.values, self.Y.iloc[j, :].values, self.params) data['ind'] = j return data return MPControl.map(regression_maker, range(self.G), tell_children=False)
def _sim_ints(prob_dist, n_per_row, sparse=False, random_seed=42): if not np.isclose(np.sum(prob_dist), 1.): raise ValueError("Probability distribution does not sum to 1") ncols = len(prob_dist) def _sim_rows(n_vec, seed): row_data = np.zeros((len(n_vec), ncols), dtype=np.int32) rng = np.random.default_rng(seed=seed) col_ids = np.arange(ncols) for i, n in enumerate(n_vec): row_data[i, :] = np.bincount(rng.choice(col_ids, size=n, p=prob_dist), minlength=ncols) return _sparse.csr_matrix(row_data) if sparse else row_data ss = np.random.SeedSequence(random_seed) sim_data = MPControl.map(_sim_rows, _row_gen(n_per_row), _ss_gen(ss)) return _sparse.vstack(sim_data) if sparse else np.vstack(sim_data)
def _sim_float(gene_centers, gene_sds, nrows, random_seed=42): ncols = len(gene_centers) assert ncols == len(gene_sds) def _sim_cols(cents, sds, seed): rng = np.random.default_rng(seed=seed) return rng.normal(loc=cents, scale=sds, size=(nrows, len(cents))) ss = np.random.SeedSequence(random_seed) return np.hstack( MPControl.map(_sim_cols, _col_gen(gene_centers), _col_gen(gene_sds), _ss_gen(ss)))
def regress(self): """ Execute multitask (AMUSR) :return: list Returns a list of regression results that the amusr_regression pileup_data can process """ if MPControl.client.name() == "dask": from inferelator.distributed.dask_functions import amusr_regress_dask return amusr_regress_dask( self.X, self.Y, self.priors, self.prior_weight, self.n_tasks, self.genes, self.tfs, self.G, remove_autoregulation=self.remove_autoregulation) def regression_maker(j): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format( gn=self.genes[j], i=j, total=self.G), level=level) gene = self.genes[j] x, y, tasks = [], [], [] if self.remove_autoregulation: tfs = [t for t in self.tfs if t != gene] else: tfs = self.tfs for k in range(self.n_tasks): if gene in self.Y[k]: x.append(self.X[k].loc[:, tfs].values) # list([N, K]) y.append(self.Y[k].loc[:, gene].values.reshape( -1, 1)) # list([N, 1]) tasks.append(k) # [T,] prior = format_prior(self.priors, gene, tasks, self.prior_weight) return run_regression_EBIC(x, y, tfs, tasks, gene, prior) return MPControl.map(regression_maker, range(self.G))
def build_mi_array(X, Y, bins, logtype=DEFAULT_LOG_TYPE, temp_dir=None): """ Calculate MI into an array :param X: np.ndarray (n x m1) Discrete array of bins :param Y: np.ndarray (n x m2) Discrete array of bins :param bins: int The total number of bins that were used to make the arrays discrete :param logtype: np.log func Which log function to use (log2 gives bits, ln gives nats) :param temp_dir: path Path to write temp files for multiprocessing :return mi: np.ndarray (m1 x m2) Returns the mutual information array """ m1, m2 = X.shape[1], Y.shape[1] # Define the function which calculates MI for each variable in X against every variable in Y def mi_make(i): level = 2 if i % 1000 == 0 else 3 Debug.allprint("Mutual Information Calculation [{i} / {total}]".format( i=i, total=m1), level=level) discrete_X = _make_discrete( X[:, i].A.flatten() if sps.isspmatrix(X) else X[:, i].flatten(), bins) return [ _calc_mi(_make_table(discrete_X, Y[:, j], bins), logtype=logtype) for j in range(m2) ] # Send the MI build to the multiprocessing controller mi_list = MPControl.map(mi_make, range(m1), tmp_file_path=temp_dir) # Convert the list of lists to an array mi = np.array(mi_list) assert ( m1, m2) == mi.shape, "Array {sh} produced [({m1}, {m2}) expected]".format( sh=mi.shape, m1=m1, m2=m2) return mi
def regress(self, regression_function=None): """ Execute multitask (AMUSR) :return: list Returns a list of regression results that the amusr_regression pileup_data can process """ regression_function = self.regression_function if regression_function is None else regression_function if MPControl.is_dask(): from inferelator.distributed.dask_functions import amusr_regress_dask return amusr_regress_dask( self.X, self.Y, self.priors, self.prior_weight, self.n_tasks, self.genes, self.tfs, self.G, remove_autoregulation=self.remove_autoregulation, regression_function=regression_function, tol=self.tol, rel_tol=self.rel_tol, use_numba=self.use_numba) def regression_maker(j): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format( gn=self.genes[j], i=j, total=self.G), level=level) gene = self.genes[j] x, y, tasks = [], [], [] if self.remove_autoregulation: tfs = [t for t in self.tfs if t != gene] else: tfs = self.tfs for k in range(self.n_tasks): if gene in self.Y[k].gene_names: x.append(self.X[k].get_gene_data(tfs)) # list([N, K]) y.append(self.Y[k].get_gene_data( gene, force_dense=True).reshape(-1, 1)) # list([N, 1]) tasks.append(k) # [T,] prior = format_prior(self.priors, gene, tasks, self.prior_weight, tfs=tfs) return regression_function(x, y, tfs, tasks, gene, prior, Cs=self.Cs, Ss=self.Ss, lambda_Bs=self.lambda_Bs, lambda_Ss=self.lambda_Ss, tol=self.tol, rel_tol=self.rel_tol, use_numba=self.use_numba) return MPControl.map(regression_maker, range(self.G))
def test_local_map(self): test_result = MPControl.map(math_function, *self.map_test_data) self.assertListEqual(test_result, self.map_test_expect)
def test_map(self): with self.assertRaises(RuntimeError): MPControl.map(math_function, *self.map_test_data)
def test_dask_cluster_map(self): with self.assertRaises(NotImplementedError): MPControl.map(math_function, *self.map_test_data)
def test_kvs_map_by_file(self): test_result = MPControl.map(math_function, *self.map_test_data, tell_children=True, tmp_file_path=self.temp_dir) self.assertListEqual(test_result, self.map_test_expect)
def test_kvs_map_distribute(self): test_result = MPControl.map(math_function, *self.map_test_data, tell_children=True) self.assertListEqual(test_result, self.map_test_expect)