def _build_pp_matrix(self): """ From priors and context likelihood of relatedness, determine which predictors should be included in the model :return pp: pd.DataFrame [G x K] Boolean matrix indicating which predictor variables should be included in BBSR for each response variable """ # Create a predictor boolean array from priors pp = np.logical_or(self.prior_mat != 0, self.weights_mat != self.no_prior_weight) pp_idx = pp.index pp_col = pp.columns if self.filter_priors_for_clr: # Set priors which have a CLR of 0 to FALSE pp = np.logical_and(pp, self.clr_mat != 0).values else: pp = pp.values # Mark the nS predictors with the highest CLR true (Do not include anything with a CLR of 0) mask = np.logical_or(self.clr_mat == 0, ~np.isfinite(self.clr_mat)).values masked_clr = np.ma.array(self.clr_mat.values, mask=mask) for i in range(self.G): n_to_keep = min(self.nS, self.K, mask.shape[1] - np.sum(mask[i, :])) if n_to_keep == 0: continue clrs = np.ma.argsort(masked_clr[i, :], endwith=False)[-1 * n_to_keep:] pp[i, clrs] = True # Rebuild into a DataFrame and set autoregulation to 0 pp = pd.DataFrame(pp, index=pp_idx, columns=pp_col, dtype=np.dtype(bool)) pp = utils.df_set_diag(pp, False) return pp
def test_dataframe_set_diag(self): def check_diag(df, val): for match in df.index.intersection(df.columns): if df.loc[match, match] == val: pass else: return False return True data_frame = pd.DataFrame(np.ones((20, 10)), index=list(range(20)), columns=list(range(10))) data_frame2 = utils.df_set_diag(data_frame, 0, copy=True) self.assertTrue(check_diag(data_frame2, 0)) self.assertEqual(data_frame2.sum().sum(), 190) self.assertTrue(check_diag(data_frame, 1)) self.assertEqual(data_frame.sum().sum(), 200) utils.df_set_diag(data_frame, 0, copy=False) self.assertTrue(check_diag(data_frame, 0)) self.assertEqual(data_frame.sum().sum(), 190)
def run(self, x_df, y_df, bins=None, logtype=DEFAULT_LOG_TYPE): """ Wrapper to calculate the CLR and MI for two data sets that have common condition columns :param x_df: pd.DataFrame :param y_df: pd.DataFrame :param logtype: np.log func :param bins: int Number of bins for discretizing continuous variables :return clr, mi: pd.DataFrame, pd.DataFrame CLR and MI DataFrames """ assert check.argument_integer(bins, allow_none=True) assert check.indexes_align((x_df.columns, y_df.columns)) assert x_df.shape[0] > 0 assert x_df.shape[1] > 0 assert y_df.shape[0] > 0 assert y_df.shape[1] > 0 if bins is not None: self.bins = bins mi = mutual_information(y_df, x_df, self.bins, temp_dir=self.temp_dir, logtype=logtype) mi_bg = mutual_information(x_df, x_df, self.bins, temp_dir=self.temp_dir, logtype=logtype) clr = calc_mixed_clr(utils.df_set_diag(mi, 0), utils.df_set_diag(mi_bg, 0)) MPControl.sync_processes(pref=SYNC_CLR_KEY) return clr, mi
def _check_prior(self, prior, expression_data, keep_self=False): if not keep_self: prior = utils.df_set_diag(prior, 0) activity_tfs, expr_tfs, drop_tfs = self._determine_tf_status( prior, expression_data) if len(drop_tfs) > 0: msg = "{n} TFs are removed from activity (no expression or prior exists)".format( n=len(drop_tfs)) utils.Debug.vprint(msg, level=0) utils.Debug.vprint(" ".join(drop_tfs), level=1) prior = prior.drop(drop_tfs, axis=1) return prior, activity_tfs, expr_tfs
def compute_transcription_factor_activity(prior, expression_data, expression_data_halftau=None, keep_self=False): """ Calculate TFA from a prior and expression data object :param prior: pd.DataFrame [G x K] :param expression_data: InferelatorData [N x G] :param expression_data_halftau: InferelatorData [N x G] :param keep_self: bool :return: InferelatorData [N x K] """ if not keep_self: prior = utils.df_set_diag(prior, 0) activity_tfs, expr_tfs, drop_tfs = TFA._determine_tf_status( prior, expression_data) if len(drop_tfs) > 0: msg = "{n} TFs are removed from activity (no expression or prior exists)".format( n=len(drop_tfs)) utils.Debug.vprint(msg, level=0) utils.Debug.vprint(" ".join(drop_tfs), level=1) prior = prior.drop(drop_tfs, axis=1) activity = np.zeros((expression_data.shape[0], prior.shape[1]), dtype=np.float64) if len(activity_tfs) > 0: a_cols = prior.columns.isin(activity_tfs) expr = expression_data_halftau if expression_data_halftau is not None else expression_data activity[:, a_cols] = TFA._calculate_activity( prior.loc[:, activity_tfs].values, expr) if len(expr_tfs) > 0: activity[:, prior.columns.isin(expr_tfs )] = expression_data.get_gene_data( expr_tfs, force_dense=True) return utils.InferelatorData(activity, gene_names=prior.columns, sample_names=expression_data.sample_names, meta_data=expression_data.meta_data)