def _do_tests(self, dec_reg, hit_reg, iter): active_features = np.where(dec_reg >= 0)[0] hits = hit_reg[active_features] # get uncorrected p values based on hit_reg to_accept_ps = sp.stats.binom.sf(hits - 1, iter, .5).flatten() to_reject_ps = sp.stats.binom.cdf(hits, iter, .5).flatten() # two step multicor process # first we correct for testing several features in each round using FDR to_accept = multicor(to_accept_ps, alpha=self.multi_alpha, method='fdr_bh')[0] to_reject = multicor(to_reject_ps, alpha=self.multi_alpha, method='fdr_bh')[0] # second we correct for testing the same feature over and over again # using bonferroni to_accept2 = to_accept_ps <= self.multi_alpha / float(iter) to_reject2 = to_reject_ps <= self.multi_alpha / float(iter) # combine the two multi corrections, and get indexes to_accept = to_accept * to_accept2 to_reject = to_reject * to_reject2 # find features which are 0 and have been rejected or accepted to_accept = np.where((dec_reg[active_features] == 0) * to_accept)[0] to_reject = np.where((dec_reg[active_features] == 0) * to_reject)[0] # updating dec_reg dec_reg[active_features[to_accept]] = 1 dec_reg[active_features[to_reject]] = -1 return dec_reg
def _do_tests(self, dec_reg, hit_reg, iter): # get uncorrected p values based on hit_reg to_accept_ps = sp.stats.binom.sf(hit_reg - 1, iter, 0.5).flatten() to_reject_ps = sp.stats.binom.cdf(hit_reg, iter, 0.5).flatten() # correct p values for multiple testing to_accept = np.where(multicor(to_accept_ps, alpha=self.multi_alpha, method=self.multi_corr_method)[0])[0] to_reject = np.where(multicor(to_reject_ps, alpha=self.multi_alpha, method=self.multi_corr_method)[0])[0] # finding those to_accept and to_reject that are 0, and setting them dec_reg[to_accept[np.where(dec_reg[to_accept] == 0)]] = 1 dec_reg[to_reject[np.where(dec_reg[to_reject] == 0)]] = -1 return dec_reg
def _do_tests(self, dec_reg, hit_reg, iter): # get uncorrected p values based on hit_reg to_accept_ps = sp.stats.binom.sf(hit_reg - 1, iter, .5).flatten() to_reject_ps = sp.stats.binom.cdf(hit_reg, iter, .5).flatten() # correct p values for multiple testing to_accept = np.where(multicor(to_accept_ps, alpha=self.multi_alpha, method=self.multi_corr_method)[0])[0] to_reject = np.where(multicor(to_reject_ps, alpha=self.multi_alpha, method=self.multi_corr_method)[0])[0] # finding those to_accept and to_reject that are 0, and setting them dec_reg[to_accept[np.where(dec_reg[to_accept] == 0)]] = 1 dec_reg[to_reject[np.where(dec_reg[to_reject] == 0)]] = -1 return dec_reg
def check_pvals(p_vals, params): """ Corrects for multiple testing using the user specified method and alpha value. Returns a boolean mask that can be used to filter the resulting heatmap. """ # basic params n, p = p_vals.shape mc_method = params['multi_corr_method'] mc_alpha = params['alpha_val'] # corr matrix is symmetric we only need to correct the lower half ps_to_check = np.array(np.tril(np.ones((p, p)), k=-1), dtype=np.bool) ps = multicor(p_vals[ps_to_check], method=mc_method, alpha=float(mc_alpha)) # make Boolean and corrected p-val matrices p_mask = np.zeros((p, p), dtype=np.bool) p_mask[ps_to_check] = ps[0] p_mask = mirror_lower_triangle(p_mask) p_vals = np.ones((p, p)) p_vals[ps_to_check] = ps[1] p_vals = mirror_lower_triangle(p_vals) return p_vals, p_mask
def gpd_spearman(rX, perm_num=10000, prec=None, mc_method='fdr_bh', mc_alpha=0.05): """ This is the main function of the module. The correlation values are calculated in matrix form in parallel, and the p-values are approximated using Monte Carlo sampling (1e4 by default), followed by Generalized Paretho Distribution Approximation to make them more precise. Then the p values are corrected for multiple testing using the Benjamini Hochberg FDR with .05 alpha (by default). For this, only the lower triangle of the correlation matrix is used, to avoid including each value twice. Parameters ---------- - rX : array of shape [n sample, p features] Column ranked X matrix of observations and features. Used to calculate Spearman correlations and their p-values. - perm_num : int, default = 10000 Number of permutations to use to estimate each correlation's p-value. - prec : array of shape [p features, p features], default = None Estimated precision matrix by some Graph Lasso algorithm. If provided, only those correlations will be considered whose precision is not 0. - mc_method : string, default = 'fdr_bh' Method for correction for multiple testing. - mc_alpha : float, default = 0.05 Threshold for q-values after FDR correction. Returns ------- Tuple with three matrices: - rs : array of shape [p features, p features] Holds empirical Spearman correlation values. - p_mask : array of shape [p features, p features] Mask for those p-values which passed the FDR correction with alpha=0.05 - p_vals : array of shape [p features, p features] FDR corrected p-values """ n, p = rX.shape # little trick to avoid having many ifs everywhere in code prec_mask = np.ones((p, p), dtype=np.bool) if prec is not None: prec_mask = prec != 0 # calculate empirical Spearman from data rs = np.corrcoef(rX, rowvar=0) # get permuted Spearman values, in parallel r_perms = np.array(Parallel(n_jobs=-1,backend='threading') (delayed(perm_r)(rX, i) for i in xrange(perm_num))) # GPD APPROXIMATION OF P-VALUES IN PARALLEL pGPDs = np.ones((p,p)) # we should only each value once, i.e. the cells below the diagonal lower_tri_ind = np.tril_indices(p, k=-1) lower_tri_mask = np.array(np.tril(np.ones((p, p)), k=-1), dtype=np.bool) pGPDs[lower_tri_ind] = np.array(Parallel(n_jobs=-1,backend='threading')(delayed(perm_gpd) (prec_mask[lower_tri_ind[0][i], lower_tri_ind[1][i]], rs[lower_tri_ind[0][i], lower_tri_ind[1][i]], r_perms[:, lower_tri_ind[0][i], lower_tri_ind[1][i]]) for i in xrange(lower_tri_ind[0].shape[0]))) # correction for multiple testing, make sure that we only include each # p-value once not twice (symmetric rs), by combining two numpy mask arrays: # - the lower triangular rs matrix # - prec_mask, i.e. p-vals whose precision is not 0 ps_to_check = lower_tri_mask * prec_mask p_vals = np.ones((p, p)) if float(mc_alpha) < 1: p_mask = np.zeros((p, p), dtype=np.bool) ps = multicor(pGPDs[ps_to_check], method=mc_method, alpha=float(mc_alpha)) p_mask[ps_to_check] = ps[0] p_vals[ps_to_check] = ps[1] else: # user doesn't want correction for multiple testing p_mask = np.ones((p, p), dtype=np.bool) p_vals = np.ones((p, p)) p_vals[ps_to_check] = pGPDs[ps_to_check] p_mask = mirror_lower_triangle(p_mask) p_vals = mirror_lower_triangle(p_vals) return rs, p_vals, p_mask