def fisher_exact(n1, d1, n2, d2, **kwargs): try: from fisher import pvalue_npy except: import sys print( "fisher needs to be installed to use fisher exact. pip install fisher or conda install -c bioconda fisher." ) sys.exit(-1) pseudocount = kwargs.get("pseudocount", 0) fe_type = kwargs.get("alternative", "twosided") n1 = np.array(n1, dtype=np.uint) n2 = np.array(n2, dtype=np.uint) d1 = np.array(d1, dtype=np.uint) d2 = np.array(d2, dtype=np.uint) left, right, twosided = pvalue_npy(n1, d1, n2, d2) if fe_type == "twosided": p_vals = twosided elif fe_type == "left": p_vals = left elif fe_type == "right": p_vals = right else: raise Exception("alternative must be twosided, left or right") OR = ((n1 + pseudocount) / (d2 + pseudocount)) / ((n2 + pseudocount) / (d1 + pseudocount)) df = pd.DataFrame({"OR": OR, "P": p_vals}) return df
def _get_significance(self, a, b, expected_freq): ''' Compute the p-value where the null hypothesis is that b was obtained due to error. a, b : numpy array of counts base_line_error : expected error rate. ''' d = a + b expected_b = np.around(d * expected_freq) expected_a = d - expected_b # Downcast to uint to work with fisher exact test function. a = np.asarray(a, dtype=np.uint) b = np.asarray(b, dtype=np.uint) expected_a = np.asarray(expected_a, dtype=np.uint) expected_b = np.asarray(expected_b, dtype=np.uint) left_tail, right_tail, two_tail = pvalue_npy(expected_a, expected_b, a, b) return right_tail
def fishers_vec(a, b, c, d, alternative='two-sided'): scalar = np.isscalar(a) and np.isscalar(b) and np.isscalar(c) and np.isscalar(d) a = np.asarray(a).ravel() b = np.asarray(b).ravel() c = np.asarray(c).ravel() d = np.asarray(d).ravel() assert len(a) == len(b) assert len(a) == len(c) assert len(a) == len(d) with warnings.catch_warnings(): warnings.simplefilter('ignore') OR = (a*d) / (b*c) res = fisher.pvalue_npy(a.astype(np.uint), b.astype(np.uint), c.astype(np.uint), d.astype(np.uint)) if alternative in ['two-sided', 'two-tailed']: out = (OR, res[2]) elif alternative in ['less', 'left-tailed']: out = (OR, res[0]) elif alternative in ['greater', 'right-tailed']: out = (OR, res[1]) else: print_function('Please specify an alternative: two-sided, less, or greater') out = OR, np.nan * np.zeros((len(a), 1)) if scalar: out = (out[0][0], out[1][0]) return out
def fisherTestVec(a,b,c,d,alternative='two-sided'): """Vectorized Fisher's exact test performs n tests on 4 length n numpy vectors a, b, c, and d representing the 4 elements of a 2x2 contigency table. Wrapper around fisher.pvalue_npy found in: Fast Fisher's Exact Test (Haibao Tang, Brent Pedersen) https://pypi.python.org/pypi/fisher/ Loop and test are performed in C (100x speed-up) Parameters ---------- a,b,c,d : shape (n,) ndarrays Vector of counts (will be cast as uint8 for operation) alternative : string Specfies the alternative hypothesis (similar to scipy.fisher_exact) Options: 'two-sided', 'less', 'greater' Returns ------- OR : shape (n,) ndarray Vector of odds-ratios associated with each 2 x 2 table p : shape (n,) ndarray Vector of p-values asspciated with each test and the alternative hypothesis""" res = fisher.pvalue_npy(a.astype(np.uint), b.astype(np.uint), c.astype(np.uint), d.astype(np.uint)) #OR = (a*d)/(b*c) if alternative == 'two-sided': return res[2] elif alternative == 'less': return res[0] elif alternative == 'greater': return res[1]
def smThresholds_sw(DF): sw_ratioLi = [] sw_fb_LD_Arr = DF[fb_LD].to_numpy().astype(np.uint) sw_sb_LD_Arr = DF[sb_LD].to_numpy().astype(np.uint) for __ in range(rep): # Create new columns for Fisher's exact test simulated P-values sw_sm_fb_AD_ALT_Arr = np.random.binomial(DF[fb_LD], fb_Freq).astype(np.uint) sw_sm_fb_AD_REF_Arr = sw_fb_LD_Arr - sw_sm_fb_AD_ALT_Arr sw_sm_sb_AD_ALT_Arr = np.random.binomial(DF[sb_LD], sb_Freq).astype(np.uint) sw_sm_sb_AD_REF_Arr = sw_sb_LD_Arr - sw_sm_sb_AD_ALT_Arr __, __, sw_sm_FE_P_Arr = pvalue_npy(sw_sm_fb_AD_ALT_Arr, sw_sm_fb_AD_REF_Arr, sw_sm_sb_AD_ALT_Arr, sw_sm_sb_AD_REF_Arr) # sw_sm_FE_OR_Arr = (sw_sm_fb_AD_ALT_Arr * sw_sm_sb_AD_REF_Arr) / (sw_sm_fb_AD_REF_Arr * sw_sm_sb_AD_ALT_Arr) sSNP_Arr = np.where(sw_sm_FE_P_Arr < smAlpha, 1, 0) sw_ratioLi.append(np.mean(sSNP_Arr)) return np.percentile(sw_ratioLi, [0.5, 99.5, 2.5, 97.5, 5.0, 95.0])
def smThresholds_gw(DF): print('Calculate the threshold of sSNPs/totalSNPs.') gw_ratioLi = [] for __ in range(rep): sm_SNP_SMPL = DF.sample(snpPerSW, replace=True) gw_sm_fb_AD_ALT_Arr = np.random.binomial(sm_SNP_SMPL[fb_LD], fb_Freq).astype(np.uint) gw_sm_fb_AD_REF_Arr = sm_SNP_SMPL[fb_LD].to_numpy().astype( np.uint) - gw_sm_fb_AD_ALT_Arr gw_sm_sb_AD_ALT_Arr = np.random.binomial(sm_SNP_SMPL[sb_LD], sb_Freq).astype(np.uint) gw_sm_sb_AD_REF_Arr = sm_SNP_SMPL[sb_LD].to_numpy().astype( np.uint) - gw_sm_sb_AD_ALT_Arr __, __, gw_sm_FE_P_Arr = pvalue_npy(gw_sm_fb_AD_ALT_Arr, gw_sm_fb_AD_REF_Arr, gw_sm_sb_AD_ALT_Arr, gw_sm_sb_AD_REF_Arr) # gw_sm_FE_OR_Arr = (gw_sm_fb_AD_ALT_Arr * gw_sm_sb_AD_REF_Arr) / (gw_sm_fb_AD_REF_Arr * gw_sm_sb_AD_ALT_Arr) sSNP_Arr = np.where(gw_sm_FE_P_Arr < smAlpha, 1, 0) gw_ratioLi.append(np.mean(sSNP_Arr)) misc.append([ 'Genome-wide sSNP/totalSNP ratio threshold', np.percentile(gw_ratioLi, [0.5, 99.5, 2.5, 97.5, 5.0, 95.0]) ]) print( f'Threshold calculation completed, time elapsed: {(time.time()-t0)/60} minutes' ) return np.percentile(gw_ratioLi, [0.5, 99.5, 2.5, 97.5, 5.0, 95.0])
def calc_fisher( clust_id: str, data: List[float], indices: List[int], indptr: List[int], shape: Tuple[int, int], cluster_labels: List[str], cond_labels: List[str], gene_names: List[str], cnt_vec: List[int], verbose: bool, ) -> pd.DataFrame: """ Calcualte Fisher's exact test for one cluster """ import fisher # recover sparse matrix mat = csr_matrix((data, indices, indptr), shape=shape) mask = cluster_labels == clust_id mat_clust = mat[mask] if cond_labels is None: n1 = mat_clust.shape[0] n2 = shape[0] - n1 a_true = mat_clust.getnnz(axis=0).astype(np.uint) a_false = n1 - a_true b_true = cnt_vec.astype(np.uint) - a_true b_false = n2 - b_true else: cond1 = cond_labels.categories[0] cond_labs = cond_labels[mask] mask2 = cond_labs == cond1 mat_cond1 = mat_clust[mask2] mat_cond2 = mat_clust[~mask2] n1 = mat_cond1.shape[0] n2 = mat_cond2.shape[0] a_true = mat_cond1.getnnz(axis=0).astype(np.uint) a_false = n1 - a_true b_true = mat_cond2.getnnz(axis=0).astype(np.uint) b_false = n2 - b_true pvals = fisher.pvalue_npy(a_true, a_false, b_true, b_false)[2] passed, qvals = fdr(pvals) df = pd.DataFrame( { "fisher_pval:{0}".format(clust_id): pvals.astype(np.float32), "fisher_qval:{0}".format(clust_id): qvals.astype(np.float32), }, index=gene_names, ) if verbose: logger.info("calc_fisher finished for cluster {0}.".format(clust_id)) return df
def calc_fisher(i, clust_label, gene_names, ct, total): cpt = total - ct[:, i, :] pvals = fisher.pvalue_npy(ct[:, i, 0], ct[:, i, 1], cpt[:, 0], cpt[:, 1])[2] passed, qvals = fdr(pvals) df = pd.DataFrame( { "fisher_pval_{0}".format(clust_label): pvals, "fisher_qval_{0}".format(clust_label): qvals }, index=gene_names) print("Cluster {0} is processed.".format(clust_label)) return df
def fisherTestVec(a, b, c, d, alternative='two-sided'): """Vectorized Fisher's exact test performs n tests on 4 length n numpy vectors a, b, c, and d representing the 4 elements of a 2x2 contigency table. Wrapper around fisher.pvalue_npy found in: Fast Fisher's Exact Test (Haibao Tang, Brent Pedersen) https://pypi.python.org/pypi/fisher/ Loop and test are performed in C (100x speed-up) Parameters ---------- a,b,c,d : shape (n,) ndarrays Vector of counts (will be cast as uint8 for operation) alternative : string Specfies the alternative hypothesis (similar to scipy.fisher_exact) Options: 'two-sided', 'less', 'greater' Returns ------- OR : shape (n,) ndarray Vector of odds-ratios associated with each 2 x 2 table p : shape (n,) ndarray Vector of p-values asspciated with each test and the alternative hypothesis""" res = fisher.pvalue_npy(a.astype(np.uint), b.astype(np.uint), c.astype(np.uint), d.astype(np.uint)) OR = (a * d) / (b * c) if alternative == 'two-sided': return (OR, res[2]) elif alternative == 'less': return (OR, res[0]) elif alternative == 'greater': return (OR, res[1])
bsaSNPs[sb_AF] = bsaSNPs[sb_AD_ALT]/bsaSNPs[sb_LD] bsaSNPs['Delta.AF'] = bsaSNPs[sb_AF] - bsaSNPs[fb_AF] # Calculate G-statistic bsaSNPs['G_S'] = gStatistic_Array(bsaSNPs[fb_AD_REF], bsaSNPs[fb_AD_ALT], bsaSNPs[sb_AD_REF], bsaSNPs[sb_AD_ALT]) try: from fisher import pvalue_npy # Create new columns for Fisher's exact test P-values and simulated P-values print('Perform Fisher\'s exact test.') fb_AD_ALT_Arr = bsaSNPs[fb_AD_ALT].to_numpy(dtype=np.uint) fb_AD_REF_Arr = bsaSNPs[fb_AD_REF].to_numpy(dtype=np.uint) sb_AD_ALT_Arr = bsaSNPs[sb_AD_ALT].to_numpy(dtype=np.uint) sb_AD_REF_Arr = bsaSNPs[sb_AD_REF].to_numpy(dtype=np.uint) __, __, bsaSNPs['FE_P'] = pvalue_npy(fb_AD_ALT_Arr, fb_AD_REF_Arr, sb_AD_ALT_Arr, sb_AD_REF_Arr) # bsaSNPs['FE_OR'] = (fb_AD_ALT_Arr * sb_AD_REF_Arr) / (fb_AD_REF_Arr * sb_AD_ALT_Arr) sm_fb_AD_ALT_Arr = bsaSNPs[sm_fb_AD_ALT].to_numpy(dtype=np.uint) sm_fb_AD_REF_Arr = bsaSNPs[sm_fb_AD_REF].to_numpy(dtype=np.uint) sm_sb_AD_ALT_Arr = bsaSNPs[sm_sb_AD_ALT].to_numpy(dtype=np.uint) sm_sb_AD_REF_Arr = bsaSNPs[sm_sb_AD_REF].to_numpy(dtype=np.uint) __, __, bsaSNPs['sm_FE_P'] = pvalue_npy(sm_fb_AD_ALT_Arr, sm_fb_AD_REF_Arr, sm_sb_AD_ALT_Arr, sm_sb_AD_REF_Arr) # bsaSNPs['sm_FE_OR'] = (sm_fb_AD_ALT_Arr * sm_sb_AD_REF_Arr) / (sm_fb_AD_REF_Arr * sm_sb_AD_ALT_Arr) print(f'Fisher\'s exact test completed, time elapsed: {(time.time()-t0)/60} minutes.') print('Calculate thresholds of \u0394(allele frequency) and G-statistic.') bsaSNPs['STAT'] = bsaSNPs.apply(statistics, axis=1)
def fisher_exact(tp, fp, fn, tn, pseudocount=0): """Fisher's exact for contingency tables. Computes the hypotheses two-sided, less and greater at the same time. The odds-ratio is Parameters ---------- tp : array-like of int Top left square of contingency table (true positives). fp : array-like of int Top right square of contingency table (false positives). fn : array-like of int Bottom left square of contingency table (false negatives). tn : array-like of int Bottom right square of contingency table (true negatives). pseudocount : float, default 0 Values > 0 allow Odds Ratio to always be a finite number. Notes ----- The odds-ratio is computed thusly: ``((tp + pseudocount) / (fp + pseudocount)) / ((fn + pseudocount) / (tn + pseudocount))`` Returns ------- pandas.DataFrame DataFrame with columns OR and P, PLeft and PRight. See Also -------- pr.stats.fdr : correct for multiple testing Examples -------- >>> d = {"TP": [12, 0], "FP": [5, 12], "TN": [29, 10], "FN": [2, 2]} >>> df = pd.DataFrame(d) >>> df TP FP TN FN 0 12 5 29 2 1 0 12 10 2 >>> pr.stats.fisher_exact(df.TP, df.FP, df.TN, df.FN) OR P PLeft PRight 0 0.165517 0.080269 0.044555 0.994525 1 0.000000 0.000067 0.000034 1.000000 """ try: from fisher import pvalue_npy except: import sys print( "fisher needs to be installed to use fisher exact. pip install fisher or conda install -c bioconda fisher." ) sys.exit(-1) tp = np.array(tp, dtype=np.uint) fp = np.array(fp, dtype=np.uint) fn = np.array(fn, dtype=np.uint) tn = np.array(tn, dtype=np.uint) left, right, twosided = pvalue_npy(tp, fp, fn, tn) OR = ((tp + pseudocount) / (fp + pseudocount)) / ((fn + pseudocount) / (tn + pseudocount)) df = pd.DataFrame({ "OR": OR, "P": twosided, "PLeft": left, "PRight": right }) return df
def fisher_exact(n1, d1, n2, d2, pseudocount=0): """Fisher's exact for contingency tables. Computes the hypotheses two-sided, less and greater at the same time. The odds-ratio is Parameters ---------- n1 : array-like of int Top left square of contingency table. d1 : array-like of int Bottom left square of contingency table. n2 : array-like of int Top right square of contingency table. d2 : array-like of int Bottom right square of contingency table. pseudocount : float, default 0 Values > 0 allow Odds Ratio to always be a finite number. Notes ----- The odds-ratio is computed thusly: ``((n1 + pseudocount) / (d2 + pseudocount)) / ((n2 + pseudocount) / (d1 + pseudocount))`` Returns ------- pandas.DataFrame DataFrame with columns OR and P, PLeft and PRight. See Also -------- pr.stats.fdr : correct for multiple testing Examples -------- >>> d = {"TP": [1, 0, 8], "FP": [11, 12, 1], "TN": [9, 10, 2], "FN": [3, 2, 5]} >>> df = pd.DataFrame(d) >>> df TP FP TN FN 0 1 11 9 3 1 0 12 10 2 2 8 1 2 5 >>> pr.stats.fisher_exact(df.TP, df.FP, df.TN, df.FN) OR P PLeft PRight 0 0.407407 0.002759 0.001380 0.999966 1 0.000000 0.000067 0.000034 1.000000 2 0.800000 0.034965 0.999126 0.024476 """ try: from fisher import pvalue_npy except: import sys print( "fisher needs to be installed to use fisher exact. pip install fisher or conda install -c bioconda fisher." ) sys.exit(-1) n1 = np.array(n1, dtype=np.uint) n2 = np.array(n2, dtype=np.uint) d1 = np.array(d1, dtype=np.uint) d2 = np.array(d2, dtype=np.uint) left, right, twosided = pvalue_npy(n1, d1, n2, d2) OR = ((n1 + pseudocount) / (d2 + pseudocount)) / ((n2 + pseudocount) / (d1 + pseudocount)) df = pd.DataFrame({ "OR": OR, "P": twosided, "PLeft": left, "PRight": right }) return df
# Calculate simulated ALT reads for each SNP under null hypothesis snpDF[sm_fb_AD_ALT] = np.random.binomial(snpDF[fb_LD], fb_Freq) snpDF[sm_fb_AD_REF] = snpDF[fb_LD] - snpDF[sm_fb_AD_ALT] snpDF[sm_sb_AD_ALT] = np.random.binomial(snpDF[sb_LD], sb_Freq) snpDF[sm_sb_AD_REF] = snpDF[sb_LD] - snpDF[sm_sb_AD_ALT] try: from fisher import pvalue_npy # Create new columns for Fisher's exact test P-values and simulated P-values print('Perform Fisher\'s exact test.') fb_AD_ALT_Arr = snpDF[fb_AD_ALT].to_numpy(dtype=np.uint) fb_AD_REF_Arr = snpDF[fb_AD_REF].to_numpy(dtype=np.uint) sb_AD_ALT_Arr = snpDF[sb_AD_ALT].to_numpy(dtype=np.uint) sb_AD_REF_Arr = snpDF[sb_AD_REF].to_numpy(dtype=np.uint) __, __, snpDF['FE_P'] = pvalue_npy(fb_AD_ALT_Arr, fb_AD_REF_Arr, sb_AD_ALT_Arr, sb_AD_REF_Arr) # snpDF['FE_OR'] = (fb_AD_ALT_Arr * sb_AD_REF_Arr) / (fb_AD_REF_Arr * sb_AD_ALT_Arr) sm_fb_AD_ALT_Arr = snpDF[sm_fb_AD_ALT].to_numpy(dtype=np.uint) sm_fb_AD_REF_Arr = snpDF[sm_fb_AD_REF].to_numpy(dtype=np.uint) sm_sb_AD_ALT_Arr = snpDF[sm_sb_AD_ALT].to_numpy(dtype=np.uint) sm_sb_AD_REF_Arr = snpDF[sm_sb_AD_REF].to_numpy(dtype=np.uint) __, __, snpDF['sm_FE_P'] = pvalue_npy(sm_fb_AD_ALT_Arr, sm_fb_AD_REF_Arr, sm_sb_AD_ALT_Arr, sm_sb_AD_REF_Arr) # snpDF['sm_FE_OR'] = (sm_fb_AD_ALT_Arr * sm_sb_AD_REF_Arr) / (sm_fb_AD_REF_Arr * sm_sb_AD_ALT_Arr) except ImportError: from scipy.stats import fisher_exact