Beispiel #1
0
def fisher_exact(n1, d1, n2, d2, **kwargs):
    try:
        from fisher import pvalue_npy
    except:
        import sys
        print(
            "fisher needs to be installed to use fisher exact. pip install fisher or conda install -c bioconda fisher."
        )
        sys.exit(-1)

    pseudocount = kwargs.get("pseudocount", 0)
    fe_type = kwargs.get("alternative", "twosided")

    n1 = np.array(n1, dtype=np.uint)
    n2 = np.array(n2, dtype=np.uint)
    d1 = np.array(d1, dtype=np.uint)
    d2 = np.array(d2, dtype=np.uint)

    left, right, twosided = pvalue_npy(n1, d1, n2, d2)

    if fe_type == "twosided":
        p_vals = twosided
    elif fe_type == "left":
        p_vals = left
    elif fe_type == "right":
        p_vals = right
    else:
        raise Exception("alternative must be twosided, left or right")

    OR = ((n1 + pseudocount) / (d2 + pseudocount)) / ((n2 + pseudocount) /
                                                      (d1 + pseudocount))

    df = pd.DataFrame({"OR": OR, "P": p_vals})

    return df
 def _get_significance(self, a, b, expected_freq):
     '''
     Compute the p-value where the null hypothesis is that b was obtained due to error.
     
     a, b : numpy array of counts
     base_line_error : expected error rate.
     '''
     d = a + b
     
     expected_b = np.around(d * expected_freq)
     
     expected_a = d - expected_b
     
     # Downcast to uint to work with fisher exact test function.
     a = np.asarray(a, dtype=np.uint)
     b = np.asarray(b, dtype=np.uint)
     expected_a = np.asarray(expected_a, dtype=np.uint)
     expected_b = np.asarray(expected_b, dtype=np.uint)
     
     left_tail, right_tail, two_tail = pvalue_npy(expected_a,
                                                   expected_b,
                                                   a,
                                                   b)
     
     return right_tail
Beispiel #3
0
    def fishers_vec(a, b, c, d, alternative='two-sided'):
        scalar = np.isscalar(a) and np.isscalar(b) and np.isscalar(c) and np.isscalar(d)
        a = np.asarray(a).ravel()
        b = np.asarray(b).ravel()
        c = np.asarray(c).ravel()
        d = np.asarray(d).ravel()

        assert len(a) == len(b)
        assert len(a) == len(c)
        assert len(a) == len(d)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            OR = (a*d) / (b*c)

        res = fisher.pvalue_npy(a.astype(np.uint), b.astype(np.uint), c.astype(np.uint), d.astype(np.uint))
        if alternative in ['two-sided', 'two-tailed']:
            out = (OR, res[2])
        elif alternative in ['less', 'left-tailed']:
            out = (OR, res[0])
        elif alternative in ['greater', 'right-tailed']:
            out = (OR, res[1])
        else:
            print_function('Please specify an alternative: two-sided, less, or greater')
            out = OR, np.nan * np.zeros((len(a), 1))
        if scalar:
            out = (out[0][0], out[1][0])
        return out
    def fisherTestVec(a,b,c,d,alternative='two-sided'):
        """Vectorized Fisher's exact test performs n tests
        on 4 length n numpy vectors a, b, c, and d representing
        the 4 elements of a 2x2 contigency table.

        Wrapper around fisher.pvalue_npy found in:
        Fast Fisher's Exact Test (Haibao Tang, Brent Pedersen)
        https://pypi.python.org/pypi/fisher/

        Loop and test are performed in C (100x speed-up)

        Parameters
        ----------
        a,b,c,d : shape (n,) ndarrays
            Vector of counts (will be cast as uint8 for operation)
        alternative : string
            Specfies the alternative hypothesis (similar to scipy.fisher_exact)
            Options: 'two-sided', 'less', 'greater'

        Returns
        -------
        OR : shape (n,) ndarray
            Vector of odds-ratios associated with each 2 x 2 table
        p : shape (n,) ndarray
            Vector of p-values asspciated with each test and the alternative hypothesis"""

        res = fisher.pvalue_npy(a.astype(np.uint), b.astype(np.uint), c.astype(np.uint), d.astype(np.uint))
        #OR = (a*d)/(b*c)

        if alternative == 'two-sided':
            return res[2]
        elif alternative == 'less':
            return res[0]
        elif alternative == 'greater':
            return res[1]
Beispiel #5
0
def smThresholds_sw(DF):
    sw_ratioLi = []

    sw_fb_LD_Arr = DF[fb_LD].to_numpy().astype(np.uint)
    sw_sb_LD_Arr = DF[sb_LD].to_numpy().astype(np.uint)

    for __ in range(rep):
        # Create new columns for Fisher's exact test simulated P-values
        sw_sm_fb_AD_ALT_Arr = np.random.binomial(DF[fb_LD],
                                                 fb_Freq).astype(np.uint)
        sw_sm_fb_AD_REF_Arr = sw_fb_LD_Arr - sw_sm_fb_AD_ALT_Arr
        sw_sm_sb_AD_ALT_Arr = np.random.binomial(DF[sb_LD],
                                                 sb_Freq).astype(np.uint)
        sw_sm_sb_AD_REF_Arr = sw_sb_LD_Arr - sw_sm_sb_AD_ALT_Arr

        __, __, sw_sm_FE_P_Arr = pvalue_npy(sw_sm_fb_AD_ALT_Arr,
                                            sw_sm_fb_AD_REF_Arr,
                                            sw_sm_sb_AD_ALT_Arr,
                                            sw_sm_sb_AD_REF_Arr)
        # sw_sm_FE_OR_Arr = (sw_sm_fb_AD_ALT_Arr * sw_sm_sb_AD_REF_Arr) / (sw_sm_fb_AD_REF_Arr * sw_sm_sb_AD_ALT_Arr)

        sSNP_Arr = np.where(sw_sm_FE_P_Arr < smAlpha, 1, 0)

        sw_ratioLi.append(np.mean(sSNP_Arr))

    return np.percentile(sw_ratioLi, [0.5, 99.5, 2.5, 97.5, 5.0, 95.0])
Beispiel #6
0
def smThresholds_gw(DF):
    print('Calculate the threshold of sSNPs/totalSNPs.')
    gw_ratioLi = []
    for __ in range(rep):
        sm_SNP_SMPL = DF.sample(snpPerSW, replace=True)

        gw_sm_fb_AD_ALT_Arr = np.random.binomial(sm_SNP_SMPL[fb_LD],
                                                 fb_Freq).astype(np.uint)
        gw_sm_fb_AD_REF_Arr = sm_SNP_SMPL[fb_LD].to_numpy().astype(
            np.uint) - gw_sm_fb_AD_ALT_Arr
        gw_sm_sb_AD_ALT_Arr = np.random.binomial(sm_SNP_SMPL[sb_LD],
                                                 sb_Freq).astype(np.uint)
        gw_sm_sb_AD_REF_Arr = sm_SNP_SMPL[sb_LD].to_numpy().astype(
            np.uint) - gw_sm_sb_AD_ALT_Arr

        __, __, gw_sm_FE_P_Arr = pvalue_npy(gw_sm_fb_AD_ALT_Arr,
                                            gw_sm_fb_AD_REF_Arr,
                                            gw_sm_sb_AD_ALT_Arr,
                                            gw_sm_sb_AD_REF_Arr)
        # gw_sm_FE_OR_Arr = (gw_sm_fb_AD_ALT_Arr * gw_sm_sb_AD_REF_Arr) / (gw_sm_fb_AD_REF_Arr * gw_sm_sb_AD_ALT_Arr)

        sSNP_Arr = np.where(gw_sm_FE_P_Arr < smAlpha, 1, 0)

        gw_ratioLi.append(np.mean(sSNP_Arr))

    misc.append([
        'Genome-wide sSNP/totalSNP ratio threshold',
        np.percentile(gw_ratioLi, [0.5, 99.5, 2.5, 97.5, 5.0, 95.0])
    ])
    print(
        f'Threshold calculation completed, time elapsed: {(time.time()-t0)/60} minutes'
    )

    return np.percentile(gw_ratioLi, [0.5, 99.5, 2.5, 97.5, 5.0, 95.0])
Beispiel #7
0
def calc_fisher(
    clust_id: str,
    data: List[float],
    indices: List[int],
    indptr: List[int],
    shape: Tuple[int, int],
    cluster_labels: List[str],
    cond_labels: List[str],
    gene_names: List[str],
    cnt_vec: List[int],
    verbose: bool,
) -> pd.DataFrame:
    """ Calcualte Fisher's exact test for one cluster
    """
    import fisher

    # recover sparse matrix
    mat = csr_matrix((data, indices, indptr), shape=shape)
    mask = cluster_labels == clust_id
    mat_clust = mat[mask]

    if cond_labels is None:
        n1 = mat_clust.shape[0]
        n2 = shape[0] - n1

        a_true = mat_clust.getnnz(axis=0).astype(np.uint)
        a_false = n1 - a_true
        b_true = cnt_vec.astype(np.uint) - a_true
        b_false = n2 - b_true
    else:
        cond1 = cond_labels.categories[0]
        cond_labs = cond_labels[mask]
        mask2 = cond_labs == cond1

        mat_cond1 = mat_clust[mask2]
        mat_cond2 = mat_clust[~mask2]
        n1 = mat_cond1.shape[0]
        n2 = mat_cond2.shape[0]

        a_true = mat_cond1.getnnz(axis=0).astype(np.uint)
        a_false = n1 - a_true
        b_true = mat_cond2.getnnz(axis=0).astype(np.uint)
        b_false = n2 - b_true

    pvals = fisher.pvalue_npy(a_true, a_false, b_true, b_false)[2]
    passed, qvals = fdr(pvals)

    df = pd.DataFrame(
        {
            "fisher_pval:{0}".format(clust_id): pvals.astype(np.float32),
            "fisher_qval:{0}".format(clust_id): qvals.astype(np.float32),
        },
        index=gene_names,
    )

    if verbose:
        logger.info("calc_fisher finished for cluster {0}.".format(clust_id))

    return df
Beispiel #8
0
def calc_fisher(i, clust_label, gene_names, ct, total):
    cpt = total - ct[:, i, :]
    pvals = fisher.pvalue_npy(ct[:, i, 0], ct[:, i, 1], cpt[:, 0], cpt[:,
                                                                       1])[2]
    passed, qvals = fdr(pvals)
    df = pd.DataFrame(
        {
            "fisher_pval_{0}".format(clust_label): pvals,
            "fisher_qval_{0}".format(clust_label): qvals
        },
        index=gene_names)

    print("Cluster {0} is processed.".format(clust_label))

    return df
Beispiel #9
0
    def fisherTestVec(a, b, c, d, alternative='two-sided'):
        """Vectorized Fisher's exact test performs n tests
        on 4 length n numpy vectors a, b, c, and d representing
        the 4 elements of a 2x2 contigency table.

        Wrapper around fisher.pvalue_npy found in:
        Fast Fisher's Exact Test (Haibao Tang, Brent Pedersen)
        https://pypi.python.org/pypi/fisher/

        Loop and test are performed in C (100x speed-up)

        Parameters
        ----------
        a,b,c,d : shape (n,) ndarrays
            Vector of counts (will be cast as uint8 for operation)
        alternative : string
            Specfies the alternative hypothesis (similar to scipy.fisher_exact)
            Options: 'two-sided', 'less', 'greater'

        Returns
        -------
        OR : shape (n,) ndarray
            Vector of odds-ratios associated with each 2 x 2 table
        p : shape (n,) ndarray
            Vector of p-values asspciated with each test and the alternative hypothesis"""

        res = fisher.pvalue_npy(a.astype(np.uint), b.astype(np.uint),
                                c.astype(np.uint), d.astype(np.uint))
        OR = (a * d) / (b * c)

        if alternative == 'two-sided':
            return (OR, res[2])
        elif alternative == 'less':
            return (OR, res[0])
        elif alternative == 'greater':
            return (OR, res[1])
Beispiel #10
0
    bsaSNPs[sb_AF] = bsaSNPs[sb_AD_ALT]/bsaSNPs[sb_LD]
    bsaSNPs['Delta.AF'] = bsaSNPs[sb_AF] - bsaSNPs[fb_AF]

    # Calculate G-statistic
    bsaSNPs['G_S'] = gStatistic_Array(bsaSNPs[fb_AD_REF], bsaSNPs[fb_AD_ALT], bsaSNPs[sb_AD_REF], bsaSNPs[sb_AD_ALT])

    try:
        from fisher import pvalue_npy
        # Create new columns for Fisher's exact test P-values and simulated P-values
        print('Perform Fisher\'s exact test.')
        fb_AD_ALT_Arr = bsaSNPs[fb_AD_ALT].to_numpy(dtype=np.uint)
        fb_AD_REF_Arr = bsaSNPs[fb_AD_REF].to_numpy(dtype=np.uint)
        sb_AD_ALT_Arr = bsaSNPs[sb_AD_ALT].to_numpy(dtype=np.uint)
        sb_AD_REF_Arr = bsaSNPs[sb_AD_REF].to_numpy(dtype=np.uint)

        __, __, bsaSNPs['FE_P'] = pvalue_npy(fb_AD_ALT_Arr, fb_AD_REF_Arr, sb_AD_ALT_Arr, sb_AD_REF_Arr)
        # bsaSNPs['FE_OR'] = (fb_AD_ALT_Arr * sb_AD_REF_Arr) / (fb_AD_REF_Arr * sb_AD_ALT_Arr)

        sm_fb_AD_ALT_Arr = bsaSNPs[sm_fb_AD_ALT].to_numpy(dtype=np.uint)
        sm_fb_AD_REF_Arr = bsaSNPs[sm_fb_AD_REF].to_numpy(dtype=np.uint)
        sm_sb_AD_ALT_Arr = bsaSNPs[sm_sb_AD_ALT].to_numpy(dtype=np.uint)
        sm_sb_AD_REF_Arr = bsaSNPs[sm_sb_AD_REF].to_numpy(dtype=np.uint)

        __, __, bsaSNPs['sm_FE_P'] = pvalue_npy(sm_fb_AD_ALT_Arr, sm_fb_AD_REF_Arr, sm_sb_AD_ALT_Arr, sm_sb_AD_REF_Arr)
        # bsaSNPs['sm_FE_OR'] = (sm_fb_AD_ALT_Arr * sm_sb_AD_REF_Arr) / (sm_fb_AD_REF_Arr * sm_sb_AD_ALT_Arr)

        print(f'Fisher\'s exact test completed, time elapsed: {(time.time()-t0)/60} minutes.')

        print('Calculate thresholds of \u0394(allele frequency) and G-statistic.')
        bsaSNPs['STAT'] = bsaSNPs.apply(statistics, axis=1)
Beispiel #11
0
def fisher_exact(tp, fp, fn, tn, pseudocount=0):
    """Fisher's exact for contingency tables.

    Computes the hypotheses two-sided, less and greater at the same time.

    The odds-ratio is

    Parameters
    ----------
    tp : array-like of int

        Top left square of contingency table (true positives).

    fp : array-like of int

        Top right square of contingency table (false positives).

    fn : array-like of int

        Bottom left square of contingency table (false negatives).

    tn : array-like of int

        Bottom right square of contingency table (true negatives).

    pseudocount : float, default 0

        Values > 0 allow Odds Ratio to always be a finite number.

    Notes
    -----

    The odds-ratio is computed thusly:

    ``((tp + pseudocount) / (fp + pseudocount)) / ((fn + pseudocount) / (tn + pseudocount))``

    Returns
    -------
    pandas.DataFrame

        DataFrame with columns OR and P, PLeft and PRight.

    See Also
    --------

    pr.stats.fdr : correct for multiple testing

    Examples
    --------

    >>> d = {"TP": [12, 0], "FP": [5, 12], "TN": [29, 10], "FN": [2, 2]}
    >>> df = pd.DataFrame(d)
    >>> df
       TP  FP  TN  FN
    0  12   5  29   2
    1   0  12  10   2

    >>> pr.stats.fisher_exact(df.TP, df.FP, df.TN, df.FN)
             OR         P     PLeft    PRight
    0  0.165517  0.080269  0.044555  0.994525
    1  0.000000  0.000067  0.000034  1.000000
    """

    try:
        from fisher import pvalue_npy
    except:
        import sys
        print(
            "fisher needs to be installed to use fisher exact. pip install fisher or conda install -c bioconda fisher."
        )
        sys.exit(-1)

    tp = np.array(tp, dtype=np.uint)
    fp = np.array(fp, dtype=np.uint)
    fn = np.array(fn, dtype=np.uint)
    tn = np.array(tn, dtype=np.uint)

    left, right, twosided = pvalue_npy(tp, fp, fn, tn)

    OR = ((tp + pseudocount) / (fp + pseudocount)) / ((fn + pseudocount) /
                                                      (tn + pseudocount))

    df = pd.DataFrame({
        "OR": OR,
        "P": twosided,
        "PLeft": left,
        "PRight": right
    })

    return df
Beispiel #12
0
def fisher_exact(n1, d1, n2, d2, pseudocount=0):
    """Fisher's exact for contingency tables.

    Computes the hypotheses two-sided, less and greater at the same time.

    The odds-ratio is

    Parameters
    ----------
    n1 : array-like of int

        Top left square of contingency table.

    d1 : array-like of int

        Bottom left square of contingency table.

    n2 : array-like of int

        Top right square of contingency table.

    d2 : array-like of int

        Bottom right square of contingency table.

    pseudocount : float, default 0

        Values > 0 allow Odds Ratio to always be a finite number.

    Notes
    -----

    The odds-ratio is computed thusly:

    ``((n1 + pseudocount) / (d2 + pseudocount)) / ((n2 + pseudocount) / (d1 + pseudocount))``

    Returns
    -------
    pandas.DataFrame

        DataFrame with columns OR and P, PLeft and PRight.

    See Also
    --------

    pr.stats.fdr : correct for multiple testing

    Examples
    --------

    >>> d = {"TP": [1, 0, 8], "FP": [11, 12, 1], "TN": [9, 10, 2], "FN": [3, 2, 5]}
    >>> df = pd.DataFrame(d)
    >>> df
       TP  FP  TN  FN
    0   1  11   9   3
    1   0  12  10   2
    2   8   1   2   5

    >>> pr.stats.fisher_exact(df.TP, df.FP, df.TN, df.FN)
             OR         P     PLeft    PRight
    0  0.407407  0.002759  0.001380  0.999966
    1  0.000000  0.000067  0.000034  1.000000
    2  0.800000  0.034965  0.999126  0.024476
    """

    try:
        from fisher import pvalue_npy
    except:
        import sys
        print(
            "fisher needs to be installed to use fisher exact. pip install fisher or conda install -c bioconda fisher."
        )
        sys.exit(-1)

    n1 = np.array(n1, dtype=np.uint)
    n2 = np.array(n2, dtype=np.uint)
    d1 = np.array(d1, dtype=np.uint)
    d2 = np.array(d2, dtype=np.uint)

    left, right, twosided = pvalue_npy(n1, d1, n2, d2)

    OR = ((n1 + pseudocount) / (d2 + pseudocount)) / ((n2 + pseudocount) /
                                                      (d1 + pseudocount))

    df = pd.DataFrame({
        "OR": OR,
        "P": twosided,
        "PLeft": left,
        "PRight": right
    })

    return df
Beispiel #13
0
    # Calculate simulated ALT reads for each SNP under null hypothesis
    snpDF[sm_fb_AD_ALT] = np.random.binomial(snpDF[fb_LD], fb_Freq)
    snpDF[sm_fb_AD_REF] = snpDF[fb_LD] - snpDF[sm_fb_AD_ALT]
    snpDF[sm_sb_AD_ALT] = np.random.binomial(snpDF[sb_LD], sb_Freq)
    snpDF[sm_sb_AD_REF] = snpDF[sb_LD] - snpDF[sm_sb_AD_ALT]

    try:
        from fisher import pvalue_npy
        # Create new columns for Fisher's exact test P-values and simulated P-values
        print('Perform Fisher\'s exact test.')
        fb_AD_ALT_Arr = snpDF[fb_AD_ALT].to_numpy(dtype=np.uint)
        fb_AD_REF_Arr = snpDF[fb_AD_REF].to_numpy(dtype=np.uint)
        sb_AD_ALT_Arr = snpDF[sb_AD_ALT].to_numpy(dtype=np.uint)
        sb_AD_REF_Arr = snpDF[sb_AD_REF].to_numpy(dtype=np.uint)

        __, __, snpDF['FE_P'] = pvalue_npy(fb_AD_ALT_Arr, fb_AD_REF_Arr,
                                           sb_AD_ALT_Arr, sb_AD_REF_Arr)
        # snpDF['FE_OR'] = (fb_AD_ALT_Arr * sb_AD_REF_Arr) / (fb_AD_REF_Arr * sb_AD_ALT_Arr)

        sm_fb_AD_ALT_Arr = snpDF[sm_fb_AD_ALT].to_numpy(dtype=np.uint)
        sm_fb_AD_REF_Arr = snpDF[sm_fb_AD_REF].to_numpy(dtype=np.uint)
        sm_sb_AD_ALT_Arr = snpDF[sm_sb_AD_ALT].to_numpy(dtype=np.uint)
        sm_sb_AD_REF_Arr = snpDF[sm_sb_AD_REF].to_numpy(dtype=np.uint)

        __, __, snpDF['sm_FE_P'] = pvalue_npy(sm_fb_AD_ALT_Arr,
                                              sm_fb_AD_REF_Arr,
                                              sm_sb_AD_ALT_Arr,
                                              sm_sb_AD_REF_Arr)
        # snpDF['sm_FE_OR'] = (sm_fb_AD_ALT_Arr * sm_sb_AD_REF_Arr) / (sm_fb_AD_REF_Arr * sm_sb_AD_ALT_Arr)

    except ImportError:
        from scipy.stats import fisher_exact