コード例 #1
0
def HC_sim(c1,
           c2,
           gamma=0.15,
           randomize=False,
           pval_thresh=1.1,
           HCtype='HCstar'):
    """
    Higher-Criticism (HC) similarity of two discrete samples

    Args:
    -----
    c1, c2 : two lists of integers of equal length
    gamma : HC parameter
    randomize : randomized Pvalues or normalization
    pval_thresh : only use P-values below this value. Has not effect
                  if pval_thresh > 1. 

    Returns: 
    -------
    HCstar of the binomial allocation P-values of the two lists
    """
    pvals = two_sample_pvals(c1, c2, randomize=randomize)
    pvals_red = pvals[pvals < pval_thresh]

    if len(pvals_red) == 0:
        return np.nan

    if HCtype == 'HCstar':
        hc, _ = HC(pvals_red).HCstar(gamma=gamma)
    elif HCtype == 'original':
        hc, _ = HC(pvals_red).HC(gamma=gamma)
    else:
        raise ValueError(f"{HCtype} is not a valid value for HCtype")
        exit(1)
    return hc
コード例 #2
0
def evaluate_iteration(n = 10, N = 10, ep = .1, mu = 1, xi = 0, metric = 'Hellinger') :
    logging.debug(f"Evaluating with: n={n}, N={N}, ep={ep}, mu={mu}, xi={xi}")
    P = power_law(N, xi)
    
    if metric == 'Hellinger' :
      QP = (np.sqrt(P) + np.sqrt(mu))**2

    if metric == 'ChiSq' :
      QP = P + 2 * np.sqrt(P * mu)

    if metric == 'proportional' :
      QP = P *( 1 + r * np.log(N))

    if metric == 'power' :
      QP = P * (np.log(N) ** r)

    smp1 = sample_from_mixture(n*P, n*QP, ep)
    smp2 = sample_from_mixture(n*P, n*QP, ep)

    min_cnt = 0
    stbl = False
    gamma = 0.25

    pv = two_sample_pvals(smp1, smp2, randomize=True, sym=True)
    pv = pv[(smp1 == 0) | (smp2 == 0)]

    if len(pv) > 0 :
        hc, _ = HC(pv[pv < 1], stbl=stbl).HC(gamma=gamma)
        MinPv = -np.log(pv.min())
    else :
        print("empty")
        hc = np.nan
        MinPv = np.nan

    pv_NR = two_sample_pvals(smp1, smp2, randomize=False)
    pv_NR = pv_NR[(smp1 == 0) | (smp2 == 0)]
    
    if len(pv_NR) > 0 :
        hc_NR, _ = HC(pv_NR[pv_NR < 1], stbl=stbl).HC(gamma=gamma)
        MinPvNR = -np.log(pv_NR.min())
    else :
        print("empty")
        hc_NR = np.nan
        MinPvNR = np.nan

    return {'HC_NR' : hc_NR, 'minPv_NR' : MinPvNR,
            'HC' : hc, 'minPv' : MinPv}
コード例 #3
0
    def test_doc(self, doc, of_cls=None, **kwrgs):
        """
        Test a new document against existing documents by combining
        binomial allocation P-values from each document. 
        
        Params:
        :doc:     dataframe representing terms in the tested doc
        :of_cls:  use this to indicate that the tested document is already
                represented by one of the classes in the model
        :stbl:    type of HC statistic to use
        :gamma:   parameter of HC statistic
        """

        stbl = kwrgs.get('stbl', self.stbl)
        gamma = kwrgs.get('gamma', self.gamma)

        dfi = self.count_words(doc)
        logging.debug(f"Doc contains {dfi.n.sum()} terms.")
        df = self.counts_df
        assert (
            len(df) == len(dfi)), "count_words must use the same vocabulary"

        dfi['tested:T'] = dfi.n.sum()
        dfi = dfi.rename(columns={'n': 'tested:n'})
        df = df.join(dfi, how='left')

        for cls in self.cls_names:
            cnt1 = df['tested:n'].astype(int)
            cnt2 = df[f'{cls}:n'].astype(int)
            if of_cls == cls:  # if tested document is already represented in
                # corpus, remove its counts to get a meaningful
                # comparison.
                logging.debug(
                    f"Doc is of {of_cls}. Evaluating in a Leave-out manner.")
                cnt2 -= cnt1
                assert (np.all(cnt2 >= 0))

            if cnt1.sum() + cnt2.sum() > 0:
                pv, p = two_sample_pvals(cnt1, cnt2, ret_p=True)
            else:
                pv, p = cnt1 * np.nan, cnt1 * np.nan

            df[f'{cls}:pval'] = pv
            df[f'{cls}:score'] = -2 * np.log(df[f'{cls}:pval'])
            df[f'{cls}:Fisher'] = df[f'{cls}:score'].mean()
            df[f'{cls}:HC'], pth = HC(pv, stbl=stbl).HCstar(gamma=gamma)
            df[f'{cls}:chisq'] = two_sample_chi_square(cnt1, cnt2)[0]
            more = -np.sign(cnt1 - (cnt1 + cnt2) * p)
            thresh = pv < pth
            df[f'{cls}:affinity'] = more * thresh

        return df
コード例 #4
0
    def two_sample_pvals_loc(c1,
                             c2,
                             randomize=False,
                             min_cnt=0,
                             pval_type='cell',
                             max_m=-1):

        if pval_type == 'stripe':
            logging.debug('Computing stripe P-values.')
            return binom_var_test(c1, c2, max_m=max_m).values

        if pval_type == 'cell':
            logging.debug('Computing cell P-values.')
            pv_exact = two_sample_pvals(c1, c2, randomize=randomize)
            return pv_exact[c1 + c2 >= min_cnt]

        logging.debug('Computing cell and stripe P-values.')
        pv_bin_var = binom_var_test(c1, c2).values
        pv_exact = two_sample_pvals(c1, c2, randomize=randomize)
        pv_exact = pv_exact[c1 + c2 >= min_cnt]

        pv_all = np.concatenate([pv_bin_var, pv_exact])
        return pv_all
コード例 #5
0
    def get_pvals(self):
        if self.num_of_cls < 2:
            logging.error("Not enough columns.")
            return np.nan
        df = self.counts_df.copy()
        if self.num_of_cls > 2:
            logging.info("Using multinomial tests. May be slow.")

            df['x'] = df.filter(regex=r":n$").to_records(index=False).tolist()
            df['p'] = df.filter(regex=r":T$").to_records(index=False).tolist()
            pv = df.apply(lambda r: multinomial_test(r['x'], r['p']), axis=1)

        else:  # num_cls == 2
            logging.info("Using binomial tests.")
            pv = two_sample_pvals(df[f"{self.cls_names[0]}:n"],
                                  df[f"{self.cls_names[1]}:n"])

        df['pval'] = pv
        return df
コード例 #6
0
    def test_cls(self, cls_name, gamma=.2, stbl=True):
        """
        HC Test of one class against the rest. Returns HC value 
        and indicates if feature is selected by HCT

        """
        df1 = pd.DataFrame()
        col_name_n = f"n ({cls_name})"
        col_name_T = f"T ({cls_name})"
        df1 = self.counts_df.filter([col_name_n, col_name_T])
        df1['n (rest)'] = self.counts_df['n'] - df1[col_name_n]
        df1['T (rest)'] = self.counts_df["T"] - df1[col_name_T]

        df1['pval'] = two_sample_pvals(df1[col_name_n], df1["n (rest)"])

        hc, thr = HC(df1['pval'], stbl=stbl).HCstar(gamma=gamma)
        df1['HC'] = hc
        df1['thresh'] = df1['pval'] < thr
        df1['more'] = np.sign(df1[col_name_n] / df1[col_name_T] \
                                - df1['n (rest)'] / df1['T (rest)'])
        return df1
コード例 #7
0
    def test_doc(self, doc, of_cls=None, stbl=True, gamma=.2):
        """
        Test a new document against existing documents by combining
        binomial allocation P-values from each document. 

        """

        dfi = self.count_words(doc)

        logging.debug(f"Doc contains {dfi.n.sum()} terms.")

        df = self.counts_df

        dfi['T (test)'] = dfi.n.sum()
        dfi = dfi.rename(columns={'n': 'n (test)'})
        df = df.join(dfi, how='left')

        for cls in self.cls_names:
            cnt1 = df['n (test)'].astype(int)
            cnt2 = df[f'n ({cls})'].astype(int)
            if of_cls == cls:  # if tested document is already represented in
                # corpus, remove its counts to get a meaningful
                # comparison.
                logging.debug(
                    f"Doc is of {of_cls}. Evaluating in Leave-our manner.")
                print(f"Doc is of {of_cls}. Evaluating in Leave-our manner.")
                cnt2 -= cnt1

            pv, p = two_sample_pvals(cnt1, cnt2, ret_p=True)

            df[f'pval ({cls})'] = pv
            df[f'score ({cls})'] = -2 * np.log(df[f'pval ({cls})'])
            df[f'HC ({cls})'], pth = HC(pv, stbl=stbl).HCstar(gamma=gamma)
            more = -np.sign(cnt1 - (cnt1 + cnt2) * p)
            thresh = pv < pth
            df[f'affinity ({cls})'] = more * thresh

        return df
コード例 #8
0
    def test_cls(self, cls_name, **kwrgs):
        """
        HC Test of one class against the rest. Returns HC value 
        and indicates if feature is selected by HCT

        """
        stbl = kwrgs.get('stbl', self.stbl)
        gamma = kwrgs.get('gamma', self.gamma)

        df1 = pd.DataFrame()
        col_name_n = f"{cls_name}:n"
        col_name_T = f"{cls_name}:T"
        df1 = self.counts_df.filter([col_name_n, col_name_T])
        df1['rest:n'] = self.counts_df['n'] - df1[col_name_n]
        df1['rest:T'] = self.counts_df["T"] - df1[col_name_T]

        df1['pval'] = two_sample_pvals(df1[col_name_n], df1["rest:n"])

        hc, thr = HC(df1['pval'], stbl=stbl).HCstar(gamma=gamma)
        df1['HC'] = hc
        df1['thresh'] = df1['pval'] < thr
        df1['more'] = np.sign(df1[col_name_n] / df1[col_name_T] \
                              - df1['rest:n'] / df1['rest:T'])
        return df1
コード例 #9
0
def BJ_sim(c1, c2, gamma=0.1, randomize=False, pval_thresh=1.1):
    """
    Berk-Jones (BJ) similarity of two discrete samples

    Args:
    -----
    c1, c2 : two lists of integers of equal length
    gamma : lower fraction of P-values
    randomize : randomized Pvalues or normalization
    pval_thresh : only use P-values below this value. Has not effect
                  if pval_thresh > 1. 

    Returns:
    -------
    HCstar of the binomial allocation P-values of the two lists
    """
    pvals = two_sample_pvals(c1, c2, randomize=randomize)
    pvals_red = pvals[pvals < pval_thresh]

    if len(pvals_red) == 0:
        return np.nan

    bj, _ = HC(pvals_red).BJ(gamma=gamma)
    return bj
コード例 #10
0
def evaluate_iteration(a, xi, r, be, n, nMonte=10,
                 metric = 'Hellinger') :
    N = int(n ** (1/a))
    #n = int(N ** a)
    P = power_law(N, xi)
    print("r = {}, beta = {}, a = {}, xi = {}, n = {}".format(r,be,a,xi,n))
    ep = N ** (-be)
    mu = r * np.log(N) / n / 2
    
    df = pd.DataFrame()
    for iM in range(nMonte) :
        
        TH1 = np.random.rand(N) < ep/2
        TH2 = np.random.rand(N) < ep/2

        if metric == 'Hellinger' :
          QP = (np.sqrt(P) + np.sqrt(mu))**2

        if metric == 'ChiSq' :
          QP = P + 2 * np.sqrt(P * mu)

        if metric == 'proportional' :
          QP = P *( 1 + r * np.log(N))

        if metric == 'power' :
          QP = P * (np.log(N) ** r)

        Q1 = P.copy()
        Q1[TH1] = QP[TH1]
        Q1 = Q1 / Q1.sum()

        Q2 = P.copy()
        Q2[TH2] = QP[TH2]
        Q2 = Q2 / Q2.sum()

        smp1 = np.random.multinomial(n, Q1)
        smp2 = np.random.multinomial(n, Q2)
        smp_P1 = np.random.poisson(lam = n*P)

        smp_P = smp1
        smp_Q = smp2
    
        min_cnt = 0
        stbl = False
        gamma = 0.25

        pv = two_sample_pvals(smp_Q, smp_P, randomize=True, sym=True)
        #pv = pv[smp_Q + smp_P > min_cnt]
        pv[(smp_Q == 0) | (smp_P == 0)]
        hc, p_th = hc_vals(pv[pv < 1], gamma = gamma, stbl=stbl, minPv=0)

        pv_NR = two_sample_pvals(smp_Q, smp_P, randomize=False)
        hc_NR, _ = hc_vals(pv_NR[pv_NR < 1], gamma = gamma, stbl=stbl, minPv=0)

        MinPv = -np.log(pv.min())
        MinPvNR = -np.log(pv_NR.min())

        dfr = pd.DataFrame({'r': [r], 'beta' : [be], 'a' : [a], 
                            'xi' : [xi],'N' : [N], 'n' : [n],
                                'metric' : metric,
                                 'nMonte' : nMonte,
                                 'HC_NR' : hc_NR,
                                 'minPv_NR' : MinPvNR,
                                 'HC' : hc,
                                 'minPv' : MinPv,
                                })
        df = df.append(dfr, ignore_index = True)

    return df