Exemple #1
def HC_sim(c1,
    Higher-Criticism (HC) similarity of two discrete samples

    c1, c2 : two lists of integers of equal length
    gamma : HC parameter
    randomize : randomized Pvalues or normalization
    pval_thresh : only use P-values below this value. Has not effect
                  if pval_thresh > 1. 

    HCstar of the binomial allocation P-values of the two lists
    pvals = two_sample_pvals(c1, c2, randomize=randomize)
    pvals_red = pvals[pvals < pval_thresh]

    if len(pvals_red) == 0:
        return np.nan

    if HCtype == 'HCstar':
        hc, _ = HC(pvals_red).HCstar(gamma=gamma)
    elif HCtype == 'original':
        hc, _ = HC(pvals_red).HC(gamma=gamma)
        raise ValueError(f"{HCtype} is not a valid value for HCtype")
    return hc
Exemple #2
def visualize_HCT(pvals, stbl=True, gamma=.3) :
    hc = HC(pvals, stbl=stbl)
    n_max = min(int(5 * hc._istar), hc._N) 
    plt.plot(hc._uu[:n_max], hc._zz[:n_max])
    plt.vlines(x=hc._uu[hc._istar], ymin=0, ymax=max(hc._zz[:n_max]),
               linestyles='dashed', color='red')
Exemple #3
    def __compute_HC(self, pvals):
        np.warnings.filterwarnings('ignore')  # numpy puts a warning
        # when more than one pval is np.nan
        # This like supresses this warning
        pv = pvals[~np.isnan(pvals)]
        pv = pv[pv < self._pval_thresh]

        if len(pv) > 0:
            hc = HC(pv, stbl=self._stbl)
            return hc.HCstar(gamma=self._gamma)
            logging.warning("Did not find any P-values.")
            return np.nan, np.nan
Exemple #4
    def test_cls_Poiss(self, cls_name, **kwrgs):
        HC Test of one class against the rest. Returns HC value 
        and indicates if feature is selected by HCT

        stbl = kwrgs.get('stbl', self.stbl)
        gamma = kwrgs.get('gamma', self.gamma)

        df1 = pd.DataFrame()
        col_name_n = f"{cls_name}:n"
        col_name_T = f"{cls_name}:T"
        df1 = self.counts_df.filter([col_name_n, col_name_T])
        df1['frequency'] = self.counts_df['n'] / self.counts_df["T"]
        # observed feature frequency

        df1['pval'] = binom_test_two_sided(
            df1[col_name_n], df1[col_name_T],
            df1["frequency"])  # can appx by Poisson

        hc, thr = HC(df1['pval'], stbl=stbl).HCstar(gamma=gamma)
        df1['HC'] = hc
        df1['thresh'] = df1['pval'] < thr
        df1['more'] = np.sign(df1[col_name_n] -
                              df1[col_name_T] * df1["frequency"])
        return df1
Exemple #5
def evaluate_iteration(n = 10, N = 10, ep = .1, mu = 1, xi = 0, metric = 'Hellinger') :
    logging.debug(f"Evaluating with: n={n}, N={N}, ep={ep}, mu={mu}, xi={xi}")
    P = power_law(N, xi)
    if metric == 'Hellinger' :
      QP = (np.sqrt(P) + np.sqrt(mu))**2

    if metric == 'ChiSq' :
      QP = P + 2 * np.sqrt(P * mu)

    if metric == 'proportional' :
      QP = P *( 1 + r * np.log(N))

    if metric == 'power' :
      QP = P * (np.log(N) ** r)

    smp1 = sample_from_mixture(n*P, n*QP, ep)
    smp2 = sample_from_mixture(n*P, n*QP, ep)

    min_cnt = 0
    stbl = False
    gamma = 0.25

    pv = two_sample_pvals(smp1, smp2, randomize=True, sym=True)
    pv = pv[(smp1 == 0) | (smp2 == 0)]

    if len(pv) > 0 :
        hc, _ = HC(pv[pv < 1], stbl=stbl).HC(gamma=gamma)
        MinPv = -np.log(pv.min())
    else :
        hc = np.nan
        MinPv = np.nan

    pv_NR = two_sample_pvals(smp1, smp2, randomize=False)
    pv_NR = pv_NR[(smp1 == 0) | (smp2 == 0)]
    if len(pv_NR) > 0 :
        hc_NR, _ = HC(pv_NR[pv_NR < 1], stbl=stbl).HC(gamma=gamma)
        MinPvNR = -np.log(pv_NR.min())
    else :
        hc_NR = np.nan
        MinPvNR = np.nan

    return {'HC_NR' : hc_NR, 'minPv_NR' : MinPvNR,
            'HC' : hc, 'minPv' : MinPv}
    def __compute_HC(self, pvals):
        np.warnings.filterwarnings('ignore')  # numpy puts a warning
        # when more than one pval is np.nan
        # This like supresses this warning
        pv = pvals[~np.isnan(pvals)]
        pv = pv[pv < self._pval_thresh]

        if len(pv) > 0:
            if self._HCtype == 'HCstar':
                return HC(pv, stbl=self._stbl).HCstar(gamma=self._gamma)
            if self._HCtype == 'original':
                return HC(pv, stbl=self._stbl).HC(gamma=self._gamma)
                raise ValueError(f"{HCtype} is not a valid value for HCtype")
            logging.warning("Did not find any P-values.")
            return np.nan, np.nan
Exemple #7
    def test_doc(self, doc, of_cls=None, **kwrgs):
        Test a new document against existing documents by combining
        binomial allocation P-values from each document. 
        :doc:     dataframe representing terms in the tested doc
        :of_cls:  use this to indicate that the tested document is already
                represented by one of the classes in the model
        :stbl:    type of HC statistic to use
        :gamma:   parameter of HC statistic

        stbl = kwrgs.get('stbl', self.stbl)
        gamma = kwrgs.get('gamma', self.gamma)

        dfi = self.count_words(doc)
        logging.debug(f"Doc contains {dfi.n.sum()} terms.")
        df = self.counts_df
        assert (
            len(df) == len(dfi)), "count_words must use the same vocabulary"

        dfi['tested:T'] = dfi.n.sum()
        dfi = dfi.rename(columns={'n': 'tested:n'})
        df = df.join(dfi, how='left')

        for cls in self.cls_names:
            cnt1 = df['tested:n'].astype(int)
            cnt2 = df[f'{cls}:n'].astype(int)
            if of_cls == cls:  # if tested document is already represented in
                # corpus, remove its counts to get a meaningful
                # comparison.
                    f"Doc is of {of_cls}. Evaluating in a Leave-out manner.")
                cnt2 -= cnt1
                assert (np.all(cnt2 >= 0))

            if cnt1.sum() + cnt2.sum() > 0:
                pv, p = two_sample_pvals(cnt1, cnt2, ret_p=True)
                pv, p = cnt1 * np.nan, cnt1 * np.nan

            df[f'{cls}:pval'] = pv
            df[f'{cls}:score'] = -2 * np.log(df[f'{cls}:pval'])
            df[f'{cls}:Fisher'] = df[f'{cls}:score'].mean()
            df[f'{cls}:HC'], pth = HC(pv, stbl=stbl).HCstar(gamma=gamma)
            df[f'{cls}:chisq'] = two_sample_chi_square(cnt1, cnt2)[0]
            more = -np.sign(cnt1 - (cnt1 + cnt2) * p)
            thresh = pv < pth
            df[f'{cls}:affinity'] = more * thresh

        return df
    def HCT(self, gamma=.2, stbl=True):
        Return results after applying HC threshold to fitted data
        Report whether a feature is selected by HC threshold


        df = self.get_pvals()

        hc, thr = HC(df['pval'], stbl=stbl).HCstar(gamma=gamma)
        df['HC'] = hc
        df['thresh'] = df['pval'] < thr
        return df
Exemple #9
    def HCT(self, **kwrgs):
        Apply HC threshold to fitted data
        Report whether a feature is selected by HC threshold


        stbl = kwrgs.get('stbl', self.stbl)
        gamma = kwrgs.get('gamma', self.gamma)

        df = self.get_pvals()

        hc, thr = HC(df['pval'], stbl=stbl).HCstar(gamma=gamma)
        df['HC'] = hc
        df['thresh'] = df['pval'] < thr
        return df
    def test_cls(self, cls_name, gamma=.2, stbl=True):
        HC Test of one class against the rest. Returns HC value 
        and indicates if feature is selected by HCT

        df1 = pd.DataFrame()
        col_name_n = f"n ({cls_name})"
        col_name_T = f"T ({cls_name})"
        df1 = self.counts_df.filter([col_name_n, col_name_T])
        df1['n (rest)'] = self.counts_df['n'] - df1[col_name_n]
        df1['T (rest)'] = self.counts_df["T"] - df1[col_name_T]

        df1['pval'] = two_sample_pvals(df1[col_name_n], df1["n (rest)"])

        hc, thr = HC(df1['pval'], stbl=stbl).HCstar(gamma=gamma)
        df1['HC'] = hc
        df1['thresh'] = df1['pval'] < thr
        df1['more'] = np.sign(df1[col_name_n] / df1[col_name_T] \
                                - df1['n (rest)'] / df1['T (rest)'])
        return df1
    def test_doc(self, doc, of_cls=None, stbl=True, gamma=.2):
        Test a new document against existing documents by combining
        binomial allocation P-values from each document. 


        dfi = self.count_words(doc)

        logging.debug(f"Doc contains {dfi.n.sum()} terms.")

        df = self.counts_df

        dfi['T (test)'] = dfi.n.sum()
        dfi = dfi.rename(columns={'n': 'n (test)'})
        df = df.join(dfi, how='left')

        for cls in self.cls_names:
            cnt1 = df['n (test)'].astype(int)
            cnt2 = df[f'n ({cls})'].astype(int)
            if of_cls == cls:  # if tested document is already represented in
                # corpus, remove its counts to get a meaningful
                # comparison.
                    f"Doc is of {of_cls}. Evaluating in Leave-our manner.")
                print(f"Doc is of {of_cls}. Evaluating in Leave-our manner.")
                cnt2 -= cnt1

            pv, p = two_sample_pvals(cnt1, cnt2, ret_p=True)

            df[f'pval ({cls})'] = pv
            df[f'score ({cls})'] = -2 * np.log(df[f'pval ({cls})'])
            df[f'HC ({cls})'], pth = HC(pv, stbl=stbl).HCstar(gamma=gamma)
            more = -np.sign(cnt1 - (cnt1 + cnt2) * p)
            thresh = pv < pth
            df[f'affinity ({cls})'] = more * thresh

        return df
Exemple #12
    def test_cls(self, cls_name, **kwrgs):
        HC Test of one class against the rest. Returns HC value 
        and indicates if feature is selected by HCT

        stbl = kwrgs.get('stbl', self.stbl)
        gamma = kwrgs.get('gamma', self.gamma)

        df1 = pd.DataFrame()
        col_name_n = f"{cls_name}:n"
        col_name_T = f"{cls_name}:T"
        df1 = self.counts_df.filter([col_name_n, col_name_T])
        df1['rest:n'] = self.counts_df['n'] - df1[col_name_n]
        df1['rest:T'] = self.counts_df["T"] - df1[col_name_T]

        df1['pval'] = two_sample_pvals(df1[col_name_n], df1["rest:n"])

        hc, thr = HC(df1['pval'], stbl=stbl).HCstar(gamma=gamma)
        df1['HC'] = hc
        df1['thresh'] = df1['pval'] < thr
        df1['more'] = np.sign(df1[col_name_n] / df1[col_name_T] \
                              - df1['rest:n'] / df1['rest:T'])
        return df1
Exemple #13
def BJ_sim(c1, c2, gamma=0.1, randomize=False, pval_thresh=1.1):
    Berk-Jones (BJ) similarity of two discrete samples

    c1, c2 : two lists of integers of equal length
    gamma : lower fraction of P-values
    randomize : randomized Pvalues or normalization
    pval_thresh : only use P-values below this value. Has not effect
                  if pval_thresh > 1. 

    HCstar of the binomial allocation P-values of the two lists
    pvals = two_sample_pvals(c1, c2, randomize=randomize)
    pvals_red = pvals[pvals < pval_thresh]

    if len(pvals_red) == 0:
        return np.nan

    bj, _ = HC(pvals_red).BJ(gamma=gamma)
    return bj