def HC_sim(c1, c2, gamma=0.15, randomize=False, pval_thresh=1.1, HCtype='HCstar'): """ Higher-Criticism (HC) similarity of two discrete samples Args: ----- c1, c2 : two lists of integers of equal length gamma : HC parameter randomize : randomized Pvalues or normalization pval_thresh : only use P-values below this value. Has not effect if pval_thresh > 1. Returns: ------- HCstar of the binomial allocation P-values of the two lists """ pvals = two_sample_pvals(c1, c2, randomize=randomize) pvals_red = pvals[pvals < pval_thresh] if len(pvals_red) == 0: return np.nan if HCtype == 'HCstar': hc, _ = HC(pvals_red).HCstar(gamma=gamma) elif HCtype == 'original': hc, _ = HC(pvals_red).HC(gamma=gamma) else: raise ValueError(f"{HCtype} is not a valid value for HCtype") exit(1) return hc
def visualize_HCT(pvals, stbl=True, gamma=.3) : hc = HC(pvals, stbl=stbl) hc.HCstar(gamma=gamma) n_max = min(int(5 * hc._istar), hc._N) plt.plot(hc._uu[:n_max], hc._zz[:n_max]) plt.vlines(x=hc._uu[hc._istar], ymin=0, ymax=max(hc._zz[:n_max]), linestyles='dashed', color='red') plt.show()
def __compute_HC(self, pvals): np.warnings.filterwarnings('ignore') # numpy puts a warning # when more than one pval is np.nan # This like supresses this warning pv = pvals[~np.isnan(pvals)] pv = pv[pv < self._pval_thresh] np.warnings.filterwarnings('always') if len(pv) > 0: hc = HC(pv, stbl=self._stbl) return hc.HCstar(gamma=self._gamma) else: logging.warning("Did not find any P-values.") return np.nan, np.nan
def test_cls_Poiss(self, cls_name, **kwrgs): """ HC Test of one class against the rest. Returns HC value and indicates if feature is selected by HCT """ stbl = kwrgs.get('stbl', self.stbl) gamma = kwrgs.get('gamma', self.gamma) df1 = pd.DataFrame() col_name_n = f"{cls_name}:n" col_name_T = f"{cls_name}:T" df1 = self.counts_df.filter([col_name_n, col_name_T]) df1['frequency'] = self.counts_df['n'] / self.counts_df["T"] # observed feature frequency df1['pval'] = binom_test_two_sided( df1[col_name_n], df1[col_name_T], df1["frequency"]) # can appx by Poisson hc, thr = HC(df1['pval'], stbl=stbl).HCstar(gamma=gamma) df1['HC'] = hc df1['thresh'] = df1['pval'] < thr df1['more'] = np.sign(df1[col_name_n] - df1[col_name_T] * df1["frequency"]) return df1
def evaluate_iteration(n = 10, N = 10, ep = .1, mu = 1, xi = 0, metric = 'Hellinger') : logging.debug(f"Evaluating with: n={n}, N={N}, ep={ep}, mu={mu}, xi={xi}") P = power_law(N, xi) if metric == 'Hellinger' : QP = (np.sqrt(P) + np.sqrt(mu))**2 if metric == 'ChiSq' : QP = P + 2 * np.sqrt(P * mu) if metric == 'proportional' : QP = P *( 1 + r * np.log(N)) if metric == 'power' : QP = P * (np.log(N) ** r) smp1 = sample_from_mixture(n*P, n*QP, ep) smp2 = sample_from_mixture(n*P, n*QP, ep) min_cnt = 0 stbl = False gamma = 0.25 pv = two_sample_pvals(smp1, smp2, randomize=True, sym=True) pv = pv[(smp1 == 0) | (smp2 == 0)] if len(pv) > 0 : hc, _ = HC(pv[pv < 1], stbl=stbl).HC(gamma=gamma) MinPv = -np.log(pv.min()) else : print("empty") hc = np.nan MinPv = np.nan pv_NR = two_sample_pvals(smp1, smp2, randomize=False) pv_NR = pv_NR[(smp1 == 0) | (smp2 == 0)] if len(pv_NR) > 0 : hc_NR, _ = HC(pv_NR[pv_NR < 1], stbl=stbl).HC(gamma=gamma) MinPvNR = -np.log(pv_NR.min()) else : print("empty") hc_NR = np.nan MinPvNR = np.nan return {'HC_NR' : hc_NR, 'minPv_NR' : MinPvNR, 'HC' : hc, 'minPv' : MinPv}
def __compute_HC(self, pvals): np.warnings.filterwarnings('ignore') # numpy puts a warning # when more than one pval is np.nan # This like supresses this warning pv = pvals[~np.isnan(pvals)] pv = pv[pv < self._pval_thresh] np.warnings.filterwarnings('always') if len(pv) > 0: if self._HCtype == 'HCstar': return HC(pv, stbl=self._stbl).HCstar(gamma=self._gamma) if self._HCtype == 'original': return HC(pv, stbl=self._stbl).HC(gamma=self._gamma) else: raise ValueError(f"{HCtype} is not a valid value for HCtype") exit(1) else: logging.warning("Did not find any P-values.") return np.nan, np.nan
def test_doc(self, doc, of_cls=None, **kwrgs): """ Test a new document against existing documents by combining binomial allocation P-values from each document. Params: :doc: dataframe representing terms in the tested doc :of_cls: use this to indicate that the tested document is already represented by one of the classes in the model :stbl: type of HC statistic to use :gamma: parameter of HC statistic """ stbl = kwrgs.get('stbl', self.stbl) gamma = kwrgs.get('gamma', self.gamma) dfi = self.count_words(doc) logging.debug(f"Doc contains {dfi.n.sum()} terms.") df = self.counts_df assert ( len(df) == len(dfi)), "count_words must use the same vocabulary" dfi['tested:T'] = dfi.n.sum() dfi = dfi.rename(columns={'n': 'tested:n'}) df = df.join(dfi, how='left') for cls in self.cls_names: cnt1 = df['tested:n'].astype(int) cnt2 = df[f'{cls}:n'].astype(int) if of_cls == cls: # if tested document is already represented in # corpus, remove its counts to get a meaningful # comparison. logging.debug( f"Doc is of {of_cls}. Evaluating in a Leave-out manner.") cnt2 -= cnt1 assert (np.all(cnt2 >= 0)) if cnt1.sum() + cnt2.sum() > 0: pv, p = two_sample_pvals(cnt1, cnt2, ret_p=True) else: pv, p = cnt1 * np.nan, cnt1 * np.nan df[f'{cls}:pval'] = pv df[f'{cls}:score'] = -2 * np.log(df[f'{cls}:pval']) df[f'{cls}:Fisher'] = df[f'{cls}:score'].mean() df[f'{cls}:HC'], pth = HC(pv, stbl=stbl).HCstar(gamma=gamma) df[f'{cls}:chisq'] = two_sample_chi_square(cnt1, cnt2)[0] more = -np.sign(cnt1 - (cnt1 + cnt2) * p) thresh = pv < pth df[f'{cls}:affinity'] = more * thresh return df
def HCT(self, gamma=.2, stbl=True): """ Return results after applying HC threshold to fitted data Report whether a feature is selected by HC threshold """ df = self.get_pvals() hc, thr = HC(df['pval'], stbl=stbl).HCstar(gamma=gamma) df['HC'] = hc df['thresh'] = df['pval'] < thr return df
def HCT(self, **kwrgs): """ Apply HC threshold to fitted data Report whether a feature is selected by HC threshold """ stbl = kwrgs.get('stbl', self.stbl) gamma = kwrgs.get('gamma', self.gamma) df = self.get_pvals() hc, thr = HC(df['pval'], stbl=stbl).HCstar(gamma=gamma) df['HC'] = hc df['thresh'] = df['pval'] < thr return df
def test_cls(self, cls_name, gamma=.2, stbl=True): """ HC Test of one class against the rest. Returns HC value and indicates if feature is selected by HCT """ df1 = pd.DataFrame() col_name_n = f"n ({cls_name})" col_name_T = f"T ({cls_name})" df1 = self.counts_df.filter([col_name_n, col_name_T]) df1['n (rest)'] = self.counts_df['n'] - df1[col_name_n] df1['T (rest)'] = self.counts_df["T"] - df1[col_name_T] df1['pval'] = two_sample_pvals(df1[col_name_n], df1["n (rest)"]) hc, thr = HC(df1['pval'], stbl=stbl).HCstar(gamma=gamma) df1['HC'] = hc df1['thresh'] = df1['pval'] < thr df1['more'] = np.sign(df1[col_name_n] / df1[col_name_T] \ - df1['n (rest)'] / df1['T (rest)']) return df1
def test_doc(self, doc, of_cls=None, stbl=True, gamma=.2): """ Test a new document against existing documents by combining binomial allocation P-values from each document. """ dfi = self.count_words(doc) logging.debug(f"Doc contains {dfi.n.sum()} terms.") df = self.counts_df dfi['T (test)'] = dfi.n.sum() dfi = dfi.rename(columns={'n': 'n (test)'}) df = df.join(dfi, how='left') for cls in self.cls_names: cnt1 = df['n (test)'].astype(int) cnt2 = df[f'n ({cls})'].astype(int) if of_cls == cls: # if tested document is already represented in # corpus, remove its counts to get a meaningful # comparison. logging.debug( f"Doc is of {of_cls}. Evaluating in Leave-our manner.") print(f"Doc is of {of_cls}. Evaluating in Leave-our manner.") cnt2 -= cnt1 pv, p = two_sample_pvals(cnt1, cnt2, ret_p=True) df[f'pval ({cls})'] = pv df[f'score ({cls})'] = -2 * np.log(df[f'pval ({cls})']) df[f'HC ({cls})'], pth = HC(pv, stbl=stbl).HCstar(gamma=gamma) more = -np.sign(cnt1 - (cnt1 + cnt2) * p) thresh = pv < pth df[f'affinity ({cls})'] = more * thresh return df
def test_cls(self, cls_name, **kwrgs): """ HC Test of one class against the rest. Returns HC value and indicates if feature is selected by HCT """ stbl = kwrgs.get('stbl', self.stbl) gamma = kwrgs.get('gamma', self.gamma) df1 = pd.DataFrame() col_name_n = f"{cls_name}:n" col_name_T = f"{cls_name}:T" df1 = self.counts_df.filter([col_name_n, col_name_T]) df1['rest:n'] = self.counts_df['n'] - df1[col_name_n] df1['rest:T'] = self.counts_df["T"] - df1[col_name_T] df1['pval'] = two_sample_pvals(df1[col_name_n], df1["rest:n"]) hc, thr = HC(df1['pval'], stbl=stbl).HCstar(gamma=gamma) df1['HC'] = hc df1['thresh'] = df1['pval'] < thr df1['more'] = np.sign(df1[col_name_n] / df1[col_name_T] \ - df1['rest:n'] / df1['rest:T']) return df1
def BJ_sim(c1, c2, gamma=0.1, randomize=False, pval_thresh=1.1): """ Berk-Jones (BJ) similarity of two discrete samples Args: ----- c1, c2 : two lists of integers of equal length gamma : lower fraction of P-values randomize : randomized Pvalues or normalization pval_thresh : only use P-values below this value. Has not effect if pval_thresh > 1. Returns: ------- HCstar of the binomial allocation P-values of the two lists """ pvals = two_sample_pvals(c1, c2, randomize=randomize) pvals_red = pvals[pvals < pval_thresh] if len(pvals_red) == 0: return np.nan bj, _ = HC(pvals_red).BJ(gamma=gamma) return bj