def test_binom_test(): #> bt = binom.test(51,235,(1/6),alternative="less") #> cat_items(bt, "binom_test_less.") binom_test_less = Holder() binom_test_less.statistic = 51 binom_test_less.parameter = 235 binom_test_less.p_value = 0.982022657605858 binom_test_less.conf_int = [0, 0.2659460862574313] binom_test_less.estimate = 0.2170212765957447 binom_test_less.null_value = 1. / 6 binom_test_less.alternative = 'less' binom_test_less.method = 'Exact binomial test' binom_test_less.data_name = '51 and 235' #> bt = binom.test(51,235,(1/6),alternative="greater") #> cat_items(bt, "binom_test_greater.") binom_test_greater = Holder() binom_test_greater.statistic = 51 binom_test_greater.parameter = 235 binom_test_greater.p_value = 0.02654424571169085 binom_test_greater.conf_int = [0.1735252778065201, 1] binom_test_greater.estimate = 0.2170212765957447 binom_test_greater.null_value = 1. / 6 binom_test_greater.alternative = 'greater' binom_test_greater.method = 'Exact binomial test' binom_test_greater.data_name = '51 and 235' #> bt = binom.test(51,235,(1/6),alternative="t") #> cat_items(bt, "binom_test_2sided.") binom_test_2sided = Holder() binom_test_2sided.statistic = 51 binom_test_2sided.parameter = 235 binom_test_2sided.p_value = 0.0437479701823997 binom_test_2sided.conf_int = [0.1660633298083073, 0.2752683640289254] binom_test_2sided.estimate = 0.2170212765957447 binom_test_2sided.null_value = 1. / 6 binom_test_2sided.alternative = 'two.sided' binom_test_2sided.method = 'Exact binomial test' binom_test_2sided.data_name = '51 and 235' alltests = [('larger', binom_test_greater), ('smaller', binom_test_less), ('two-sided', binom_test_2sided)] for alt, res0 in alltests: # only p-value is returned res = smprop.binom_test(51, 235, prop=1. / 6, alternative=alt) #assert_almost_equal(res[0], res0.statistic) assert_almost_equal(res, res0.p_value, decimal=13) # R binom_test returns Copper-Pearson confint ci_2s = smprop.proportion_confint(51, 235, alpha=0.05, method='beta') ci_low, ci_upp = smprop.proportion_confint(51, 235, alpha=0.1, method='beta') assert_almost_equal(ci_2s, binom_test_2sided.conf_int, decimal=13) assert_almost_equal(ci_upp, binom_test_less.conf_int[1], decimal=13) assert_almost_equal(ci_low, binom_test_greater.conf_int[0], decimal=13)
def test_binom_rejection_interval(): # consistency check with binom_test # some code duplication but limit checks are different alpha = 0.05 nobs = 200 prop = 12./20 alternative='smaller' ci_low, ci_upp = smprop.binom_test_reject_interval(prop, nobs, alpha=alpha, alternative=alternative) assert_equal(ci_upp, nobs) pval = smprop.binom_test(ci_low, nobs, prop=prop, alternative=alternative) assert_array_less(pval, alpha) pval = smprop.binom_test(ci_low + 1, nobs, prop=prop, alternative=alternative) assert_array_less(alpha, pval) alternative='larger' ci_low, ci_upp = smprop.binom_test_reject_interval(prop, nobs, alpha=alpha, alternative=alternative) assert_equal(ci_low, 0) pval = smprop.binom_test(ci_upp, nobs, prop=prop, alternative=alternative) assert_array_less(pval, alpha) pval = smprop.binom_test(ci_upp - 1, nobs, prop=prop, alternative=alternative) assert_array_less(alpha, pval) alternative='two-sided' ci_low, ci_upp = smprop.binom_test_reject_interval(prop, nobs, alpha=alpha, alternative=alternative) pval = smprop.binom_test(ci_upp, nobs, prop=prop, alternative=alternative) assert_array_less(pval, alpha) pval = smprop.binom_test(ci_upp - 1, nobs, prop=prop, alternative=alternative) assert_array_less(alpha, pval) pval = smprop.binom_test(ci_upp, nobs, prop=prop, alternative=alternative) assert_array_less(pval, alpha) pval = smprop.binom_test(ci_upp - 1, nobs, prop=prop, alternative=alternative) assert_array_less(alpha, pval)
def __pvalue_directed(**params): """ Compute the pvalue for the directed edge null model. Use a standard binomial test from the statsmodels package @keyword w_uv: Weight of the directe edge. @keyword ku_out: Total outgoing weight of the source vertex. @keyword kv_in: Total incoming weight of the destination vertex. @keyword q: Total sum of all edge weights in the graph. """ w_uv = params.get("w_uv") ku_out = params.get("ku_out") kv_in = params.get("kv_in") q = params.get("q") p = 1.0 * ku_out * kv_in / q / q / 1.0 print "p = %f" % p return binom_test(count=w_uv, nobs=q, prop=p, alternative="larger")
def __pvalue_undirected(**params): """ Compute the pvalue for the undirected edge null model. Use a standard binomial test from the statsmodels package. @keyword w: weight of the undirected edge. @keyword ku: total incident weight (strength) of the first vertex. @keyword kv: total incident weight (strength) of the second vertex. @keyword q: total incident weight of all vertices divided by two. Similar to the total number of edges in the graph. """ w = params.get("w") ku = params.get("ku") kv = params.get("kv") q = params.get("q") if not (w and ku and kv and q): raise ValueError p = ku * kv * 1.0 / q / q / 2.0 return binom_test(count=w, nobs=q, prop=p, alternative="larger")
def calculate_enrichment(self, genes, reference=None, evidence_codes=None, aspect=None, use_fdr=True): """ Parameters ---------- genes : list list of genes reference : list reference list of species to calculate enrichment evidence_codes : list GO evidence codes use_fdr : bool Correct for multiple hypothesis testing Returns ------- """ # TODO check for alias for genes genes = set(genes) # TODO add aspects term_reference = self.go_to_gene.keys() aspect_dict = { 'P': 'biological_process', 'C': 'cellular_component', 'F': 'molecular_function' } if aspect is None: term_reference = self.go_to_gene gene_reference = self.gene_to_go else: term_reference = dict() gene_reference = dict() if aspect is not None: for i in aspect: if i not in ['P', 'C', 'F']: print("Error: Aspects are only 'P', 'C', and 'F' \n") quit() for i in ['P', 'C', 'F']: if i in aspect: term_reference = None # TODO add reference if reference: # TODO check for reference alias reference = set(reference) reference.intersection_update(set(self.gene_to_go.keys())) else: reference = set(self.gene_to_go.keys()) # TODO add evidence_codes terms = set() for i in genes: if i in self.gene_to_go: for t in self.gene_to_go[i]: terms.add(t) n_genes = len(genes) n_ref = float(len(reference)) res = {} for term in terms: all_annotated_genes = set(self.go_to_gene[term]) mapped_genes = genes.intersection(all_annotated_genes) n_mapped_genes = len(mapped_genes) if n_ref > len(all_annotated_genes): mapped_reference_genes = \ reference.intersection(all_annotated_genes) else: mapped_reference_genes = \ all_annotated_genes.intersection(reference) n_mapped_ref = len(mapped_reference_genes) prob = float(n_mapped_ref) / n_ref p_value = binom_test(n_mapped_genes, n_genes, prob, 'larger') res[term] = ([i for i in mapped_genes], p_value, n_mapped_ref) if use_fdr: res = sorted(res.items(), key=lambda x: x[1][1]) fdr = fdrcorrection([p for _, (_, p, _) in res], is_sorted=True) values = fdr[1] res = dict([(index, (genes, p, ref)) for (index, (genes, _, ref)), p in zip(res, values)]) return res
def poisson_twosample(count1, exposure1, count2, exposure2, ratio_null=1, method='score', alternative='two-sided'): '''test for ratio of two sample Poisson intensities If the two Poisson rates are g1 and g2, then the Null hypothesis is H0: g1 / g2 = ratio_null against one of the following alternatives H1_2-sided: g1 / g2 != ratio_null H1_larger: g1 / g2 > ratio_null H1_smaller: g1 / g2 < ratio_null Parameters ---------- count1: int Number of events in first sample exposure1: float Total exposure (time * subjects) in first sample count2: int Number of events in first sample exposure2: float Total exposure (time * subjects) in first sample ratio: float ratio of the two Poisson rates under the Null hypothesis. Default is 1. method: string Method for the test statistic and the p-value. Defaults to `'score'`. Current Methods are based on Gu et. al 2008 Implemented are 'wald', 'score' and 'sqrt' based asymptotic normal distribution, and the exact conditional test 'exact-cond', and its mid-point version 'cond-midp', see Notes alternative : string The alternative hypothesis, H1, has to be one of the following 'two-sided': H1: ratio of rates is not equal to ratio_null (default) 'larger' : H1: ratio of rates is larger than ratio_null 'smaller' : H1: ratio of rates is smaller than ratio_null Returns ------- stat, pvalue two-sided not yet #results : Results instance # The resulting test statistics and p-values are available as attributes. Notes ----- 'wald': method W1A, wald test, variance based on separate estimates 'score': method W2A, score test, variance based on estimate under Null 'wald-log': W3A 'score-log' W4A 'sqrt': W5A, based on variance stabilizing square root transformation 'exact-cond': exact conditional test based on binomial distribution 'cond-midp': midpoint-pvalue of exact conditional test The latter two are only verified for one-sided example. References ---------- Gu, Ng, Tang, Schucany 2008: Testing the Ratio of Two Poisson Rates, Biometrical Journal 50 (2008) 2, 2008 ''' # shortcut names y1, n1, y2, n2 = count1, exposure1, count2, exposure2 d = n2 / n1 r = ratio_null r_d = r / d if method in ['score']: stat = (y1 - y2 * r_d) / np.sqrt((y1 + y2) * r_d) dist = 'normal' elif method in ['wald']: stat = (y1 - y2 * r_d) / np.sqrt(y1 + y2 * r_d**2) dist = 'normal' elif method in ['sqrt']: stat = 2 * (np.sqrt(y1 + 3 / 8.) - np.sqrt((y2 + 3 / 8.) * r_d)) stat /= np.sqrt(1 + r_d) dist = 'normal' elif method in ['exact-cond', 'cond-midp']: from statsmodels.stats import proportion bp = r_d / (1 + r_d) y_total = y1 + y2 stat = None pvalue = proportion.binom_test(y1, y_total, prop=bp, alternative=alternative) if method in ['cond-midp']: # not inplace in case we still want binom pvalue pvalue = pvalue - 0.5 * stats.binom.pmf(y1, y_total, bp) dist = 'binomial' if dist == 'normal': return _zstat_generic2(stat, 1, alternative) else: return stat, pvalue
def sfit_first_order(model, loss, x, y, alpha, beta=None, verbose=1): """Compute the first-order SFIT method to test what are the first-order significant variables within x toward the prediction of y as learned by the model. Parameters ---------- model : A predictive model that can return predicted y given x by calling its method model.predict loss : function Function that computes the pointwise loss between 2 numpy arrays of outcomes, its first argument should be the predicted outcomes and its second should be the true ones. x: numpy array of shape (N, p) Input data used to perform the tests y: numpy array of shape (N, ) True outcomes alpha : float Significance level of the test beta: float Regularization amount of the test. If set to None, the optimal beta parameter is found using the randomization procedure described in the original SFIT paper (works only when model is a Keras model). verbose: boolean The summary of the test procedure is printed if true (default) but no printing if false. Returns ------- s_1 : list the list of first-order significant variables (indexed from 1 to p) c_1 : dictionary dictionary whose keys are the first-order significant variables ; for each key, its value is a tuple whose first element is the test statistic value and second element is its (1 - alpha)% confidence interval u_1 : list the list of first-order non-significant variables (indexed from 1 to p) p_values : numpy array of shape (p, ) array containing the p-values associated with each variables. second_order_significance : boolean If true, indicates the presence of significant second-order effects which suggests to use second-order SFIT. opt_beta : float Equals to beta if beta is passed as argument, otherwise equals to the optimal beta found from randomization procedure. """ (n, p) = x.shape assert np.array_equal(x[:, 0], np.ones(n)), 'As an intercept, all the entries of the first column of x ' \ 'should be equal to 1.' errors = compute_errors_first(loss, model, n, p, x, y) if beta is not None: opt_beta = beta c_1, p_values, s_1, u_1 = sign_test_first(alpha, beta, errors, n, p) else: print('Compute optimal beta') beta = 1e-7 mean_nr_significants = 1 initial_weights = model.get_weights() k_eval = lambda placeholder: placeholder.eval(session=K.get_session()) nr_simulations = 20 x_val, x_test, y_val, y_test = train_test_split(x, y, test_size=0.5) n_val = x_val.shape[0] n_test = x_test.shape[0] while mean_nr_significants > alpha: beta = beta * 10 nr_significants_per_sim = np.zeros(nr_simulations) for i in range(nr_simulations): new_weights = [ k_eval(glorot_normal()(w.shape)) for w in initial_weights ] model.set_weights(new_weights) errors_i = compute_errors_first(loss, model, n_val, p, x_val, y_val) c_1_i, p_values_i, s_1_i, u_1_i = sign_test_first( alpha, beta, errors_i, n_val, p) nr_significants_per_sim[i] = len(s_1_i) / p mean_nr_significants = np.mean(nr_significants_per_sim) opt_beta = beta print('Optimal beta found: {0}'.format(opt_beta)) model.set_weights(initial_weights) errors = compute_errors_first(loss, model, n_test, p, x_test, y_test) c_1, p_values, s_1, u_1 = sign_test_first(alpha, beta, errors, n_test, p) if verbose: print('Summary of first-order SFIT\n' '------------------------------------------------\n' 'First-order significant variables:') for key in c_1.keys(): print('- Variable {0}:'.format(key)) print('\t Median: {0}'.format(np.round(c_1[key][0], 3))) print('\t {0}% confidence interval: {1}'.format( int(100 * (1 - alpha)), np.round(c_1[key][1], 3))) print('------------------------------------------------\n' 'First-order non-significant variables: {0}'.format(u_1)) # Test for presence of any second-order significance: second_order_significance = False x_first = np.copy(x) indices = [i for i in range(1, p) if i in u_1] x_first[:, indices] = 0 predicted_y_first = model.predict(x_first) model_first_errors = loss(predicted_y_first, y) predicted_y_all = model.predict(x) model_all_errors = loss(predicted_y_all, y) delta_j = model_first_errors - model_all_errors n_j = np.sum(delta_j > 0) if binom_test(n_j, n, 0.5, 'larger') < alpha: second_order_significance = True if verbose: print( '------------------------------------------------\n' 'There are some significant second-order variables: recommended to run second-order SFIT.' ) else: if verbose: print('------------------------------------------------\n' 'There are no significant second-order variables.') if verbose: print('------------------------------------------------ \n' '------------------------------------------------ \n') return s_1, c_1, u_1, p_values, second_order_significance, opt_beta
def test_poisson_2indep(count1, exposure1, count2, exposure2, ratio_null=1, method='score', alternative='two-sided', etest_kwds=None): '''test for ratio of two sample Poisson intensities If the two Poisson rates are g1 and g2, then the Null hypothesis is - H0: g1 / g2 = ratio_null against one of the following alternatives - H1_2-sided: g1 / g2 != ratio_null - H1_larger: g1 / g2 > ratio_null - H1_smaller: g1 / g2 < ratio_null Parameters ---------- count1 : int Number of events in first sample. exposure1 : float Total exposure (time * subjects) in first sample. count2 : int Number of events in first sample. exposure2 : float Total exposure (time * subjects) in first sample. ratio: float ratio of the two Poisson rates under the Null hypothesis. Default is 1. method : string Method for the test statistic and the p-value. Defaults to `'score'`. Current Methods are based on Gu et. al 2008. Implemented are 'wald', 'score' and 'sqrt' based asymptotic normal distribution, and the exact conditional test 'exact-cond', and its mid-point version 'cond-midp'. method='etest' and method='etest-wald' provide pvalues from `etest_poisson_2indep` using score or wald statistic respectively. see Notes. alternative : string The alternative hypothesis, H1, has to be one of the following - 'two-sided': H1: ratio of rates is not equal to ratio_null (default) - 'larger' : H1: ratio of rates is larger than ratio_null - 'smaller' : H1: ratio of rates is smaller than ratio_null Returns ------- results : Results instance The resulting test statistics and p-values are available as attributes. Notes ----- - 'wald': method W1A, wald test, variance based on separate estimates - 'score': method W2A, score test, variance based on estimate under Null - 'wald-log': W3A - 'score-log' W4A - 'sqrt': W5A, based on variance stabilizing square root transformation - 'exact-cond': exact conditional test based on binomial distribution - 'cond-midp': midpoint-pvalue of exact conditional test - 'etest': etest with score test statistic - 'etest-wald': etest with wald test statistic References ---------- Gu, Ng, Tang, Schucany 2008: Testing the Ratio of Two Poisson Rates, Biometrical Journal 50 (2008) 2, 2008 ''' # shortcut names y1, n1, y2, n2 = count1, exposure1, count2, exposure2 d = n2 / n1 r = ratio_null r_d = r / d if method in ['score']: stat = (y1 - y2 * r_d) / np.sqrt((y1 + y2) * r_d) dist = 'normal' elif method in ['wald']: stat = (y1 - y2 * r_d) / np.sqrt(y1 + y2 * r_d**2) dist = 'normal' elif method in ['sqrt']: stat = 2 * (np.sqrt(y1 + 3 / 8.) - np.sqrt((y2 + 3 / 8.) * r_d)) stat /= np.sqrt(1 + r_d) dist = 'normal' elif method in ['exact-cond', 'cond-midp']: from statsmodels.stats import proportion bp = r_d / (1 + r_d) y_total = y1 + y2 stat = None # TODO: why y2 in here and not y1, check definition of H1 "larger" pvalue = proportion.binom_test(y1, y_total, prop=bp, alternative=alternative) if method in ['cond-midp']: # not inplace in case we still want binom pvalue pvalue = pvalue - 0.5 * stats.binom.pmf(y1, y_total, bp) dist = 'binomial' elif method.startswith('etest'): if method.endswith('wald'): method_etest = 'wald' else: method_etest = 'score' if etest_kwds is None: etest_kwds = {} stat, pvalue = etest_poisson_2indep(count1, exposure1, count2, exposure2, ratio_null=ratio_null, method=method_etest, alternative=alternative, **etest_kwds) dist = 'poisson' else: raise ValueError('method not recognized') if dist == 'normal': stat, pvalue = _zstat_generic2(stat, 1, alternative) rates = (y1 / n1, y2 / n2) ratio = rates[0] / rates[1] res = HolderTuple(statistic=stat, pvalue=pvalue, distribution=dist, method=method, alternative=alternative, rates=rates, ratio=ratio, ratio_null=ratio_null) return res
def poisson_test(count1, count2, exposure1=1, exposure2=1, ratio_null=1, method='score', alternative='2-sided'): '''Test for ratio of two sample Poisson intensities If the two Poisson rates are g1 and g2, then the Null hypothesis is H0: g1 / g2 = ratio_null against one of the following alternatives H1_2-sided: g1 / g2 != ratio_null H1_larger: g1 / g2 > ratio_null H1_smaller: g1 / g2 < ratio_null Args: count1: int Number of events in first sample exposure1: float Total exposure (time * subjects) in first sample count2: int Number of events in first sample exposure2: float Total exposure (time * subjects) in first sample ratio: float ratio of the two Poisson rates under the Null hypothesis. Default is 1. method: string Method for the test statistic and the p-value. Defaults to `'score'`. Current Methods are based on Gu et. al 2008 Implemented are 'wald', 'score' and 'sqrt' based asymptotic normal distribution, and the exact conditional test 'exact-cond', and its mid-point version 'cond-midp', see Notes alternative : string The alternative hypothesis, H1, has to be one of the following 'two-sided': H1: ratio of rates is not equal to ratio_null (default) 'larger' : H1: ratio of rates is larger than ratio_null 'smaller' : H1: ratio of rates is smaller than ratio_null Returns: pvalue two-sided # stat Notes ----- 'wald': method W1A, wald test, variance based on separate estimates 'score': method W2A, score test, variance based on estimate under Null 'wald-log': W3A 'score-log' W4A 'sqrt': W5A, based on variance stabilizing square root transformation 'exact-cond': exact conditional test based on binomial distribution 'cond-midp': midpoint-pvalue of exact conditional test The latter two are only verified for one-sided example. References ---------- Gu, Ng, Tang, Schucany 2008: Testing the Ratio of Two Poisson Rates, Biometrical Journal 50 (2008) 2, 2008 Author: Josef Perktold License: BSD-3 destination statsmodels From: https://stackoverflow.com/questions/33944914/implementation-of-e-test-for-poisson-in-python Date: 2020feb24 ''' # Copied from statsmodels.stats.weightstats def zstat_generic2(value, std_diff, alternative): '''generic (normal) z-test to save typing can be used as ztest based on summary statistics ''' zstat = value / std_diff if alternative in ['two-sided', '2-sided', '2s']: pvalue = sps.norm.sf(np.abs(zstat)) * 2 elif alternative in ['larger', 'l']: pvalue = sps.norm.sf(zstat) elif alternative in ['smaller', 's']: pvalue = sps.norm.cdf(zstat) else: raise ValueError('invalid alternative') return pvalue # zstat # shortcut names y1, n1, y2, n2 = count1, exposure1, count2, exposure2 d = n2 / n1 r = ratio_null r_d = r / d if method in ['score']: stat = (y1 - y2 * r_d) / np.sqrt((y1 + y2) * r_d) dist = 'normal' elif method in ['wald']: stat = (y1 - y2 * r_d) / np.sqrt(y1 + y2 * r_d**2) dist = 'normal' elif method in ['sqrt']: stat = 2 * (np.sqrt(y1 + 3 / 8.) - np.sqrt((y2 + 3 / 8.) * r_d)) stat /= np.sqrt(1 + r_d) dist = 'normal' elif method in ['exact-cond', 'cond-midp']: from statsmodels.stats import proportion bp = r_d / (1 + r_d) y_total = y1 + y2 stat = None pvalue = proportion.binom_test(y1, y_total, prop=bp, alternative=alternative) if method in ['cond-midp']: # not inplace in case we still want binom pvalue pvalue = pvalue - 0.5 * sps.binom.pmf(y1, y_total, bp) dist = 'binomial' if dist == 'normal': return zstat_generic2(stat, 1, alternative) else: return pvalue #, stat