def calc_coverage_threshold(cov_dict): ''' calculate minimum coverage threshold for each key in cov_dict. see end of 'alternative parameterization' section of Negative binomial page and scipy negative binomial documentation for details of calculation. ''' threshold_dict = {} for g in cov_dict: mean = float(cov_dict[g]['mean']) var = float(cov_dict[g]['variance']) q = (var-mean)/var n = mean**2/(var-mean) p = 1 - q ## assert that I did the math correctly. assert(isclose(nbinom.mean(n,p), mean)) assert(isclose(nbinom.var(n,p), var)) ## find the integer threshold that includes ~95% of REL606 distribution, ## excluding 5% on the left hand side. my_threshold = nbinom.ppf(0.05,n,p) my_threshold_p = nbinom.cdf(my_threshold,n,p) threshold_dict[g] = {'threshold':str(my_threshold), 'threshold_p':str(my_threshold_p)} return threshold_dict
def test_mran_var_p2(self): n, p = sm.distributions.zinegbin.convert_params(7, 1, 2) nbinom_mean, nbinom_var = nbinom.mean(n, p), nbinom.var(n, p) zinb_mean = sm.distributions.zinegbin.mean(7, 1, 2, 0) zinb_var = sm.distributions.zinegbin.var(7, 1, 2, 0) assert_allclose(nbinom_mean, zinb_mean, rtol=1e-10) assert_allclose(nbinom_var, zinb_var, rtol=1e-10)
def test_mean_var(self): for m in [9, np.array([1, 5, 10])]: n, p = sm.distributions.zinegbin.convert_params(m, 1, 1) nbinom_mean, nbinom_var = nbinom.mean(n, p), nbinom.var(n, p) zinb_mean = sm.distributions.zinegbin._mean(m, 1, 1, 0) zinb_var = sm.distributions.zinegbin._var(m, 1, 1, 0) assert_allclose(nbinom_mean, zinb_mean, rtol=1e-10) assert_allclose(nbinom_var, zinb_var, rtol=1e-10)
def calc_2X_coverage_threshold(cov_dict): ''' calculate coverage threshold for each key in cov_dict, based on a likelihood ratio between empirical Nbinom(mu,disp) 1X coverage distribution, and a theoretical Poisson(2*mu) 2X coverage distribution. see end of 'alternative parameterization' section of Negative binomial page and scipy negative binomial documentation for details of calculation. choose coverage threshold s.t. log likelihood ratio > 10. ''' ## to convert my IDs to REL IDs. rel_name = {'RM3-130-1':'REL11734','RM3-130-2':'REL11735', 'RM3-130-3':'REL11736','RM3-130-4':'REL11737', 'RM3-130-5':'REL11738','RM3-130-6':'REL11739', 'RM3-130-7':'REL11740','RM3-130-8':'REL11741', 'RM3-130-9':'REL11742','RM3-130-10':'REL11743', 'RM3-130-11':'REL11744','RM3-130-12':'REL11745', 'RM3-130-13':'REL11746','RM3-130-14':'REL11747', 'RM3-130-15':'REL11748','RM3-130-16':'REL11749', 'RM3-130-17':'REL11750','RM3-130-18':'REL11751', 'RM3-130-19':'REL11752','RM3-130-20':'REL11753', 'RM3-130-21':'REL11754','RM3-130-22':'REL11755', 'RM3-130-23':'REL11756','RM3-130-24':'REL11757', 'REL4397':'REL4397', 'REL4398':'REL4398', 'REL288':'REL288','REL291':'REL291','REL296':'REL296','REL298':'REL298'} threshold_dict = {} for g in cov_dict: mean = float(cov_dict[g]['mean']) var = float(cov_dict[g]['variance']) q = (var-mean)/var n = mean**2/(var-mean) p = 1 - q ## assert that I did the math correctly. assert(isclose(nbinom.mean(n,p), mean)) assert(isclose(nbinom.var(n,p), var)) ## find the integer threshold that includes ~95% of REL606 distribution, ## excluding 5% on the left hand side. for x in range(int(mean),int(2*mean)): p0 = nbinom.pmf(x,n,p) p1 = poisson.pmf(x,2*mean) lratio = p1/p0 if lratio > 10: my_threshold = x my_threshold_p0 = p0 my_threshold_p1 = p1 my_lratio = lratio break threshold_dict[rel_name[g]] = {'threshold':str(my_threshold), 'threshold_p0':str(my_threshold_p0), 'threshold_p1':str(my_threshold_p1), 'lratio':str(lratio)} return threshold_dict
def ComputeNBMeanVar(ExprPar): ''' Compute the mean and the variance of a NB distribution with parameter n and p ''' n = ExprPar[1] p = ExprPar[0] M = nbinom.mean(n, p) V = nbinom.var(n, p) return M, V
def plot_pascal_distr(p, n): xs = [] ys = [] cum = 0 k = 0 while cum < 0.99: prob = nbinom.pmf(k, n, p) cum += prob xs.append(k) ys.append(prob) k += 1 plt.gca().axvline(x=nbinom.mean(n, p), color="red") plt.plot( xs, ys, label="p={}".format(p), marker="o", linestyle="None", color="purple")
def analytical_MPVS( infection_ts: pd.DataFrame, smoothing: Callable, alpha: float = 3.0, # shape beta: float = 2.0, # rate CI: float = 0.95, # confidence interval infectious_period: int = 5*days, # inf period = 1/gamma, variance_shift: float = 0.99, # how much to scale variance parameters by when anomaly detected totals: bool = True # are these case totals or daily new cases? ): """Estimates Rt ~ Gamma(alpha, 1/beta), and implements an analytical expression for a mean-preserving variance increase whenever case counts fall outside the CI defined by a negative binomial distribution""" # infection_ts = infection_ts.copy(deep = True) dates = infection_ts.index if totals: # daily_cases = np.diff(infection_ts.clip(lower = 0)).clip(min = 0) # infection_ts clipped because COVID19India API does weird stuff daily_cases = infection_ts.clip(lower = 0).diff().clip(lower = 0).iloc[1:] else: daily_cases = infection_ts total_cases = np.cumsum(smoothing(np.squeeze(daily_cases))) v_alpha, v_beta = [], [] RR_pred, RR_CI_upper, RR_CI_lower = [], [], [] T_pred, T_CI_upper, T_CI_lower = [], [], [] new_cases_ts = [] anomalies = [] anomaly_dates = [] for i in range(2, len(total_cases)): new_cases = max(0, total_cases[i] - total_cases[i-1]) old_new_cases = max(0, total_cases[i-1] - total_cases[i-2]) alpha += new_cases beta += old_new_cases v_alpha.append(alpha) v_beta.append(beta) RR_est = max(0, 1 + infectious_period*np.log(Gamma.mean( a = alpha, scale = 1/beta))) RR_upper = max(0, 1 + infectious_period*np.log(Gamma.ppf(CI, a = alpha, scale = 1/beta))) RR_lower = max(0, 1 + infectious_period*np.log(Gamma.ppf(1-CI, a = alpha, scale = 1/beta))) RR_pred.append(RR_est) RR_CI_upper.append(RR_upper) RR_CI_lower.append(RR_lower) if (new_cases == 0 or old_new_cases == 0): if new_cases == 0: logger.debug("new_cases at time %s: 0", i) if old_new_cases == 0: logger.debug("old_new_cases at time %s: 0", i) T_pred.append(0) T_CI_upper.append(10) # <- where does this come from? T_CI_lower.append(0) new_cases_ts.append(0) if (new_cases > 0 and old_new_cases > 0): new_cases_ts.append(new_cases) r, p = alpha, beta/(old_new_cases + beta) T_pred.append(nbinom.mean(r, p)) T_upper = nbinom.ppf(CI, r, p) T_lower = nbinom.ppf(1-CI, r, p) T_CI_upper.append(T_upper) T_CI_lower.append(T_lower) _np = p _nr = r anomaly_noted = False counter = 0 while not (T_lower < new_cases < T_upper): if not anomaly_noted: anomalies.append(new_cases) anomaly_dates.append(dates[i]) # logger.debug("anomaly identified at time %s: %s < %s < %s, r: %s, p: %s, annealing iteration: %s", i, T_lower, new_cases, T_upper, _nr, _np, counter+1) # nnp = 0.95 *_np # <- where does this come from _nr = variance_shift * _nr * ((1-_np)/(1-variance_shift*_np) ) _np = variance_shift * _np T_upper = nbinom.ppf(CI, _nr, _np) T_lower = nbinom.ppf(1-CI, _nr, _np) T_lower, T_upper = sorted((T_lower, T_upper)) if T_lower == T_upper == 0: T_upper = 1 logger.debug("CI collapse, setting T_upper -> 1") anomaly_noted = True counter += 1 if counter >= 10000: raise ValueError("Number of iterations exceeded") else: if anomaly_noted: alpha = _nr # update distribution on R with new parameters that enclose the anomaly beta = _np/(1-_np) * old_new_cases T_pred[-1] = nbinom.mean(_nr, _np) T_CI_lower[-1] = nbinom.ppf(CI, _nr, _np) T_CI_upper[-1] = nbinom.ppf(1-CI, _nr, _np) # annealing leaves the RR mean unchanged, but we need to adjust its widened CI RR_upper = max(0, 1 + infectious_period * np.log(Gamma.ppf(CI , a = alpha, scale = 1/beta))) RR_lower = max(0, 1 + infectious_period * np.log(Gamma.ppf(1 - CI, a = alpha, scale = 1/beta))) # replace latest CI time series entries with adjusted CI RR_CI_upper[-1] = RR_upper RR_CI_lower[-1] = RR_lower return ( dates[2:], RR_pred, RR_CI_upper, RR_CI_lower, T_pred, T_CI_upper, T_CI_lower, total_cases, new_cases_ts, anomalies, anomaly_dates )
def mean(self, n, p): mu = nbinom.mean(self, n, p) return mu