def make_negative_binom_density(r, p, w, size_of_counts, left_most, for_plot=False): negative_binom_density_array = np.zeros(size_of_counts + 1, dtype=np.float128) dist1 = st.nbinom(r, p) f1 = dist1.pmf cdf1 = dist1.cdf dist2 = st.nbinom(r, 1 - p) f2 = dist2.pmf cdf2 = dist2.cdf negative_binom_norm = (cdf1(size_of_counts) - cdf1(left_most - 1)) * w + \ (cdf2(size_of_counts) - cdf2(left_most - 1)) * (1 - w) plot_norm = (cdf1(size_of_counts) - cdf1(4)) * w + \ (cdf2(size_of_counts) - cdf2(4)) * (1 - w) for k in range(5, size_of_counts + 1): if for_plot: negative_binom_density_array[k] = (w * f1(k) + (1 - w) * f2(k)) / plot_norm else: negative_binom_density_array[k] = ( w * f1(k) + (1 - w) * f2(k)) / negative_binom_norm return negative_binom_density_array
def bag_size_gen(self, num_bags, random_state, max_pts=None): if self.size_type == 'uniform': lo, hi = self.bag_sizes sizes = random_state.randint(low=lo, high=hi + 1, size=num_bags) elif self.size_type == 'neg-binom': # Do a negative binomial + 1 (in Wikipedia's notation), # so that sizes are a distribution on the positive integers. # mean = p r / (1 - p) + 1; var = (mean - 1) / (1 - p) mean, std = self.bag_sizes p = 1 - (mean - 1) / (std * std) assert 0 < p < 1 r = (mean - 1) * (1 - p) / p assert r > 0 # scipy swaps p and 1-p sizes = [] if max_pts is not None: while max_pts > 0: size_bag = stats.nbinom(r, 1 - p).rvs( size=1, random_state=random_state)[0] + 1 max_pts = max_pts - size_bag if max_pts >= 0: sizes.append(size_bag) else: sizes.append(max_pts + size_bag) else: sizes = stats.nbinom(r, 1 - p).rvs( size=num_bags, random_state=random_state) + 1 else: raise ValueError("unknown size_type {}".format(self.size_type)) return sizes
def neg_binomial_3(Neg_Binom_Params): u = Neg_Binom_Params.valuesdict() model_value = (u['coef1']) * scs.nbinom(u['n1'], u['p1']).pmf(vals) + ( u['coef2']) * scs.nbinom(u['n2'], u['p2']).pmf(vals) + ( 1 - u['coef1'] - u['coef2']) * scs.nbinom(u['n3'], u['p3']).pmf(vals) residuals = model_value - actual return (residuals)
def computeSumOfDensities(self, pBackgroundModel, pArgs, pXfoldMaxValue=None): background_nbinom = {} background_sum_of_densities_dict = {} max_value = 0 fixateRange = int(pArgs.fixateRange) for distance in pBackgroundModel: max_value_distance = int(pBackgroundModel[distance][2]) if max_value < int(pBackgroundModel[distance][2]): max_value = int(pBackgroundModel[distance][2]) if pXfoldMaxValue is not None: max_value_distance *= pXfoldMaxValue if -int(pArgs.fixateRange) < distance and int(pArgs.fixateRange) > distance: background_nbinom[distance] = nbinom(pBackgroundModel[distance][0], pBackgroundModel[distance][1]) sum_of_densities = np.zeros(max_value_distance) for j in range(max_value_distance): if j >= 1: sum_of_densities[j] += sum_of_densities[j - 1] sum_of_densities[j] += background_nbinom[distance].pmf(j) background_sum_of_densities_dict[distance] = sum_of_densities background_nbinom[fixateRange] = nbinom(pBackgroundModel[fixateRange][0], pBackgroundModel[fixateRange][1]) sum_of_densities = np.zeros(max_value) for j in range(max_value): if j >= 1: sum_of_densities[j] += sum_of_densities[j - 1] sum_of_densities[j] += background_nbinom[fixateRange].pmf(j) background_sum_of_densities_dict[fixateRange] = sum_of_densities background_nbinom[-fixateRange] = nbinom(pBackgroundModel[-fixateRange][0], pBackgroundModel[-fixateRange][1]) sum_of_densities = np.zeros(max_value) for j in range(max_value): if j >= 1: sum_of_densities[j] += sum_of_densities[j - 1] sum_of_densities[j] += background_nbinom[-fixateRange].pmf(j) background_sum_of_densities_dict[-fixateRange] = sum_of_densities min_key = min(background_sum_of_densities_dict) max_key = max(background_sum_of_densities_dict) for key in pBackgroundModel.keys(): if key in background_sum_of_densities_dict: continue if key < min_key: background_sum_of_densities_dict[key] = background_sum_of_densities_dict[min_key] elif key > max_key: background_sum_of_densities_dict[key] = background_sum_of_densities_dict[max_key] return background_sum_of_densities_dict
def get_probability_density_func(self): """ Calculates the probabilities for the NegativeBinomial x_values. """ dist = nbinom(n=self.r, p=self.p) self.probabilities = dist.pmf(self.x_values)
def _reset_distribution(self): """ https://stackoverflow.com/questions/40846992/ alternative-parametrization-of-the-negative-binomial-in-scipy #comment109394209_47406400 """ self._distribution: rv_discrete = nbinom(self._r, 1 - self._p)
def print_stats(seqs): lens = get_lengths(seqs) m, v, p, r = dist_parameters(lens) print "mean\tvariance\tmedian\tr\tp" print "\t".join(map(str, [m, v, scipy.median(lens), r, p])) return lens, stats.nbinom(r, 1-p).pmf
def N_test_neg_binom( num_obs_events: int, rupture_rate: float, prob_success: float, r_dispersion: float, conf_interval: float, ) -> dict: if r_dispersion < 1: logging.warn("Earthquake production temporally underdispersed, \n" "switching to Poisson N-Test") return N_test_poisson(num_obs_events, rupture_rate, conf_interval) conf_min, conf_max = nbinom(r_dispersion, prob_success).interval(conf_interval) test_pass = conf_min <= num_obs_events <= conf_max test_res = "Pass" if test_pass else "Fail" logging.info(f"N-Test: {test_res}") test_result = { "conf_interval_pct": conf_interval, "conf_interval": (conf_min, conf_max), "inv_time_rate": rupture_rate, "n_obs_earthquakes": num_obs_events, "test_res": test_res, "test_pass": bool(test_pass), } return test_result
def test_zig_cdf(): np.random.seed(0) x = st.nbinom(n=10, p=.1).rvs(size=100) Fx = scmodes.benchmark.gof._zig_cdf(x, size=1, log_mu=-5, log_phi=-1, logodds=-3) assert Fx.shape == x.shape assert (Fx >= 0).all() assert (Fx <= 1).all()
def _shannon_entropy(col, alpha, beta): if not isinstance(col, (np.ndarray,)): col = np.array(col, shape = len(col), dtype = float) if len(alpha) != len(beta): return ArgumentError priors = [] for i in range(0, len(alpha)): priors.append( sp.nbinom(alpha[i], beta[i]) ) col = np.sort(np.around(col)) if col.shape[0] == 1: return 0.0 else: ## apply prior scaling weights = np.zeros_like(col, dtype = float) for i in range(0, col.shape[0]): weights[i] = np.max([ p.pmf(col[i]) for p in priors ]) #print col #print weights freq = col * weights freq = freq/np.sum(freq) #print freq H = -1 * freq * np.log2(freq) #print H return np.nansum(H)
def nb_iter(n,p): yield 0.0 nb = nbinom(n,p) for i in count(): pr = nb.pmf(i) if pr<1e-5: break yield pr
def _shannon_entropy(col, alpha, beta): if not isinstance(col, (np.ndarray, )): col = np.array(col, shape=len(col), dtype=float) if len(alpha) != len(beta): return ArgumentError priors = [] for i in range(0, len(alpha)): priors.append(sp.nbinom(alpha[i], beta[i])) col = np.sort(np.around(col)) if col.shape[0] == 1: return 0.0 else: ## apply prior scaling weights = np.zeros_like(col, dtype=float) for i in range(0, col.shape[0]): weights[i] = np.max([p.pmf(col[i]) for p in priors]) #print col #print weights freq = col * weights freq = freq / np.sum(freq) #print freq H = -1 * freq * np.log2(freq) #print H return np.nansum(H)
def plot_negbinomial_fit(data, fit_results, title=None, x_label=None, x_range=None, y_range=None, fig_size=(6, 5), bin_width=1, filename=None): """ :param data: (numpy.array) observations :param fit_results: dictionary with keys "n", "p" and "loc" :param title: title of the figure :param x_label: label to show on the x-axis of the histogram :param x_range: (tuple) x range :param y_range: (tuple) y range (the histogram shows the probability density so the upper value of y_range should be 1). :param fig_size: int, specify the figure size :param bin_width: bin width :param filename: filename to save the figure as """ plot_fit_discrete(data=data, dist=stat.nbinom(n=fit_results['n'], p=fit_results['p'], loc=fit_results['loc']), label='Negative Binomial', bin_width=bin_width, title=title, x_label=x_label, x_range=x_range, y_range=y_range, fig_size=fig_size, filename=filename)
def rpp(x, log_mu, log_phi, logodds, size, onehot, n_samples=1): # Important: these are n x 1 n = onehot.dot(np.exp(-log_phi)) pi0 = onehot.dot(sp.expit(-logodds)) p = 1 / (1 + (size * onehot.dot(np.exp(log_mu + log_phi)))) cdf = st.nbinom(n=n, p=p).cdf(x - 1) # Important: this excludes the right endpoint, so we need to special case x = # 0 cdf = np.where(x > 0, pi0 + (1 - pi0) * cdf, cdf) pmf = st.nbinom(n=n, p=p).pmf(x) pmf *= (1 - pi0) pmf[x == 0] += pi0[x == 0] u = np.random.uniform(size=(n_samples, x.shape[0])) # cdf and pmf are n x 1 rpp = cdf.ravel() + u * pmf.ravel() return rpp
def predict(self, size=100): b, alpha, phi = self.theta_opt a = b * self.mu p = b / (1 + b) rv = nbinom(a, p).rvs(size=size) y50 = np.mean(rv) y25 = np.quantile(rv, 0.25) y90 = np.quantile(rv, 0.9) return y50, y25, y90
def choose(self): self.name = "Neg-binomial" if self.user_class == 'HF': peak_hours_for_number_of_requests_hf = [ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ] if self.hour in peak_hours_for_number_of_requests_hf: nbinom_n_size, nbinom_mu_mean = 0.470368548315641, 34.7861725808564 else: nbinom_n_size, nbinom_mu_mean = .143761308534382, 14.158264589062 elif self.user_class == 'HO': peak_hours_for_number_of_requests_ho = [ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ] if self.hour in peak_hours_for_number_of_requests_ho: nbinom_n_size, nbinom_mu_mean = 0.113993444740046, 1.04026982546095 else: nbinom_n_size, nbinom_mu_mean = 0.0448640346452827, 0.366034837767499 elif self.user_class == 'MF': peak_hours_for_number_of_requests_mf = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 ] if self.hour in peak_hours_for_number_of_requests_mf: nbinom_n_size, nbinom_mu_mean = 0.758889839349924, 4.83390315655562 else: nbinom_n_size, nbinom_mu_mean = 0.314653746175354, 3.22861572712093 elif self.user_class == 'MO': peak_hours_for_number_of_requests_mo = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 ] if self.hour in peak_hours_for_number_of_requests_mo: nbinom_n_size, nbinom_mu_mean = 0.177211316065872, 0.406726610288464 else: nbinom_n_size, nbinom_mu_mean = 0.0536955764781434, 0.124289074773539 elif self.user_class == 'LF': peak_hours_for_number_of_requests_lf = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 ] if self.hour in peak_hours_for_number_of_requests_lf: nbinom_n_size, nbinom_mu_mean = 0.480203280455517, 0.978733578849008 else: nbinom_n_size, nbinom_mu_mean = 0.240591506072217, 0.487956906502501 elif self.user_class == 'LO': peak_hours_for_number_of_requests_lo = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 ] if self.hour in peak_hours_for_number_of_requests_lo: nbinom_n_size, nbinom_mu_mean = 0.188551092877969, 0.111187768162793 else: nbinom_n_size, nbinom_mu_mean = 0.0810585648991726, 0.0405013083716073 else: raise Exception('The user class %s does not exist' % self.user_class) # From R's documentation: An alternative parametrization (often used in ecology) is by the # _mean_ 'mu', and 'size', the _dispersion parameter_, where 'prob' = 'size/(size+mu)' nbinom_prob = nbinom_n_size / (nbinom_n_size + nbinom_mu_mean) return nbinom(nbinom_n_size, nbinom_prob)
def generate_onset_to_reporting_distribution_brauner(): """ Build onset-to-reporting distribution """ # Distribution used by [Brauner et al., 2020] mu = 5.25 alpha = 1.57 distrb = nbinom(n=1 / alpha, p=1 - alpha * mu / (1 + alpha * mu)) x = range(int(distrb.ppf(1 - 1e-6))) return distrb.pmf(x)
def test_rvs(self): vals = stats.nbinom.rvs(10, 0.75, size=(2, 50)) assert_(numpy.all(vals >= 0)) assert_(numpy.shape(vals) == (2, 50)) assert_(vals.dtype.char in typecodes['AllInteger']) val = stats.nbinom.rvs(10, 0.75) assert_(isinstance(val, int)) val = stats.nbinom(10, 0.75).rvs(3) assert_(isinstance(val, numpy.ndarray)) assert_(val.dtype.char in typecodes['AllInteger'])
def test_rvs(self): vals = stats.nbinom.rvs(10, 0.75, size=(2, 50)) assert numpy.all(vals >= 0) assert numpy.shape(vals) == (2, 50) assert vals.dtype.char in typecodes["AllInteger"] val = stats.nbinom.rvs(10, 0.75) assert isinstance(val, int) val = stats.nbinom(10, 0.75).rvs(3) assert isinstance(val, numpy.ndarray) assert val.dtype.char in typecodes["AllInteger"]
def _ebpm_point_gamma_update(theta, x, s): logodds, a, b = theta p = sp.expit(logodds) nb_lik = st.nbinom(n=a, p=1 / (1 + s / b)).pmf(x) z = np.where(x < 1, p * nb_lik / (1 - p + p * nb_lik), 1) pm = (x + a) / (s + b) plm = sp.digamma(x + a) - np.log(s + b) logodds = np.log(z.sum()) - np.log((1 - z).sum() + 1e-16) b = a * z.sum() / (z * pm).sum() a = _ebpm_point_gamma_update_a(a, z, plm, b) return np.array([logodds, a, b])
def nbinSim(*ps): ''' # Test negative binomial model. # ps[0] - mean of nbinom distribution # ps[1] - aggregation k factor. ''' p = ps[0] / (ps[1] + ps[0]) if p == 0: return np.zeros(1000) else: return stats.nbinom(n=ps[0], p=p).rvs(size=1000)
def nbinSim(*ps): ''' # Test negative binomial model. # ps[0] - mean of nbinom distribution # ps[1] - aggregation k factor. ''' p = ps[0]/(ps[1]+ps[0]) if p == 0: return np.zeros(1000) else: return stats.nbinom(n=ps[0],p=p).rvs(size=1000)
def test_Geometric_to_NBinom(self): exp_list, obs_list = [], [] X = Geometric(p=0.8) sims = X.sim(Nsim) simulated = sims.tabulate() for k in range(1, 10): expected = Nsim * stats.nbinom(n=1, p=0.8).pmf(k - 1) if expected > 5: exp_list.append(expected) obs_list.append(simulated[k]) pval = stats.chisquare(obs_list, exp_list).pvalue self.assertTrue(pval > 0.01)
def test_NBinom_Pascal_additive(self): exp_list, obs_list = [], [] X, Y = RV(Pascal(r=4, p=0.6) * Pascal(r=6, p=0.6)) sims = (X + Y).sim(Nsim) simulated = sims.tabulate() for k in range(10, 35): expected = Nsim * stats.nbinom(n=10, p=0.6).pmf(k) if expected > 5: exp_list.append(expected) obs_list.append(simulated[k]) pval = stats.chisquare(obs_list, exp_list).pvalue self.assertTrue(pval > .01)
def simulate_nb_gamma(): np.random.seed(1) n = 100 p = 5 s = 1e5 * np.ones((n, 1)) theta = 0.2 log_mu = np.random.uniform(-12, -6, size=(1, p)) log_phi = np.random.uniform(-6, 0, size=(1, p)) G = st.gamma(a=np.exp(-log_phi), scale=np.exp(log_mu + log_phi)) lam = G.rvs(size=(n, p)) x = st.nbinom(n=1 / theta, p=1 / (1 + s * lam * theta)).rvs() return x, s, log_mu, log_phi, theta
def demand_quantile(self, my_percentile=0): if my_percentile == 0: my_percentile = self.percentile for sku_id in self.sku_list: for dc_id in self.dc_list: row = self.distribution.loc[ (self.distribution.item_sku_id == sku_id) & (self.distribution.dc_id == dc_id)] dist_type = row.dist_type.iloc[0] para1 = row.para1.astype(float).iloc[0] para2 = row.para2.astype(float).iloc[0] if dist_type == 'N': ng_bi = sp.nbinom(para1, para2) if dc_id == 0: self.dR_it[sku_id - 1] = np.ceil( ng_bi.ppf(my_percentile)) else: self.d_ijt[sku_id - 1, dc_id - 1] = np.ceil( ng_bi.ppf(my_percentile)) elif dist_type == 'G': g = sp.gamma(para1, scale=para2) if dc_id == 0: self.dR_it[sku_id - 1] = np.ceil(g.ppf(my_percentile)) else: self.d_ijt[sku_id - 1, dc_id - 1] = np.ceil(g.ppf(my_percentile)) # # Assign mean values as deterministic sku_demand # self.dR_it = np.zeros((1000)).astype(int) # self.d_ijt = np.zeros((1000, 5)).astype(int) # for sku_id in self.sku_list: # for dc_id in self.dc_list: # row = self.distribution.loc[(self.distribution.item_sku_id == sku_id) & (self.distribution.dc_id == dc_id)] # dist_type = row.dist_type.iloc[0] # para1 = row.para1.astype(float).iloc[0] # para2 = row.para2.astype(float).iloc[0] # if dist_type == 'N': # if dc_id == 0: # self.dR_it[sku_id-1] = para1*(1-para2)/para2 # else: # self.d_ijt[sku_id-1, dc_id-1] = para1*(1-para2)/para2 # elif dist_type == 'G': # if dc_id == 0: # self.dR_it[sku_id-1] = para1*para2 # else: # self.d_ijt[sku_id-1, dc_id-1] = para1*para2 # else: # if dc_id == 0: # print ("No distribution for sku", sku_id, "in RDC") # else: # print ("No distribution for sku", sku_id, "in FDC", dc_id) return (self.dR_it, self.d_ijt)
def _calc_negbinom(self, domain_matrix): sigs = [] means = [np.mean(self.hicmap.diagonal(i)) for i in range(self.hicmap.shape[0])] lens = [len(self.hicmap.diagonal(i)) for i in range(self.hicmap.shape[0])] def sum_mean(i, j): """ Counts the mean across several consecutive diagonals in the hicmap """ s = sum([m * l for (m, l) in list(zip(means, lens))[i:j]]) l = sum(lens[i:j]) try: return s / l except ZeroDivisionError: return 0 def sum_var(i, j, m): """ Counts the variance in several consecutive diagonals given their mean """ mses = [np.mean((self.hicmap.diagonal(i) - m) ** 2) for i in range(i, j)] s = sum([m * l for (m, l) in zip(mses, lens[i:j])]) l = sum(lens[i:j]) try: return s / l except: return 0 pvalue_matrix = np.ones(domain_matrix.shape) for i in range(domain_matrix.shape[0]): li = self.domains[i][1] - self.domains[i][0] + 1 for j in range(i + 1, domain_matrix.shape[1]): lj = self.domains[j][1] - self.domains[j][0] + 1 dist = self.domains[j][0] - self.domains[i][1] span = self.domains[j][1] - self.domains[i][0] + 1 expected = sum_mean(dist, span) var = sum_var(dist, span, expected) mean = expected * li * lj if var < mean: var = mean + 1 k = domain_matrix[i][j] r = mean ** 2 / (var - mean) if (var - mean) != 0 else np.nan p = (var - mean) / var if var != 0 else np.nan model = ss.nbinom(n=r, p=1 - p) if expected and k: pval = model.sf(k) pvalue_matrix[i, j] = pval if pval < self.threshold: sigs.append((i, j, pval)) return sigs, self._fdr_correct(pvalue_matrix, domain_matrix.shape)
def update_umi(attr, old, new): selected = ind_data.selected['1d']['indices'] with sqlite3.connect(db) as conn: if selected: ind = ind_data.data['ind'][selected[0]] print("Selected {}, {}".format(ind, gene)) umi = pd.read_sql( """select umi.value, annotation.size from annotation, umi where umi.gene == ? and annotation.chip_id == ? and umi.sample == annotation.sample""", con=conn, params=( gene, ind, )) keep = umi['value'] < 19 edges = np.arange(20) counts, _ = np.histogram(umi['value'].values, bins=edges) umi_data.data = bokeh.models.ColumnDataSource.from_df( pd.DataFrame({ 'left': edges[:-1], 'right': edges[1:], 'count': counts })) params = pd.read_sql( 'select log_mean, log_disp, logodds from params where gene == ? and ind == ?', con=conn, params=(gene, ind)) n = np.exp(params['log_disp']) p = 1 / (1 + np.outer( umi['size'], np.exp(params['log_mean'] - params['log_disp']))) assert (n > 0).all(), 'n must be non-negative' assert (p >= 0).all(), 'p must be non-negative' assert (p <= 1).all(), 'p must be <= 1' G = st.nbinom(n=n.values.ravel(), p=p.ravel()).pmf grid = np.arange(19) pmf = np.array([G(x).mean() for x in grid]) if params.iloc[0]['logodds'] is not None: pmf *= sp.expit(-params['logodds']).values pmf[0] += sp.expit(params['logodds']).values exp_count = umi.shape[0] * pmf dist_data.data = bokeh.models.ColumnDataSource.from_df( pd.DataFrame({ 'x': .5 + grid, 'y': exp_count })) else: umi_data.data = bokeh.models.ColumnDataSource.from_df( pd.DataFrame(columns=['left', 'right', 'count'])) dist_data.data = bokeh.models.ColumnDataSource.from_df( pd.DataFrame(columns=['x', 'y']))
def _consultations(self): """ Calculates the expected number of consultations each day """ self.E = np.zeros(self.Y.size[0]) self.Cdist = np.empty(self.Y.size[0], dtype=nbinom) for i in range(1, self.Y.size[0]): #incident cases Z_i = S(i-1)-S(i) #expected consultations for Covid 19: E_i = prob * Z_i self.E[i] = (self.Y[i - 1, 0] - self.Y[i, 0]) * self.care_probability r = pow(self.E[i], self.delta) self.Cdist[i] = nbinom(n=r, p=self.E[i] / (r + self.E[i]))
def simulate_point_gamma(): x, s, log_mu, log_phi, _ = _simulate_gamma() n, p = x.shape logodds = np.random.uniform(-3, -1, size=(1, p)) pi0 = sp.expit(logodds) z = np.random.uniform(size=x.shape) < pi0 y = np.where(z, 0, x) F = st.nbinom(n=np.exp(-log_phi), p=1 / (1 + s.dot(np.exp(log_mu + log_phi)))) llik_nonzero = np.log(1 - pi0) + F.logpmf(y) llik = np.where(y < 1, np.log(pi0 + np.exp(llik_nonzero)), llik_nonzero).sum() return y, s, log_mu, log_phi, logodds, llik
def test_ebpm_gamma_extrapolate(simulate_gamma): x, s, log_mu, log_phi, _ = simulate_gamma # Important: log_mu, log_phi are [1, p]. We want oracle log likelihood for # only gene 0 oracle_llik = st.nbinom( n=np.exp(-log_phi[0, 0]), p=1 / (1 + s.dot(np.exp(log_mu[0, 0] + log_phi[0, 0])))).logpmf( x[:, 0]).sum() log_mu_hat, neg_log_phi_hat, llik = scmodes.ebpm.ebpm_gamma( x[:, 0], s.ravel(), extrapolate=True) assert np.isfinite(log_mu_hat) assert np.isfinite(neg_log_phi_hat) assert llik > oracle_llik
def find_high_lim(self): """ Finds the high interval to use in calculations for the variable basis and univariate norm squared values. """ low_percent = 8e-17 high_percent = 1 - low_percent stand_dist = nbinom(n=self.r, p=self.p) high = np.ceil(stand_dist.ppf(high_percent)) low = np.floor(stand_dist.ppf(low_percent)) self.x_values = np.arange(low, high + 1)
def gridOptimFlanks(fitparams, quantiles=(.05, .5), toquantiles=(.5, .99), thres=.1, p=[0, 1], n=[0, 500]): # The central distribution: fit = nbinom(fitparams[0], fitparams[1]) # The squared difference between the quantiles: def ssq(n, p, quantiles, toquantiles): this_fit = nbinom(n, p) return (np.sum([(this_fit.ppf(quantiles[i]) - fit.ppf(toquantiles[i])) ** 2 for i in range(len(quantiles))])) previous, this = (np.mean(n), np.mean(p)), (0.001, 0.001) N = np.linspace(n[0], n[1], 100) P = np.linspace(p[0], p[1], 100) iter = 0 while abs(ssq(previous[0], previous[1], quantiles, toquantiles) - ssq(this[0], this[1], quantiles, toquantiles)) > thres: iter += 1 print(str("Iteration # %s" % str(iter)).ljust(15, " ") + "|", end="") previous = this[:] dist = np.full((100, 100), np.nan) for i, ni in enumerate(N): for j, pj in enumerate(P): d = ssq(ni, pj, quantiles, toquantiles) dist[i, j] = d np.where(dist == np.nanmin(dist)) nId = np.where(dist == np.nanmin(dist))[0] pId = np.where(dist == np.nanmin(dist))[1] this = (np.mean(N[nId]), np.mean(P[pId])) nMin, nMax = N[[nId[0] - 10 if nId[0] - 10 > 0 else 0]][0], N[[nId[-1] + 10 if nId[-1] + 10 < 100 else 99]][0] pMin, pMax = P[[pId[0] - 10 if pId[0] - 10 > 0 else 0]][0], P[[pId[-1] + 10 if pId[-1] + 10 < 100 else 99]][0] # Adjust the edges: if pMin == min(P): pMin = min(P) * 0.8 if pMin == 0: pMin = 0.0001 if pMax == max(P): pMax = max(P) * 1.5 if pMax > 1: pMax = 1 if nMin == min(N): nMin = min(N) * 0.8 if nMax == max(N): nMax = max(N) * 1.2 N = np.linspace(nMin, nMax, 100) P = np.linspace(pMin, pMax, 100) print(str(" Current parameter state %s" % str(round(this[0], 3))).ljust(34, " ") + ";", end="") print(str(" %s" % str(round(this[1], 3))).ljust(8, " ") + "|", end="") print(str(" SSE = %s" % str(round(np.nanmin(dist), 3))).ljust(14, " ") + "|", end="\n") return (this)
def choose(self): self.name = "Neg-binomial" if self.user_class == 'HF': peak_hours_for_number_of_requests_hf = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23] if self.hour in peak_hours_for_number_of_requests_hf: nbinom_n_size, nbinom_mu_mean = 0.470368548315641, 34.7861725808564 else: nbinom_n_size, nbinom_mu_mean = .143761308534382, 14.158264589062 elif self.user_class == 'HO': peak_hours_for_number_of_requests_ho = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23] if self.hour in peak_hours_for_number_of_requests_ho: nbinom_n_size, nbinom_mu_mean = 0.113993444740046, 1.04026982546095 else: nbinom_n_size, nbinom_mu_mean = 0.0448640346452827, 0.366034837767499 elif self.user_class == 'MF': peak_hours_for_number_of_requests_mf = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] if self.hour in peak_hours_for_number_of_requests_mf: nbinom_n_size, nbinom_mu_mean = 0.758889839349924, 4.83390315655562 else: nbinom_n_size, nbinom_mu_mean = 0.314653746175354, 3.22861572712093 elif self.user_class == 'MO': peak_hours_for_number_of_requests_mo = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] if self.hour in peak_hours_for_number_of_requests_mo: nbinom_n_size, nbinom_mu_mean = 0.177211316065872, 0.406726610288464 else: nbinom_n_size, nbinom_mu_mean = 0.0536955764781434, 0.124289074773539 elif self.user_class == 'LF': peak_hours_for_number_of_requests_lf = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] if self.hour in peak_hours_for_number_of_requests_lf: nbinom_n_size, nbinom_mu_mean = 0.480203280455517, 0.978733578849008 else: nbinom_n_size, nbinom_mu_mean = 0.240591506072217, 0.487956906502501 elif self.user_class == 'LO': peak_hours_for_number_of_requests_lo = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] if self.hour in peak_hours_for_number_of_requests_lo: nbinom_n_size, nbinom_mu_mean = 0.188551092877969, 0.111187768162793 else: nbinom_n_size, nbinom_mu_mean = 0.0810585648991726, 0.0405013083716073 else: raise Exception('The user class %s does not exist' % self.user_class) # From R's documentation: An alternative parametrization (often used in ecology) is by the # _mean_ 'mu', and 'size', the _dispersion parameter_, where 'prob' = 'size/(size+mu)' nbinom_prob = nbinom_n_size / (nbinom_n_size + nbinom_mu_mean) return nbinom(nbinom_n_size, nbinom_prob)
def density_nb( expanded, r, mean, strand ): ''' expanded is the list of values. r, p follows the description in http://en.wikipedia.org/wiki/Negative_binomial_distribution mean = p*r/(1-p) mode = floor( p(r-1)/(1-p) ) ''' mean = float( mean ) r = float(r) p = mean / (mean + r) mode = np.floor( p*(r-1)/(1-p ) ) p = 1 - p #conform to the scipy definition nbinom = scist.nbinom( r, p ) modep = nbinom.pmf( mode ) factor = 1/modep leftwin = mode rightwin = mean * 3 if strand == '-': temp = leftwin leftwin = rightwin rightwin = temp out = np.zeros_like( expanded ) for i in range( expanded.shape[0] ): count = expanded[i] if count > 0: start = max( 0, i - leftwin ) end = min( expanded.shape[0], i + rightwin ) for j in range( int(start), int(end) ): k = j - i + mode if strand == '-': k = mode - j + i out[ j ] += factor*count*nbinom.pmf( k ) expanded.resize( 100000, refcheck=False) expanded.resize( 0, refcheck=False) return out
def likelihood(self, event_mark, dt): r = np.array(event_mark * self.r * event_mark.T) p = np.array(event_mark * self.p * event_mark.T) return stats.nbinom(r, 1. - p).pmf(dt.astype(int))
def dist(self, i, j): p = self.p[i, j] r = self.r[i, j] return stats.nbinom(r, 1. - p)
# Poisson-distribution expected frequencies : px = poisson(num, mu) # compute probabilities ps_expfreq = px * len(ants) # computes expected frequencies #=============================================================================== # MLE estimation for negative-binomial distribution : # Starting values for (r, p) r0 = (mu + mu**2) / sigma**2 p0 = r0 / (mu + r0) out = minimize(negbinlike, [r0, p0], args=(ants,), method='L-BFGS-B') r, p = out['x'] # MLE nbin = nbinom(r, p) # n-bin object bx = nbin.pmf(num) # probabilities nb_expfreq = bx * len(ants) # expected frequency #=============================================================================== # plotting : fig = figure() ax = fig.add_subplot(111) ax.hist(ants, max(ants), color='0.4', histtype='stepfilled') ax.plot(num + .5, nb_expfreq, 'ko', label='Neg-Bin expected freq') ax.plot(num + .5, ps_expfreq, 'rs', label='Poisson expected freq') ax.set_xlabel('# fireants per 50-meter square plot') ax.set_ylabel('Frequency') ax.set_title('Histogram of Fire-Ant Hill Counts') ax.legend(loc='center right')
def make_nbinom(mu, sigmasq): p = 1.0 - mu/sigmasq r = mu * (1.0-p) / p return nbinom(r,1-p)
dist0 = stats.nbinom(n, p) y = dist0.rvs(size=nobs) x = np.ones(nobs) """ y = len_list x = np.ones(len(len_list)) loglike_method = 'nb1' # or use 'nb2' res = sm.NegativeBinomial(y, x, loglike_method=loglike_method).fit(start_params=[0.1, 0.1]) #print dist0.mean() print res.params mu = res.predict() # use this for mean if not constant mu = np.exp(res.params[0]) # shortcut, we just regress on a constant alpha = res.params[1] if loglike_method == 'nb1': Q = 1 elif loglike_method == 'nb2': Q = 0 size = 1. / alpha * mu**Q prob = size / (size + mu) #print 'data generating parameters', n, p print 'estimated params ', size, prob #estimated distribution dist_est = stats.nbinom(size, prob)
classifier.likelihood('bad anchortext', bad_anchortext, p_ifyes=0.005, p_ifno=0.3) def good_linkcontext(doc): pat = re.compile(r'penultimate|draft|forthcoming') return pat.search(doc.link.context.lower()) classifier.likelihood('good link context', good_linkcontext, p_ifyes=0.2, p_ifno=0.05) def course_words(doc): # note: 'course' is also common in 'of course', 'in the course of', # 'essay' is common in discussions of Locke pat = re.compile(r'seminar|schedule|readings|textbook|students|handout|\bweek|hours/', re.I) # normalize all measures to 10000 word documents (i.e., here we # return the number of matches per 10000 words): return int(len(pat.findall(doc.content)) * 10000 / doc.numwords) classifier.likelihood('course note words', course_words, p_ifyes=nbinom(1, 0.8), p_ifno=nbinom(2, 0.2)) def paper_words(doc): pat = re.compile(r'in section|finally,', re.I) return int(len(pat.findall(doc.content)) * 10000 / doc.numwords) classifier.likelihood('typical paper words', paper_words, p_ifyes=nbinom(2, 0.3), p_ifno=nbinom(1, 0.6)) def interview_words(doc): pat = re.compile(r'interview|do you', re.I) return int(len(pat.findall(doc.content)) * 10000 / doc.numwords) classifier.likelihood('interview words', interview_words, p_ifyes=nbinom(1, 0.8), p_ifno=nbinom(1, 0.2)) def verbs(doc): # bibliographies and other lists don't contain many verbs
def in_beginning(regex): reg = re.compile(regex, re.I) def check(doc): if not doc.content: return Ellipsis beginning = doc.content[:5000] return reg.search(beginning) return check # ========================================================================= bookfilter = BinaryNaiveBayes(prior_yes=0.2) bookfilter.likelihood('numwords', length, p_ifyes=nbinom(7, 0.0001), p_ifno=nbinom(1, 0.0001)) # TODO: add more features? "Acknowledgements" section? Occurrences of # "this book" TOC? Index? ... # ========================================================================= chapterfilter = BinaryNaiveBayes(prior_yes=0.2) chapterfilter.likelihood('numwords', length, p_ifyes=nbinom(2, 0.0002), p_ifno=nbinom(3, 0.0002)) chapterfilter.likelihood('"chapter" occurs in link context', in_context('chapter'), p_ifyes=0.7, p_ifno=0.05) # TODO: add features?