def fit_beta_mixture(Y, gamma, n_itr=100): Y = Y / gamma Y = Y[(Y > 0) & (Y < 1)] #plt.figure(figsize=[12,5]) #sns.distplot(Y) #plt.show() theta = np.array([1, 10, 2, 5, 0.5]) w = np.zeros([Y.shape[0], 2], dtype=float) if_converge = False for i in range(n_itr): theta_old = theta # E step w[:, 0] = beta.pdf(Y, theta[0], theta[1]) * theta[4] w[:, 1] = beta.pdf(Y, theta[2], theta[3]) * (1 - theta[4]) w = (w.T / w.sum(axis=1)).T # M step rand_idx = np.random.binomial(1, w[:, 0], size=Y.shape[0]) theta[4] = w[:, 0].mean() print(np.sum(rand_idx == 1)) theta[0], theta[1], _, _ = beta.fit(Y[rand_idx == 1], loc=0, scale=1) theta[2], theta[3], _, _ = beta.fit(Y[rand_idx == 0], loc=0, scale=1) if np.linalg.norm(theta - theta_old) < 1e-8: if_converge = True break return theta, if_converge
def get_outliers(data, filter, plotting): if plotting: for x, r in [("x1", (0, 1)), ("x2", (0, 30)), ("x3", (0, 1))]: plt.violinplot(data[x], vert=False) plt.xlim(r) plt.savefig("plots/violin/%s.png" % x) plt.clf() if filter: data_fl = data[data["class"] == 0] else: data_fl = data pdf = pd.DataFrame({}) a, b, loc, scale = beta.fit(data_fl["x1"]) pdf["x1"] = beta.logpdf(data["x1"], a, b, loc=loc, scale=scale) a, loc, scale = gamma.fit(data_fl["x2"]) pdf["x2"] = gamma.logpdf(data["x2"], a, loc=loc, scale=scale) a, b, loc, scale = beta.fit(data_fl["x3"]) pdf["x3"] = beta.logpdf(data["x3"], a, b, loc=loc, scale=scale) pdfs = pdf["x1"] + pdf["x2"] + pdf["x3"] if plotting: sns.boxplot(y=pdfs, x="class", data=data) plt.savefig("plots/boxplot.png") plt.clf() if plotting: plt.plot(np.sort(pdfs)) splits = [40, 45, 50, 60] for split in splits: split = np.sort(pdfs)[60] plt.plot((0, 1000), (split, split), 'k-', lw=0.5) split = np.sort(pdfs)[50] plt.plot((0, 1000), (split, split), 'k.', lw=0.5) split = np.sort(pdfs)[45] plt.plot((0, 1000), (split, split), 'k--', lw=0.5) split = np.sort(pdfs)[40] plt.plot((0, 1000), (split, split), 'k--', lw=0.5) plt.savefig("plots/thresholds.png") plt.clf() outliers = np.argsort(pdfs) final = [] for outlier in outliers: if data["class"][outlier] == -1: final.append(outlier) return np.array(final[:100])
def organize_data(): df = pd.read_csv('2016stats.csv') df = df[df['3PA'] > 20] df['3P%'] = df['3P%'] / 100 a = beta.fit(list(df['3P%']), floc=0, fscale=1)[0] b = beta.fit(list(df['3P%']), floc=0, fscale=1)[1] df['3PEstimate'] = (df['3PM'] + a) / (df['3PA'] + a + b) df['a'] = df['3PM'] + a df['b'] = df['3PA'] - df['3PM'] + b print('alpha: ' + str(a)) print('beta: ' + str(b)) return (df, a, b)
def phase1(state): """ if state['pulls-left'] % 100 == 0: print(state['pulls-left']) """ if state['pulls-left'] == 9000: mikeys_ducks_info['alph-beta-scal'] = [] for i in range(100): alph, beta, _, scal = beta_mod.fit(mikeys_ducks_info['payoffs'][i]) mikeys_ducks_info['alph-beta-scal'].append( ((scal * alph) / (alph + beta), alph, beta, scal, i)) elif state['pulls-left'] == 10000: mikeys_ducks_info['costs'] = [0 for i in range(100)] mikeys_ducks_info['metadata'] = ['00000000' for i in range(100)] mikeys_ducks_info['payoffs'] = [[] for i in range(100)] if check_key(state, 'last-cost'): last_pull = mikeys_ducks_info['last-pull'] mikeys_ducks_info['utility'] += (state['last-payoff'] - state['last-cost']) mikeys_ducks_info['costs'][last_pull] = state['last-cost'] mikeys_ducks_info['metadata'][last_pull] = state['last-metadata'] mikeys_ducks_info['payoffs'][last_pull].append(state['last-payoff']) if len(mikeys_ducks_info['payoffs'][last_pull]) >= 1000: mikeys_ducks_info['machines-done'].add(last_pull) if check_key(mikeys_ducks_info, 'alph-beta-scal') and state['pulls-left'] % 10 == 0: alph, beta, _, scal = beta_mod.fit( mikeys_ducks_info['payoffs'][last_pull]) mikeys_ducks_info['alph-beta-scal'][last_pull] = (scal * alph / (alph + beta), alph, beta, scal, last_pull) move = {} move['team-code'] = state['team-code'] move['game'] = 'phase_1' if state['pulls-left'] > 9000: move['pull'] = int((10000 - state['pulls-left']) / 10) else: best_prof, best_ind = get_best_profit_index() move['pull'] = best_ind mikeys_ducks_info['last-pull'] = move['pull'] if state['pulls-left'] == 1: mikeys_ducks_info['machines-done'].add(move['pull']) mikeys_ducks_info['auctions'] = sorted( list(mikeys_ducks_info['machines-done']), key=lambda x: -1 * (mikeys_ducks_info['alph-beta-scal'][x][0] - mikeys_ducks_info['costs'][x])) return move
def organize_data(season): df = pd.read_csv('Data/Seasons_Stats.csv') df = df[df['Year'] == season] df = remove_duplicate_players(df) df = df[['Player', '3P', '3PA', '3P%', 'Tm', 'Year']] df = df[df['3PA'] > 20] a = beta.fit(list(df['3P%']), floc=0, fscale=1)[0] b = beta.fit(list(df['3P%']), floc=0, fscale=1)[1] df['3PEstimate'] = (df['3P'] + a) / (df['3PA'] + a + b) df['a'] = df['3P'] + a df['b'] = df['3PA'] - df['3P'] + b print('alpha: ' + str(a)) print('beta: ' + str(b)) return (df, a, b)
def test_draw_samples(self, dtype, a_shape, a_is_samples, b_shape, b_is_samples, rv_shape, num_samples): # Note: Tests above have been commented as they are very slow to run. # Note: Moved random number generation to here as the seed wasn't set if used above a = np.random.uniform(0.5, 2, size=a_shape) b = np.random.uniform(0.5, 2, size=b_shape) n_dim = 1 + len(rv_shape) a_np = numpy_array_reshape(a, a_is_samples, n_dim) b_np = numpy_array_reshape(b, b_is_samples, n_dim) rv_samples_np = np.random.beta(a_np, b_np, size=(num_samples, ) + rv_shape) var = Beta.define_variable(shape=rv_shape, dtype=dtype, rand_gen=None).factor a_mx = mx.nd.array(a, dtype=dtype) if not a_is_samples: a_mx = add_sample_dimension(mx.nd, a_mx) b_mx = mx.nd.array(b, dtype=dtype) if not b_is_samples: b_mx = add_sample_dimension(mx.nd, b_mx) variables = {var.a.uuid: a_mx, var.b.uuid: b_mx} rv_samples_rt = var.draw_samples(F=mx.nd, variables=variables, num_samples=num_samples) assert np.issubdtype(rv_samples_rt.dtype, dtype) assert is_sampled_array(mx.nd, rv_samples_rt) assert get_num_samples(mx.nd, rv_samples_rt) == num_samples rtol, atol = 1e-1, 1e-1 from itertools import product fits_np = [ beta.fit(rv_samples_np[:, i, j])[0:2] for i, j in (product(*map(range, rv_shape))) ] fits_rt = [ beta.fit(rv_samples_rt.asnumpy()[:, i, j])[0:2] for i, j in (product(*map(range, rv_shape))) ] assert np.allclose(fits_np, fits_rt, rtol=rtol, atol=atol)
def make_first_set_of_plots(): N = 1000 x = zeros(shape=(N,), dtype=float) t = None tmax = 10 axis([0,tmax,0,1]) for i in range(N): t, y = random_walk(0.25, tmax, 0.01, t) x[i] = y[-1] if (i < 3): plot(t, (y+1)/2.0) xlabel("time") ylabel("CTR") savefig("random_walk.png") clf() subplot(211) hist((x+1)/2, bins=50) ylabel("Monte carlo results") subplot(212) best_fit = beta.fit((x+1)/2, floc=0, fscale=1) print best_fit ctr = arange(0,1,0.001) plot(ctr, beta(1,4).pdf(ctr), label="Invariant distribution, beta(1,4)") plot(ctr, beta(best_fit[0],best_fit[1]).pdf(ctr), label="Best fit, beta("+str(best_fit[0]) + "," + str(best_fit[1]) + ")") xlabel("CTR at t="+str(tmax)) ylabel("pdf") legend() savefig("long_term_random_walk_result.png")
def get_surprisal_continuous(infile): """ Generates a continuous probability distribution for the surprisal metrics. :param infile: str; path to file containing sequences of discreet values. :return: """ with open(infile, "r", encoding="utf8") as F: sims = np.array([ j for i in tqdm(F.readlines(), unit_scale=True, desc=os.path.basename(infile)) for j in np.fromstring(i.replace("nan", "").strip(), sep=" ", dtype=np.float32) if i.strip() ]) sims = np.clip(sims, 0, 1) print(sims.shape) print("Fitting beta distribution to data.") sims = np.random.choice(sims, size=20_000_000, replace=False) B = beta.fit(sims) print(B) with open(f"../out/{os.path.basename(infile)}", "w", encoding="utf8") as F: F.write(f"beta distribution\n{B}") return 0
def _fit_beta(self, X): """Fit the beta parameters to the data. """ self.loc = np.min(X) self.scale = np.max(X) - self.loc self.a, self.b, _, _ = beta.fit(X, loc=self.loc, scale=self.scale) self.model = self._get_model()
def make_first_set_of_plots(): N = 1000 x = zeros(shape=(N, ), dtype=float) t = None tmax = 10 axis([0, tmax, 0, 1]) for i in range(N): t, y = random_walk(0.25, tmax, 0.01, t) x[i] = y[-1] if (i < 3): plot(t, (y + 1) / 2.0) xlabel("time") ylabel("CTR") savefig("random_walk.png") clf() subplot(211) hist((x + 1) / 2, bins=50) ylabel("Monte carlo results") subplot(212) best_fit = beta.fit((x + 1) / 2, floc=0, fscale=1) print best_fit ctr = arange(0, 1, 0.001) plot(ctr, beta(1, 4).pdf(ctr), label="Invariant distribution, beta(1,4)") plot(ctr, beta(best_fit[0], best_fit[1]).pdf(ctr), label="Best fit, beta(" + str(best_fit[0]) + "," + str(best_fit[1]) + ")") xlabel("CTR at t=" + str(tmax)) ylabel("pdf") legend() savefig("long_term_random_walk_result.png")
def vcf_graph(rows, pop, binsize, title, filename): expx = rows["ac_%s" % pop].divide(rows["an_%s" % pop]) expx = expx[((expx > 0) & (expx < 1))] alphax, betax, _, _ = beta.fit(expx, floc=0, fscale=1) x = np.arange(0, 1, binsize) binx = [(x[i + 1] + x[i]) / 2 for i in range(len(x) - 1)] y = [ btdtr(alphax, betax, x[i + 1]) - btdtr(alphax, betax, x[i]) for i in range(len(x) - 1) ] fig = go.Figure() fig.add_trace( go.Histogram(x=expx, histnorm='probability', name="Experimental", autobinx=False, xbins=dict(start=0, end=1, size=binsize), opacity=.9)) fig.add_trace(go.Bar(x=binx, y=y, name="Theory", opacity=.9)) fig.update_layout(autosize=False, width=800, height=600, yaxis=go.layout.YAxis(title_text="P(x)", range=[0, 1]), xaxis=go.layout.XAxis(title_text="x", range=[0, 1]), title_text=title, legend_orientation="h") #fig.write_image(filename) ksexp = kstest(expx, 'beta', args=(alphax, betax)) ksneut = kstest('beta', False, args=(alphax, betax), N=expx.size) return (alphax, betax, ksexp.statistic, ksexp.pvalue, ksneut.statistic, ksneut.pvalue, expx.size)
def ebb_fit_prior(x, n, method='mm', start=(0.5, 0.5)): p = x / n if (method == 'mm'): mu, sig = np.mean(p), np.var(p) a = ((1 - mu) / sig - 1 / mu) * mu**2 b = a * (1 / mu - 1) fitted_prior = Beta(a, b) pass elif (method == 'mle'): # starting value # if (np.isnan(start)): # mm_est = ebb_fit_prior(x, n, 'mm') # start = (mm_est.alpha, mm_est.beta) # #print(start) # likelihood function: f(a, b) def likelihood(pars): return (-np.sum(beta_dist.pdf(p, pars[0], pars[1]))) # optimization function: over a series of params, optimise likelihood # outp = minimize(likelihood, x0 = start, method = 'BFGS') # fitted_prior = Beta(outp.x[0], outp.x[1]) a, b, *ls = beta_dist.fit(p) fitted_prior = Beta(a, b) pass else: return ('Method should be MM or MLE') return fitted_prior pass
def generate_image(self): '''Provides histogram(s) with PDF curve(s)''' # Setup plots fig, ax = plt.subplots(figsize=(16, 6)) plt.subplots_adjust(bottom=.2) ax.axes.set_title('Risk Distribution', fontsize=20) # Format X axis ax.axes.xaxis.set_major_formatter(StrMethodFormatter('${x:,.0f}')) ax.axes.xaxis.set_tick_params(rotation=-45) ax.set_ylabel('Frequency Histogram') for tick in ax.axes.xaxis.get_major_ticks(): tick.label.set_horizontalalignment('left') # Draw histrogram for each model legend_labels = [] for name, model in self._input.items(): legend_labels.append(name) plt.hist([model.export_results()['Risk']], bins=25, alpha=.3) ax.legend(legend_labels, frameon=False) # Min and Max post graphing xmin, xmax = ax.get_xlim() # Now draw twin axis a d style tyax = plt.twinx(ax) tyax.set_ylabel('PDF') tyax.set_yticks([]) # Plot for each for name, model in self._input.items(): risk = model.export_results()['Risk'] # Catch warnings as we're "fitting" with known shape parameters. with warnings.catch_warnings(): warnings.simplefilter("ignore") beta_curve = beta(*beta.fit(risk)) space = np.linspace(0, xmax, 1000) tyax.plot(space, beta_curve.pdf(space)) plt.margins(0) return (fig, ax)
def maximum_likelihood_fit(data, loc=0, scale=1): """Estimate parameters from samples. This a wrapper around scipy's maximum likelihood estimator to estimate the parameters of a beta distribution from samples. Parameters ---------- data : array-like, shape=[..., n_samples] Data to estimate parameters from. Arrays of different length may be passed. loc : float Location parameter of the distribution to estimate parameters from. It is kept fixed during optimization. Optional, default: 0. scale : float Scale parameter of the distribution to estimate parameters from. It is kept fixed during optimization. Optional, default: 1. Returns ------- parameter : array-like, shape=[..., 2] Estimate of parameter obtained by maximum likelihood. """ data = gs.cast(data, gs.float32) data = gs.to_ndarray(gs.where(data == 1., 1. - EPSILON, data), to_ndim=2) parameters = [] for sample in data: param_a, param_b, _, _ = beta.fit(sample, floc=loc, fscale=scale) parameters.append(gs.array([param_a, param_b])) return parameters[0] if len(data) == 1 else gs.stack(parameters)
def pvalue(name, data): p_value = [] total_number = len(name) for i in range(len(name)): sys.stdout.write('Sample tested: %d/' % (i) + str(total_number) + ' \r') sys.stdout.flush() ppvp = [] d1 = np.where(data[i] > 0, data[i], 0.0000000001) d1 = np.where(d1 < 1, d1, 0.9999999999) rl = [] for j in range(len(d1)): if name[j].split('-')[0] == name[i].split('-')[0]: rl.append(j) d2 = [] for k in range(len(d1)): if k in rl: continue else: d2.append(d1[k]) try: param = beta.fit(d2, floc=0, fscale=1) except: print d2 rv = beta(param[0], param[1], 0, 1) for j in range(len(d1)): if j != i: ppvp.append(1 - rv.cdf(d1[j])) else: ppvp.append(1) p_value.append(ppvp) if save_label_files == 1: np.savetxt(path + '/p_values.txt', np.array(p_value)) return np.array(p_value)
def fit(data: FloatIterable, a: Optional[float] = None, b: Optional[float] = None, c: Optional[float] = None) -> 'PERT': """ Fit a PERT distribution to the data. :param data: Iterable of data to fit to. :param a: Optional fixed value for a. :param b: Optional fixed value for b. :param c: Optional fixed value for c. """ kwargs = {} if a is not None: kwargs['floc'] = a if a is not None and c is not None: kwargs['fscale'] = c - a alpha, beta, loc, scale = beta_dist.fit(data=data, **kwargs) a = a if a is not None else loc c = c if c is not None else loc + scale if b is None: b_est_1 = a + (alpha * (c - a) - 1) / 4 b_est_2 = c - (beta * (c - a) - 1) / 4 b = (b_est_1 + b_est_2) / 2 return PERT(a=a, b=b, c=c)
def get_histogram(self): # Fit a normal distribution to the data: mu, std = norm.fit(self.data) # Fit a beta distribution to the data betaparams = beta.fit(self.data) fig = plt.figure() # plt.subplot(211) # Plot the histogram. plt.hist(self.data, bins=2 * len(self.data), normed=True, alpha=0.6, color='g') # Plot the PDF. xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) plt.plot(x, p, 'k', linewidth=2) b = beta.pdf(x, *betaparams) plt.plot(x, b, 'r', linewidth=2) title = "Normal fit: mu = %.2f, std = %.2f" % (mu, std) plt.suptitle(title) subtitle = "black: normal, red: beta" plt.title(subtitle) return fig
def confirmed_prior(save = False, name = 'data/distr/confirmedratio.csv'): """Get ratio of confirmed cases. Args: save (bool, optional): Whether to save the figure, defaultly not. name (str, optional): Path to save the plot to. """ try: return pd.read_csv(name) except: pass # get data pop = population.countries() df = src.get_data() tests = testing.tests() # group iso3_iso2 = {'CZE':'CZ','SWE':'SE','POL':'PL','ITA':'IT'} for country3 in df.iso_alpha_3.unique(): # get country population country2 = iso3_iso2[country3] country_pop = float(pop.population[pop.region == country2]) # normalize confirmed by tests country_confirmed = df[df.iso_alpha_3 == country3].confirmed.apply(lambda c: c if c > 0 else 1) country_tests = tests[tests.country == country3].tests.apply(lambda t: t if t > 0 else 1) df.loc[df.iso_alpha_3 == country3,'ratio'] = (country_confirmed / country_tests).fillna(0) df['ratio'] = df.ratio.apply(lambda r: r if r < 1 else 1 - 1e-6) df['ratio'] = df.ratio.apply(lambda r: r if r > 0 else 1e-6) df[df.iso_alpha_3 == country3]['tests'] = country_tests df[df.iso_alpha_3 == country3]['confirmed'] = country_confirmed df = df[['iso_alpha_3','date','confirmed','tests','ratio']] confirmed_fit = beta.fit(df.ratio, floc = 0, fscale = 1) # save if save: df.to_csv(name, index = False) return df
def model_df(p_df, ax, title=''): # construct prior cov_cutoff = 100 hi_df = p_df\ .query('(sense + antisense) > %i' %cov_cutoff)\ .assign(percentage_sense = lambda d: d.sense/(d.sense + d.antisense)) fitted_params = beta.fit(data=hi_df.percentage_sense.values, floc=0, fscale=1) print(fitted_params) alpha0, beta0, loc, scale = fitted_params hist = True bins = 10 sns.distplot(hi_df.percentage_sense, label='High count repeats (>%i fragments)' % cov_cutoff, ax=ax, hist_kws={'alpha': 0.5}, bins=bins, hist=hist) ls = np.linspace(0, 1, 1000) ax.plot(ls, beta.pdf(ls, alpha0, beta0, loc, scale), label='Fitted beta-binomial') #sns.distplot(np.random.beta(alpha0, beta0, size=hi_df.shape[0])) ax.legend(frameon=False, bbox_to_anchor=(0.6, 1.1)) ax.set_xlabel('Propotion of sense strand fragments') ax.set_ylabel('Density') ax.set_xlim(0, 1) ax.set_title(title, fontsize=15) sns.despine() return alpha0, beta0
def get_beta_percentile_confidence(self, conf=0.60): ''' I am conf percent sure that I will be there in :return: minutes or less invert conf for "minutes or more". eg 0.10== 90 percent sure it'll take at least... ''' betaparams = beta.fit(self.data) return beta.ppf(conf, *betaparams)
def define_correction_function(top_pvalues_perm, cis_mode): #Always try to use the MLE estimator, new default to 10 permutations. #If the MLE estimator fails we go back to the cruder estimation of the beta distribution. offset = (np.finfo(np.double).tiny * 100) ##Replace zero's value with smallest number not 0. top_pvalues_perm[top_pvalues_perm == 0] = offset ##Replace highest value with highest number not 1. top_pvalues_perm[top_pvalues_perm == 1] = 1 - offset try: alpha_para, beta_para, loc, fscale = beta.fit(top_pvalues_perm, floc=0, fscale=1) except (scipy.stats._continuous_distns.FitSolverError): alpha_para, beta_para = estimate_beta_function_paras(top_pvalues_perm) except (scipy.stats._continuous_distns.FitDataError): alpha_para, beta_para = estimate_beta_function_paras(top_pvalues_perm) if (cis_mode): if (alpha_para < BETA_SHAPE1_MIN or alpha_para > BETA_SHAPE1_MAX or alpha_para < BETA_SHAPE2_MIN_CIS or alpha_para > BETA_SHAPE2_MAX_CIS): alpha_para, beta_para = estimate_beta_function_paras( top_pvalues_perm) ### If pvalues become more significant after multiple testing correction we put them back to the orignal test Pvalue in a seperate step. else: if (alpha_para < BETA_SHAPE1_MIN or alpha_para > BETA_SHAPE1_MAX or alpha_para < BETA_SHAPE2_MIN_TRANS or alpha_para > BETA_SHAPE2_MAX_TRANS): alpha_para, beta_para = estimate_beta_function_paras( top_pvalues_perm) beta_dist = scipy.stats.beta(alpha_para, beta_para) correction_function = lambda x: beta_dist.cdf(x) #Would be good to replace 0 with minimal double value of python. return [correction_function, alpha_para, beta_para]
def test_draw_samples_non_mock(self, plot=False): # Also make sure the non-mock sampler works dtype = np.float32 num_samples = 100000 a = np.array([2]) b = np.array([5]) rv_shape = (1, ) a_mx = add_sample_dimension(mx.nd, mx.nd.array(a, dtype=dtype)) b_mx = add_sample_dimension(mx.nd, mx.nd.array(b, dtype=dtype)) rand_gen = None var = Beta.define_variable(shape=rv_shape, rand_gen=rand_gen, dtype=dtype).factor variables = {var.alpha.uuid: a_mx, var.beta.uuid: b_mx} rv_samples_rt = var.draw_samples(F=mx.nd, variables=variables, num_samples=num_samples) assert array_has_samples(mx.nd, rv_samples_rt) assert get_num_samples(mx.nd, rv_samples_rt) == num_samples assert rv_samples_rt.dtype == dtype if plot: plot_univariate(samples=rv_samples_rt, dist=beta, a=a[0], b=b[0]) a_est, b_est, _, _ = beta.fit(rv_samples_rt.asnumpy().ravel()) a_tol = 0.2 b_tol = 0.2 assert np.abs(a[0] - a_est) < a_tol assert np.abs(b[0] - b_est) < b_tol
def maximum_likelihood_fit(self, data, loc=0, scale=1): """Estimate parameters from samples. This a wrapper around scipy's maximum likelihood estimator to estimate the parameters of a beta distribution from samples. Parameters ---------- data : array-like, shape=[n_distributions, n_samples] the data to estimate parameters from. Arrays of different length may be passed. loc : float, optional the location parameter of the distribution to estimate parameters from. It is kept fixed during optimization default: 0 scale : float, optional the scale parameter of the distribution to estimate parameters from. It is kept fixed during optimization default: 1 Returns ------- parameter : array-like, shape=[n_samples, 2] """ data = gs.to_ndarray( gs.where(data == 1., 1 - EPSILON, data), to_ndim=2) parameters = [] for sample in data: param_a, param_b, _, _ = beta.fit(sample, floc=loc, fscale=scale) parameters.append(gs.array([param_a, param_b])) return parameters[0] if len(data) == 1 else gs.stack(parameters)
def reference_sim(self, A, classified,labels): num_centers = len(set(labels)) small = .0000000000001 ideal_A = np.zeros([A.shape[0],A.shape[1]]) for i in range(0,len(labels)): for j in range(0,i+1): if labels[i] == labels[j]: ideal_A[i,j] = 1 ideal_A[j,i] = 1 pred_pos = A[ideal_A ==1] pred_neg = A[ideal_A ==0] pos_a,pos_b,pos_loc, pos_scale= beta.fit(pred_pos) neg_a,neg_b,neg_loc, neg_scale= beta.fit(pred_neg) fits = [] #Fit comparison iwth more than 1 clust for sim in range(0, 50): simulated_mat = np.ones([A.shape[0],A.shape[1]]) for i in range(0,len(labels)): for j in range(0,i): if ideal_A[i,j] ==0: simulated_mat[i,j] = simulated_mat[j,i]= beta.rvs(max(neg_a,small), max(small,neg_b), loc=neg_loc,scale =neg_scale) else: simulated_mat[i, j ] = simulated_mat[j, i ] = beta.rvs(max(pos_a,small), max(small,pos_b), loc=pos_loc,scale =pos_scale) self.one_clust_test = False whereAreNaNs = np.isnan(simulated_mat) simulated_mat[whereAreNaNs] = 0 self.fit(simulated_mat) #print simulated_mat fits.append(self.gap_stat_) multi_fit = np.mean(fits) fits_one = [] pos_a,pos_b,pos_loc, pos_scale= beta.fit(A) for sim in range(0, 50): simulated_mat = np.ones([A.shape[0],A.shape[1]]) for i in range(0,len(labels)): for j in range(0,i): simulated_mat[i,j] = simulated_mat[j,i]= beta.rvs(max(small,pos_a), max(small,pos_b), loc=pos_loc,scale =pos_scale) whereAreNaNs = np.isnan(simulated_mat) simulated_mat[whereAreNaNs] = 0 e_vals, e_vecs = np.linalg.eigh(simulated_mat) #2. Get Reverse Sorted Order - largest to smallest e_order = np.argsort(e_vals)[::-1] self.one_clust_fit_alt(e_vecs,e_order) fits_one.append(self.gap_stat_) one_fit = np.mean(fits_one) return multi_fit, one_fit
def fit_beta(data): EPSILON = 1e-3 data = [EPSILON if d == 0 else 1 - EPSILON if d == 1 else d for d in data] # make sure no data points EQUAL one of the bounds if len(set(data)) == 1: # make sure not all data points are identical data = [d + random.normalvariate(0, 1e-6) for d in data] params = beta.fit(data, floc=0, fscale=1) return params[0], params[1]
def gen_beta(data): # Param 0 is alpha # Param 1 is b1 # Param 2 is loc data = data[~np.isnan(data)] data = np.array([.01 if i == 0 else .99 if i == 1 else i for i in data]) beta_params = beta.fit(data, floc=0., fscale=1.) return beta_params[0], beta_params[1]
def one_vote(N, threshold=0.5, ab=False, forecasts=False, normal=False, p=False, diagnostic=False): import numpy as np if sum(map(bool,[ab, forecasts, normal, p])) != 1: raise ValueError("Please specify one and only one of the 'ab', 'forecasts', 'normal', or 'p' options.") if ab: a, b = ab elif forecasts: from scipy.stats import beta a, b, _, _ = beta.fit(forecasts, floc=0, fscale=1) elif normal: from functions import fit_beta_to_normal m, s = normal a, b = fit_beta_to_normal(m,s) else: pass if p: from functions import mp_binom victory_pr = mp_binom(np.ceil(N*threshold),N,p) if (N*threshold).is_integer(): # tie probability in case N*threshold is whole: tie_pr = mp_binom(N*(1-threshold)+1,N,p) elif not (N*threshold).is_integer() and threshold != 0.5: # tie probability in case N*threshold is not whole: tie_pr = mp_binom(N*(1-threshold),N,p) else: tie_pr = 0 elif a and b: from functions import beta_binomial victory_pr = beta_binomial(np.ceil(N*threshold),N,a,b,multi_precission=True) if (N*threshold).is_integer(): # tie probability in case N*threshold is whole: tie_pr = beta_binomial(N*(1-threshold)+1,N,a,b,multi_precission=True) elif not (N*threshold).is_integer() and threshold != 0.5: # tie probability in case N*threshold is not whole: tie_pr = beta_binomial(N*(1-threshold),N,a,b,multi_precission=True) else: tie_pr = 0 else: pass if diagnostic: import matplotlib.pyplot as plt x = np.linspace(0,1,1000) try: plt.style.use('http://chymera.eu/matplotlib/styles/chymeric-gnome.mplstyle') except ValueError: plt.style.use('ggplot') plt.axvline(x=threshold, color="#fbb4b9", linewidth=1) plt.legend(['percentage\n threshold'], loc='upper right') plt.plot(x, beta.pdf(x,a,b)) plt.xlabel('Reference Candidate Vote Share') plt.ylabel('PDF') plt.show() total_pr = victory_pr+tie_pr return total_pr, victory_pr, tie_pr
def beta_fit(data, col): plt.hist(data[col], bins=10000, density=True, alpha=0.6, color='g') a, b, loc, scale = beta.fit(data[col]) xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 10000) # plt.title("Beta fitting of "+str(col)) # plt.plot(x, p, 'r', linewidth=2) # plt.show() return x, a, b, loc, scale
def run_beta_fit(cadd_trset, mnp_cadd_trset, gerp_trset): ''' from scipy import stats import numpy as np import matplotlib.pylab as plt # create some normal random noisy data ser = 50*np.random.rand() * np.random.normal(10, 10, 100) + 20 # plot normed histogram plt.hist(ser, normed=True) # find minimum and maximum of xticks, so we know # where we should compute theoretical distribution xt = plt.xticks()[0] xmin, xmax = min(xt), max(xt) lnspc = np.linspace(xmin, xmax, len(ser)) ab,bb,cb,db = stats.beta.fit(ser) pdf_beta = stats.beta.pdf(lnspc, ab, bb,cb, db) plt.plot(lnspc, pdf_beta, label="Beta") plt.show() ''' cadd_trset_param = {} for aaconv in cadd_trset.keys(): a,b,loc2,scale2 = beta.fit(cadd_trset[aaconv]) mean2 = beta.mean(a,b,loc2,scale2) cadd_trset_param[aaconv] = [a,b,loc2,scale2,mean2] mnp_cadd_trset_param = {} for aaconv in mnp_cadd_trset.keys(): a,b,loc2,scale2 = beta.fit(mnp_cadd_trset[aaconv]) mean2 = beta.mean(a,b,loc2,scale2) mnp_cadd_trset_param[aaconv] = [a,b,loc2,scale2,mean2] gerp_trset_param = {} for aaconv in gerp_trset.keys(): a,b,loc2,scale2 = beta.fit(gerp_trset[aaconv]) mean2 = beta.mean(a,b,loc2,scale2) gerp_trset_param[aaconv] = [a,b,loc2,scale2,mean2] return cadd_trset_param, mnp_cadd_trset_param, gerp_trset_param
def beta_dist_estimator(data): if not data.empty: x = data.Age.value_counts(dropna=False).sort_index() z = x / np.sum(x) a1, b1, loc1, scale1 = beta.fit(z, floc=0, fscale=1) return a1, b1, loc1, scale1
def plot(): CURRENT_PATH = os.path.dirname(os.path.realpath(__file__)) # Final model with exponential hyperprior MODEL_PATH = os.path.join(CURRENT_PATH, "..", "bayes_cv_prune", "stan_models", "exp_new.stan") model = BayesStanPruner(MODEL_PATH, seed=0).load() # Simulated sets of accuracy values A0 = [0.4, 0.5] A1 = [0.80, 0.82] post_sample0 = model.fit_predict(A0) post_sample1 = model.fit_predict(A1) x = np.linspace(0, 1, 250) a, b, _, _ = beta.fit(A0, floc=0, fscale=1) ml_estimate0 = beta.pdf(x, a, b) a, b, _, _ = beta.fit(A1, floc=0, fscale=1) ml_estimate1 = beta.pdf(x, a, b) # Plot sns.set(context="paper", style="whitegrid", font="STIXGeneral", font_scale=1.25) bins = np.linspace(0, 1, 41) f, axes = plt.subplots(1, 2, figsize=(6.5, 3)) axes[0].hist(post_sample0, bins=bins, density=True, label="Post. pred.") axes[0].plot(x, ml_estimate0, '-k', label="Beta ML fit") axes[0].set_title("A = {0.4, 0.5}") axes[0].legend() axes[1].hist(post_sample1, bins=bins, density=True, label="Post. pred.") axes[1].plot(x, ml_estimate1, '-k', label="Beta ML fit") axes[1].set_title("A = {0.80, 0.82}") axes[1].legend() plt.subplots_adjust(left=0.065, bottom=0.095, top=0.9, right=0.975) plt.show()
def EI(): """Get distributions for parameter c, connection E-I.""" # seed np.random.seed(seed=12345) # draw from incubation period pars = incubation.continuous()['gamma'] draws = gamma.rvs(*pars, size = 1000000, random_state = 12345) # fit beta to 1/draw samples = 1 / draws samples = samples[(samples > 0) & (samples < 1)] return {'x': samples, 'beta': beta.fit(samples), 'gamma': gamma.fit(samples, loc = .2, scale = 10)}
def fit_beta(table, xlims=(-2, 0.5)): """Returns fit of Beta Distribution to a given [Fe/H] table See: fit_gaussian() """ z_sort = np.sort(table['feh']) i_0 = np.searchsorted(z_sort, xlims[0]) i_1 = np.searchsorted(z_sort, xlims[1]) loc = xlims[0] scale = xlims[1] - xlims[0] a, b, loc, scale = beta.fit(z_sort[i_0:i_1], floc=loc, fscale=scale) return a, b, loc, scale
plt.figure(figsize=(10, 7)) plt.axvline(mle, linestyle ="--") line1, = plt.plot(possible_thetas, likelihoods) bins = [x/100 for x in range(100)] counts, bins = np.histogram(infections_rates, bins=bins) counts = counts / counts.sum() line2, = plt.plot(bins[:-1], counts) plt.xlabel("Theta") plt.title("Evidence vs Historical Infection Rates") plt.legend((line1, line2), ('Likelihood of Theta with new evidence', 'Frequency of Theta in last 100 months') , loc = 'upper left') plt.show() # Model the data with a beta function prior_a, prior_b = beta.fit(infections_rates, floc = 0, fscale = 1)[0:2] # Fit data to find a & b for the beta dist. prior = beta(prior_a, prior_b) prior_samples = prior.rvs(10000) # Sample from the prior beta_sample_counts, bins = np.histogram(prior_samples, bins) total = beta_sample_counts.sum() beta_sample_counts = [x / total for x in beta_sample_counts] plt.figure(figsize=(10, 7)) line1, = plt.plot(bins[:-1], beta_sample_counts) hist_rates, bins = np.histogram(infections_rates, bins) total = hist_rates.sum() hist_rates = [x/total for x in hist_rates] line2, = plt.plot(bins[:-1], hist_rates)
numpy.set_printoptions(linewidth=1000000) probs = [] pvals = [] pyplot.figure(figsize=(15,10)) for i, data in enumerate(bin_data): print i tdata = numpy.array(data) params = beta_dist.fit(tdata, floc=0) (alpha, beta, floc, fshape) = params print alpha, beta vals = numpy.arange(0, strand_length + 1) fit_hist = numpy.array([beta_binom(val, strand_length, alpha, beta) for val in vals]) #data_bin_edges = numpy.linspace(0.0, 1.0, num=strand_length + 1, endpoint=True) data_bin_edges = numpy.linspace(-0.5, strand_length + 0.5, num=strand_length + 2, endpoint=True) / float(strand_length) #print data_bin_edges
# Copyright 2015, Chen Sun (bbsunchen at outlook.com) from sys import argv from scipy.stats import beta frequence_list = [] with open(argv[1]) as input_file: for line in input_file: line = line.strip() if line.startswith('rs#'): continue columns = line.split(' ') #print line ref_freq = float(columns[11]) oth_freq = float(columns[14]) if ref_freq >= 0.05 and ref_freq <= 0.95: frequence_list.append(ref_freq) if oth_freq >= 0.05 and oth_freq <= 0.95: frequence_list.append(oth_freq) # direct fit Beta distribution #print beta.fit(frequence_list) # fit Beta distribution with frequency [0.05, 0.95], more precise a,b,l,s=beta.fit(frequence_list, floc=0.04999999999999999, fscale=0.9000000000000000) #print str(a)+'\t'+str(b) print '{}\t{}'.format(a,b) # fit Beta distribution with some infer #print beta.fit(frequence_list, floc=0, fscale=1)
#analysis of mccarthy's data import numpy as np import csv import pylab as plt import pymc as pm from sklearn import mixture import sys from scipy.stats import beta def gaussian(x, mu, sig): return (1/(sig*np.sqrt(2*np.pi)))*np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.))) filename = "C:\Users\leon\Documents\Data\McCarthys.dat" raw_data = list(csv.reader(open(filename, 'rb'))) data = np.zeros((256,2)) for idx,line in enumerate(raw_data): data[idx, :] = line[0].split() master_data = np.array([]) for line in data: new_data = np.repeat(line[0], line[1]) master_data = np.append(master_data, new_data) master_data = master_data[:, None] #clf = mixture.GMM(n_components=2, ) #clf.fit(master_data) #plt.hist(master_data, 100, normed = True) #plt.plot(gaussian(np.linspace(0,50,100), clf.means_[0], clf.covars_[0])) #plt.plot(gaussian(np.linspace(0,50,100), clf.means_[1], clf.covars_[1])) sys.exit() [alpha, beta, loc, scale] = beta.fit(master_data)
smoothMeth = regionDict[key]["SMOOTHED"] rawScores = [] smoothScores = [] for r,s in zip(rawMeth, smoothMeth): rawScores.append(float(r['score'])/100) smScore = float(s['score'])/100 if smScore == 1: smScore = 0.9999999999 if smScore == 0: smScore = 0.0000000001 smoothScores.append(smScore) print(smoothScores) if np.var(smoothScores) == 0: break else: fit = beta.fit(smoothScores, floc=0, fscale=1) alphas.append(fit[0]) betas.append(fit[1]) statuses.append(status) fig = plt.figure() cols = {1:'b', -1:'r', 0:'k'} for a,b,s in zip(alphas, betas, statuses): plt.plot(a,b, cols[s]+'o',markersize = 2) plt.savefig('beta_plot.pdf', bbox_inches = 'tight')
def vplik(old, imps, cclass): # p = multiprocessing.Pool(multiprocessing.cpu_count()) preimage = [(old,imps,random.randint(0,1000000)) for i in xrange(PLIK_REPS)] # print preimage image = np.array(P.map(rboot,preimage)) return beta(*beta.fit(image[:,0]))
bin_step = 100.0/n_bins bin_edges = numpy.arange(0.0, 100.0 + bin_step, bin_step) print bin_edges bin_data = [[] for _ in range(len(bin_edges) - 1)] for i in range(len(x)): n_hb = y[i] val = x[i] for j in range(len(bin_data)): if (val <= bin_edges[j+1]) and (val > bin_edges[j]): bin_data[j].append(n_hb) for i, data in enumerate(bin_data): tdata = numpy.array(data) params = beta.fit(tdata, fscale=1) print i, params[0], params[1], params[2], params[3] #print tdata
if correct < 50: continue if (incorrect > 0) and (correct > 0): print name,len(times[name]) print correct,incorrect print np.mean(correct_times),np.mean(incorrect_times) #print sum([1 for c in correct_times if c >= min(incorrect_times)])/float(len(correct_times)) #print max_time = max(max(correct_times),max(incorrect_times)) min_time = min(min(correct_times),min(incorrect_times)) data = correct_times data = [(t-min_time)/float(max_time-min_time) for t in data] #print data a,b,lower,scale = beta.fit(data) #print a,b,lower,scale #print #print beta.cdf(0.8,a,b) #----------------Fit using moments---------------- mean=np.mean(data) var=np.var(data,ddof=1) alpha1=mean**2*(1-mean)/var-mean beta1=alpha1*(1-mean)/mean print beta.cdf((incorrect_times[-1]-min_time)/(max_time-min_time),alpha1,beta1) print #break #print correct_times
import numpy as np from numpy import random as rnd from scipy.stats import t from scipy.stats import beta import seaborn as sns import matplotlib.pyplot as plt ### Parameters ### nu = 3 # arbitrary choice sigma = 2 # arbitrary choice ### Simulate Values ### draws1 = rnd.standard_t(nu, size = 1.e6) # draw one million Y values draws2 = t.cdf(draws1, df = nu) # classic PIT (F_Y(Y) is standard uniform) draws3 = t.cdf(draws1 / sigma, df = nu) # compute one million X values based on Y values alpha_fit, beta_fit, loc_fit, scale_fit = beta.fit(draws3) # determine best-fitted beta ### Plot KDEs and CDF of Best-Fitted Beta PDF ### kdeFig = plt.figure() # start figure sns.set("talk") # set seaborn style to "talk" --> increase font size # add all KDEs with plot labels in LaTeX (hence the use of raw strings) sns.kdeplot(draws1, shade = True, clip = (-3, 3), label = r'KDE for $Y \sim t_3$') sns.kdeplot(draws2, shade = True, clip = (-3, 3), label = r'KDE for $F_Y(Y)$') sns.kdeplot(draws3, shade = True, clip = (-3, 3), label = r'KDE for $X$ with $\sigma = 2$') # add pdf for best-fitted beta with plot labels in LaTeX (hence the use of raw strings) x = np.linspace(-3,3, num = 1000) # create 1000 values between -3 and +3 y = beta.pdf(x, a = alpha_fit, b = beta_fit, loc = loc_fit, scale = scale_fit) # f(x) plt.plot(x, y, label = r'PDF for Best-Fitted $B(\alpha, \beta)$') # title & legend plt.title('Kernel Density Estimation') plt.legend(loc='upper left')