Esempio n. 1
0
def fit_beta_mixture(Y, gamma, n_itr=100):
    Y = Y / gamma
    Y = Y[(Y > 0) & (Y < 1)]
    #plt.figure(figsize=[12,5])
    #sns.distplot(Y)
    #plt.show()

    theta = np.array([1, 10, 2, 5, 0.5])
    w = np.zeros([Y.shape[0], 2], dtype=float)

    if_converge = False
    for i in range(n_itr):
        theta_old = theta
        # E step
        w[:, 0] = beta.pdf(Y, theta[0], theta[1]) * theta[4]
        w[:, 1] = beta.pdf(Y, theta[2], theta[3]) * (1 - theta[4])
        w = (w.T / w.sum(axis=1)).T

        # M step
        rand_idx = np.random.binomial(1, w[:, 0], size=Y.shape[0])
        theta[4] = w[:, 0].mean()
        print(np.sum(rand_idx == 1))
        theta[0], theta[1], _, _ = beta.fit(Y[rand_idx == 1], loc=0, scale=1)
        theta[2], theta[3], _, _ = beta.fit(Y[rand_idx == 0], loc=0, scale=1)

        if np.linalg.norm(theta - theta_old) < 1e-8:
            if_converge = True
            break

    return theta, if_converge
Esempio n. 2
0
def get_outliers(data, filter, plotting):
    if plotting:
        for x, r in [("x1", (0, 1)), ("x2", (0, 30)), ("x3", (0, 1))]:
            plt.violinplot(data[x], vert=False)
            plt.xlim(r)
            plt.savefig("plots/violin/%s.png" % x)
            plt.clf()

    if filter:
        data_fl = data[data["class"] == 0]
    else:
        data_fl = data

    pdf = pd.DataFrame({})

    a, b, loc, scale = beta.fit(data_fl["x1"])
    pdf["x1"] = beta.logpdf(data["x1"], a, b, loc=loc, scale=scale)

    a, loc, scale = gamma.fit(data_fl["x2"])
    pdf["x2"] = gamma.logpdf(data["x2"], a, loc=loc, scale=scale)

    a, b, loc, scale = beta.fit(data_fl["x3"])
    pdf["x3"] = beta.logpdf(data["x3"], a, b, loc=loc, scale=scale)

    pdfs = pdf["x1"] + pdf["x2"] + pdf["x3"]

    if plotting:
        sns.boxplot(y=pdfs, x="class", data=data)
        plt.savefig("plots/boxplot.png")
        plt.clf()

    if plotting:
        plt.plot(np.sort(pdfs))
        splits = [40, 45, 50, 60]
        for split in splits:
            split = np.sort(pdfs)[60]
            plt.plot((0, 1000), (split, split), 'k-', lw=0.5)
            split = np.sort(pdfs)[50]
            plt.plot((0, 1000), (split, split), 'k.', lw=0.5)
            split = np.sort(pdfs)[45]
            plt.plot((0, 1000), (split, split), 'k--', lw=0.5)
            split = np.sort(pdfs)[40]
            plt.plot((0, 1000), (split, split), 'k--', lw=0.5)

        plt.savefig("plots/thresholds.png")
        plt.clf()

    outliers = np.argsort(pdfs)

    final = []
    for outlier in outliers:
        if data["class"][outlier] == -1:
            final.append(outlier)

    return np.array(final[:100])
Esempio n. 3
0
def organize_data():
    df = pd.read_csv('2016stats.csv')
    df = df[df['3PA'] > 20]
    df['3P%'] = df['3P%'] / 100
    a = beta.fit(list(df['3P%']), floc=0, fscale=1)[0]
    b = beta.fit(list(df['3P%']), floc=0, fscale=1)[1]
    df['3PEstimate'] = (df['3PM'] + a) / (df['3PA'] + a + b)
    df['a'] = df['3PM'] + a
    df['b'] = df['3PA'] - df['3PM'] + b
    print('alpha: ' + str(a))
    print('beta: ' + str(b))
    return (df, a, b)
Esempio n. 4
0
def phase1(state):
    """
    if state['pulls-left'] % 100 == 0:
        print(state['pulls-left'])
    """

    if state['pulls-left'] == 9000:
        mikeys_ducks_info['alph-beta-scal'] = []
        for i in range(100):
            alph, beta, _, scal = beta_mod.fit(mikeys_ducks_info['payoffs'][i])
            mikeys_ducks_info['alph-beta-scal'].append(
                ((scal * alph) / (alph + beta), alph, beta, scal, i))
    elif state['pulls-left'] == 10000:
        mikeys_ducks_info['costs'] = [0 for i in range(100)]
        mikeys_ducks_info['metadata'] = ['00000000' for i in range(100)]
        mikeys_ducks_info['payoffs'] = [[] for i in range(100)]

    if check_key(state, 'last-cost'):
        last_pull = mikeys_ducks_info['last-pull']
        mikeys_ducks_info['utility'] += (state['last-payoff'] -
                                         state['last-cost'])
        mikeys_ducks_info['costs'][last_pull] = state['last-cost']
        mikeys_ducks_info['metadata'][last_pull] = state['last-metadata']
        mikeys_ducks_info['payoffs'][last_pull].append(state['last-payoff'])
        if len(mikeys_ducks_info['payoffs'][last_pull]) >= 1000:
            mikeys_ducks_info['machines-done'].add(last_pull)
        if check_key(mikeys_ducks_info,
                     'alph-beta-scal') and state['pulls-left'] % 10 == 0:
            alph, beta, _, scal = beta_mod.fit(
                mikeys_ducks_info['payoffs'][last_pull])
            mikeys_ducks_info['alph-beta-scal'][last_pull] = (scal * alph /
                                                              (alph + beta),
                                                              alph, beta, scal,
                                                              last_pull)

    move = {}
    move['team-code'] = state['team-code']
    move['game'] = 'phase_1'
    if state['pulls-left'] > 9000:
        move['pull'] = int((10000 - state['pulls-left']) / 10)
    else:
        best_prof, best_ind = get_best_profit_index()
        move['pull'] = best_ind
    mikeys_ducks_info['last-pull'] = move['pull']

    if state['pulls-left'] == 1:
        mikeys_ducks_info['machines-done'].add(move['pull'])
        mikeys_ducks_info['auctions'] = sorted(
            list(mikeys_ducks_info['machines-done']),
            key=lambda x: -1 * (mikeys_ducks_info['alph-beta-scal'][x][0] -
                                mikeys_ducks_info['costs'][x]))
    return move
Esempio n. 5
0
def organize_data(season):
    df = pd.read_csv('Data/Seasons_Stats.csv')
    df = df[df['Year'] == season]
    df = remove_duplicate_players(df)
    df = df[['Player', '3P', '3PA', '3P%', 'Tm', 'Year']]
    df = df[df['3PA'] > 20]
    a = beta.fit(list(df['3P%']), floc=0, fscale=1)[0]
    b = beta.fit(list(df['3P%']), floc=0, fscale=1)[1]
    df['3PEstimate'] = (df['3P'] + a) / (df['3PA'] + a + b)
    df['a'] = df['3P'] + a
    df['b'] = df['3PA'] - df['3P'] + b
    print('alpha: ' + str(a))
    print('beta: ' + str(b))
    return (df, a, b)
Esempio n. 6
0
    def test_draw_samples(self, dtype, a_shape, a_is_samples, b_shape,
                          b_is_samples, rv_shape, num_samples):
        # Note: Tests above have been commented as they are very slow to run.
        # Note: Moved random number generation to here as the seed wasn't set if used above
        a = np.random.uniform(0.5, 2, size=a_shape)
        b = np.random.uniform(0.5, 2, size=b_shape)

        n_dim = 1 + len(rv_shape)
        a_np = numpy_array_reshape(a, a_is_samples, n_dim)
        b_np = numpy_array_reshape(b, b_is_samples, n_dim)

        rv_samples_np = np.random.beta(a_np,
                                       b_np,
                                       size=(num_samples, ) + rv_shape)

        var = Beta.define_variable(shape=rv_shape, dtype=dtype,
                                   rand_gen=None).factor

        a_mx = mx.nd.array(a, dtype=dtype)
        if not a_is_samples:
            a_mx = add_sample_dimension(mx.nd, a_mx)

        b_mx = mx.nd.array(b, dtype=dtype)
        if not b_is_samples:
            b_mx = add_sample_dimension(mx.nd, b_mx)

        variables = {var.a.uuid: a_mx, var.b.uuid: b_mx}
        rv_samples_rt = var.draw_samples(F=mx.nd,
                                         variables=variables,
                                         num_samples=num_samples)

        assert np.issubdtype(rv_samples_rt.dtype, dtype)
        assert is_sampled_array(mx.nd, rv_samples_rt)
        assert get_num_samples(mx.nd, rv_samples_rt) == num_samples

        rtol, atol = 1e-1, 1e-1

        from itertools import product
        fits_np = [
            beta.fit(rv_samples_np[:, i, j])[0:2]
            for i, j in (product(*map(range, rv_shape)))
        ]
        fits_rt = [
            beta.fit(rv_samples_rt.asnumpy()[:, i, j])[0:2]
            for i, j in (product(*map(range, rv_shape)))
        ]

        assert np.allclose(fits_np, fits_rt, rtol=rtol, atol=atol)
Esempio n. 7
0
def make_first_set_of_plots():
    N = 1000
    x = zeros(shape=(N,), dtype=float)
    t = None
    tmax = 10
    axis([0,tmax,0,1])
    for i in range(N):
        t, y = random_walk(0.25, tmax, 0.01, t)
        x[i] = y[-1]
        if (i < 3):
            plot(t, (y+1)/2.0)

    xlabel("time")
    ylabel("CTR")
    savefig("random_walk.png")

    clf()
    subplot(211)
    hist((x+1)/2, bins=50)
    ylabel("Monte carlo results")

    subplot(212)
    best_fit = beta.fit((x+1)/2, floc=0, fscale=1)

    print best_fit
    ctr = arange(0,1,0.001)
    plot(ctr, beta(1,4).pdf(ctr), label="Invariant distribution, beta(1,4)")
    plot(ctr, beta(best_fit[0],best_fit[1]).pdf(ctr), label="Best fit, beta("+str(best_fit[0]) + "," + str(best_fit[1]) + ")")
    xlabel("CTR at t="+str(tmax))
    ylabel("pdf")
    legend()
    savefig("long_term_random_walk_result.png")
Esempio n. 8
0
def get_surprisal_continuous(infile):
    """
    Generates a continuous probability distribution
    for the surprisal metrics.

    :param infile: str; path to file containing sequences
     of discreet values.
    :return:
    """
    with open(infile, "r", encoding="utf8") as F:
        sims = np.array([
            j
            for i in tqdm(F.readlines(), unit_scale=True, desc=os.path.basename(infile))
            for j in np.fromstring(i.replace("nan", "").strip(), sep=" ", dtype=np.float32)
            if i.strip()
        ])
        sims = np.clip(sims, 0, 1)
        print(sims.shape)
        print("Fitting beta distribution to data.")
        sims = np.random.choice(sims, size=20_000_000, replace=False)
        B = beta.fit(sims)
        print(B)
        with open(f"../out/{os.path.basename(infile)}", "w", encoding="utf8") as F:
            F.write(f"beta distribution\n{B}")

    return 0
Esempio n. 9
0
 def _fit_beta(self, X):
     """Fit the beta parameters to the data.
     """
     self.loc = np.min(X)
     self.scale = np.max(X) - self.loc
     self.a, self.b, _, _ = beta.fit(X, loc=self.loc, scale=self.scale)
     self.model = self._get_model()
Esempio n. 10
0
def make_first_set_of_plots():
    N = 1000
    x = zeros(shape=(N, ), dtype=float)
    t = None
    tmax = 10
    axis([0, tmax, 0, 1])
    for i in range(N):
        t, y = random_walk(0.25, tmax, 0.01, t)
        x[i] = y[-1]
        if (i < 3):
            plot(t, (y + 1) / 2.0)

    xlabel("time")
    ylabel("CTR")
    savefig("random_walk.png")

    clf()
    subplot(211)
    hist((x + 1) / 2, bins=50)
    ylabel("Monte carlo results")

    subplot(212)
    best_fit = beta.fit((x + 1) / 2, floc=0, fscale=1)

    print best_fit
    ctr = arange(0, 1, 0.001)
    plot(ctr, beta(1, 4).pdf(ctr), label="Invariant distribution, beta(1,4)")
    plot(ctr,
         beta(best_fit[0], best_fit[1]).pdf(ctr),
         label="Best fit, beta(" + str(best_fit[0]) + "," + str(best_fit[1]) +
         ")")
    xlabel("CTR at t=" + str(tmax))
    ylabel("pdf")
    legend()
    savefig("long_term_random_walk_result.png")
Esempio n. 11
0
def vcf_graph(rows, pop, binsize, title, filename):
    expx = rows["ac_%s" % pop].divide(rows["an_%s" % pop])
    expx = expx[((expx > 0) & (expx < 1))]
    alphax, betax, _, _ = beta.fit(expx, floc=0, fscale=1)
    x = np.arange(0, 1, binsize)
    binx = [(x[i + 1] + x[i]) / 2 for i in range(len(x) - 1)]
    y = [
        btdtr(alphax, betax, x[i + 1]) - btdtr(alphax, betax, x[i])
        for i in range(len(x) - 1)
    ]
    fig = go.Figure()
    fig.add_trace(
        go.Histogram(x=expx,
                     histnorm='probability',
                     name="Experimental",
                     autobinx=False,
                     xbins=dict(start=0, end=1, size=binsize),
                     opacity=.9))
    fig.add_trace(go.Bar(x=binx, y=y, name="Theory", opacity=.9))
    fig.update_layout(autosize=False,
                      width=800,
                      height=600,
                      yaxis=go.layout.YAxis(title_text="P(x)", range=[0, 1]),
                      xaxis=go.layout.XAxis(title_text="x", range=[0, 1]),
                      title_text=title,
                      legend_orientation="h")
    #fig.write_image(filename)
    ksexp = kstest(expx, 'beta', args=(alphax, betax))
    ksneut = kstest('beta', False, args=(alphax, betax), N=expx.size)
    return (alphax, betax, ksexp.statistic, ksexp.pvalue, ksneut.statistic,
            ksneut.pvalue, expx.size)
Esempio n. 12
0
def ebb_fit_prior(x, n, method='mm', start=(0.5, 0.5)):
    p = x / n
    if (method == 'mm'):
        mu, sig = np.mean(p), np.var(p)
        a = ((1 - mu) / sig - 1 / mu) * mu**2
        b = a * (1 / mu - 1)

        fitted_prior = Beta(a, b)

        pass
    elif (method == 'mle'):

        # starting value
        # if (np.isnan(start)):
        #     mm_est = ebb_fit_prior(x, n, 'mm')
        #     start = (mm_est.alpha, mm_est.beta)
        #     #print(start)

        # likelihood function: f(a, b)
        def likelihood(pars):
            return (-np.sum(beta_dist.pdf(p, pars[0], pars[1])))

        # optimization function: over a series of params, optimise likelihood
        # outp = minimize(likelihood, x0 = start, method = 'BFGS')
        # fitted_prior = Beta(outp.x[0], outp.x[1])

        a, b, *ls = beta_dist.fit(p)
        fitted_prior = Beta(a, b)
        pass
    else:
        return ('Method should be MM or MLE')

    return fitted_prior

    pass
Esempio n. 13
0
 def generate_image(self):
     '''Provides histogram(s) with PDF curve(s)'''
     # Setup plots
     fig, ax = plt.subplots(figsize=(16, 6))
     plt.subplots_adjust(bottom=.2)
     ax.axes.set_title('Risk Distribution', fontsize=20)
     # Format X axis
     ax.axes.xaxis.set_major_formatter(StrMethodFormatter('${x:,.0f}'))
     ax.axes.xaxis.set_tick_params(rotation=-45)
     ax.set_ylabel('Frequency Histogram')
     for tick in ax.axes.xaxis.get_major_ticks():
         tick.label.set_horizontalalignment('left')
     # Draw histrogram for each model
     legend_labels = []
     for name, model in self._input.items():
         legend_labels.append(name)
         plt.hist([model.export_results()['Risk']], bins=25, alpha=.3)
     ax.legend(legend_labels, frameon=False)
     # Min and Max post graphing
     xmin, xmax = ax.get_xlim()
     # Now draw twin axis a d style
     tyax = plt.twinx(ax)
     tyax.set_ylabel('PDF')
     tyax.set_yticks([])
     # Plot for each
     for name, model in self._input.items():
         risk = model.export_results()['Risk']
         # Catch warnings as we're "fitting" with known shape parameters.
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             beta_curve = beta(*beta.fit(risk))
         space = np.linspace(0, xmax, 1000)
         tyax.plot(space, beta_curve.pdf(space))
     plt.margins(0)
     return (fig, ax)
Esempio n. 14
0
    def maximum_likelihood_fit(data, loc=0, scale=1):
        """Estimate parameters from samples.

        This a wrapper around scipy's maximum likelihood estimator to
        estimate the parameters of a beta distribution from samples.

        Parameters
        ----------
        data : array-like, shape=[..., n_samples]
            Data to estimate parameters from. Arrays of
            different length may be passed.
        loc : float
            Location parameter of the distribution to estimate parameters
            from. It is kept fixed during optimization.
            Optional, default: 0.
        scale : float
            Scale parameter of the distribution to estimate parameters
            from. It is kept fixed during optimization.
            Optional, default: 1.

        Returns
        -------
        parameter : array-like, shape=[..., 2]
            Estimate of parameter obtained by maximum likelihood.
        """
        data = gs.cast(data, gs.float32)
        data = gs.to_ndarray(gs.where(data == 1., 1. - EPSILON, data),
                             to_ndim=2)
        parameters = []
        for sample in data:
            param_a, param_b, _, _ = beta.fit(sample, floc=loc, fscale=scale)
            parameters.append(gs.array([param_a, param_b]))
        return parameters[0] if len(data) == 1 else gs.stack(parameters)
Esempio n. 15
0
def pvalue(name, data):
    p_value = []
    total_number = len(name)
    for i in range(len(name)):
        sys.stdout.write('Sample tested: %d/' % (i) + str(total_number) +
                         ' \r')
        sys.stdout.flush()
        ppvp = []
        d1 = np.where(data[i] > 0, data[i], 0.0000000001)
        d1 = np.where(d1 < 1, d1, 0.9999999999)
        rl = []
        for j in range(len(d1)):
            if name[j].split('-')[0] == name[i].split('-')[0]:
                rl.append(j)
        d2 = []
        for k in range(len(d1)):
            if k in rl:
                continue
            else:
                d2.append(d1[k])
        try:
            param = beta.fit(d2, floc=0, fscale=1)
        except:
            print d2
        rv = beta(param[0], param[1], 0, 1)
        for j in range(len(d1)):
            if j != i:
                ppvp.append(1 - rv.cdf(d1[j]))
            else:
                ppvp.append(1)
        p_value.append(ppvp)
    if save_label_files == 1:
        np.savetxt(path + '/p_values.txt', np.array(p_value))
    return np.array(p_value)
Esempio n. 16
0
    def fit(data: FloatIterable,
            a: Optional[float] = None,
            b: Optional[float] = None,
            c: Optional[float] = None) -> 'PERT':
        """
        Fit a PERT distribution to the data.

        :param data: Iterable of data to fit to.
        :param a: Optional fixed value for a.
        :param b: Optional fixed value for b.
        :param c: Optional fixed value for c.
        """
        kwargs = {}
        if a is not None:
            kwargs['floc'] = a
        if a is not None and c is not None:
            kwargs['fscale'] = c - a
        alpha, beta, loc, scale = beta_dist.fit(data=data, **kwargs)
        a = a if a is not None else loc
        c = c if c is not None else loc + scale
        if b is None:
            b_est_1 = a + (alpha * (c - a) - 1) / 4
            b_est_2 = c - (beta * (c - a) - 1) / 4
            b = (b_est_1 + b_est_2) / 2
        return PERT(a=a, b=b, c=c)
Esempio n. 17
0
    def get_histogram(self):

        # Fit a normal distribution to the data:
        mu, std = norm.fit(self.data)
        # Fit a beta distribution to the data
        betaparams = beta.fit(self.data)

        fig = plt.figure()
        # plt.subplot(211)

        # Plot the histogram.
        plt.hist(self.data,
                 bins=2 * len(self.data),
                 normed=True,
                 alpha=0.6,
                 color='g')

        # Plot the PDF.
        xmin, xmax = plt.xlim()
        x = np.linspace(xmin, xmax, 100)
        p = norm.pdf(x, mu, std)
        plt.plot(x, p, 'k', linewidth=2)
        b = beta.pdf(x, *betaparams)
        plt.plot(x, b, 'r', linewidth=2)
        title = "Normal fit: mu = %.2f,  std = %.2f" % (mu, std)
        plt.suptitle(title)
        subtitle = "black: normal, red: beta"
        plt.title(subtitle)

        return fig
Esempio n. 18
0
def confirmed_prior(save = False, name = 'data/distr/confirmedratio.csv'):
    """Get ratio of confirmed cases.
    
    Args:
        save (bool, optional): Whether to save the figure, defaultly not.
        name (str, optional): Path to save the plot to.
    """
    try:
        return pd.read_csv(name)
    except: pass
    # get data
    pop = population.countries()
    df = src.get_data()
    tests = testing.tests()
    # group
    iso3_iso2 = {'CZE':'CZ','SWE':'SE','POL':'PL','ITA':'IT'}
    for country3 in df.iso_alpha_3.unique():
        # get country population
        country2 = iso3_iso2[country3]
        country_pop = float(pop.population[pop.region == country2])
        # normalize confirmed by tests
        country_confirmed = df[df.iso_alpha_3 == country3].confirmed.apply(lambda c: c if c > 0 else 1)
        country_tests = tests[tests.country == country3].tests.apply(lambda t: t if t > 0 else 1)
        df.loc[df.iso_alpha_3 == country3,'ratio'] = (country_confirmed / country_tests).fillna(0)
        df['ratio'] = df.ratio.apply(lambda r: r if r < 1 else 1 - 1e-6)
        df['ratio'] = df.ratio.apply(lambda r: r if r > 0 else 1e-6)
        df[df.iso_alpha_3 == country3]['tests'] = country_tests
        df[df.iso_alpha_3 == country3]['confirmed'] = country_confirmed
    df = df[['iso_alpha_3','date','confirmed','tests','ratio']]
    confirmed_fit = beta.fit(df.ratio, floc = 0, fscale = 1)
    # save
    if save: df.to_csv(name, index = False)
    return df
Esempio n. 19
0
def model_df(p_df, ax, title=''):
    # construct prior
    cov_cutoff = 100
    hi_df = p_df\
        .query('(sense + antisense) > %i' %cov_cutoff)\
        .assign(percentage_sense = lambda d: d.sense/(d.sense + d.antisense))
    fitted_params = beta.fit(data=hi_df.percentage_sense.values,
                             floc=0,
                             fscale=1)
    print(fitted_params)
    alpha0, beta0, loc, scale = fitted_params

    hist = True
    bins = 10
    sns.distplot(hi_df.percentage_sense,
                 label='High count repeats (>%i fragments)' % cov_cutoff,
                 ax=ax,
                 hist_kws={'alpha': 0.5},
                 bins=bins,
                 hist=hist)
    ls = np.linspace(0, 1, 1000)
    ax.plot(ls,
            beta.pdf(ls, alpha0, beta0, loc, scale),
            label='Fitted beta-binomial')
    #sns.distplot(np.random.beta(alpha0, beta0, size=hi_df.shape[0]))
    ax.legend(frameon=False, bbox_to_anchor=(0.6, 1.1))
    ax.set_xlabel('Propotion of sense strand fragments')
    ax.set_ylabel('Density')
    ax.set_xlim(0, 1)
    ax.set_title(title, fontsize=15)
    sns.despine()
    return alpha0, beta0
Esempio n. 20
0
 def get_beta_percentile_confidence(self, conf=0.60):
     '''
     I am conf percent sure that I will be there in :return: minutes or less
     invert conf for "minutes or more". eg 0.10== 90 percent sure it'll take at least...
     '''
     betaparams = beta.fit(self.data)
     return beta.ppf(conf, *betaparams)
Esempio n. 21
0
def define_correction_function(top_pvalues_perm, cis_mode):
    #Always try to use the MLE estimator, new default to 10 permutations.
    #If the MLE estimator fails we go back to the cruder estimation of the beta distribution.
    offset = (np.finfo(np.double).tiny * 100)
    ##Replace zero's value with smallest number not 0.
    top_pvalues_perm[top_pvalues_perm == 0] = offset
    ##Replace highest value with highest number not 1.
    top_pvalues_perm[top_pvalues_perm == 1] = 1 - offset
    try:
        alpha_para, beta_para, loc, fscale = beta.fit(top_pvalues_perm,
                                                      floc=0,
                                                      fscale=1)
    except (scipy.stats._continuous_distns.FitSolverError):
        alpha_para, beta_para = estimate_beta_function_paras(top_pvalues_perm)
    except (scipy.stats._continuous_distns.FitDataError):
        alpha_para, beta_para = estimate_beta_function_paras(top_pvalues_perm)
    if (cis_mode):
        if (alpha_para < BETA_SHAPE1_MIN or alpha_para > BETA_SHAPE1_MAX
                or alpha_para < BETA_SHAPE2_MIN_CIS
                or alpha_para > BETA_SHAPE2_MAX_CIS):
            alpha_para, beta_para = estimate_beta_function_paras(
                top_pvalues_perm)
            ### If pvalues become more significant after multiple testing correction we put them back to the orignal test Pvalue in a seperate step.
    else:
        if (alpha_para < BETA_SHAPE1_MIN or alpha_para > BETA_SHAPE1_MAX
                or alpha_para < BETA_SHAPE2_MIN_TRANS
                or alpha_para > BETA_SHAPE2_MAX_TRANS):
            alpha_para, beta_para = estimate_beta_function_paras(
                top_pvalues_perm)

    beta_dist = scipy.stats.beta(alpha_para, beta_para)
    correction_function = lambda x: beta_dist.cdf(x)
    #Would be good to replace 0 with minimal double value of python.
    return [correction_function, alpha_para, beta_para]
Esempio n. 22
0
    def test_draw_samples_non_mock(self, plot=False):
        # Also make sure the non-mock sampler works
        dtype = np.float32
        num_samples = 100000

        a = np.array([2])
        b = np.array([5])

        rv_shape = (1, )

        a_mx = add_sample_dimension(mx.nd, mx.nd.array(a, dtype=dtype))
        b_mx = add_sample_dimension(mx.nd, mx.nd.array(b, dtype=dtype))

        rand_gen = None
        var = Beta.define_variable(shape=rv_shape,
                                   rand_gen=rand_gen,
                                   dtype=dtype).factor
        variables = {var.alpha.uuid: a_mx, var.beta.uuid: b_mx}
        rv_samples_rt = var.draw_samples(F=mx.nd,
                                         variables=variables,
                                         num_samples=num_samples)

        assert array_has_samples(mx.nd, rv_samples_rt)
        assert get_num_samples(mx.nd, rv_samples_rt) == num_samples
        assert rv_samples_rt.dtype == dtype

        if plot:
            plot_univariate(samples=rv_samples_rt, dist=beta, a=a[0], b=b[0])

        a_est, b_est, _, _ = beta.fit(rv_samples_rt.asnumpy().ravel())
        a_tol = 0.2
        b_tol = 0.2
        assert np.abs(a[0] - a_est) < a_tol
        assert np.abs(b[0] - b_est) < b_tol
Esempio n. 23
0
    def maximum_likelihood_fit(self, data, loc=0, scale=1):
        """Estimate parameters from samples.

        This a wrapper around scipy's maximum likelihood estimator to
        estimate the parameters of a beta distribution from samples.

        Parameters
        ----------
        data : array-like, shape=[n_distributions, n_samples]
            the data to estimate parameters from. Arrays of
            different length may be passed.
        loc : float, optional
            the location parameter of the distribution to estimate parameters
            from. It is kept fixed during optimization
            default: 0
        scale : float, optional
            the scale parameter of the distribution to estimate parameters
            from. It is kept fixed during optimization
            default: 1
        Returns
        -------
        parameter : array-like, shape=[n_samples, 2]
        """
        data = gs.to_ndarray(
            gs.where(data == 1., 1 - EPSILON, data), to_ndim=2)
        parameters = []
        for sample in data:
            param_a, param_b, _, _ = beta.fit(sample, floc=loc, fscale=scale)
            parameters.append(gs.array([param_a, param_b]))
        return parameters[0] if len(data) == 1 else gs.stack(parameters)
Esempio n. 24
0
 def reference_sim(self, A, classified,labels):
     num_centers = len(set(labels))
     small = .0000000000001
     ideal_A = np.zeros([A.shape[0],A.shape[1]])
     for i in range(0,len(labels)):
         for j in range(0,i+1):
             if labels[i] == labels[j]:
                 ideal_A[i,j] = 1
                 ideal_A[j,i] = 1         
     pred_pos = A[ideal_A ==1]
     pred_neg = A[ideal_A ==0]
     pos_a,pos_b,pos_loc, pos_scale= beta.fit(pred_pos)
     neg_a,neg_b,neg_loc, neg_scale= beta.fit(pred_neg)
     fits = []
     #Fit comparison iwth more than 1 clust
     for sim in range(0, 50):
         simulated_mat = np.ones([A.shape[0],A.shape[1]])
         for i in range(0,len(labels)):
             for j in range(0,i):
                 if ideal_A[i,j] ==0:
                     simulated_mat[i,j] = simulated_mat[j,i]= beta.rvs(max(neg_a,small), max(small,neg_b), loc=neg_loc,scale =neg_scale)
                 else:
                     simulated_mat[i, j ] = simulated_mat[j, i ] = beta.rvs(max(pos_a,small), max(small,pos_b), loc=pos_loc,scale =pos_scale)
         self.one_clust_test = False
         whereAreNaNs = np.isnan(simulated_mat)
         simulated_mat[whereAreNaNs] = 0
         self.fit(simulated_mat)
         #print simulated_mat
         fits.append(self.gap_stat_)
     multi_fit = np.mean(fits)
     fits_one = []
     pos_a,pos_b,pos_loc, pos_scale= beta.fit(A)
     for sim in range(0, 50):
         simulated_mat = np.ones([A.shape[0],A.shape[1]])
         for i in range(0,len(labels)):
             for j in range(0,i):
                 simulated_mat[i,j] = simulated_mat[j,i]= beta.rvs(max(small,pos_a), max(small,pos_b), loc=pos_loc,scale =pos_scale)
         whereAreNaNs = np.isnan(simulated_mat)
         simulated_mat[whereAreNaNs] = 0
         e_vals, e_vecs  = np.linalg.eigh(simulated_mat)
         #2. Get Reverse Sorted Order  - largest to smallest
         e_order = np.argsort(e_vals)[::-1]
         self.one_clust_fit_alt(e_vecs,e_order)
         fits_one.append(self.gap_stat_)
     one_fit = np.mean(fits_one)
     return multi_fit, one_fit
Esempio n. 25
0
def fit_beta(data):
    EPSILON = 1e-3
    data = [EPSILON if d == 0 else 1 - EPSILON if d == 1 else d
            for d in data]  # make sure no data points EQUAL one of the bounds
    if len(set(data)) == 1:  # make sure not all data points are identical
        data = [d + random.normalvariate(0, 1e-6) for d in data]
    params = beta.fit(data, floc=0, fscale=1)
    return params[0], params[1]
Esempio n. 26
0
def gen_beta(data):
    # Param 0 is alpha
    # Param 1 is b1
    # Param 2 is loc
    data = data[~np.isnan(data)]
    data = np.array([.01 if i == 0 else .99 if i == 1 else i for i in data])
    beta_params = beta.fit(data, floc=0., fscale=1.)
    return beta_params[0], beta_params[1]
Esempio n. 27
0
def one_vote(N, threshold=0.5, ab=False, forecasts=False, normal=False, p=False, diagnostic=False):
	import numpy as np
	if sum(map(bool,[ab, forecasts, normal, p])) != 1:
		raise ValueError("Please specify one and only one of the 'ab', 'forecasts', 'normal', or 'p' options.")

	if ab:
		a, b = ab
	elif forecasts:
		from scipy.stats import beta
		a, b, _, _ = beta.fit(forecasts, floc=0, fscale=1)
	elif normal:
		from functions import fit_beta_to_normal
		m, s = normal
		a, b = fit_beta_to_normal(m,s)
	else:
		pass

	if p:
		from functions import mp_binom
		victory_pr = mp_binom(np.ceil(N*threshold),N,p)
		if (N*threshold).is_integer():
			# tie probability in case N*threshold is whole:
			tie_pr = mp_binom(N*(1-threshold)+1,N,p)
		elif not (N*threshold).is_integer() and threshold != 0.5:
			# tie probability in case N*threshold is not whole:
			tie_pr = mp_binom(N*(1-threshold),N,p)
		else:
			tie_pr = 0
	elif a and b:
		from functions import beta_binomial
		victory_pr = beta_binomial(np.ceil(N*threshold),N,a,b,multi_precission=True)
		if (N*threshold).is_integer():
			# tie probability in case N*threshold is whole:
			tie_pr = beta_binomial(N*(1-threshold)+1,N,a,b,multi_precission=True)
		elif not (N*threshold).is_integer() and threshold != 0.5:
			# tie probability in case N*threshold is not whole:
			tie_pr = beta_binomial(N*(1-threshold),N,a,b,multi_precission=True)
		else:
			tie_pr = 0
	else:
		pass

	if diagnostic:
		import matplotlib.pyplot as plt
		x = np.linspace(0,1,1000)
		try:
			plt.style.use('http://chymera.eu/matplotlib/styles/chymeric-gnome.mplstyle')
		except ValueError:
			plt.style.use('ggplot')
		plt.axvline(x=threshold, color="#fbb4b9", linewidth=1)
		plt.legend(['percentage\n threshold'], loc='upper right')
		plt.plot(x, beta.pdf(x,a,b))
		plt.xlabel('Reference Candidate Vote Share')
		plt.ylabel('PDF')
		plt.show()

	total_pr = victory_pr+tie_pr
	return total_pr, victory_pr, tie_pr
Esempio n. 28
0
def beta_fit(data, col):
    plt.hist(data[col], bins=10000, density=True, alpha=0.6, color='g')
    a, b, loc, scale = beta.fit(data[col])
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 10000)
    # plt.title("Beta fitting of "+str(col))
    # plt.plot(x, p, 'r', linewidth=2)
    # plt.show()
    return x, a, b, loc, scale
Esempio n. 29
0
def run_beta_fit(cadd_trset, mnp_cadd_trset, gerp_trset):
  '''
from scipy import stats  
import numpy as np  
import matplotlib.pylab as plt

# create some normal random noisy data
ser = 50*np.random.rand() * np.random.normal(10, 10, 100) + 20

# plot normed histogram
plt.hist(ser, normed=True)

# find minimum and maximum of xticks, so we know
# where we should compute theoretical distribution
xt = plt.xticks()[0]  
xmin, xmax = min(xt), max(xt)  
lnspc = np.linspace(xmin, xmax, len(ser))

ab,bb,cb,db = stats.beta.fit(ser)  
pdf_beta = stats.beta.pdf(lnspc, ab, bb,cb, db)  
plt.plot(lnspc, pdf_beta, label="Beta")

plt.show()
  '''
  
  cadd_trset_param = {}
  for aaconv in cadd_trset.keys():
     a,b,loc2,scale2 = beta.fit(cadd_trset[aaconv])
     mean2 = beta.mean(a,b,loc2,scale2)
     cadd_trset_param[aaconv] = [a,b,loc2,scale2,mean2]

  mnp_cadd_trset_param = {}
  for aaconv in mnp_cadd_trset.keys(): 
    a,b,loc2,scale2 = beta.fit(mnp_cadd_trset[aaconv])
    mean2 = beta.mean(a,b,loc2,scale2)
    mnp_cadd_trset_param[aaconv] = [a,b,loc2,scale2,mean2]

  gerp_trset_param = {}
  for aaconv in gerp_trset.keys():
    a,b,loc2,scale2 = beta.fit(gerp_trset[aaconv])
    mean2 = beta.mean(a,b,loc2,scale2)
    gerp_trset_param[aaconv] = [a,b,loc2,scale2,mean2]

  return cadd_trset_param, mnp_cadd_trset_param, gerp_trset_param
Esempio n. 30
0
def beta_dist_estimator(data):

    if not data.empty:

        x = data.Age.value_counts(dropna=False).sort_index()
        z = x / np.sum(x)

        a1, b1, loc1, scale1 = beta.fit(z, floc=0, fscale=1)

        return a1, b1, loc1, scale1
Esempio n. 31
0
def plot():
    CURRENT_PATH = os.path.dirname(os.path.realpath(__file__))

    # Final model with exponential hyperprior
    MODEL_PATH = os.path.join(CURRENT_PATH, "..", "bayes_cv_prune",
                              "stan_models", "exp_new.stan")

    model = BayesStanPruner(MODEL_PATH, seed=0).load()

    # Simulated sets of accuracy values
    A0 = [0.4, 0.5]
    A1 = [0.80, 0.82]
    post_sample0 = model.fit_predict(A0)
    post_sample1 = model.fit_predict(A1)

    x = np.linspace(0, 1, 250)
    a, b, _, _ = beta.fit(A0, floc=0, fscale=1)
    ml_estimate0 = beta.pdf(x, a, b)
    a, b, _, _ = beta.fit(A1, floc=0, fscale=1)
    ml_estimate1 = beta.pdf(x, a, b)

    # Plot
    sns.set(context="paper",
            style="whitegrid",
            font="STIXGeneral",
            font_scale=1.25)

    bins = np.linspace(0, 1, 41)

    f, axes = plt.subplots(1, 2, figsize=(6.5, 3))
    axes[0].hist(post_sample0, bins=bins, density=True, label="Post. pred.")
    axes[0].plot(x, ml_estimate0, '-k', label="Beta ML fit")
    axes[0].set_title("A = {0.4, 0.5}")
    axes[0].legend()
    axes[1].hist(post_sample1, bins=bins, density=True, label="Post. pred.")
    axes[1].plot(x, ml_estimate1, '-k', label="Beta ML fit")
    axes[1].set_title("A = {0.80, 0.82}")
    axes[1].legend()
    plt.subplots_adjust(left=0.065, bottom=0.095, top=0.9, right=0.975)
    plt.show()
Esempio n. 32
0
def EI():
    """Get distributions for parameter c, connection E-I."""
    # seed
    np.random.seed(seed=12345)
    # draw from incubation period
    pars = incubation.continuous()['gamma']
    draws = gamma.rvs(*pars, size = 1000000, random_state = 12345)
    # fit beta to 1/draw
    samples = 1 / draws
    samples = samples[(samples > 0) & (samples < 1)]
    return {'x': samples,
            'beta': beta.fit(samples),
            'gamma': gamma.fit(samples, loc = .2, scale = 10)}
Esempio n. 33
0
def fit_beta(table, xlims=(-2, 0.5)):
    """Returns fit of Beta Distribution to a given [Fe/H] table
    See: fit_gaussian()
    """
    z_sort = np.sort(table['feh'])
    i_0 = np.searchsorted(z_sort, xlims[0])
    i_1 = np.searchsorted(z_sort, xlims[1])

    loc = xlims[0]
    scale = xlims[1] - xlims[0]

    a, b, loc, scale = beta.fit(z_sort[i_0:i_1], floc=loc, fscale=scale)
    return a, b, loc, scale
Esempio n. 34
0
plt.figure(figsize=(10, 7))
plt.axvline(mle, linestyle ="--")
line1, = plt.plot(possible_thetas, likelihoods)

bins = [x/100 for x in range(100)]
counts, bins = np.histogram(infections_rates, bins=bins)
counts = counts / counts.sum()
line2, = plt.plot(bins[:-1], counts)
plt.xlabel("Theta")
plt.title("Evidence vs Historical Infection Rates")
plt.legend((line1, line2), ('Likelihood of Theta with new evidence', 'Frequency of Theta in last 100 months')
           , loc = 'upper left')
plt.show()

# Model the data with a beta function
prior_a, prior_b = beta.fit(infections_rates, floc = 0, fscale = 1)[0:2] # Fit data to find a & b for the beta dist.
prior = beta(prior_a, prior_b)

prior_samples = prior.rvs(10000)  # Sample from the prior
beta_sample_counts, bins = np.histogram(prior_samples, bins)
total = beta_sample_counts.sum()
beta_sample_counts = [x / total for x in beta_sample_counts]

plt.figure(figsize=(10, 7))

line1, = plt.plot(bins[:-1], beta_sample_counts)

hist_rates, bins = np.histogram(infections_rates, bins)
total = hist_rates.sum()
hist_rates = [x/total for x in hist_rates]
line2, = plt.plot(bins[:-1], hist_rates)
Esempio n. 35
0
    numpy.set_printoptions(linewidth=1000000)



    probs = []
    pvals = []

    pyplot.figure(figsize=(15,10))

    for i, data in enumerate(bin_data):

        print i
        tdata =  numpy.array(data)

        params = beta_dist.fit(tdata, floc=0) 

        (alpha, beta, floc, fshape) = params

        print alpha, beta

        vals = numpy.arange(0, strand_length + 1)
        fit_hist = numpy.array([beta_binom(val, strand_length, alpha, beta) for val in vals])

        #data_bin_edges = numpy.linspace(0.0, 1.0, num=strand_length + 1, endpoint=True)

        data_bin_edges = numpy.linspace(-0.5, strand_length + 0.5, num=strand_length + 2,  endpoint=True) / float(strand_length)

        #print data_bin_edges

Esempio n. 36
0
# Copyright 2015, Chen Sun (bbsunchen at outlook.com)
from sys import argv
from scipy.stats import beta

frequence_list = []

with open(argv[1]) as input_file:
    for line in input_file:
        line = line.strip()
        if line.startswith('rs#'):
            continue
        columns = line.split(' ')
        #print line
        ref_freq = float(columns[11])
        oth_freq = float(columns[14])
        if ref_freq >= 0.05 and ref_freq <= 0.95:
            frequence_list.append(ref_freq)
        if oth_freq >= 0.05 and oth_freq <= 0.95:
            frequence_list.append(oth_freq)
            
# direct fit Beta distribution
#print beta.fit(frequence_list)

# fit Beta distribution with frequency [0.05, 0.95], more precise
a,b,l,s=beta.fit(frequence_list, floc=0.04999999999999999, fscale=0.9000000000000000)
#print str(a)+'\t'+str(b)
print '{}\t{}'.format(a,b)
# fit Beta distribution with some infer
#print beta.fit(frequence_list, floc=0, fscale=1)
Esempio n. 37
0
#analysis of mccarthy's data
import numpy as np
import csv
import pylab as plt
import pymc as pm
from sklearn import mixture
import sys
from scipy.stats import beta
def gaussian(x, mu, sig):
    return (1/(sig*np.sqrt(2*np.pi)))*np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))
filename = "C:\Users\leon\Documents\Data\McCarthys.dat"
raw_data = list(csv.reader(open(filename, 'rb')))
data = np.zeros((256,2))
for idx,line in enumerate(raw_data):
    data[idx, :] =  line[0].split()
master_data = np.array([])
for line in data:
    new_data = np.repeat(line[0], line[1])
    master_data = np.append(master_data, new_data)
master_data = master_data[:, None]
#clf = mixture.GMM(n_components=2, )
#clf.fit(master_data)
#plt.hist(master_data, 100, normed = True)
#plt.plot(gaussian(np.linspace(0,50,100), clf.means_[0], clf.covars_[0]))
#plt.plot(gaussian(np.linspace(0,50,100), clf.means_[1], clf.covars_[1]))
sys.exit()
[alpha, beta, loc, scale] = beta.fit(master_data)
Esempio n. 38
0
			smoothMeth = regionDict[key]["SMOOTHED"]
			rawScores = []
			smoothScores = []
			for r,s in zip(rawMeth, smoothMeth):
				rawScores.append(float(r['score'])/100)
				smScore = float(s['score'])/100
				if smScore == 1:
					smScore = 0.9999999999
				if smScore == 0:
					smScore = 0.0000000001
				smoothScores.append(smScore)
			print(smoothScores)
			if np.var(smoothScores) == 0:
				break
			else:
				fit = beta.fit(smoothScores, floc=0, fscale=1)

			alphas.append(fit[0])
			betas.append(fit[1])
			statuses.append(status)
	
				

fig = plt.figure()

cols = {1:'b', -1:'r', 0:'k'}

for a,b,s in zip(alphas, betas, statuses):
	plt.plot(a,b, cols[s]+'o',markersize = 2)

plt.savefig('beta_plot.pdf', bbox_inches = 'tight')
Esempio n. 39
0
def vplik(old, imps, cclass):
#	p = multiprocessing.Pool(multiprocessing.cpu_count())
	preimage = [(old,imps,random.randint(0,1000000)) for i in xrange(PLIK_REPS)]
#	print preimage
	image = np.array(P.map(rboot,preimage))
	return beta(*beta.fit(image[:,0]))
Esempio n. 40
0
bin_step = 100.0/n_bins
bin_edges = numpy.arange(0.0, 100.0 + bin_step, bin_step)

print bin_edges

bin_data = [[] for _ in range(len(bin_edges) - 1)]


for i in range(len(x)):

    n_hb = y[i]
    val  = x[i]

    for j in range(len(bin_data)):

        if (val <= bin_edges[j+1]) and (val > bin_edges[j]):
            bin_data[j].append(n_hb)

for i, data in enumerate(bin_data):

    tdata =  numpy.array(data)

    params = beta.fit(tdata, fscale=1)
    print i, params[0], params[1], params[2], params[3]
    #print tdata





Esempio n. 41
0
    if correct < 50:
        continue

    if (incorrect > 0) and (correct > 0):
        print name,len(times[name])
        print correct,incorrect
        print np.mean(correct_times),np.mean(incorrect_times)
        #print sum([1 for c in correct_times if c >= min(incorrect_times)])/float(len(correct_times))
        #print
        max_time = max(max(correct_times),max(incorrect_times))
        min_time = min(min(correct_times),min(incorrect_times))
        data = correct_times
        data = [(t-min_time)/float(max_time-min_time) for t in data]
        #print data
        a,b,lower,scale = beta.fit(data)
        #print a,b,lower,scale
        #print
        #print beta.cdf(0.8,a,b)
        #----------------Fit using moments----------------
        mean=np.mean(data)
        var=np.var(data,ddof=1)
        alpha1=mean**2*(1-mean)/var-mean
        beta1=alpha1*(1-mean)/mean


        print beta.cdf((incorrect_times[-1]-min_time)/(max_time-min_time),alpha1,beta1)
        print
        #break

        #print correct_times
Esempio n. 42
0
import numpy as np
from numpy import random as rnd
from scipy.stats import t
from scipy.stats import beta
import seaborn as sns
import matplotlib.pyplot as plt

### Parameters ###
nu = 3 # arbitrary choice
sigma = 2 # arbitrary choice

### Simulate Values ###
draws1 = rnd.standard_t(nu, size = 1.e6) # draw one million Y values
draws2 = t.cdf(draws1, df = nu) # classic PIT (F_Y(Y) is standard uniform)
draws3 = t.cdf(draws1 / sigma, df = nu) # compute one million X values based on Y values
alpha_fit, beta_fit, loc_fit, scale_fit = beta.fit(draws3) # determine best-fitted beta

### Plot KDEs and CDF of Best-Fitted Beta PDF ###
kdeFig = plt.figure() # start figure
sns.set("talk") # set seaborn style to "talk" --> increase font size
# add all KDEs with plot labels in LaTeX (hence the use of raw strings)
sns.kdeplot(draws1, shade = True, clip = (-3, 3), label = r'KDE for $Y \sim t_3$')
sns.kdeplot(draws2, shade = True, clip = (-3, 3), label = r'KDE for $F_Y(Y)$')
sns.kdeplot(draws3, shade = True, clip = (-3, 3), label = r'KDE for $X$ with $\sigma = 2$')
# add pdf for best-fitted beta with plot labels in LaTeX (hence the use of raw strings)
x = np.linspace(-3,3, num = 1000) # create 1000 values between -3 and +3
y = beta.pdf(x, a = alpha_fit, b = beta_fit, loc = loc_fit, scale = scale_fit) # f(x)
plt.plot(x, y, label = r'PDF for Best-Fitted $B(\alpha, \beta)$')
# title & legend
plt.title('Kernel Density Estimation')
plt.legend(loc='upper left')