Ejemplo n.º 1
0
def energyplot(energies,
               fill_color=("C0", "C1"),
               fill_alpha=(1, 0.5),
               fig=plt.gcf(),
               sp=GridSpec(1, 1)[:, :]):

    for i, energy in enumerate(energies):
        mean_energy, trans_energy = energy - energy.mean(), np.diff(energy)
        ax = fig.add_subplot(sp)
        pm.kdeplot(mean_energy,
                   label="Marginal Energy",
                   ax=ax,
                   shade=fill_alpha[0],
                   kwargs_shade={"color": fill_color[0]})
        pm.kdeplot(trans_energy,
                   label="Energy Transition",
                   ax=ax,
                   shade=fill_alpha[1],
                   kwargs_shade={"color": fill_color[1]})

        ax.plot([],
                label="chain {:>2} BFMI = {:.2f}".format(
                    i, pm.bfmi({"energy": energy})),
                alpha=0)
    ax.legend()

    ax.set_xticks([])
    ax.set_yticks([])
Ejemplo n.º 2
0
def run():
    teams = df.home_team.unique()
    teams = pd.DataFrame(teams, columns=['team'])
    teams['i'] = teams.index
    
    df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
    df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
    df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
    df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
    
    observed_home_goals = df.home_score.values
    observed_away_goals = df.away_score.values
    
    home_team = df.i_home.values
    away_team = df.i_away.values
    
    num_teams = len(df.i_home.drop_duplicates())
    num_games = len(home_team)
    
    g = df.groupby('i_away')
    att_starting_points = np.log(g.away_score.mean())
    g = df.groupby('i_home')
    def_starting_points = -np.log(g.away_score.mean())
    with pm.Model() as model:
        # global model parameters
        home = pm.Flat('home')
        sd_att = pm.HalfStudentT('sd_att', nu=3, sigma=2.5)
        sd_def = pm.HalfStudentT('sd_def', nu=3, sigma=2.5)
        intercept = pm.Flat('intercept')
    
        # team-specific model parameters
        atts_star = pm.Normal("atts_star", mu=0, sigma=sd_att, shape=num_teams)
        defs_star = pm.Normal("defs_star", mu=0, sigma=sd_def, shape=num_teams)
    
        atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
        defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
        home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team])
        away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])
    
        # likelihood of observed data
        home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
        away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)
    trace = pm.sample(1000, tune=1000, cores=3)
    pm.traceplot(trace, var_names=['intercept', 'home', 'sd_att', 'sd_def']);
    bfmi = pm.bfmi(trace)
    max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
    (pm.energyplot(trace, legend=False, figsize=(6, 4))
def main(output_trace_path, Xy_training_path, Xy_testing_path, output_path,
         main_cities):
    # loading data
    with open(output_trace_path, 'rb') as buff:
        data = pickle.load(buff)
        hierarchical_model, hierarchical_trace, scaler, degree_index, \
        response_variable, predictor_variables, sector = data['inference'], data['trace'], data['scaler'], \
                                                 data['city_index_df'], data['response_variable'],\
                                                 data['predictor_variables'], data['sector']

    # calculate Convergence stats
    bfmi = pm.bfmi(hierarchical_trace).round(2)
    max_gr = max(
        np.max(gr_stats)
        for gr_stats in pm.gelman_rubin(hierarchical_trace).values()).round(2)
    n = pm.diagnostics.effective_n(hierarchical_trace)
    efffective_samples_city_beta = n['b1']

    # fields to scale, get data of traces
    fields_to_scale = [response_variable] + predictor_variables
    Xy_testing, Xy_training, degree_index = input_data(Xy_testing_path,
                                                       Xy_training_path,
                                                       fields_to_scale, scaler,
                                                       sector)

    # get data of traces
    data = pm.trace_to_dataframe(hierarchical_trace)

    # DO CALCULATION FOR EVERY CLASS IN THE MODEL (CITIES)
    accurracy_df = pd.DataFrame()
    accurracy_df_2 = pd.DataFrame()
    for i, city in zip(degree_index["CODE"].values,
                       degree_index["CITY"].values):
        # get mean coefficeints
        alpha = data['b1__' + str(i)].mean()
        beta = data['b2__' + str(i)].mean()

        # calc accurracy against training set
        Xy_training_city = Xy_training[Xy_training["CITY"] == city]
        Xy_testing_city = Xy_testing[Xy_testing["CITY"] == city]

        if Xy_training_city.empty or Xy_testing_city.empty:
            print(city, sector, "does not exist, we are skipping it")
        else:
            # do for the training data set
            y_prediction, y_target, y_prediction_log, y_target_log = do_prediction(
                Xy_training_city, alpha, beta, response_variable,
                predictor_variables, fields_to_scale, scaler)
            n_samples_train = len(y_target)
            MAPE_single_building_train, MAPE_city_scale_train, r2_train = calc_accurracy(
                y_prediction, y_target)
            MSE_log_domain_train = mean_squared_error(y_target_log,
                                                      y_prediction_log)

            # do for the testing data set
            y_prediction, y_target, y_prediction_log, y_target_log = do_prediction(
                Xy_testing_city, alpha, beta, response_variable,
                predictor_variables, fields_to_scale, scaler)
            n_samples_test = len(y_target)
            MAPE_single_building_test, MAPE_city_scale_test, r2_test = calc_accurracy(
                y_prediction, y_target)
            MSE_log_domain_test = mean_squared_error(y_target_log,
                                                     y_prediction_log)

            dict = pd.DataFrame.from_items([
                ("CITY", [
                    city,
                    city,
                ]), ("BUILDING_CLASS", [sector, sector]),
                ("DATASET", ["Training", "Testing"]),
                ("MAPE_build_EUI_%",
                 [MAPE_single_building_train, MAPE_single_building_test]),
                ("PE_mean_EUI_%",
                 [MAPE_city_scale_train, MAPE_city_scale_test]),
                ("MSE_log_domain", [MSE_log_domain_train,
                                    MSE_log_domain_test]),
                ("n_samples", [n_samples_train, n_samples_test])
            ])

            #do this to get the cities in order
            if city in main_cities:
                accurracy_df = pd.concat([accurracy_df, dict],
                                         ignore_index=True)
            else:
                accurracy_df_2 = pd.concat([accurracy_df_2, dict],
                                           ignore_index=True)

    #append both datasets
    accurracy_df = pd.concat([accurracy_df, accurracy_df_2], ignore_index=True)
    accurracy_df.to_csv(output_path, index=False)
Ejemplo n.º 4
0
def bayesian_ab_test_prob(sample_a_total,
                          sample_a_responses,
                          sample_b_total,
                          sample_b_responses,
                          N_simulations=1000,
                          pct_tune=50,
                          gr_threshold=1.001,
                          N_additional_draws=1000,
                          lpv_height=30):
    ###########################################################################
    # get parameters for model
    # make pct_tune into a proportion
    prop_tune = pct_tune / 100
    # calculate number to tune
    N_tune = round(N_simulations * prop_tune)
    # calculate additional tuning steps
    N_additional_tune = round(N_additional_draws * prop_tune)

    ###########################################################################
    # get data for lower and upper plausible values

    # define helper function for plausible values
    def plausible_values(total, upvotes):
        # get downvotes
        d = total - upvotes
        # 1 + upvotes
        a = 1 + upvotes
        # 1 + downvotes
        b = 1 + d
        # calculate lower plausible value
        lpv = (a / (a + b)) - (1.65 * np.sqrt(
            (a * b) / (((a + b)**2) * (a + b + 1))))
        # calculate upper plausible value
        upv = (a / (a + b)) + (1.65 * np.sqrt(
            (a * b) / (((a + b)**2) * (a + b + 1))))
        # return lpv and upv
        return lpv, upv

    # sample A
    # number of non-clicks
    N_nonclicks_A = sample_a_total - sample_a_responses
    # calculate CTR
    observed_p_A = sample_a_responses / sample_a_total
    # calculate lower plausible value in sample A
    sample_A_lpv = plausible_values(total=sample_a_total,
                                    upvotes=sample_a_responses)[0]
    # calculate upper plausible value in sample A
    sample_A_upv = plausible_values(total=sample_a_total,
                                    upvotes=sample_a_responses)[1]

    # sample B
    # calculate non-clicks
    N_nonclicks_B = sample_b_total - sample_b_responses
    # calculate CTR
    observed_p_B = sample_b_responses / sample_b_total
    # calculate lower plausible value in sample B
    sample_B_lpv = plausible_values(total=sample_b_total,
                                    upvotes=sample_b_responses)[0]
    # calculate upper plausible value in sample B
    sample_B_upv = plausible_values(total=sample_b_total,
                                    upvotes=sample_b_responses)[1]

    # put into df
    df = pd.DataFrame({
        'Variable': ['Sent', 'Yes', 'No', 'Rate', 'LPV', 'UPV'],
        'Sample A': [
            sample_a_total, sample_a_responses, N_nonclicks_A, observed_p_A,
            sample_A_lpv, sample_A_upv
        ],
        'Sample B': [
            sample_b_total, sample_b_responses, N_nonclicks_B, observed_p_B,
            sample_B_lpv, sample_B_upv
        ]
    })
    # create a col that is sample A minus sample B
    df['A - B'] = df['Sample A'] - df['Sample B']

    ###########################################################################
    # plot it
    x = ('Sample A', 'Sample B')
    y = (observed_p_A, observed_p_B)
    # get error values
    # lower
    # sample A
    A_low_err = observed_p_A - sample_A_lpv
    # sample B
    B_low_err = observed_p_B - sample_B_lpv
    # upper
    # sample A
    A_upp_err = sample_A_upv - observed_p_A
    # sample B
    B_upp_err = sample_B_upv - observed_p_B
    # create a programmatic title
    # print message
    if sample_A_lpv > sample_B_lpv:
        title = 'Sample A has a greater LPV'
    else:
        title = 'Sample B has a greater LPV'
    yerr = np.array([(A_low_err, B_low_err), (A_upp_err, B_upp_err)])
    lpv_plot, axes = plt.subplots()
    axes.errorbar(x, y, yerr, fmt='o')
    axes.set_title(title)

    ###########################################################################
    # place the user input into artificial observations
    # sample A
    # create list for number of zeros (non-clicks)
    observations_A = [0] * N_nonclicks_A
    # create list for number of 1s
    N_clicks_list_A = [1] * sample_a_responses
    # combine lists
    observations_A.extend(N_clicks_list_A)

    # sample B
    # create list for number of zeros
    observations_B = [0] * N_nonclicks_B
    # create list for number of 1s
    N_clicks_list_B = [1] * sample_b_responses
    # combine lists
    observations_B.extend(N_clicks_list_B)

    ###########################################################################
    # set up pymc3 model assuming uniform prior and Bernoulli posterior
    # print a message
    print('\n')
    print('Model being built using {} initial, tuned draws...'.format(
        N_simulations))
    print('\n')
    # instantiate model
    with pm.Model() as model:
        # get prior probabilities from Uniform distribution because we don't know what they are (objective prior)
        prior_A = pm.Uniform('prior_A', 0, 1)
        prior_B = pm.Uniform('prior_B', 0, 1)
        # fit our observations to a (posterior) Bernoulli distribution, could also try Binomial?
        posterior_A = pm.Bernoulli('posterior_A',
                                   prior_A,
                                   observed=observations_A)
        posterior_B = pm.Bernoulli('posterior_B',
                                   prior_B,
                                   observed=observations_B)
        # get samples from the posterior distribution
        trace = pm.sample(draws=N_simulations + N_tune, tune=N_tune)
    # get maximum value of Gelman-Rubin test
    max_gr = max(
        np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
    # if model has not converged, continue to...
    while max_gr > gr_threshold:
        # print message
        print('\n')
        print('Gelman-Rubin statistic: {}'.format(max_gr))
        print(
            'Gelman-Rubin statistic is too large, {} additional draws will be taken.'
            .format(N_additional_draws))
        print('\n')
        with model:
            trace = pm.sample(draws=N_additional_draws + N_additional_tune,
                              tune=N_additional_tune)
        # get maximum value of Gelman-Rubin test
        max_gr = max(
            np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
        # add N_additional_draws to N_simulations
        N_simulations += N_additional_draws
    # print message
    print('\n')
    print(
        'Success! Model has converged after {} draws. Final Gelman-Rubin: {}'.
        format(N_simulations, max_gr))

    ###########################################################################
    # display conversion stats
    # calculate the bayesian fraction of missing information
    bfmi = pm.bfmi(trace)
    # get maximum value of Gelman-Rubin test
    max_gr = max(
        np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())
    # print message
    print('Bayesian fraction of missing information: {}'.format(bfmi))
    print('\n')

    ###########################################################################
    # get distributions
    # sample A
    p_A_samples = trace['prior_A']
    # sample B
    p_B_samples = trace['prior_B']

    ###########################################################################
    # plot the distributions
    sns.set(style='white', palette='muted', color_codes=True)
    # Set up the matplotlib figure
    dist_plot, axes = plt.subplots(figsize=(7, 7), sharex=True)
    dist_plot.suptitle(
        'Posterior distributions of $p_A$ (blue) and $p_B$ (red) after {} draws'
        .format(N_simulations))
    sns.despine(left=True)
    # posterior A
    p1 = sns.distplot(p_A_samples, color='b', label='Posterior of $p_A$')
    p1.vlines(sample_A_lpv,
              0,
              lpv_height,
              colors='b',
              linestyle='--',
              label='Sample A LPV: {0:0.3f}'.format(sample_A_lpv))
    p1.legend(loc='upper left')
    # posterior B
    p2 = sns.distplot(p_B_samples, color='r', label='Posterior of $p_B$')
    p2.vlines(sample_B_lpv,
              0,
              lpv_height,
              colors='r',
              linestyle='--',
              label='Sample B LPV: {0:0.3f}'.format(sample_B_lpv))
    p2.legend(loc='upper left')
    # display plot
    plt.tight_layout()  # fix any overlapping

    ###########################################################################
    # get proportion of p_A_samples that are greater than p_B_samples
    # iterate through p_A_samples and p_B_samples
    sum_of_A_greater_than_B = 0
    for i in range(len(p_A_samples)):
        if p_A_samples[i] > p_B_samples[i]:
            sum_of_A_greater_than_B += 1

    # calculate proportion A greater than B
    proportion_A_greater_than_B = sum_of_A_greater_than_B / len(p_A_samples)
    # calculate proportion B greater than A
    proportion_B_greater_than_A = 1 - proportion_A_greater_than_B

    ###########################################################################
    # put all of the objects we want inside of a class so they can be returned
    class Attributes:
        def __init__(self, df, lpv_plot, p_A_samples, p_B_samples, bfmi,
                     max_gr, dist_plot, proportion_A_greater_than_B,
                     proportion_B_greater_than_A):
            self.df = df
            self.lpv_plot = lpv_plot
            self.p_A_samples = p_A_samples
            self.p_B_samples = p_B_samples
            self.bfmi = bfmi
            self.max_gr = max_gr
            self.dist_plot = dist_plot
            self.proportion_A_greater_than_B = proportion_A_greater_than_B
            self.proportion_B_greater_than_A = proportion_B_greater_than_A

    x = Attributes(df, lpv_plot, p_A_samples, p_B_samples, bfmi, max_gr,
                   dist_plot, proportion_A_greater_than_B,
                   proportion_B_greater_than_A)
    return x
def main(output_trace_path, Xy_training_path, Xy_testing_path, output_path,
         main_cities):
    # loading data
    with open(output_trace_path, 'rb') as buff:
        data = pickle.load(buff)
        hierarchical_model, hierarchical_trace, scaler, degree_index, \
        response_variable, predictor_variables = data['inference'], data['trace'], data['scaler'], \
                                                 data['city_index_df'], data['response_variable'],\
                                                 data['predictor_variables']

    # calculate Convergence stats
    bfmi = pm.bfmi(hierarchical_trace).round(2)
    max_gr = max(
        np.max(gr_stats)
        for gr_stats in pm.gelman_rubin(hierarchical_trace).values()).round(2)
    n = pm.diagnostics.effective_n(hierarchical_trace)
    efffective_samples_city_beta = n['beta']
    efffective_samples_global_beta = n['global_b']

    # fields to scale
    fields_to_scale = [response_variable] + predictor_variables

    # get training data scaler
    Xy_training = pd.read_csv(Xy_training_path)
    Xy_testing = pd.read_csv(Xy_testing_path)

    if scaler != None:
        Xy_training[fields_to_scale] = pd.DataFrame(
            scaler.transform(Xy_training[fields_to_scale]),
            columns=Xy_training[fields_to_scale].columns)

        Xy_testing[fields_to_scale] = pd.DataFrame(
            scaler.transform(Xy_testing[fields_to_scale]),
            columns=Xy_testing[fields_to_scale].columns)

    # get data of traces
    data = pm.trace_to_dataframe(hierarchical_trace)

    # DO CALCULATION FOR ALL CLASSES IN THE MODEL (CITIES)
    # get mean coefficeints
    alpha = data['global_a'].mean()
    beta = data['global_b'].mean()
    gamma = data['global_c'].mean()
    # epsilon = data['global_d'].mean()
    # err = data['eps'].mean()
    epsilon = 1

    # calc accurracy against training set
    # get scaled values for the city
    MAPE_single_building_train, MAPE_all_buildings_train, R2_train = calc_accurracy(
        Xy_training, alpha, beta, epsilon, gamma, response_variable,
        predictor_variables, fields_to_scale, scaler)

    # calc accurracy against testing set
    MAPE_single_building_test, MAPE_all_buildings_test, R2_test = calc_accurracy(
        Xy_testing, alpha, beta, epsilon, gamma, response_variable,
        predictor_variables, fields_to_scale, scaler)

    accurracy_df = pd.DataFrame.from_items([
        ("CITY", ["All", ""]), ("DATASET", ["Training", "Testing"]),
        ("MAPE_building [%]",
         [MAPE_single_building_train, MAPE_single_building_test]),
        ("MAPE_city [%]", [MAPE_all_buildings_train, MAPE_all_buildings_test]),
        ("R2 [-]", [R2_train, R2_test]), ("BFMI [-]", [bfmi, ""]),
        ("GB [-]", [max_gr, ""]),
        ("N_eff", [efffective_samples_global_beta, ""])
    ])
    accurracy_df_2 = pd.DataFrame()

    # DO CALCULATION FOR EVERY CLASS IN THE MODEL (CITIES)
    for i, city in zip(degree_index["CODE"].values,
                       degree_index["CITY"].values):
        # get mean coefficeints
        alpha = data['alpha__' + str(i)].mean()
        beta = data['beta__' + str(i)].mean()
        gamma = data['gamma__' + str(i)].mean()
        # epsilon = data['epsilon__' + str(i)].mean()
        # err = data['eps'].mean()

        # calc accurracy against training set
        Xy_training_city = Xy_training[Xy_training["CITY"] == city]
        MAPE_single_building_train, MAPE_all_buildings_train, R2_train = calc_accurracy(
            Xy_training_city, alpha, beta, epsilon, gamma, response_variable,
            predictor_variables, fields_to_scale, scaler)

        # calc accurracy against testing set
        Xy_testing_city = Xy_testing[Xy_testing["CITY"] == city]
        MAPE_single_building_test, MAPE_all_buildings_test, R2_test = calc_accurracy(
            Xy_testing_city, alpha, beta, epsilon, gamma, response_variable,
            predictor_variables, fields_to_scale, scaler)

        dict = pd.DataFrame.from_items([
            ("CITY", [
                city,
                "",
            ]), ("DATASET", ["Training", "Testing"]),
            ("MAPE_building [%]",
             [MAPE_single_building_train, MAPE_single_building_test]),
            ("MAPE_city [%]",
             [MAPE_all_buildings_train, MAPE_all_buildings_test]),
            ("R2 [-]", [R2_train, R2_test]), ("BFMI [-]", [bfmi, ""]),
            ("GB [-]", [max_gr, ""]),
            ("N_eff", [efffective_samples_city_beta[i], ""])
        ])

        #do this to get the cities in order
        if city in main_cities:
            accurracy_df = pd.concat([accurracy_df, dict], ignore_index=True)
        else:
            accurracy_df_2 = pd.concat([accurracy_df_2, dict],
                                       ignore_index=True)

    #append both datasets
    accurracy_df = pd.concat([accurracy_df, accurracy_df_2], ignore_index=True)
    accurracy_df.to_csv(output_path, index=False)
Ejemplo n.º 6
0
    away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])

    # likelihood of observed data
    home_points = pm.Poisson('home_points',
                             mu=home_theta,
                             observed=observed_home_goals)
    away_points = pm.Poisson('away_points',
                             mu=away_theta,
                             observed=observed_away_goals)

with model:
    trace = pm.sample(1000, tune=1000, cores=3)

pm.traceplot(trace, var_names=['intercept', 'home', 'sd_att', 'sd_def'])

bfmi = pm.bfmi(trace)
max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values())

#print(pm.stats.hpd(trace['atts']))

#print(pm.stats.quantiles(trace['atts'])[50])

df_hpd = pd.DataFrame(pm.stats.hpd(trace['atts']),
                      columns=['hpd_low', 'hpd_high'],
                      index=teams.team.values)
df_median = pd.DataFrame(pm.stats.quantiles(trace['atts'])[50],
                         columns=['hpd_median'],
                         index=teams.team.values)
df_hpd = df_hpd.join(df_median)
df_hpd['relative_lower'] = df_hpd.hpd_median - df_hpd.hpd_low
df_hpd['relative_upper'] = df_hpd.hpd_high - df_hpd.hpd_median
Ejemplo n.º 7
0
# ================================================================================
# Return a B-spline basis element B(x | t[0], ..., t[k+1])
xx = np.linspace(1, 15, Num)
b = sp.interpolate.BSpline.basis_element(knots[1:])
print(b)
fig, ax = plt.subplots()
x = np.linspace(0, 12, 200)
ax.plot(x, b(x), 'g', lw=3)
ax.grid(True)
plt.show()

pm.traceplot(trace_1)
plt.show()

ax = pm.energyplot(trace_1)
bfmi = pm.bfmi(trace_1)
ax.set_title(f"BFMI = {bfmi:.2f}")
plt.show()
varnames2 = ['δ', 'δB', 'δC']
tmp0 = pm.df_summary(trace_1, varnames2)
print(tmp0)

# ================================================================================
Bx_.set_value(basis_funcs(xs_yearA.get_value()))
# 建模,模型,用作算法对比,将一阶回归换成高斯游走
with pm.Model() as model_3:
    # define priors
    alpha3 = pm.HalfCauchy('alpha3', 10., testval=1.15)

    beta0 = pm.GaussianRandomWalk('beta0', sd=1, shape=Num_5)
    beta1 = pm.GaussianRandomWalk('beta1', sd=1, shape=Num_5)