def energyplot(energies, fill_color=("C0", "C1"), fill_alpha=(1, 0.5), fig=plt.gcf(), sp=GridSpec(1, 1)[:, :]): for i, energy in enumerate(energies): mean_energy, trans_energy = energy - energy.mean(), np.diff(energy) ax = fig.add_subplot(sp) pm.kdeplot(mean_energy, label="Marginal Energy", ax=ax, shade=fill_alpha[0], kwargs_shade={"color": fill_color[0]}) pm.kdeplot(trans_energy, label="Energy Transition", ax=ax, shade=fill_alpha[1], kwargs_shade={"color": fill_color[1]}) ax.plot([], label="chain {:>2} BFMI = {:.2f}".format( i, pm.bfmi({"energy": energy})), alpha=0) ax.legend() ax.set_xticks([]) ax.set_yticks([])
def run(): teams = df.home_team.unique() teams = pd.DataFrame(teams, columns=['team']) teams['i'] = teams.index df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left') df = df.rename(columns = {'i': 'i_home'}).drop('team', 1) df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left') df = df.rename(columns = {'i': 'i_away'}).drop('team', 1) observed_home_goals = df.home_score.values observed_away_goals = df.away_score.values home_team = df.i_home.values away_team = df.i_away.values num_teams = len(df.i_home.drop_duplicates()) num_games = len(home_team) g = df.groupby('i_away') att_starting_points = np.log(g.away_score.mean()) g = df.groupby('i_home') def_starting_points = -np.log(g.away_score.mean()) with pm.Model() as model: # global model parameters home = pm.Flat('home') sd_att = pm.HalfStudentT('sd_att', nu=3, sigma=2.5) sd_def = pm.HalfStudentT('sd_def', nu=3, sigma=2.5) intercept = pm.Flat('intercept') # team-specific model parameters atts_star = pm.Normal("atts_star", mu=0, sigma=sd_att, shape=num_teams) defs_star = pm.Normal("defs_star", mu=0, sigma=sd_def, shape=num_teams) atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star)) defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star)) home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team]) away_theta = tt.exp(intercept + atts[away_team] + defs[home_team]) # likelihood of observed data home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals) away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals) trace = pm.sample(1000, tune=1000, cores=3) pm.traceplot(trace, var_names=['intercept', 'home', 'sd_att', 'sd_def']); bfmi = pm.bfmi(trace) max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values()) (pm.energyplot(trace, legend=False, figsize=(6, 4))
def main(output_trace_path, Xy_training_path, Xy_testing_path, output_path, main_cities): # loading data with open(output_trace_path, 'rb') as buff: data = pickle.load(buff) hierarchical_model, hierarchical_trace, scaler, degree_index, \ response_variable, predictor_variables, sector = data['inference'], data['trace'], data['scaler'], \ data['city_index_df'], data['response_variable'],\ data['predictor_variables'], data['sector'] # calculate Convergence stats bfmi = pm.bfmi(hierarchical_trace).round(2) max_gr = max( np.max(gr_stats) for gr_stats in pm.gelman_rubin(hierarchical_trace).values()).round(2) n = pm.diagnostics.effective_n(hierarchical_trace) efffective_samples_city_beta = n['b1'] # fields to scale, get data of traces fields_to_scale = [response_variable] + predictor_variables Xy_testing, Xy_training, degree_index = input_data(Xy_testing_path, Xy_training_path, fields_to_scale, scaler, sector) # get data of traces data = pm.trace_to_dataframe(hierarchical_trace) # DO CALCULATION FOR EVERY CLASS IN THE MODEL (CITIES) accurracy_df = pd.DataFrame() accurracy_df_2 = pd.DataFrame() for i, city in zip(degree_index["CODE"].values, degree_index["CITY"].values): # get mean coefficeints alpha = data['b1__' + str(i)].mean() beta = data['b2__' + str(i)].mean() # calc accurracy against training set Xy_training_city = Xy_training[Xy_training["CITY"] == city] Xy_testing_city = Xy_testing[Xy_testing["CITY"] == city] if Xy_training_city.empty or Xy_testing_city.empty: print(city, sector, "does not exist, we are skipping it") else: # do for the training data set y_prediction, y_target, y_prediction_log, y_target_log = do_prediction( Xy_training_city, alpha, beta, response_variable, predictor_variables, fields_to_scale, scaler) n_samples_train = len(y_target) MAPE_single_building_train, MAPE_city_scale_train, r2_train = calc_accurracy( y_prediction, y_target) MSE_log_domain_train = mean_squared_error(y_target_log, y_prediction_log) # do for the testing data set y_prediction, y_target, y_prediction_log, y_target_log = do_prediction( Xy_testing_city, alpha, beta, response_variable, predictor_variables, fields_to_scale, scaler) n_samples_test = len(y_target) MAPE_single_building_test, MAPE_city_scale_test, r2_test = calc_accurracy( y_prediction, y_target) MSE_log_domain_test = mean_squared_error(y_target_log, y_prediction_log) dict = pd.DataFrame.from_items([ ("CITY", [ city, city, ]), ("BUILDING_CLASS", [sector, sector]), ("DATASET", ["Training", "Testing"]), ("MAPE_build_EUI_%", [MAPE_single_building_train, MAPE_single_building_test]), ("PE_mean_EUI_%", [MAPE_city_scale_train, MAPE_city_scale_test]), ("MSE_log_domain", [MSE_log_domain_train, MSE_log_domain_test]), ("n_samples", [n_samples_train, n_samples_test]) ]) #do this to get the cities in order if city in main_cities: accurracy_df = pd.concat([accurracy_df, dict], ignore_index=True) else: accurracy_df_2 = pd.concat([accurracy_df_2, dict], ignore_index=True) #append both datasets accurracy_df = pd.concat([accurracy_df, accurracy_df_2], ignore_index=True) accurracy_df.to_csv(output_path, index=False)
def bayesian_ab_test_prob(sample_a_total, sample_a_responses, sample_b_total, sample_b_responses, N_simulations=1000, pct_tune=50, gr_threshold=1.001, N_additional_draws=1000, lpv_height=30): ########################################################################### # get parameters for model # make pct_tune into a proportion prop_tune = pct_tune / 100 # calculate number to tune N_tune = round(N_simulations * prop_tune) # calculate additional tuning steps N_additional_tune = round(N_additional_draws * prop_tune) ########################################################################### # get data for lower and upper plausible values # define helper function for plausible values def plausible_values(total, upvotes): # get downvotes d = total - upvotes # 1 + upvotes a = 1 + upvotes # 1 + downvotes b = 1 + d # calculate lower plausible value lpv = (a / (a + b)) - (1.65 * np.sqrt( (a * b) / (((a + b)**2) * (a + b + 1)))) # calculate upper plausible value upv = (a / (a + b)) + (1.65 * np.sqrt( (a * b) / (((a + b)**2) * (a + b + 1)))) # return lpv and upv return lpv, upv # sample A # number of non-clicks N_nonclicks_A = sample_a_total - sample_a_responses # calculate CTR observed_p_A = sample_a_responses / sample_a_total # calculate lower plausible value in sample A sample_A_lpv = plausible_values(total=sample_a_total, upvotes=sample_a_responses)[0] # calculate upper plausible value in sample A sample_A_upv = plausible_values(total=sample_a_total, upvotes=sample_a_responses)[1] # sample B # calculate non-clicks N_nonclicks_B = sample_b_total - sample_b_responses # calculate CTR observed_p_B = sample_b_responses / sample_b_total # calculate lower plausible value in sample B sample_B_lpv = plausible_values(total=sample_b_total, upvotes=sample_b_responses)[0] # calculate upper plausible value in sample B sample_B_upv = plausible_values(total=sample_b_total, upvotes=sample_b_responses)[1] # put into df df = pd.DataFrame({ 'Variable': ['Sent', 'Yes', 'No', 'Rate', 'LPV', 'UPV'], 'Sample A': [ sample_a_total, sample_a_responses, N_nonclicks_A, observed_p_A, sample_A_lpv, sample_A_upv ], 'Sample B': [ sample_b_total, sample_b_responses, N_nonclicks_B, observed_p_B, sample_B_lpv, sample_B_upv ] }) # create a col that is sample A minus sample B df['A - B'] = df['Sample A'] - df['Sample B'] ########################################################################### # plot it x = ('Sample A', 'Sample B') y = (observed_p_A, observed_p_B) # get error values # lower # sample A A_low_err = observed_p_A - sample_A_lpv # sample B B_low_err = observed_p_B - sample_B_lpv # upper # sample A A_upp_err = sample_A_upv - observed_p_A # sample B B_upp_err = sample_B_upv - observed_p_B # create a programmatic title # print message if sample_A_lpv > sample_B_lpv: title = 'Sample A has a greater LPV' else: title = 'Sample B has a greater LPV' yerr = np.array([(A_low_err, B_low_err), (A_upp_err, B_upp_err)]) lpv_plot, axes = plt.subplots() axes.errorbar(x, y, yerr, fmt='o') axes.set_title(title) ########################################################################### # place the user input into artificial observations # sample A # create list for number of zeros (non-clicks) observations_A = [0] * N_nonclicks_A # create list for number of 1s N_clicks_list_A = [1] * sample_a_responses # combine lists observations_A.extend(N_clicks_list_A) # sample B # create list for number of zeros observations_B = [0] * N_nonclicks_B # create list for number of 1s N_clicks_list_B = [1] * sample_b_responses # combine lists observations_B.extend(N_clicks_list_B) ########################################################################### # set up pymc3 model assuming uniform prior and Bernoulli posterior # print a message print('\n') print('Model being built using {} initial, tuned draws...'.format( N_simulations)) print('\n') # instantiate model with pm.Model() as model: # get prior probabilities from Uniform distribution because we don't know what they are (objective prior) prior_A = pm.Uniform('prior_A', 0, 1) prior_B = pm.Uniform('prior_B', 0, 1) # fit our observations to a (posterior) Bernoulli distribution, could also try Binomial? posterior_A = pm.Bernoulli('posterior_A', prior_A, observed=observations_A) posterior_B = pm.Bernoulli('posterior_B', prior_B, observed=observations_B) # get samples from the posterior distribution trace = pm.sample(draws=N_simulations + N_tune, tune=N_tune) # get maximum value of Gelman-Rubin test max_gr = max( np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values()) # if model has not converged, continue to... while max_gr > gr_threshold: # print message print('\n') print('Gelman-Rubin statistic: {}'.format(max_gr)) print( 'Gelman-Rubin statistic is too large, {} additional draws will be taken.' .format(N_additional_draws)) print('\n') with model: trace = pm.sample(draws=N_additional_draws + N_additional_tune, tune=N_additional_tune) # get maximum value of Gelman-Rubin test max_gr = max( np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values()) # add N_additional_draws to N_simulations N_simulations += N_additional_draws # print message print('\n') print( 'Success! Model has converged after {} draws. Final Gelman-Rubin: {}'. format(N_simulations, max_gr)) ########################################################################### # display conversion stats # calculate the bayesian fraction of missing information bfmi = pm.bfmi(trace) # get maximum value of Gelman-Rubin test max_gr = max( np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values()) # print message print('Bayesian fraction of missing information: {}'.format(bfmi)) print('\n') ########################################################################### # get distributions # sample A p_A_samples = trace['prior_A'] # sample B p_B_samples = trace['prior_B'] ########################################################################### # plot the distributions sns.set(style='white', palette='muted', color_codes=True) # Set up the matplotlib figure dist_plot, axes = plt.subplots(figsize=(7, 7), sharex=True) dist_plot.suptitle( 'Posterior distributions of $p_A$ (blue) and $p_B$ (red) after {} draws' .format(N_simulations)) sns.despine(left=True) # posterior A p1 = sns.distplot(p_A_samples, color='b', label='Posterior of $p_A$') p1.vlines(sample_A_lpv, 0, lpv_height, colors='b', linestyle='--', label='Sample A LPV: {0:0.3f}'.format(sample_A_lpv)) p1.legend(loc='upper left') # posterior B p2 = sns.distplot(p_B_samples, color='r', label='Posterior of $p_B$') p2.vlines(sample_B_lpv, 0, lpv_height, colors='r', linestyle='--', label='Sample B LPV: {0:0.3f}'.format(sample_B_lpv)) p2.legend(loc='upper left') # display plot plt.tight_layout() # fix any overlapping ########################################################################### # get proportion of p_A_samples that are greater than p_B_samples # iterate through p_A_samples and p_B_samples sum_of_A_greater_than_B = 0 for i in range(len(p_A_samples)): if p_A_samples[i] > p_B_samples[i]: sum_of_A_greater_than_B += 1 # calculate proportion A greater than B proportion_A_greater_than_B = sum_of_A_greater_than_B / len(p_A_samples) # calculate proportion B greater than A proportion_B_greater_than_A = 1 - proportion_A_greater_than_B ########################################################################### # put all of the objects we want inside of a class so they can be returned class Attributes: def __init__(self, df, lpv_plot, p_A_samples, p_B_samples, bfmi, max_gr, dist_plot, proportion_A_greater_than_B, proportion_B_greater_than_A): self.df = df self.lpv_plot = lpv_plot self.p_A_samples = p_A_samples self.p_B_samples = p_B_samples self.bfmi = bfmi self.max_gr = max_gr self.dist_plot = dist_plot self.proportion_A_greater_than_B = proportion_A_greater_than_B self.proportion_B_greater_than_A = proportion_B_greater_than_A x = Attributes(df, lpv_plot, p_A_samples, p_B_samples, bfmi, max_gr, dist_plot, proportion_A_greater_than_B, proportion_B_greater_than_A) return x
def main(output_trace_path, Xy_training_path, Xy_testing_path, output_path, main_cities): # loading data with open(output_trace_path, 'rb') as buff: data = pickle.load(buff) hierarchical_model, hierarchical_trace, scaler, degree_index, \ response_variable, predictor_variables = data['inference'], data['trace'], data['scaler'], \ data['city_index_df'], data['response_variable'],\ data['predictor_variables'] # calculate Convergence stats bfmi = pm.bfmi(hierarchical_trace).round(2) max_gr = max( np.max(gr_stats) for gr_stats in pm.gelman_rubin(hierarchical_trace).values()).round(2) n = pm.diagnostics.effective_n(hierarchical_trace) efffective_samples_city_beta = n['beta'] efffective_samples_global_beta = n['global_b'] # fields to scale fields_to_scale = [response_variable] + predictor_variables # get training data scaler Xy_training = pd.read_csv(Xy_training_path) Xy_testing = pd.read_csv(Xy_testing_path) if scaler != None: Xy_training[fields_to_scale] = pd.DataFrame( scaler.transform(Xy_training[fields_to_scale]), columns=Xy_training[fields_to_scale].columns) Xy_testing[fields_to_scale] = pd.DataFrame( scaler.transform(Xy_testing[fields_to_scale]), columns=Xy_testing[fields_to_scale].columns) # get data of traces data = pm.trace_to_dataframe(hierarchical_trace) # DO CALCULATION FOR ALL CLASSES IN THE MODEL (CITIES) # get mean coefficeints alpha = data['global_a'].mean() beta = data['global_b'].mean() gamma = data['global_c'].mean() # epsilon = data['global_d'].mean() # err = data['eps'].mean() epsilon = 1 # calc accurracy against training set # get scaled values for the city MAPE_single_building_train, MAPE_all_buildings_train, R2_train = calc_accurracy( Xy_training, alpha, beta, epsilon, gamma, response_variable, predictor_variables, fields_to_scale, scaler) # calc accurracy against testing set MAPE_single_building_test, MAPE_all_buildings_test, R2_test = calc_accurracy( Xy_testing, alpha, beta, epsilon, gamma, response_variable, predictor_variables, fields_to_scale, scaler) accurracy_df = pd.DataFrame.from_items([ ("CITY", ["All", ""]), ("DATASET", ["Training", "Testing"]), ("MAPE_building [%]", [MAPE_single_building_train, MAPE_single_building_test]), ("MAPE_city [%]", [MAPE_all_buildings_train, MAPE_all_buildings_test]), ("R2 [-]", [R2_train, R2_test]), ("BFMI [-]", [bfmi, ""]), ("GB [-]", [max_gr, ""]), ("N_eff", [efffective_samples_global_beta, ""]) ]) accurracy_df_2 = pd.DataFrame() # DO CALCULATION FOR EVERY CLASS IN THE MODEL (CITIES) for i, city in zip(degree_index["CODE"].values, degree_index["CITY"].values): # get mean coefficeints alpha = data['alpha__' + str(i)].mean() beta = data['beta__' + str(i)].mean() gamma = data['gamma__' + str(i)].mean() # epsilon = data['epsilon__' + str(i)].mean() # err = data['eps'].mean() # calc accurracy against training set Xy_training_city = Xy_training[Xy_training["CITY"] == city] MAPE_single_building_train, MAPE_all_buildings_train, R2_train = calc_accurracy( Xy_training_city, alpha, beta, epsilon, gamma, response_variable, predictor_variables, fields_to_scale, scaler) # calc accurracy against testing set Xy_testing_city = Xy_testing[Xy_testing["CITY"] == city] MAPE_single_building_test, MAPE_all_buildings_test, R2_test = calc_accurracy( Xy_testing_city, alpha, beta, epsilon, gamma, response_variable, predictor_variables, fields_to_scale, scaler) dict = pd.DataFrame.from_items([ ("CITY", [ city, "", ]), ("DATASET", ["Training", "Testing"]), ("MAPE_building [%]", [MAPE_single_building_train, MAPE_single_building_test]), ("MAPE_city [%]", [MAPE_all_buildings_train, MAPE_all_buildings_test]), ("R2 [-]", [R2_train, R2_test]), ("BFMI [-]", [bfmi, ""]), ("GB [-]", [max_gr, ""]), ("N_eff", [efffective_samples_city_beta[i], ""]) ]) #do this to get the cities in order if city in main_cities: accurracy_df = pd.concat([accurracy_df, dict], ignore_index=True) else: accurracy_df_2 = pd.concat([accurracy_df_2, dict], ignore_index=True) #append both datasets accurracy_df = pd.concat([accurracy_df, accurracy_df_2], ignore_index=True) accurracy_df.to_csv(output_path, index=False)
away_theta = tt.exp(intercept + atts[away_team] + defs[home_team]) # likelihood of observed data home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals) away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals) with model: trace = pm.sample(1000, tune=1000, cores=3) pm.traceplot(trace, var_names=['intercept', 'home', 'sd_att', 'sd_def']) bfmi = pm.bfmi(trace) max_gr = max(np.max(gr_stats) for gr_stats in pm.gelman_rubin(trace).values()) #print(pm.stats.hpd(trace['atts'])) #print(pm.stats.quantiles(trace['atts'])[50]) df_hpd = pd.DataFrame(pm.stats.hpd(trace['atts']), columns=['hpd_low', 'hpd_high'], index=teams.team.values) df_median = pd.DataFrame(pm.stats.quantiles(trace['atts'])[50], columns=['hpd_median'], index=teams.team.values) df_hpd = df_hpd.join(df_median) df_hpd['relative_lower'] = df_hpd.hpd_median - df_hpd.hpd_low df_hpd['relative_upper'] = df_hpd.hpd_high - df_hpd.hpd_median
# ================================================================================ # Return a B-spline basis element B(x | t[0], ..., t[k+1]) xx = np.linspace(1, 15, Num) b = sp.interpolate.BSpline.basis_element(knots[1:]) print(b) fig, ax = plt.subplots() x = np.linspace(0, 12, 200) ax.plot(x, b(x), 'g', lw=3) ax.grid(True) plt.show() pm.traceplot(trace_1) plt.show() ax = pm.energyplot(trace_1) bfmi = pm.bfmi(trace_1) ax.set_title(f"BFMI = {bfmi:.2f}") plt.show() varnames2 = ['δ', 'δB', 'δC'] tmp0 = pm.df_summary(trace_1, varnames2) print(tmp0) # ================================================================================ Bx_.set_value(basis_funcs(xs_yearA.get_value())) # 建模,模型,用作算法对比,将一阶回归换成高斯游走 with pm.Model() as model_3: # define priors alpha3 = pm.HalfCauchy('alpha3', 10., testval=1.15) beta0 = pm.GaussianRandomWalk('beta0', sd=1, shape=Num_5) beta1 = pm.GaussianRandomWalk('beta1', sd=1, shape=Num_5)