def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0') parser.add_argument('--r_prior', nargs="?", type=float, default=0.0, help='prior value of reward function') parser.add_argument('--optLb', nargs="?", type=float, default=1e-2, help='value of r0') parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--epi_step_num', nargs="?", type=int, default=100, help='number of episode steps') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') parser.add_argument('--rstd', nargs="?", type=float, default=1.0, help='standard deviation of reward') parser.add_argument('--opt_ori', nargs="?", type=bool, default=False, help='Q-OCBA optimization method') parser.add_argument('--num_value_iter', nargs="?", type=int, default=200, help='number of value iteration') parser.add_argument('--opt_one_step', nargs="?", type=bool, default=False, help='Q-OCBA optimization running only one step') args = parser.parse_args() opt_ori = args.opt_ori print("Q-OCBA optimization method using original formulation is {}".format(opt_ori)) num_rep = args.rep initial_s_dist = "even" Q_approximation = None right_prop = args.rightprop optLb = args.optLb s_0 = 2 # collect data configuration n_s = 5 print("n_s is {}".format(n_s)) n_a = 2 # value-iteration configuration num_iter = 200 gamma = 0.95 # real p and r p = np.zeros(n_s * n_a * n_s) Q_0 = np.zeros(n_s * n_a) V_0 = np.zeros(n_s) rou = np.ones(n_s) / n_s r = np.zeros(n_s * n_a) r[0] = args.r0 r[-1] = 10. r_sd = args.rstd r_prior_mean = args.r_prior print("reward standard deviation is {}".format(r_sd)) # r[0] = 10. # r[-1] = 0.1 print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) right_prop = args.rightprop Total_data = args.numdata print("total num of data is {}".format(Total_data)) episode_steps = args.epi_step_num numdata_1 = 5 print("warm start steps is {}".format(numdata_1)) numdata_2 = Total_data print("epsisode timestep is {}".format(episode_steps)) num_datas = [episode_steps] * (numdata_2/ episode_steps) #num_datas = [1000, 0] CS_num = 0. future_V = np.zeros(num_rep) Total_time = [] #if use Bayesian prior as exploration Bayes_resample = False #optLbs = np.linspace(optLb, 1e-6, len(num_datas)) ##print(optLbs) #exit() for ii in range(num_rep): time_rep = time.time() para_cl = parameter_prior(n_s,n_a, s_0, r_mean_prior = r_prior_mean) data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std = r_sd) para_cl.update(data, resample = Bayes_resample) p_n, r_n, r_std = para_cl.get_para( resample = Bayes_resample) var_r_n = r_std **2 #print(p_n) #print(r_n) #print(r_std) #test #p_n = p #r_n = r Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, args.num_value_iter , gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) for jj, num_data in enumerate(num_datas): TM = inference.embedd_MC(p_n, n_s, n_a, V_n_max_index) I = np.identity(n_s * n_a) I_TM = np.linalg.inv(I - gamma * TM) V = np.diag(var_r_n) ds = [] ds_V = [] for i in range(n_s): for j in range(n_a): p_sa = p_n[(i * n_a * n_s + j * n_s): (i * n_a * n_s + (j + 1) * n_s)] dij = inference.cal_cov_p_quad_V(p_sa, V_n, n_s) ds.append(dij) if j == V_n_max_index[i]: ds_V.append(dij) D = np.diag(ds) cov_V_D = V + D quad_consts = np.zeros((n_s, n_a)) denom_consts = np.zeros((n_s, n_a, n_s * n_a)) for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: minus_op = np.zeros(n_s * n_a) minus_op[i * n_a + j] = 1 minus_op[i * n_a + V_n_max_index[i]] = -1 denom_consts[i][j] = np.power(np.dot(minus_op, I_TM), 2) * np.diag(cov_V_D) quad_consts[i][j] = (Q_n[i * n_a + j] - Q_n[i * n_a + V_n_max_index[i]]) ** 2 A, b, G, h = two_stage_inference.construct_contrain_matrix(p_n, n_s, n_a) AA = np.array(A) #bb = np.asarray(b) if opt_ori: def fun(x): return -x[0] else: def fun(x): return x[0] constraints = [] if opt_ori: for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: # print(denom_consts[i][j]) if np.max(denom_consts[i][j]) > 1e-5: constraints.append({'type': 'ineq', 'fun': lambda x, up_c, denom_c: up_c / ( np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))) - x[0], 'args': (quad_consts[i][j], denom_consts[i][j])}) else: for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: # print(denom_consts[i][j]) if np.max(quad_consts[i][j]) > 1e-5: constraints.append({'type': 'ineq', 'fun': lambda x, up_c, denom_c: -( np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))) / up_c + x[0], 'args': (quad_consts[i][j], denom_consts[i][j])}) for i in range(AA.shape[0]): constraints.append( {'type': 'eq', 'fun': lambda x, a, b: np.dot(a, x[1:]) - b, 'args': (AA[i], b[i])}) constraints = tuple(constraints) bnds = [] bnds.append((0., None)) for i in range(n_s * n_a): bnds.append((optLb, 1)) #bnds.append((optLbs[jj], 1)) bnds = tuple(bnds) initial = np.ones(n_s * n_a + 1) / (n_s * n_a) initial[0] = 0.1 # print(initial) # print("number of equality constraints is {}".format(len(A))) if args.opt_one_step: res = minimize(fun, initial, method='SLSQP', bounds=bnds, constraints=constraints, options = {'disp':False, 'maxiter':1}) else: res = minimize(fun, initial, method='SLSQP', bounds=bnds, constraints=constraints) x_opt = res.x[1:] #exit() #print("***", para_cl.s) data = collect_data_swimmer.collect_data(p, r, num_data, para_cl.s, n_s, n_a, pi_s_a=x_opt, std = r_sd) para_cl.update(data, resample = Bayes_resample) _, _, freq, _ = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a) #print("x_opt", x_opt) #print("freq", freq) #dist = np.linalg.norm(freq - x_opt) #dist = sklearn.metrics.mutual_info_score(freq, x_opt) #print(dist) p_n, r_n, r_std = para_cl.get_para(resample = Bayes_resample) var_r_n = r_std ** 2 Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, args.num_value_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) #print(p_n, r_n) #print(Q_n) Total_time.append(time.time() - time_rep) V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma) future_V[ii] = np.dot(rou, V_here) fS_bool = optimize_pfs.FS_bool(Q_n, V_max_index, n_s, n_a) CS_num += fS_bool fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv PCS = np.float(CS_num) / num_rep CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep) print("Seq_Q_OCBA") print("PCS is {}, with CI length {}".format(PCS, CI_len)) print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt( num_rep), rv, diff)) runnung_time_mean = np.mean(Total_time) runnung_time_CI = 1.96 * np.std(Total_time)/ np.sqrt(num_rep) print("average running time of Seq QOCBA is {} with CI length {}".format(runnung_time_mean, runnung_time_CI)) #exit() # follow original CS_num_naive = 0 future_V = np.zeros(num_rep) for i in range(num_rep): data = collect_data_swimmer.collect_data(p, r, Total_data, s_0, n_s, n_a, right_prop=right_prop) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) # print(Q_n) V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) fS_bool_ = optimize_pfs.FS_bool(Q_n, V_max_index, n_s, n_a) CS_num_naive += fS_bool_ # if not FS_bool_: # print(i) # print(f_n) # print(Q_n) PCS_naive = np.float(CS_num_naive) / num_rep CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv print("follow original") print("PCS is {}, with CI length {}".format(PCS_naive, CI_len)) print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt( num_rep), rv, diff))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') #parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0') parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') parser.add_argument('--rstd', nargs="?", type=float, default=1.0, help='standard deviation of reward ') args = parser.parse_args() num_iter, gamma, n_s, n_a, delta, num_rep = 200, 0.95, 5, 2, 0.05, args.rep right_prop = args.rightprop Q_0 = np.zeros(n_s * n_a) V_0 = np.zeros(n_s) p = np.zeros(n_s * n_a * n_s) r = np.zeros(n_s * n_a) for r0_val in range(1, 4): r[0] = float(r0_val) r[-1] = 10. r_std = args.rstd # r[0] = 10. # r[-1] = 0.1 print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) print("Q real is {}".format(Q_real)) s_0 = 2 rou = np.ones(n_s) / n_s Q_approximation = None initial_s_dist = "even" if initial_s_dist == "even": R_real = np.mean(V_real) initial_w = np.ones(n_s) / n_s cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] numerical_tol = 1e-6 S_0 = None ## UCRL CS_num = 0. num_data = args.numdata num_1 = num_data * 3/10 num_2 = num_data * 7/10 #print("smaller") future_V = np.zeros(num_rep) for i in range(num_rep): #all_data = [] while True: data1 = collect_data_swimmer.collect_data(p, r, num_1, s_0, n_s, n_a, right_prop=right_prop, std = r_std) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data1, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) #print("first stage visiting frequency is {}".format(f_n)) if f_n.all()!=0: break #all_data += data1 pre_collected_stats = get_pre_collected_stats(data1, n_s, n_a) UCRL_cl = UCRL(n_s, n_a, 0.05, num_1, s_0, num_data, pre_collected_stats) while UCRL_cl.t < num_data: UCRL_cl.update_point_estimate_and_CIbound() #print("step1 finished") UCRL_cl.Extended_Value_Iter() #print("step2 finished") UCRL_cl.collect_data_and_update(p,r, r_std = r_std) #print("step3 finished") #print(UCRL_cl.t) UCRL_cl.update_point_estimate_and_CIbound() Q_estimate = Iterative_Cal_Q.cal_Q_val(UCRL_cl.transition, Q_0, UCRL_cl.rew, num_iter , gamma, n_s, n_a) #print(Q_estimate) FS_bool = optimize_pfs.FS_bool(Q_estimate, V_max_index, n_s, n_a) CS_num += FS_bool V_here = optimize_pfs.policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) datahere = data1 + UCRL_cl.datas Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(Q_approximation, S_0, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=datahere) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and(Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and(V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and(R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) PCS = np.float(CS_num) / num_rep CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv # print(CS_num_naive) print("PCS is {}, with CI length {}".format(PCS, CI_len)) print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt( num_rep), rv, diff)) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.divide(cov_bools_Q, num_rep) cov_rate_V = np.divide(cov_bools_V, num_rep) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) print("coverage for Q") print(cov_rate_Q) print(cov_rate_CI_Q) print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("coverage for V") print(cov_rate_V) print(cov_rate_CI_V) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') parser.add_argument('--r0', nargs="?", type=float, default=0.0, help='value of r0') #parser.add_argument('--r_prior', nargs="?", type=float, default=1.0, help='prior value of reward function') parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--epi_step_num', nargs="?", type=int, default=100, help='number of episode steps') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') parser.add_argument('--rstd', nargs="?", type=float, default=1.0, help='standard deviation of reward') parser.add_argument('--beta', nargs="?", type=float, default=0.25, help='beta') parser.add_argument('--two_stage', nargs="?", type=bool, default=True, help='if run two stage or sequential experiment') args = parser.parse_args() print("PSPE") num_iter, gamma, n_s, n_a, num_rep = 200, 0.95, 5, 2, args.rep right_prop = args.rightprop Q_0 = np.zeros(n_s * n_a) V_0 = np.zeros(n_s) p = np.zeros(n_s * n_a * n_s) r = np.zeros(n_s * n_a) r[0] = args.r0 r[-1] = 10. r_std = args.rstd print("reward standard deviation is {}".format(r_std)) # r[0] = 10. # r[-1] = 0.1 print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) #print("Q real is {}".format(Q_real)) s_0 = 2 ## PSPE if not args.two_stage: print("sequential implementation") Total_data = args.numdata print("total num of data is {}".format(Total_data)) episode_steps = args.epi_step_num numdata_1 = episode_steps numdata_2 = Total_data - numdata_1 print("epsisode timestep is {}".format(episode_steps)) num_datas = [episode_steps] * (numdata_2 / episode_steps) else: print("two_stage implementation") Total_data = args.numdata print("total num of data is {}".format(Total_data)) numdata_1 = Total_data * 3 / 10 numdata_2 = Total_data - numdata_1 episodes = 100 num_datas = [numdata_2 / episodes] * episodes CS_num = 0. beta = args.beta rou = np.ones(n_s) / n_s future_V = np.zeros(num_rep) for i in range(num_rep): para_cl = parameter_prior(n_s, n_a, s_0) while True: data1 = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_std) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data1, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) # print("first stage visiting frequency is {}".format(f_n)) if f_n.all() != 0: break para_cl.update(data1, r_sigma=r_std) Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) #print(Q_estimate) for num_data in num_datas: data = collect_data_swimmer.collect_data(p, r, num_data, para_cl.s_0, n_s, n_a, Q=Q_estimate, epsilon=0, std=r_std) para_cl.update(data, r_sigma=r_std) Q_estimate_1 = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) V_n_1, V_n_max_index_1 = inference.get_V_from_Q( Q_estimate_1, n_s, n_a) sim = np.random.binomial(1, beta, 1)[0] if sim: Q_estimate = Q_estimate_1 else: while True: Q_estimate_2 = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) V_n_2, V_n_max_index_2 = inference.get_V_from_Q( Q_estimate_2, n_s, n_a) if V_n_max_index_2 != V_n_max_index_1: break Q_estimate = Q_estimate_2 #print(Q_estimate) #print(para_cl.pprior) #print(para_cl.r_mean) V_here = optimize_pfs.policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) FS_bool = optimize_pfs.FS_bool(Q_estimate, V_max_index, n_s, n_a) CS_num += FS_bool PCS = np.float(CS_num) / num_rep CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv # print(CS_num_naive) print("PCS is {}, with CI length {}".format(PCS, CI_len)) print( "future value func is {} with CI length {}, real value is {}, diff is {}" .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') parser.add_argument('--episode', nargs="?", type=int, default=100, help='number of episode') #parser.add_argument('--r0', nargs = "?", type = float, default = 1.0, help = 'value of r0' ) parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') parser.add_argument('--rstd', nargs="?", type=float, default=1.0, help='standard deviation of reward') args = parser.parse_args() num_iter, gamma, n_s, n_a, num_rep = 200, 0.95, 5, 2, args.rep episodes = args.episode Total_data = args.numdata right_prop = args.rightprop #print(num_rep, episodes, Total_data, right_prop) r = np.zeros(n_s * n_a) r_vals = range(1, 4) #r_vals = [5./1000] r_right = 10.0 for r0_val in r_vals: r[0] = float(r0_val) r[-1] = r_right r_std = args.rstd print("reward standard deviation is {}".format(r_std)) # r[0] = 10. # r[-1] = 0.1 Q_0 = np.zeros(n_s * n_a) V_0 = np.zeros(n_s) rou = np.ones(n_s) / n_s p = np.zeros(n_s * n_a * n_s) print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) #exit() p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) print("Q real is {}".format(Q_real)) s_0 = 2 Q_approximation = None initial_s_dist = "even" if initial_s_dist == "even": R_real = np.mean(V_real) initial_w = np.ones(n_s) / n_s cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] numerical_tol = 1e-6 S_0 = None ## PSRL data parameter specification print("total num of data is {}".format(Total_data)) numdata_1 = Total_data * 3 / 10 seq_if = False numdata_2 = Total_data - numdata_1 print("# of epsisodes is {}".format(episodes)) num_datas = [numdata_2 / episodes] * episodes #print(num_datas) CS_num = 0. future_V = np.zeros(num_rep) for i in range(num_rep): para_cl = parameter_prior(n_s, n_a, s_0) all_data = [] if not seq_if: while True: data1 = collect_data_swimmer.collect_data( p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_std) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data1, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) #print("first stage visiting frequency is {}".format(f_n)) if f_n.all() != 0: break else: data1 = collect_data_swimmer.collect_data( p, r, numdata_2 / episodes, s_0, n_s, n_a, right_prop=right_prop, std=r_std) #data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop) all_data += data1 para_cl.update(data1, r_sigma=r_std) Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) #print(Q_estimate) second_stage_data = [] for num_data in num_datas: data = collect_data_swimmer.collect_data(p, r, num_data, para_cl.s_0, n_s, n_a, Q=Q_estimate, epsilon=0, std=r_std) all_data += data second_stage_data += data para_cl.update(data, r_sigma=r_std) Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) #print(para_cl.pprior) #print(para_cl.r_mean) #exit() #print(Q_estimate) #print(para_cl.pprior) #print(para_cl.r_mean) #transition = np.array([1.] * n_s * (n_s * n_a)) #for i in range(n_s): # for j in range(n_a): # transition[ # (i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] = para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] \ # / np.sum(para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)]) #r_n = para_cl.r_mean #print(r_n) #print(transition) #Q_estimate = Iterative_Cal_Q.cal_Q_val(transition, Q_0, r_n, num_iter , gamma, n_s, n_a) #print(len(all_data)) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( all_data, n_s, n_a) Q_estimate = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_here = optimize_pfs.policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) FS_bool = optimize_pfs.FS_bool(Q_estimate, V_max_index, n_s, n_a) CS_num += FS_bool # 5.3 Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI( Q_approximation, S_0, Total_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=all_data) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and( Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and( V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and( R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) PCS = np.float(CS_num) / num_rep CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv # print(CS_num_naive) print("PCS is {}, with CI length {}".format(PCS, CI_len)) print( "future value func is {} with CI length {}, real value is {}, diff is {}" .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff)) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.divide(cov_bools_Q, num_rep) cov_rate_V = np.divide(cov_bools_V, num_rep) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) print("coverage for Q") print(cov_rate_Q) print(cov_rate_CI_Q) print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("coverage for V") print(cov_rate_V) print(cov_rate_CI_V) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci))