def get_CI( num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w): # collect new data data = collect_data_swimmer.collect_data(p, r, num_data, s_0, n_s, n_a) # get impirical statistics p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a) # cal Q_n function and V_n function Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n , V_n_max_index = get_V_from_Q(Q_n, n_s, n_a) R_n = np.dot(initial_w, V_n) Sigma_n_Q, Sigma_n_V, Sigma_n_R = cal_Sigma_n(p_n, f_n, var_r_n , V_n, gamma, n_s, n_a, V_n_max_index, initial_w) CI_len_Q = 1.96 * np.sqrt(np.diag(Sigma_n_Q)) / np.sqrt(num_data) CI_len_V = 1.96 * np.sqrt(np.diag(Sigma_n_V)) / np.sqrt(num_data) CI_len_R = 1.96 * np.sqrt(Sigma_n_R) / np.sqrt(num_data) return Q_n, CI_len_Q , V_n, CI_len_V, R_n, CI_len_R
def stage_2_estimation(p, r, num_data_1, s_0, n_s, n_a, Q_0, x_opt, num_iter, gamma, initial_w): data = collect_data_swimmer.collect_data(p, r, num_data_1, s_0, n_s, n_a, pi_s_a=x_opt) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) R_n = np.dot(initial_w, V_n) return p_n, r_n, f_n, var_r_n, Q_n, V_n, V_n_max_index, R_n
def stage_1_estimation(p, r, num_data_1, s_0, n_s, n_a, Q_0, right_prop, num_iter, gamma, initial_w): count = 0 while True: count += 1 data = collect_data_swimmer.collect_data(p, r, num_data_1, s_0, n_s, n_a, right_prop=right_prop) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) # print("first stage visiting frequency is {}".format(f_n)) if f_n.all() != 0: break R_n = np.dot(initial_w, V_n) return p_n, r_n, f_n, var_r_n, Q_n, V_n, V_n_max_index, R_n
def main(): ENV_NAME = "CartPole-v0" env = gym.make(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n train_data = [] num_train_data = 30000 #print(observation_space, action_space) #exit() state = env.reset() c = 0 while len(train_data) < num_train_data: action = np.random.randint(2) #action =1 state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -200 #state_next = np.reshape(state_next, [1, observation_space]) #dqn_solver.remember(state, action, reward, state_next, terminal) train_data.append((state, action, reward, state_next)) #print(state, action, reward, state_next) state = state_next if terminal: c += 1 state = env.reset() dims = (4, 4, 4, 4) data = pre_process(train_data, dims) #print(c) #for d in train_data: # print(d) #exit() n_s = np.prod(dims) n_a = 2 p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data, n_s, n_a) print(len(f_n)) nonzero_freq = [f for f in f_n if f != 0] print(len(nonzero_freq)) print(max(f_n), min(nonzero_freq), statistics.median(nonzero_freq))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0') parser.add_argument('--r_prior', nargs="?", type=float, default=0.0, help='prior value of reward function') parser.add_argument('--optLb', nargs="?", type=float, default=1e-2, help='value of r0') parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--epi_step_num', nargs="?", type=int, default=100, help='number of episode steps') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') parser.add_argument('--rstd', nargs="?", type=float, default=1.0, help='standard deviation of reward') parser.add_argument('--opt_ori', nargs="?", type=bool, default=False, help='Q-OCBA optimization method') parser.add_argument('--num_value_iter', nargs="?", type=int, default=200, help='number of value iteration') parser.add_argument('--opt_one_step', nargs="?", type=bool, default=False, help='Q-OCBA optimization running only one step') args = parser.parse_args() opt_ori = args.opt_ori print("Q-OCBA optimization method using original formulation is {}".format(opt_ori)) num_rep = args.rep initial_s_dist = "even" Q_approximation = None right_prop = args.rightprop optLb = args.optLb s_0 = 2 # collect data configuration n_s = 5 print("n_s is {}".format(n_s)) n_a = 2 # value-iteration configuration num_iter = 200 gamma = 0.95 # real p and r p = np.zeros(n_s * n_a * n_s) Q_0 = np.zeros(n_s * n_a) V_0 = np.zeros(n_s) rou = np.ones(n_s) / n_s r = np.zeros(n_s * n_a) r[0] = args.r0 r[-1] = 10. r_sd = args.rstd r_prior_mean = args.r_prior print("reward standard deviation is {}".format(r_sd)) # r[0] = 10. # r[-1] = 0.1 print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) right_prop = args.rightprop Total_data = args.numdata print("total num of data is {}".format(Total_data)) episode_steps = args.epi_step_num numdata_1 = 5 print("warm start steps is {}".format(numdata_1)) numdata_2 = Total_data print("epsisode timestep is {}".format(episode_steps)) num_datas = [episode_steps] * (numdata_2/ episode_steps) #num_datas = [1000, 0] CS_num = 0. future_V = np.zeros(num_rep) Total_time = [] #if use Bayesian prior as exploration Bayes_resample = False #optLbs = np.linspace(optLb, 1e-6, len(num_datas)) ##print(optLbs) #exit() for ii in range(num_rep): time_rep = time.time() para_cl = parameter_prior(n_s,n_a, s_0, r_mean_prior = r_prior_mean) data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std = r_sd) para_cl.update(data, resample = Bayes_resample) p_n, r_n, r_std = para_cl.get_para( resample = Bayes_resample) var_r_n = r_std **2 #print(p_n) #print(r_n) #print(r_std) #test #p_n = p #r_n = r Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, args.num_value_iter , gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) for jj, num_data in enumerate(num_datas): TM = inference.embedd_MC(p_n, n_s, n_a, V_n_max_index) I = np.identity(n_s * n_a) I_TM = np.linalg.inv(I - gamma * TM) V = np.diag(var_r_n) ds = [] ds_V = [] for i in range(n_s): for j in range(n_a): p_sa = p_n[(i * n_a * n_s + j * n_s): (i * n_a * n_s + (j + 1) * n_s)] dij = inference.cal_cov_p_quad_V(p_sa, V_n, n_s) ds.append(dij) if j == V_n_max_index[i]: ds_V.append(dij) D = np.diag(ds) cov_V_D = V + D quad_consts = np.zeros((n_s, n_a)) denom_consts = np.zeros((n_s, n_a, n_s * n_a)) for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: minus_op = np.zeros(n_s * n_a) minus_op[i * n_a + j] = 1 minus_op[i * n_a + V_n_max_index[i]] = -1 denom_consts[i][j] = np.power(np.dot(minus_op, I_TM), 2) * np.diag(cov_V_D) quad_consts[i][j] = (Q_n[i * n_a + j] - Q_n[i * n_a + V_n_max_index[i]]) ** 2 A, b, G, h = two_stage_inference.construct_contrain_matrix(p_n, n_s, n_a) AA = np.array(A) #bb = np.asarray(b) if opt_ori: def fun(x): return -x[0] else: def fun(x): return x[0] constraints = [] if opt_ori: for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: # print(denom_consts[i][j]) if np.max(denom_consts[i][j]) > 1e-5: constraints.append({'type': 'ineq', 'fun': lambda x, up_c, denom_c: up_c / ( np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))) - x[0], 'args': (quad_consts[i][j], denom_consts[i][j])}) else: for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: # print(denom_consts[i][j]) if np.max(quad_consts[i][j]) > 1e-5: constraints.append({'type': 'ineq', 'fun': lambda x, up_c, denom_c: -( np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))) / up_c + x[0], 'args': (quad_consts[i][j], denom_consts[i][j])}) for i in range(AA.shape[0]): constraints.append( {'type': 'eq', 'fun': lambda x, a, b: np.dot(a, x[1:]) - b, 'args': (AA[i], b[i])}) constraints = tuple(constraints) bnds = [] bnds.append((0., None)) for i in range(n_s * n_a): bnds.append((optLb, 1)) #bnds.append((optLbs[jj], 1)) bnds = tuple(bnds) initial = np.ones(n_s * n_a + 1) / (n_s * n_a) initial[0] = 0.1 # print(initial) # print("number of equality constraints is {}".format(len(A))) if args.opt_one_step: res = minimize(fun, initial, method='SLSQP', bounds=bnds, constraints=constraints, options = {'disp':False, 'maxiter':1}) else: res = minimize(fun, initial, method='SLSQP', bounds=bnds, constraints=constraints) x_opt = res.x[1:] #exit() #print("***", para_cl.s) data = collect_data_swimmer.collect_data(p, r, num_data, para_cl.s, n_s, n_a, pi_s_a=x_opt, std = r_sd) para_cl.update(data, resample = Bayes_resample) _, _, freq, _ = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a) #print("x_opt", x_opt) #print("freq", freq) #dist = np.linalg.norm(freq - x_opt) #dist = sklearn.metrics.mutual_info_score(freq, x_opt) #print(dist) p_n, r_n, r_std = para_cl.get_para(resample = Bayes_resample) var_r_n = r_std ** 2 Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, args.num_value_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) #print(p_n, r_n) #print(Q_n) Total_time.append(time.time() - time_rep) V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma) future_V[ii] = np.dot(rou, V_here) fS_bool = optimize_pfs.FS_bool(Q_n, V_max_index, n_s, n_a) CS_num += fS_bool fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv PCS = np.float(CS_num) / num_rep CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep) print("Seq_Q_OCBA") print("PCS is {}, with CI length {}".format(PCS, CI_len)) print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt( num_rep), rv, diff)) runnung_time_mean = np.mean(Total_time) runnung_time_CI = 1.96 * np.std(Total_time)/ np.sqrt(num_rep) print("average running time of Seq QOCBA is {} with CI length {}".format(runnung_time_mean, runnung_time_CI)) #exit() # follow original CS_num_naive = 0 future_V = np.zeros(num_rep) for i in range(num_rep): data = collect_data_swimmer.collect_data(p, r, Total_data, s_0, n_s, n_a, right_prop=right_prop) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) # print(Q_n) V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) fS_bool_ = optimize_pfs.FS_bool(Q_n, V_max_index, n_s, n_a) CS_num_naive += fS_bool_ # if not FS_bool_: # print(i) # print(f_n) # print(Q_n) PCS_naive = np.float(CS_num_naive) / num_rep CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv print("follow original") print("PCS is {}, with CI length {}".format(PCS_naive, CI_len)) print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt( num_rep), rv, diff))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0') parser.add_argument('--optLb', nargs="?", type=float, default=1e-2, help='value of r0') parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') parser.add_argument('--rstd', nargs="?", type=float, default=1.0, help='standard deviation of reward') parser.add_argument('--opt_ori', nargs="?", type=bool, default=False, help='Q-OCBA optimization method') args = parser.parse_args() opt_ori = args.opt_ori print("Q-OCBA optimization method using original formulation is {}".format( opt_ori)) two_stage_opt_bool = True print("two_stage_opt_bool is {}".format(two_stage_opt_bool)) two_stage_eps_greedy_bool = True print("two_stage_eps_greedy_bool is {}".format(two_stage_eps_greedy_bool)) num_rep = args.rep initial_s_dist = "even" Q_approximation = None right_prop = args.rightprop optLb = args.optLb s_0 = 2 # collect data configuration num_data = args.numdata num_data_1 = num_data * 3 / 10 num_data_2 = num_data * 7 / 10 print( "num_data in stage 1 is {}, num_data in stage 2 is {}, rightprop in stage 1 is {}" .format(num_data_1, num_data_2, right_prop)) n_s = 5 print("n_s is {}".format(n_s)) n_a = 2 # value-iteration configuration num_iter = 200 gamma = 0.95 # real p and r p = np.zeros(n_s * n_a * n_s) Q_0 = np.zeros(n_s * n_a) V_0 = np.zeros(n_s) rou = np.ones(n_s) / n_s r = np.zeros(n_s * n_a) r[0] = args.r0 r[-1] = 10. r_std = args.rstd print("reward standard deviation is {}".format(r_std)) # r[0] = 10. # r[-1] = 0.1 print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) # print("Q real is {}".format(Q_real)) if initial_s_dist == "even": R_real = np.mean(V_real) initial_w = np.ones(n_s) / n_s if two_stage_opt_bool or two_stage_eps_greedy_bool: Q_ns = [] x_opts = [] counts = [] data1s = [] PCS_first_stage = 0. for i in range(num_rep): count = 0 while True: count += 1 data1 = collect_data_swimmer.collect_data( p, r, num_data_1, s_0, n_s, n_a, right_prop=right_prop, std=r_std) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data1, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) # print("first stage visiting frequency is {}".format(f_n)) if f_n.all() != 0: break counts.append(count) data1s.append(data1) PCS_first_stage += functools.reduce( lambda i, j: i and j, map(lambda i, j: i == j, V_max_index, V_n_max_index), True) Q_ns.append(Q_n) # print("first stage trial = {}".format(count)) # print("real V_max_index vs estimated V_max_index after first stage is {} and {}".format(V_max_index, V_n_max_index)) # print(Q_n) # test # p_n = p # V_n = V_real # V_n_max_index = V_max_index I_TM, W_inverse, cov_V_D, I_TM_V, W_inverse_V, cov_V_V_D = inference.get_Sigma_n_comp( p_n, f_n, var_r_n, V_n, gamma, n_s, n_a, V_n_max_index) # test covariance # cov_V_D = np.diag(np.ones(n_s * n_a)) # print("first stage stationary dist is {}".format(f_n)) # print("real Q is {}".format(Q_real)) # print("Q_n estiamte is {}".format(Q_n)) # Q_n = Q_real if two_stage_opt_bool: quad_consts = np.zeros((n_s, n_a)) denom_consts = np.zeros((n_s, n_a, n_s * n_a)) for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: minus_op = np.zeros(n_s * n_a) minus_op[i * n_a + j] = 1 minus_op[i * n_a + V_n_max_index[i]] = -1 c1 = np.power(np.dot(minus_op, I_TM), 2) denom_consts[i][j] = c1 * np.diag(cov_V_D) # print(I_TM, c1) # exit() quad_consts[i][j] = ( Q_n[i * n_a + j] - Q_n[i * n_a + V_n_max_index[i]])**2 A, b, G, h = two_stage_inference.construct_contrain_matrix( p_n, n_s, n_a) AA = np.array(A) # bb = np.asarray(b) def fun(x): return -x[0] """ def cons(x, i,j): z = x[0] w = x[1:] return quad_consts[i][j] / (np.sum(np.multiply(denom_consts[i][j], np.reciprocal(w)))) -z def eqcons(x,a, b): return np.dot(a,x[1:]) -b """ # print("quardratic coeff of opt is {}".format(quad_consts)) # print("denom consts coef of opt is {}".format(denom_consts)) constraints = [] for i in range(n_s): for j in range(n_a): constraints.append({ 'type': 'ineq', 'fun': lambda x, ii, jj: x[1 + ii * n_a + jj] - x[0], 'args': (i, j) }) for i in range(AA.shape[0]): constraints.append({ 'type': 'eq', 'fun': lambda x, a, b: np.dot(a, x[1:]) - b, 'args': (AA[i], b[i]) }) constraints = tuple(constraints) bnds = [] bnds.append((0., None)) for i in range(n_s * n_a): bnds.append((optLb, 1)) bnds = tuple(bnds) initial = np.ones(n_s * n_a + 1) / (n_s * n_a) initial[0] = 0.1 # print(initial) t_1 = time.time() # print("number of equality constraints is {}".format(len(A))) res = minimize(fun, initial, method='SLSQP', bounds=bnds, constraints=constraints) x_opt = res.x[1:] runnung_t = time.time() - t_1 def func_val(x): vals = [] for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: vals.append(quad_consts[i][j] / (2 * np.sum( np.multiply(denom_consts[i][j], np.reciprocal(x))))) z = np.min(vals) # print (z) # print (vals) # z = 1 return z # print("optimization running time is {}".format(runnung_t)) # ec = np.dot(AA, x_opt) - b # print("last equality constraint coeff is {}, {}".format(AA[-1], b[-1])) # print("verify equality constraints, equality residual is {}".format(ec)) # opt_val = func_val(x_opt) # print(f_n) epsilon = 0.3 tran_M = transition_mat_S_A_epsilon(p_n, epsilon, V_n_max_index, n_s, n_a) bench_w = compare_var.solveStationary(tran_M) bench_w = np.array(bench_w).reshape(-1, ) # print(bench_w) # bench_val_1= func_val(bench_w) # bench_val_2 = func_val(f_n) #print("optimal exploration policy has stationary dist {} with sum {}".format(x_opt, np.sum(x_opt))) #print("optimal value is {}".format(res.x[0])) #print("optimal value with optimal solution is {} ".format(opt_val)) # print("benchmark objective value is {} and {}".format(bench_val_1, bench_val_2)) # exit() x_opts.append(x_opt) mean_count = np.mean(counts) std_count = np.std(counts) print( "first stage average # of trials is {} with CI length {}".format( mean_count, 1.96 * std_count / np.sqrt(num_rep))) PFS_first_stage = 1 - PCS_first_stage / num_rep print("PFS after first stage is {} ".format(PFS_first_stage)) """ w = cp.Variable(n_s * n_a) #z = cp.Variable(1) rate = w[0*n_a + 0] for i in range(n_s): for j in range(n_a): if j!= V_n_max_index[i]: #rates.append(quad_consts[i][j] * cp.inv_pos(cp.sum(cp.multiply(denom_consts[i][j], cp.inv_pos(w))))) rate = cp.min(rate, w[i*n_a + j]) #rates = np.array(rates) problem = cp.Problem(cp.Maximize(rate), [AA * w == bb, w >= 0]) problem.solve() # Print result. print("\nThe optimal value is", problem.value) print("A solution w is") print(w.value) exit() """ CS_num_naive = 0 future_V = np.zeros(num_rep) for i in range(num_rep): x_opt = x_opts[i] if two_stage_opt_bool: data = collect_data_swimmer.collect_data(p, r, num_data_2, s_0, n_s, n_a, right_prop=right_prop, pi_s_a=x_opt) else: data = collect_data_swimmer.collect_data(p, r, num_data_2, s_0, n_s, n_a, right_prop=right_prop) data = data + data1s[i] p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) # print(Q_n) V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) FS_bool_ = FS_bool(Q_n, V_max_index, n_s, n_a) CS_num_naive += FS_bool_ # if not FS_bool_: # print(i) # print(f_n) # print(Q_n) PCS_naive = np.float(CS_num_naive) / num_rep CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv # print(CS_num_naive) print("Exploration_for pure exploration:") print("PCS is {}, with CI length {}".format(PCS_naive, CI_len)) print( "future value func is {} with CI length {}, real value is {}, diff is {}" .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') parser.add_argument('--r0', nargs="?", type=float, default=0.0, help='value of r0') #parser.add_argument('--r_prior', nargs="?", type=float, default=1.0, help='prior value of reward function') parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--epi_step_num', nargs="?", type=int, default=100, help='number of episode steps') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') parser.add_argument('--rstd', nargs="?", type=float, default=1.0, help='standard deviation of reward') parser.add_argument('--beta', nargs="?", type=float, default=0.25, help='beta') parser.add_argument('--two_stage', nargs="?", type=bool, default=True, help='if run two stage or sequential experiment') args = parser.parse_args() print("PSPE") num_iter, gamma, n_s, n_a, num_rep = 200, 0.95, 5, 2, args.rep right_prop = args.rightprop Q_0 = np.zeros(n_s * n_a) V_0 = np.zeros(n_s) p = np.zeros(n_s * n_a * n_s) r = np.zeros(n_s * n_a) r[0] = args.r0 r[-1] = 10. r_std = args.rstd print("reward standard deviation is {}".format(r_std)) # r[0] = 10. # r[-1] = 0.1 print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) #print("Q real is {}".format(Q_real)) s_0 = 2 ## PSPE if not args.two_stage: print("sequential implementation") Total_data = args.numdata print("total num of data is {}".format(Total_data)) episode_steps = args.epi_step_num numdata_1 = episode_steps numdata_2 = Total_data - numdata_1 print("epsisode timestep is {}".format(episode_steps)) num_datas = [episode_steps] * (numdata_2 / episode_steps) else: print("two_stage implementation") Total_data = args.numdata print("total num of data is {}".format(Total_data)) numdata_1 = Total_data * 3 / 10 numdata_2 = Total_data - numdata_1 episodes = 100 num_datas = [numdata_2 / episodes] * episodes CS_num = 0. beta = args.beta rou = np.ones(n_s) / n_s future_V = np.zeros(num_rep) for i in range(num_rep): para_cl = parameter_prior(n_s, n_a, s_0) while True: data1 = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_std) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data1, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) # print("first stage visiting frequency is {}".format(f_n)) if f_n.all() != 0: break para_cl.update(data1, r_sigma=r_std) Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) #print(Q_estimate) for num_data in num_datas: data = collect_data_swimmer.collect_data(p, r, num_data, para_cl.s_0, n_s, n_a, Q=Q_estimate, epsilon=0, std=r_std) para_cl.update(data, r_sigma=r_std) Q_estimate_1 = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) V_n_1, V_n_max_index_1 = inference.get_V_from_Q( Q_estimate_1, n_s, n_a) sim = np.random.binomial(1, beta, 1)[0] if sim: Q_estimate = Q_estimate_1 else: while True: Q_estimate_2 = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) V_n_2, V_n_max_index_2 = inference.get_V_from_Q( Q_estimate_2, n_s, n_a) if V_n_max_index_2 != V_n_max_index_1: break Q_estimate = Q_estimate_2 #print(Q_estimate) #print(para_cl.pprior) #print(para_cl.r_mean) V_here = optimize_pfs.policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) FS_bool = optimize_pfs.FS_bool(Q_estimate, V_max_index, n_s, n_a) CS_num += FS_bool PCS = np.float(CS_num) / num_rep CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv # print(CS_num_naive) print("PCS is {}, with CI length {}".format(PCS, CI_len)) print( "future value func is {} with CI length {}, real value is {}, diff is {}" .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') #parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0') parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') parser.add_argument('--rstd', nargs="?", type=float, default=1.0, help='standard deviation of reward ') args = parser.parse_args() num_iter, gamma, n_s, n_a, delta, num_rep = 200, 0.95, 5, 2, 0.05, args.rep right_prop = args.rightprop Q_0 = np.zeros(n_s * n_a) V_0 = np.zeros(n_s) p = np.zeros(n_s * n_a * n_s) r = np.zeros(n_s * n_a) for r0_val in range(1, 4): r[0] = float(r0_val) r[-1] = 10. r_std = args.rstd # r[0] = 10. # r[-1] = 0.1 print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) print("Q real is {}".format(Q_real)) s_0 = 2 rou = np.ones(n_s) / n_s Q_approximation = None initial_s_dist = "even" if initial_s_dist == "even": R_real = np.mean(V_real) initial_w = np.ones(n_s) / n_s cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] numerical_tol = 1e-6 S_0 = None ## UCRL CS_num = 0. num_data = args.numdata num_1 = num_data * 3/10 num_2 = num_data * 7/10 #print("smaller") future_V = np.zeros(num_rep) for i in range(num_rep): #all_data = [] while True: data1 = collect_data_swimmer.collect_data(p, r, num_1, s_0, n_s, n_a, right_prop=right_prop, std = r_std) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data1, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) #print("first stage visiting frequency is {}".format(f_n)) if f_n.all()!=0: break #all_data += data1 pre_collected_stats = get_pre_collected_stats(data1, n_s, n_a) UCRL_cl = UCRL(n_s, n_a, 0.05, num_1, s_0, num_data, pre_collected_stats) while UCRL_cl.t < num_data: UCRL_cl.update_point_estimate_and_CIbound() #print("step1 finished") UCRL_cl.Extended_Value_Iter() #print("step2 finished") UCRL_cl.collect_data_and_update(p,r, r_std = r_std) #print("step3 finished") #print(UCRL_cl.t) UCRL_cl.update_point_estimate_and_CIbound() Q_estimate = Iterative_Cal_Q.cal_Q_val(UCRL_cl.transition, Q_0, UCRL_cl.rew, num_iter , gamma, n_s, n_a) #print(Q_estimate) FS_bool = optimize_pfs.FS_bool(Q_estimate, V_max_index, n_s, n_a) CS_num += FS_bool V_here = optimize_pfs.policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) datahere = data1 + UCRL_cl.datas Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(Q_approximation, S_0, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=datahere) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and(Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and(V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and(R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) PCS = np.float(CS_num) / num_rep CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv # print(CS_num_naive) print("PCS is {}, with CI length {}".format(PCS, CI_len)) print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt( num_rep), rv, diff)) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.divide(cov_bools_Q, num_rep) cov_rate_V = np.divide(cov_bools_V, num_rep) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) print("coverage for Q") print(cov_rate_Q) print(cov_rate_CI_Q) print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("coverage for V") print(cov_rate_V) print(cov_rate_CI_V) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0') # parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') parser.add_argument('--rstd', nargs="?", type=float, default=1.0, help='standard deviation of reward') parser.add_argument('--episode', nargs="?", type=int, default=100, help='number of episode') parser.add_argument('--epi_step_num', nargs="?", type=int, default=100, help='number of episode steps') parser.add_argument('--first_stage_data', nargs="?", type=int, default=100, help='number of first stage data') parser.add_argument('--r_prior', nargs="?", type=float, default=0.0, help='prior value of reward function') parser.add_argument('--iflog', nargs="?", type=int, default=0, help='whether take logrithm of x-axis') args = parser.parse_args() num_rep = args.rep right_prop = args.rightprop print("right prop is {}".format(right_prop)) s_0 = 2 n_s = 5 print("n_s is {}".format(n_s)) n_a = 2 # value-iteration configuration num_iter = 200 gamma = 0.95 # real p and r p = np.zeros(n_s * n_a * n_s) Q_0 = np.zeros(n_s * n_a) V_0 = np.zeros(n_s) rou = np.ones(n_s) / n_s r = np.zeros(n_s * n_a) r[0] = args.r0 r[-1] = 10. r_sd = args.rstd r_prior_mean = args.r_prior print("reward standard deviation is {}".format(r_sd)) # r[0] = 10. # r[-1] = 0.1 print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) R_real = np.mean(V_real) # print("Q real is {}".format(Q_real)) #num_datas = list(range(500, 10500, 500)) episode_steps = args.epi_step_num #numdata_1 = episode_steps numdata_1 = args.first_stage_data print("first stage data num is {}".format(numdata_1)) print("epsisode timestep is {}".format(episode_steps)) logif = True if args.iflog else False print("we print x axis in log is {}".format(logif)) if not logif: if r_sd == 10.0: num_datas = list(range(10, 8000, 1000)) else: num_datas = list(range(5, 10010, 1000)) else: num_datas = [10, 100, 1000, 5000, 10000] #num_datas = list(range(1000, 5000, 2000)) #num_datas = [2000] QOCBAs_Q_cov = [] REs_Q_cov = [] eps_Q_cov = [] UCRL_Q_cov = [] PSRL_Q_cov = [] Bayes_resample = False print_if = True epsilon = 0.2 S_0 = None initial_w = np.ones(n_s) / n_s numerical_tol = 1e-6 Q_approximation = None print("epsilon is {}".format(epsilon)) for num_data in num_datas: print("numdata is {}".format(num_data)) stage_datas = [episode_steps] * (num_data / episode_steps) cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] print("epsilon greedy") for i in range(num_rep): para_cl = seq_cls(n_s, n_a, s_0, r_mean_prior=r_prior_mean) all_data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_sd) para_cl.update(all_data, resample=False) p_n, r_n, r_std = para_cl.get_para(resample=False) Q_here = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) for num_dat in stage_datas: stage_data = collect_data_swimmer.collect_data( p, r, num_dat, s_0, n_s, n_a, right_prop=right_prop, Q=Q_here, epsilon=epsilon, print_pro_right=False, std=r_sd) para_cl.update(stage_data, resample=Bayes_resample) p_n, r_n, r_std = para_cl.get_para(resample=False) Q_here = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) all_data += stage_data # print(Q_here) Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI( Q_approximation, S_0, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=all_data) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and( Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and( V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and( R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) #print(CI_lens_Q) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.mean(np.divide(cov_bools_Q, num_rep)) cov_rate_V = np.mean(np.divide(cov_bools_V, num_rep)) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) eps_Q_cov.append(cov_rate_Q) if print_if: print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format( CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format( CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format( CI_len_R_mean, CI_len_R_ci)) # exit() print("Q-OCBA") cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] for iii in range(num_rep): para_cl = seq_cls(n_s, n_a, s_0, r_mean_prior=r_prior_mean) all_data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_sd) #data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=0.3, std=r_sd) para_cl.update(all_data, resample=Bayes_resample) p_n, r_n, r_std = para_cl.get_para(resample=Bayes_resample) var_r_n = r_std**2 # print(p_n) # print(r_n) # print(r_std) # test # p_n = p # r_n = r Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) for jj, stage_data in enumerate(stage_datas): TM_V = inference.P_V(p_n, n_s, n_a, V_n_max_index) I_V = np.identity(n_s) I_TM_V = np.linalg.inv(I_V - gamma * TM_V) var_r_n_V = np.array( [var_r_n[i * n_a + V_n_max_index[i]] for i in range(n_s)]) V_V = np.diag(var_r_n_V) ds = [] ds_V = [] for i in range(n_s): for j in range(n_a): p_sa = p_n[(i * n_a * n_s + j * n_s):(i * n_a * n_s + (j + 1) * n_s)] dij = inference.cal_cov_p_quad_V(p_sa, V_n, n_s) ds.append(dij) if j == V_n_max_index[i]: ds_V.append(dij) D_V = np.diag(ds_V) cov_V_V_D = V_V + D_V A, b, G, h = two_stage_inference.construct_contrain_matrix( p_n, n_s, n_a) AA = np.array(A) # bb = np.asarray(b) quad_con_vec = np.power(np.dot(initial_w, I_TM_V), 2) * np.diag(cov_V_V_D) # print(quad_con_vec) # if not np.all(f_n): # print(f_n) # print(quad_con_vec) # print("need more data for first stage") # exit() quad_con_vec_all = np.zeros(n_s * n_a) for i in range(n_s): quad_con_vec_all[i * n_a + V_n_max_index[i]] = quad_con_vec[i] # print(quad_con_vec) # print(I_TM_V) # print(cov_V_V_D) # print(initial_w) # Create a new model init_v_opt = 1. / (n_a * n_s) quad_con_vec_all = matrix(quad_con_vec_all) array_quad_con_vec = np.array(quad_con_vec_all).transpose()[0] # print(array_quad_con_vec) # exit() def F(x): u = np.divide(1, x) # print(u) uu = np.multiply(array_quad_con_vec, u) # print(quad_con_vec_all) # print(uu) val = np.sum(uu) # print(val) return val A, b, G, h = two_stage_inference.construct_contrain_matrix( p_n, n_s, n_a) AA = np.array(A) bb = np.asarray(b) constraints = [] for i in range(AA.shape[0]): constraints.append({ 'type': 'eq', 'fun': lambda x, a, b: np.dot(a, x) - b, 'args': (AA[i], bb[i]) }) constraints = tuple(constraints) bnds = [] for i in range(n_s * n_a): bnds.append((1e-6, None)) # bnds.append((0.001, None)) bnds = tuple(bnds) initial = np.ones(n_s * n_a) / (n_s * n_a) # print(initial) res = minimize(F, initial, method='SLSQP', bounds=bnds, constraints=constraints) x_opt = res.x # exit() # print("***", para_cl.s) #print(x_opt) data = collect_data_swimmer.collect_data(p, r, stage_data, para_cl.s, n_s, n_a, pi_s_a=x_opt, std=r_sd) all_data += data para_cl.update(data, resample=Bayes_resample) _, _, freq, _ = cal_impirical_r_p.cal_impirical_stats( data, n_s, n_a) # print("x_opt", x_opt) # print("freq", freq) # dist = np.linalg.norm(freq - x_opt) # dist = sklearn.metrics.mutual_info_score(freq, x_opt) # print(dist) p_n, r_n, r_std = para_cl.get_para(resample=Bayes_resample) var_r_n = r_std**2 Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI( Q_approximation, S_0, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=all_data) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and( Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and( V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and( R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.mean(np.divide(cov_bools_Q, num_rep)) cov_rate_V = np.mean(np.divide(cov_bools_V, num_rep)) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) QOCBAs_Q_cov.append(cov_rate_Q) if print_if: print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format( CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format( CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format( CI_len_R_mean, CI_len_R_ci)) # follow original print("random exploration") cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] for i in range(num_rep): data = collect_data_swimmer.collect_data(p, r, num_data + numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_sd) Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI( Q_approximation, S_0, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=data) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and( Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and( V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and( R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.mean(np.divide(cov_bools_Q, num_rep)) cov_rate_V = np.mean(np.divide(cov_bools_V, num_rep)) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) REs_Q_cov.append(cov_rate_Q) if print_if: print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format( CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format( CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format( CI_len_R_mean, CI_len_R_ci)) #UCRL #delta = 0.05 print("UCRL") cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] for i in range(num_rep): all_data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_sd) pre_collected_stats = UCRL_2.get_pre_collected_stats( data, n_s, n_a) UCRL_cl = UCRL_2.UCRL(n_s, n_a, 0.05, numdata_1, s_0, num_data, pre_collected_stats) while UCRL_cl.t < num_data: UCRL_cl.update_point_estimate_and_CIbound() # print("step1 finished") UCRL_cl.Extended_Value_Iter() # print("step2 finished") UCRL_cl.collect_data_and_update(p, r, r_std=r_sd) # print("step3 finished") # print(UCRL_cl.t) UCRL_cl.update_point_estimate_and_CIbound() all_data = all_data + UCRL_cl.datas #print(UCRL_cl.t, num_data) #print(len(datahere)) #exit() Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI( Q_approximation, S_0, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=all_data) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and( Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and( V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and( R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.mean(np.divide(cov_bools_Q, num_rep)) cov_rate_V = np.mean(np.divide(cov_bools_V, num_rep)) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) UCRL_Q_cov.append(cov_rate_Q) if print_if: print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format( CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format( CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format( CI_len_R_mean, CI_len_R_ci)) print("PSRL") cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] for i in range(num_rep): para_cl = PSRLcls(n_s, n_a, s_0) data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_sd) para_cl.update(data, r_sigma=r_sd) Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) # print(Q_estimate) for nd in stage_datas: dat = collect_data_swimmer.collect_data(p, r, nd, para_cl.s_0, n_s, n_a, Q=Q_estimate, epsilon=0, std=r_sd) data += dat para_cl.update(dat, r_sigma=r_sd) Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) # print(para_cl.pprior) # print(para_cl.r_mean) # exit() # print(Q_estimate) # print(para_cl.pprior) # print(para_cl.r_mean) # transition = np.array([1.] * n_s * (n_s * n_a)) # for i in range(n_s): # for j in range(n_a): # transition[ # (i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] = para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] \ # / np.sum(para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)]) # r_n = para_cl.r_mean # print(r_n) # print(transition) # Q_estimate = Iterative_Cal_Q.cal_Q_val(transition, Q_0, r_n, num_iter , gamma, n_s, n_a) Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI( Q_approximation, S_0, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=data) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and( Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and( V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and( R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.mean(np.divide(cov_bools_Q, num_rep)) cov_rate_V = np.mean(np.divide(cov_bools_V, num_rep)) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) PSRL_Q_cov.append(cov_rate_Q) if print_if: print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format( CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format( CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format( CI_len_R_mean, CI_len_R_ci)) print("epsilon greedy") print(eps_Q_cov) print("QOCBA") print(QOCBAs_Q_cov) print("REs") print(REs_Q_cov) print("UCRL ") print(UCRL_Q_cov) print("PSRL ") print(PSRL_Q_cov) if logif: num_datas = np.log(np.array(num_datas) + 1) plt.plot(num_datas, eps_Q_cov, 'g<--', markersize=6, label="epsilon-greedy") plt.plot(num_datas, UCRL_Q_cov, 'm+--', markersize=6, label="UCRL") plt.plot(num_datas, PSRL_Q_cov, 'cx--', markersize=6, label="PSRL") plt.plot(num_datas, QOCBAs_Q_cov, 'ro--', markersize=6, label="Q-OCBA") # plt.fill_between(xs, np.subtract(y1, CI_1), np.add(y1, CI_1), color='r', alpha=0.4) plt.plot(num_datas, REs_Q_cov, 'b>--', markersize=6, label="RE({})".format(right_prop)) # plt.fill_between(xs, np.subtract(y2, CI_2), np.add(y2, CI_2), color='b', alpha=0.4) # plt.axhline(y=0.95) plt.xlabel("total number of data") # plt.ylabel("CR overall coverage") plt.ylabel("CI Coverage") plt.axhline(y=0.95) plt.legend(loc='lower right', shadow=True, fontsize='x-small') plt.title(r'$\sigma_R= {}, r_L = {}$ CI coverage'.format(r_sd, r[0])) plt.show()
def get_CI(Q_approximation, S_0, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, pi_s_a=None, num_sec=10, Q=None, epsilon=0, data=None): if Q_approximation == None: # collect new data if data == None: data = collect_data_swimmer.collect_data(p, r, num_data, s_0, n_s, n_a, right_prop=right_prop, pi_s_a=pi_s_a, Q=Q, epsilon=epsilon) # get impirical statistics p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data, n_s, n_a, f_n_def=0) if f_n.all() != 0: # cal Q_n function and V_n function Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = get_V_from_Q(Q_n, n_s, n_a) R_n = np.dot(initial_w, V_n) Sigma_n_Q, Sigma_n_V, Sigma_n_R = cal_Sigma_n( p_n, f_n, var_r_n, V_n, gamma, n_s, n_a, V_n_max_index, initial_w) CI_len_Q = 1.96 * np.sqrt(np.diag(Sigma_n_Q)) / np.sqrt(num_data) CI_len_V = 1.96 * np.sqrt(np.diag(Sigma_n_V)) / np.sqrt(num_data) CI_len_R = 1.96 * np.sqrt(Sigma_n_R) / np.sqrt(num_data) return Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R else: return np.zeros(n_s * n_a), np.zeros( n_s * n_a), np.zeros(n_s), np.zeros(n_s), 0, 0 raise ValueError('f_n has zero') if Q_approximation == "linear_interpolation": Q_ns = [] R_ns = [] V_ns = [] # collect new data if data == None: data = collect_data_swimmer.collect_data(p, r, num_data, s_0, n_s, n_a, right_prop=right_prop, pi_s_a=pi_s_a) sec_size = num_data / num_sec for i in range(num_sec): # get impirical statistics p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data[i * sec_size:(i + 1) * sec_size], n_s, n_a) #print(p_n, r_n) #print(f_n) Q_n = Iterative_Cal_Q.cal_Q_val_approx_linear_interpolation( p_n, Q_0, r_n, num_iter, gamma, S_0, n_s, n_a) V_n, V_n_max_index = get_V_from_Q(Q_n, n_s, n_a) R_n = np.dot(initial_w, V_n) #print(Q_n) #print(R_n) Q_ns.append(Q_n) R_ns.append(R_n) V_ns.append(V_n) Q_bar = np.mean(Q_ns, 0) V_bar = np.mean(V_ns, 0) R_bar = np.mean(R_ns) Q_std = np.std(Q_ns, 0) V_std = np.std(V_ns, 0) R_std = np.std(R_ns) CI_len_Q = t.ppf(0.975, num_sec - 1) * Q_std / np.sqrt(num_sec) CI_len_V = t.ppf(0.975, num_sec - 1) * V_std / np.sqrt(num_sec) CI_len_R = t.ppf(0.975, num_sec - 1) * R_std / np.sqrt(num_sec) #print(Q_bar) return Q_bar, CI_len_Q, V_bar, CI_len_V, R_bar, CI_len_R
def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0') parser.add_argument('--optLb', nargs="?", type=float, default=1e-2, help='value of r0') # parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') parser.add_argument('--rstd', nargs="?", type=float, default=1.0, help='standard deviation of reward') parser.add_argument('--opt_ori', nargs="?", type=bool, default=False, help='Q-OCBA optimization method') parser.add_argument('--episode', nargs="?", type=int, default=100, help='number of episode') args = parser.parse_args() opt_ori = args.opt_ori print("Q-OCBA optimization method using original formulation is {}".format(opt_ori)) num_rep = args.rep right_prop = args.rightprop optLb = args.optLb s_0 = 2 n_s = 5 print("n_s is {}".format(n_s)) n_a = 2 # value-iteration configuration num_iter = 200 gamma = 0.95 # real p and r p = np.zeros(n_s * n_a * n_s) Q_0 = np.zeros(n_s * n_a) V_0 = np.zeros(n_s) rou = np.ones(n_s) / n_s r = np.zeros(n_s * n_a) r[0] = args.r0 r[-1] = 10. r_std = args.rstd print("reward standard deviation is {}".format(r_std)) # r[0] = 10. # r[-1] = 0.1 print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) # print("Q real is {}".format(Q_real)) num_datas = list(range(500, 12500, 2000)) num_datas = list(range(1000, 5000, 2000)) num_datas = [10000] QOCBAs_PCS = [] REs_PCS = [] QOCBAs_fr = [] REs_fr = [] eps_PCSs = [] eps_frs = [] UCRL_PCSs = [] UCRL_frs = [] PSRL_PCSs = [] PSRL_frs = [] for num_data in num_datas: num_data_1 = num_data * 3 / 10 num_data_2 = num_data * 7 / 10 print("num_data in stage 1 is {}, num_data in stage 2 is {}, rightprop in stage 1 is {}".format(num_data_1, num_data_2, right_prop)) if True: Q_ns = [] x_opts = [] counts = [] data1s = [] PCS_first_stage = 0. for i in range(num_rep): count = 0 while True: count += 1 data1 = collect_data_swimmer.collect_data(p, r, num_data_1, s_0, n_s, n_a, right_prop=right_prop, std=r_std) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data1, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) # print("first stage visiting frequency is {}".format(f_n)) if f_n.all() != 0: break counts.append(count) data1s.append(data1) PCS_first_stage += functools.reduce(lambda i, j: i and j, map(lambda i, j: i == j, V_max_index, V_n_max_index), True) Q_ns.append(Q_n) # print("first stage trial = {}".format(count)) # print("real V_max_index vs estimated V_max_index after first stage is {} and {}".format(V_max_index, V_n_max_index)) # print(Q_n) # test # p_n = p # V_n = V_real # V_n_max_index = V_max_index I_TM, W_inverse, cov_V_D, I_TM_V, W_inverse_V, cov_V_V_D = inference.get_Sigma_n_comp(p_n, f_n, var_r_n, V_n, gamma, n_s, n_a, V_n_max_index) # test covariance # cov_V_D = np.diag(np.ones(n_s * n_a)) # print("first stage stationary dist is {}".format(f_n)) # print("real Q is {}".format(Q_real)) # print("Q_n estiamte is {}".format(Q_n)) # Q_n = Q_real if True: quad_consts = np.zeros((n_s, n_a)) denom_consts = np.zeros((n_s, n_a, n_s * n_a)) for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: minus_op = np.zeros(n_s * n_a) minus_op[i * n_a + j] = 1 minus_op[i * n_a + V_n_max_index[i]] = -1 c1 = np.power(np.dot(minus_op, I_TM), 2) denom_consts[i][j] = c1 * np.diag(cov_V_D) # print(I_TM, c1) # exit() quad_consts[i][j] = (Q_n[i * n_a + j] - Q_n[i * n_a + V_n_max_index[i]]) ** 2 A, b, G, h = two_stage_inference.construct_contrain_matrix(p_n, n_s, n_a) AA = np.array(A) # bb = np.asarray(b) if opt_ori: def fun(x): return -x[0] else: def fun(x): return x[0] """ def cons(x, i,j): z = x[0] w = x[1:] return quad_consts[i][j] / (np.sum(np.multiply(denom_consts[i][j], np.reciprocal(w)))) -z def eqcons(x,a, b): return np.dot(a,x[1:]) -b """ # print("quardratic coeff of opt is {}".format(quad_consts)) # print("denom consts coef of opt is {}".format(denom_consts)) constraints = [] if opt_ori: for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: # print(denom_consts[i][j]) if np.max(denom_consts[i][j]) > 1e-5: constraints.append({'type': 'ineq', 'fun': lambda x, up_c, denom_c: up_c / ( np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))) - x[0], 'args': (quad_consts[i][j], denom_consts[i][j])}) else: for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: # print(denom_consts[i][j]) if np.max(denom_consts[i][j]) > 1e-5: constraints.append({'type': 'ineq', 'fun': lambda x, up_c, denom_c: -( np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))) / up_c + x[0], 'args': (quad_consts[i][j], denom_consts[i][j])}) for i in range(AA.shape[0]): constraints.append( {'type': 'eq', 'fun': lambda x, a, b: np.dot(a, x[1:]) - b, 'args': (AA[i], b[i])}) constraints = tuple(constraints) bnds = [] bnds.append((0., None)) for i in range(n_s * n_a): bnds.append((optLb, 1)) bnds = tuple(bnds) initial = np.ones(n_s * n_a + 1) / (n_s * n_a) initial[0] = 0.1 # print(initial) t_1 = time.time() # print("number of equality constraints is {}".format(len(A))) res = minimize(fun, initial, method='SLSQP', bounds=bnds, constraints=constraints) x_opt = res.x[1:] runnung_t = time.time() - t_1 def func_val(x): vals = [] for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: vals.append(quad_consts[i][j] / (2 * np.sum(np.multiply(denom_consts[i][j], np.reciprocal(x))))) z = np.min(vals) # print (z) # print (vals) # z = 1 return z # print("optimization running time is {}".format(runnung_t)) # ec = np.dot(AA, x_opt) - b # print("last equality constraint coeff is {}, {}".format(AA[-1], b[-1])) # print("verify equality constraints, equality residual is {}".format(ec)) # opt_val = func_val(x_opt) # print(f_n) epsilon = 0.3 tran_M = transition_mat_S_A_epsilon(p_n, epsilon, V_n_max_index, n_s, n_a) bench_w = compare_var.solveStationary(tran_M) bench_w = np.array(bench_w).reshape(-1, ) # print(bench_w) # bench_val_1= func_val(bench_w) # bench_val_2 = func_val(f_n) # print("optimal exploration policy has stationary dist {} with sum {}".format(x_opt, np.sum(x_opt))) # print("optimal value is {}".format(res.x[0])) # print("optimal value with optimal solution is {} ".format(opt_val)) # print("benchmark objective value is {} and {}".format(bench_val_1, bench_val_2)) # exit() x_opts.append(x_opt) mean_count = np.mean(counts) std_count = np.std(counts) # print("first stage average # of trials is {} with CI length {}".format(mean_count,1.96 * std_count / np.sqrt(num_rep))) # PFS_first_stage = 1 - PCS_first_stage / num_rep # print("PFS after first stage is {} ".format(PFS_first_stage)) """ w = cp.Variable(n_s * n_a) #z = cp.Variable(1) rate = w[0*n_a + 0] for i in range(n_s): for j in range(n_a): if j!= V_n_max_index[i]: #rates.append(quad_consts[i][j] * cp.inv_pos(cp.sum(cp.multiply(denom_consts[i][j], cp.inv_pos(w))))) rate = cp.min(rate, w[i*n_a + j]) #rates = np.array(rates) problem = cp.Problem(cp.Maximize(rate), [AA * w == bb, w >= 0]) problem.solve() # Print result. print("\nThe optimal value is", problem.value) print("A solution w is") print(w.value) exit() """ epsilons = [0.2] for epsilon in epsilons: print("epsilon is {}".format(epsilon)) CS_num_naive = 0 future_V = np.zeros(num_rep) for i in range(num_rep): Q_n = Q_ns[i] data = collect_data_swimmer.collect_data(p, r, num_data_2, s_0, n_s, n_a, right_prop=right_prop, Q=Q_n, epsilon=epsilon, print_pro_right=False, std=r_std) data = data + data1s[i] p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a) Q_here = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) # print(Q_here) V_here = policy_val_iteration(Q_here, n_s, n_a, V_0, num_iter, r, p, gamma) future_V[i] = np.dot(rou, V_here) FS_bool_ = FS_bool(Q_here, V_max_index, n_s, n_a) CS_num_naive += FS_bool_ # if not FS_bool_: # print(i) # print(f_n) # print(Q_here) # exit() PCS_naive = np.float(CS_num_naive) / num_rep CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv eps_PCSs.append(PCS_naive) eps_frs.append(diff) print("epsilon--greedy with epsilon {}:".format(epsilon)) print("PCS is {}, with CI length {}".format(PCS_naive, CI_len)) print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt( num_rep), rv, diff)) # exit() CS_num_naive = 0 future_V = np.zeros(num_rep) for i in range(num_rep): x_opt = x_opts[i] data = collect_data_swimmer.collect_data(p, r, num_data_2, s_0, n_s, n_a, right_prop=right_prop, pi_s_a=x_opt, std=r_std) data = data + data1s[i] p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) # print(Q_n) V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) FS_bool_ = FS_bool(Q_n, V_max_index, n_s, n_a) CS_num_naive += FS_bool_ # if not FS_bool_: # print(i) # print(f_n) # print(Q_n) PCS_naive = np.float(CS_num_naive) / num_rep CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv # print(CS_num_naive) QOCBAs_PCS.append(PCS_naive) QOCBAs_fr.append(diff) print("Q-OCBA:") print("PCS is {}, with CI length {}".format(PCS_naive, CI_len)) print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt( num_rep), rv, diff)) # exit() # follow original CS_num_naive = 0 future_V = np.zeros(num_rep) for i in range(num_rep): data = collect_data_swimmer.collect_data(p, r, num_data_2, s_0, n_s, n_a, right_prop=right_prop, std=r_std) data = data + data1s[i] p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) # print(Q_n) V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) FS_bool_ = FS_bool(Q_n, V_max_index, n_s, n_a) CS_num_naive += FS_bool_ # if not FS_bool_: # print(i) # print(f_n) # print(Q_n) PCS_naive = np.float(CS_num_naive) / num_rep CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv REs_PCS.append(PCS_naive) REs_fr.append(diff) print("follow original") print("PCS is {}, with CI length {}".format(PCS_naive, CI_len)) print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt( num_rep), rv, diff)) #UCRL #delta = 0.05 CS_num = 0. future_V = np.zeros(num_rep) for i in range(num_rep): pre_collected_stats = UCRL_2.get_pre_collected_stats(data1s[i], n_s, n_a) UCRL_cl = UCRL_2.UCRL(n_s, n_a, 0.05, num_data_1, s_0, num_data_2, pre_collected_stats) while UCRL_cl.t < num_data_1 + num_data_2: UCRL_cl.update_point_estimate_and_CIbound() # print("step1 finished") UCRL_cl.Extended_Value_Iter() # print("step2 finished") UCRL_cl.collect_data_and_update(p, r, r_std=r_std) # print("step3 finished") # print(UCRL_cl.t) UCRL_cl.update_point_estimate_and_CIbound() Q_estimate = Iterative_Cal_Q.cal_Q_val(UCRL_cl.transition, Q_0, UCRL_cl.rew, num_iter, gamma, n_s, n_a) # print(Q_estimate) FS_bool_ = FS_bool(Q_estimate, V_max_index, n_s, n_a) CS_num += FS_bool_ V_here = policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) PCS = np.float(CS_num) / num_rep CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv # print(CS_num_naive) UCRL_PCSs.append(PCS) UCRL_frs.append(diff) print("UCRL") print("PCS is {}, with CI length {}".format(PCS, CI_len)) print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt( num_rep), rv, diff)) episodes = args.episode ## PSRL print("# of epsisodes is {}".format(episodes)) CS_num = 0. future_V = np.zeros(num_rep) for i in range(num_rep): all_data = data1s[i] para_cl = PSRLcls(n_s, n_a, s_0) para_cl.update(data1s[i], r_sigma=r_std) Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) # print(Q_estimate) nds = [num_data_2 / episodes] * episodes for nd in nds: dat = collect_data_swimmer.collect_data(p, r, nd, para_cl.s_0, n_s, n_a, Q=Q_estimate, epsilon=0, std=r_std) all_data += dat para_cl.update(dat, r_sigma=r_std) Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) # print(para_cl.pprior) # print(para_cl.r_mean) # exit() # print(Q_estimate) # print(para_cl.pprior) # print(para_cl.r_mean) # transition = np.array([1.] * n_s * (n_s * n_a)) # for i in range(n_s): # for j in range(n_a): # transition[ # (i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] = para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] \ # / np.sum(para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)]) # r_n = para_cl.r_mean # print(r_n) # print(transition) # Q_estimate = Iterative_Cal_Q.cal_Q_val(transition, Q_0, r_n, num_iter , gamma, n_s, n_a) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(all_data, n_s, n_a) Q_estimate = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_here = policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) FS_bool_ = FS_bool(Q_estimate, V_max_index, n_s, n_a) CS_num += FS_bool_ PCS = np.float(CS_num) / num_rep CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv PSRL_PCSs.append(PCS) PSRL_frs.append(diff) # print(CS_num_naive) print("PSRL") print("PCS is {}, with CI length {}".format(PCS, CI_len)) print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt( num_rep), rv, diff)) print("epsilon greedy") print(eps_PCSs) print(eps_frs) print("QOCBA") print(QOCBAs_PCS) print(QOCBAs_fr) print("REs") print(REs_PCS) print(REs_fr) print("UCRL ") print(UCRL_PCSs) print(UCRL_frs) print("PSRL ") print(PSRL_PCSs) print(PSRL_frs) plt.plot(num_datas, eps_PCSs, 'g<--', markersize=6, label="epsilon-greedy") plt.plot(num_datas, UCRL_PCSs, 'm+--', markersize=6, label="UCRL") plt.plot(num_datas, PSRL_PCSs, 'cx--', markersize=6, label="PSRL") plt.plot(num_datas, QOCBAs_PCS, 'ro--', markersize=6, label="Q-OCBA") # plt.fill_between(xs, np.subtract(y1, CI_1), np.add(y1, CI_1), color='r', alpha=0.4) plt.plot(num_datas, REs_PCS, 'b>--', markersize=6, label="RE(0.6)") # plt.fill_between(xs, np.subtract(y2, CI_2), np.add(y2, CI_2), color='b', alpha=0.4) # plt.axhline(y=0.95) plt.xlabel("total number of data") # plt.ylabel("CR overall coverage") plt.ylabel("PCS") plt.legend(loc='lower right', shadow=True, fontsize='x-small') plt.title(r'$\sigma_R= {}, r_L = {}$ PCS'.format(r_std, r[0])) plt.show() plt.plot(num_datas, eps_frs, 'g<--', markersize=6, label="epsilon-greedy") plt.plot(num_datas, UCRL_frs, 'm+--', markersize=6, label="UCRL") plt.plot(num_datas, PSRL_frs, 'cx--', markersize=6, label="PSRL") plt.plot(num_datas, QOCBAs_fr, 'ro--', markersize=6, label="Q-OCBA") # plt.fill_between(xs, np.subtract(y1, CI_1), np.add(y1, CI_1), color='r', alpha=0.4) plt.plot(num_datas, REs_fr, 'b>--', markersize=6, label="RE(0.6)") # plt.fill_between(xs, np.subtract(y2, CI_2), np.add(y2, CI_2), color='b', alpha=0.4) # plt.axhline(y=0.95) plt.xlabel("total number of data") # plt.ylabel("CR overall coverage") plt.ylabel("future regret") plt.legend(loc='upper right', shadow=True, fontsize='x-small') plt.title(r'$\sigma_R= {}, r_L = {}$ future regret'.format(r_std, r[0])) plt.show()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0') parser.add_argument('--optLb', nargs="?", type=float, default=1e-2, help='value of r0') # parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') parser.add_argument('--rstd', nargs="?", type=float, default=1.0, help='standard deviation of reward') parser.add_argument('--opt_ori', nargs="?", type=int, default=0, help='Q-OCBA optimization method') parser.add_argument('--episode', nargs="?", type=int, default=100, help='number of episode') parser.add_argument('--epi_step_num', nargs="?", type=int, default=100, help='number of episode steps') parser.add_argument('--first_stage_data', nargs="?", type=int, default=3, help='number of first stage data') parser.add_argument('--r_prior', nargs="?", type=float, default=0.0, help='prior value of reward function') parser.add_argument('--opt_one_step', nargs="?", type=int, default=0, help='Q-OCBA optimization running only one step') parser.add_argument('--iflog', nargs="?", type=int, default=0, help='whether take logrithm of x-axis') args = parser.parse_args() opt_ori = True if args.opt_ori else False print("Q-OCBA optimization method using original formulation is {}".format( opt_ori)) num_rep = args.rep right_prop = args.rightprop print("right prop is {}".format(right_prop)) optLb = args.optLb s_0 = 2 n_s = 5 print("n_s is {}".format(n_s)) n_a = 2 # value-iteration configuration num_iter = 200 gamma = 0.95 # real p and r p = np.zeros(n_s * n_a * n_s) Q_0 = np.zeros(n_s * n_a) V_0 = np.zeros(n_s) rou = np.ones(n_s) / n_s r = np.zeros(n_s * n_a) r[0] = args.r0 r[-1] = 10. r_sd = args.rstd r_prior_mean = args.r_prior print("reward standard deviation is {}".format(r_sd)) # r[0] = 10. # r[-1] = 0.1 print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) # print("Q real is {}".format(Q_real)) #num_datas = list(range(500, 10500, 500)) episode_steps = args.epi_step_num #numdata_1 = episode_steps numdata_1 = args.first_stage_data print("first stage data num is {}".format(numdata_1)) print("epsisode timestep is {}".format(episode_steps)) logif = True if args.iflog else False print("we print x axis in log is {}".format(logif)) if not logif: if r_sd == 10.0: num_datas = list(range(0, 8000, 1000)) else: num_datas = list(range(0, 4000, 500)) #num_datas = list(range(0, 2000, 500)) else: num_datas = [0, 100, 1000, 5000, 10000] #num_datas = list(range(1000, 5000, 2000)) #num_datas = [2000] QOCBAs_PCS = [] REs_PCS = [] QOCBAs_fr = [] REs_fr = [] eps_PCSs = [] eps_frs = [] UCRL_PCSs = [] UCRL_frs = [] PSRL_PCSs = [] PSRL_frs = [] REs_PCS_08 = [] REs_fr_08 = [] Bayes_resample = False print_if = True epsilon = 0.2 print("epsilon is {}".format(epsilon)) for num_data in num_datas: print("numdata is {}".format(num_data)) stage_datas = [episode_steps] * (num_data / episode_steps) CS_num_naive = 0 future_V = np.zeros(num_rep) for i in range(num_rep): para_cl = seq_cls(n_s, n_a, s_0, r_mean_prior=r_prior_mean) data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_sd) para_cl.update(data, resample=False) p_n, r_n, r_std = para_cl.get_para(resample=False) Q_here = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) for num_dat in stage_datas: stage_data = collect_data_swimmer.collect_data( p, r, num_dat, s_0, n_s, n_a, right_prop=right_prop, Q=Q_here, epsilon=epsilon, print_pro_right=False, std=r_sd) para_cl.update(stage_data, resample=Bayes_resample) p_n, r_n, r_std = para_cl.get_para(resample=False) Q_here = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) # print(Q_here) V_here = policy_val_iteration(Q_here, n_s, n_a, V_0, num_iter, r, p, gamma) future_V[i] = np.dot(rou, V_here) FS_bool_ = FS_bool(Q_here, V_max_index, n_s, n_a) CS_num_naive += FS_bool_ PCS_naive = np.float(CS_num_naive) / num_rep CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv eps_PCSs.append(PCS_naive) eps_frs.append(diff) if print_if: print("epsilon--greedy with epsilon {}:".format(epsilon)) print("PCS is {}, with CI length {}".format(PCS_naive, CI_len)) print( "future value func is {} with CI length {}, real value is {}, diff is {}" .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff)) # exit() CS_num_naive = 0 future_V = np.zeros(num_rep) for iii in range(num_rep): para_cl = seq_cls(n_s, n_a, s_0, r_mean_prior=r_prior_mean) data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_sd) #data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=0.3, std=r_sd) para_cl.update(data, resample=Bayes_resample) p_n, r_n, r_std = para_cl.get_para(resample=Bayes_resample) var_r_n = r_std**2 # print(p_n) # print(r_n) # print(r_std) # test # p_n = p # r_n = r Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) for jj, stage_data in enumerate(stage_datas): TM = inference.embedd_MC(p_n, n_s, n_a, V_n_max_index) I = np.identity(n_s * n_a) I_TM = np.linalg.inv(I - gamma * TM) V = np.diag(var_r_n) ds = [] ds_V = [] for i in range(n_s): for j in range(n_a): p_sa = p_n[(i * n_a * n_s + j * n_s):(i * n_a * n_s + (j + 1) * n_s)] dij = inference.cal_cov_p_quad_V(p_sa, V_n, n_s) ds.append(dij) if j == V_n_max_index[i]: ds_V.append(dij) D = np.diag(ds) cov_V_D = V + D quad_consts = np.zeros((n_s, n_a)) denom_consts = np.zeros((n_s, n_a, n_s * n_a)) for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: minus_op = np.zeros(n_s * n_a) minus_op[i * n_a + j] = 1 minus_op[i * n_a + V_n_max_index[i]] = -1 denom_consts[i][j] = np.power( np.dot(minus_op, I_TM), 2) * np.diag(cov_V_D) quad_consts[i][j] = ( Q_n[i * n_a + j] - Q_n[i * n_a + V_n_max_index[i]])**2 A, b, G, h = two_stage_inference.construct_contrain_matrix( p_n, n_s, n_a) AA = np.array(A) # bb = np.asarray(b) if opt_ori: def fun(x): return -x[0] else: def fun(x): return x[0] constraints = [] if opt_ori: for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: # print(denom_consts[i][j]) if np.max(denom_consts[i][j]) > 1e-5: constraints.append({ 'type': 'ineq', 'fun': lambda x, up_c, denom_c: up_c / (np.sum( np.multiply( denom_c, np.reciprocal(x[1:]))) ) - x[0], 'args': (quad_consts[i][j], denom_consts[i][j]) }) else: for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: # print(denom_consts[i][j]) if np.max(quad_consts[i][j]) > 1e-5: constraints.append({ 'type': 'ineq', 'fun': lambda x, up_c, denom_c: -(np.sum( np.multiply( denom_c, np.reciprocal(x[1:])) )) / up_c + x[0], 'args': (quad_consts[i][j], denom_consts[i][j]) }) for i in range(AA.shape[0]): constraints.append({ 'type': 'eq', 'fun': lambda x, a, b: np.dot(a, x[1:]) - b, 'args': (AA[i], b[i]) }) constraints = tuple(constraints) bnds = [] bnds.append((0., None)) for i in range(n_s * n_a): bnds.append((optLb, 1)) # bnds.append((optLbs[jj], 1)) bnds = tuple(bnds) initial = np.ones(n_s * n_a + 1) / (n_s * n_a) initial[0] = 0.1 # print(initial) # print("number of equality constraints is {}".format(len(A))) if args.opt_one_step: #print("haha") res = minimize(fun, initial, method='SLSQP', bounds=bnds, constraints=constraints, options={ 'disp': False, 'maxiter': 1 }) else: #print("huha") res = minimize(fun, initial, method='SLSQP', bounds=bnds, constraints=constraints) x_opt = res.x[1:] # exit() # print("***", para_cl.s) #print(x_opt) data = collect_data_swimmer.collect_data(p, r, stage_data, para_cl.s, n_s, n_a, pi_s_a=x_opt, std=r_sd) para_cl.update(data, resample=Bayes_resample) _, _, freq, _ = cal_impirical_r_p.cal_impirical_stats( data, n_s, n_a) # print("x_opt", x_opt) # print("freq", freq) # dist = np.linalg.norm(freq - x_opt) # dist = sklearn.metrics.mutual_info_score(freq, x_opt) # print(dist) p_n, r_n, r_std = para_cl.get_para(resample=Bayes_resample) var_r_n = r_std**2 Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma) future_V[iii] = np.dot(rou, V_here) FS_bool_ = FS_bool(Q_n, V_max_index, n_s, n_a) CS_num_naive += FS_bool_ PCS_naive = np.float(CS_num_naive) / num_rep CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv # print(CS_num_naive) QOCBAs_PCS.append(PCS_naive) QOCBAs_fr.append(diff) if print_if: print("Q-OCBA:") print("PCS is {}, with CI length {}".format(PCS_naive, CI_len)) print( "future value func is {} with CI length {}, real value is {}, diff is {}" .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff)) # follow original CS_num_naive = 0 future_V = np.zeros(num_rep) for i in range(num_rep): data = collect_data_swimmer.collect_data(p, r, num_data + numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_sd) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) # print(Q_n) V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) FS_bool_ = FS_bool(Q_n, V_max_index, n_s, n_a) CS_num_naive += FS_bool_ # if not FS_bool_: # print(i) # print(f_n) # print(Q_n) PCS_naive = np.float(CS_num_naive) / num_rep CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv REs_PCS.append(PCS_naive) REs_fr.append(diff) if print_if: print("follow original") print("PCS is {}, with CI length {}".format(PCS_naive, CI_len)) print( "future value func is {} with CI length {}, real value is {}, diff is {}" .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff)) # follow original CS_num_naive = 0 future_V = np.zeros(num_rep) for i in range(num_rep): data = collect_data_swimmer.collect_data(p, r, num_data + numdata_1, s_0, n_s, n_a, right_prop=0.8, std=r_sd) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) # print(Q_n) V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) FS_bool_ = FS_bool(Q_n, V_max_index, n_s, n_a) CS_num_naive += FS_bool_ # if not FS_bool_: # print(i) # print(f_n) # print(Q_n) PCS_naive = np.float(CS_num_naive) / num_rep CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv REs_PCS_08.append(PCS_naive) REs_fr_08.append(diff) if print_if: print("RE(0.8)") print("PCS is {}, with CI length {}".format(PCS_naive, CI_len)) print( "future value func is {} with CI length {}, real value is {}, diff is {}" .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff)) #UCRL #delta = 0.05 CS_num = 0. future_V = np.zeros(num_rep) for i in range(num_rep): data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_sd) pre_collected_stats = UCRL_2.get_pre_collected_stats( data, n_s, n_a) UCRL_cl = UCRL_2.UCRL(n_s, n_a, 0.05, numdata_1, s_0, num_data, pre_collected_stats) while UCRL_cl.t < num_data: UCRL_cl.update_point_estimate_and_CIbound() # print("step1 finished") UCRL_cl.Extended_Value_Iter() # print("step2 finished") UCRL_cl.collect_data_and_update(p, r, r_std=r_sd) # print("step3 finished") # print(UCRL_cl.t) UCRL_cl.update_point_estimate_and_CIbound() datahere = data + UCRL_cl.datas #print(UCRL_cl.t, num_data) #print(len(datahere)) #exit() p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( datahere, n_s, n_a) Q_estimate = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) # print(Q_estimate) FS_bool_ = FS_bool(Q_estimate, V_max_index, n_s, n_a) CS_num += FS_bool_ V_here = policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) PCS = np.float(CS_num) / num_rep CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv # print(CS_num_naive) UCRL_PCSs.append(PCS) UCRL_frs.append(diff) if print_if: print("UCRL") print("PCS is {}, with CI length {}".format(PCS, CI_len)) print( "future value func is {} with CI length {}, real value is {}, diff is {}" .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff)) CS_num = 0. future_V = np.zeros(num_rep) for i in range(num_rep): para_cl = PSRLcls(n_s, n_a, s_0) data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_sd) para_cl.update(data, r_sigma=r_sd) Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) # print(Q_estimate) for nd in stage_datas: dat = collect_data_swimmer.collect_data(p, r, nd, para_cl.s_0, n_s, n_a, Q=Q_estimate, epsilon=0, std=r_sd) data += dat para_cl.update(dat, r_sigma=r_sd) Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) # print(para_cl.pprior) # print(para_cl.r_mean) # exit() # print(Q_estimate) # print(para_cl.pprior) # print(para_cl.r_mean) # transition = np.array([1.] * n_s * (n_s * n_a)) # for i in range(n_s): # for j in range(n_a): # transition[ # (i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] = para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] \ # / np.sum(para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)]) # r_n = para_cl.r_mean # print(r_n) # print(transition) # Q_estimate = Iterative_Cal_Q.cal_Q_val(transition, Q_0, r_n, num_iter , gamma, n_s, n_a) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data, n_s, n_a) Q_estimate = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_here = policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) FS_bool_ = FS_bool(Q_estimate, V_max_index, n_s, n_a) CS_num += FS_bool_ PCS = np.float(CS_num) / num_rep CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv PSRL_PCSs.append(PCS) PSRL_frs.append(diff) # print(CS_num_naive) if print_if: print("PSRL") print("PCS is {}, with CI length {}".format(PCS, CI_len)) print( "future value func is {} with CI length {}, real value is {}, diff is {}" .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff)) print("epsilon greedy") print(eps_PCSs) print(eps_frs) print("QOCBA") print(QOCBAs_PCS) print(QOCBAs_fr) print("REs") print(REs_PCS) print(REs_fr) print("RE(0.8)s") print(REs_PCS_08) print(REs_fr_08) print("UCRL ") print(UCRL_PCSs) print(UCRL_frs) print("PSRL ") print(PSRL_PCSs) print(PSRL_frs) if logif: num_datas = np.log(np.array(num_datas) + 1) plt.plot(num_datas, eps_PCSs, 'k<--', markersize=6, label="epsilon-greedy") plt.plot(num_datas, UCRL_PCSs, 'm+--', markersize=6, label="UCRL") plt.plot(num_datas, PSRL_PCSs, 'cx--', markersize=6, label="PSRL") plt.plot(num_datas, QOCBAs_PCS, 'ro--', markersize=6, label="Q-OCBA") # plt.fill_between(xs, np.subtract(y1, CI_1), np.add(y1, CI_1), color='r', alpha=0.4) plt.plot(num_datas, REs_PCS, 'b>--', markersize=6, label="RE({})".format(right_prop)) plt.plot(num_datas, REs_PCS_08, 'g*--', markersize=6, label="RE(0.8)") # plt.fill_between(xs, np.subtract(y2, CI_2), np.add(y2, CI_2), color='b', alpha=0.4) # plt.axhline(y=0.95) plt.xlabel("total number of data") # plt.ylabel("CR overall coverage") plt.ylabel("PCS") plt.legend(loc='lower right', shadow=True, fontsize='x-small') plt.title(r'$\sigma_R= {}, r_L = {}$ PCS'.format(r_sd, r[0])) plt.show() plt.plot(num_datas, eps_frs, 'k<--', markersize=6, label="epsilon-greedy") plt.plot(num_datas, UCRL_frs, 'm+--', markersize=6, label="UCRL") plt.plot(num_datas, PSRL_frs, 'cx--', markersize=6, label="PSRL") plt.plot(num_datas, REs_fr_08, 'g*--', markersize=6, label="RE(0.8)") plt.plot(num_datas, QOCBAs_fr, 'ro--', markersize=6, label="Q-OCBA") # plt.fill_between(xs, np.subtract(y1, CI_1), np.add(y1, CI_1), color='r', alpha=0.4) plt.plot(num_datas, REs_fr, 'b>--', markersize=6, label="RE({})".format(right_prop)) # plt.fill_between(xs, np.subtract(y2, CI_2), np.add(y2, CI_2), color='b', alpha=0.4) # plt.axhline(y=0.95) plt.xlabel("total number of data") # plt.ylabel("CR overall coverage") plt.ylabel("future regret") plt.legend(loc='upper right', shadow=True, fontsize='x-small') plt.title(r'$\sigma_R= {}, r_L = {}$ future regret'.format(r_sd, r[0])) plt.show()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0') parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') args = parser.parse_args() num_rep = args.rep initial_s_dist = "even" Q_approximation = None # Q_approximation = "linear_interpolation" right_prop = args.rightprop # 0.8 s_0 = 2 # collect data configuration num_data = args.numdata num_data_1 = num_data * 3 / 10 num_data_2 = num_data * 7 / 10 print( "num_data in stage 1 is {}, num_data in stage 2 is {}, rightprop in stage 1 is {}" .format(num_data_1, num_data_2, right_prop)) n_s = 5 print("n_s is {}".format(n_s)) n_a = 2 # value-iteration configuration num_iter = 200 gamma = 0.95 # real p and r p = np.zeros(n_s * n_a * n_s) Q_0 = np.zeros(n_s * n_a) r = np.zeros(n_s * n_a) r[0] = args.r0 r[-1] = 10. # r[0] = 10. # r[-1] = 0.1 print(r) print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 # one replication of coverage test # Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) # V_real = get_V_from_Q(Q_real, n_s, n_a) # Q_n, CI_len, V_n = get_CI(collec_data_bool, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p) # print(Q_real) # print(V_real) # print(Q_n) # print(V_n) # print(CI_len) Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) print("Q real is {}".format(Q_real)) if initial_s_dist == "even": R_real = np.mean(V_real) initial_w = np.ones(n_s) / n_s opts = [] datas = [] Q_ns = [] opts_ori = [] for i in range(num_rep): while True: while True: data = collect_data_swimmer.collect_data(p, r, num_data_1, s_0, n_s, n_a, right_prop=right_prop) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) # print("first stage visiting frequency is {}".format(f_n)) if f_n.all() != 0: break datas.append(data) Q_ns.append(Q_n) I_TM, W_inverse, cov_V_D, I_TM_V, W_inverse_V, cov_V_V_D = inference.get_Sigma_n_comp( p_n, f_n, var_r_n, V_n, gamma, n_s, n_a, V_n_max_index) # print( np.diag(cov_V_V_D)) # exit() quad_con_vec = np.power(np.dot(initial_w, I_TM_V), 2) * np.diag(cov_V_V_D) # print(quad_con_vec) # if not np.all(f_n): # print(f_n) # print(quad_con_vec) # print("need more data for first stage") # exit() quad_con_vec_all = np.zeros(n_s * n_a) for i in range(n_s): quad_con_vec_all[i * n_a + V_n_max_index[i]] = quad_con_vec[i] # print(quad_con_vec) # print(I_TM_V) # print(cov_V_V_D) # print(initial_w) # Create a new model init_v_opt = 1. / (n_a * n_s) quad_con_vec_all = matrix(quad_con_vec_all) array_quad_con_vec = np.array(quad_con_vec_all).transpose()[0] # print(array_quad_con_vec) # exit() def F(x): u = np.divide(1, x) # print(u) uu = np.multiply(array_quad_con_vec, u) # print(quad_con_vec_all) # print(uu) val = np.sum(uu) # print(val) return val A, b, G, h = construct_contrain_matrix(p_n, n_s, n_a) AA = np.array(A) bb = np.asarray(b) constraints = [] for i in range(AA.shape[0]): constraints.append({ 'type': 'eq', 'fun': lambda x, a, b: np.dot(a, x) - b, 'args': (AA[i], bb[i]) }) constraints = tuple(constraints) bnds = [] for i in range(n_s * n_a): bnds.append((1e-6, None)) # bnds.append((0.001, None)) bnds = tuple(bnds) initial = np.ones(n_s * n_a) / (n_s * n_a) # print(initial) res = minimize(F, initial, method='SLSQP', bounds=bnds, constraints=constraints) x_opt = res.x # print(x_opt) opt_val = F(x_opt) # ori-Q-OCBA def fun(x): return x[0] # print("quardratic coeff of opt is {}".format(quad_consts)) # print("denom consts coef of opt is {}".format(denom_consts)) quad_consts = np.zeros((n_s, n_a)) denom_consts = np.zeros((n_s, n_a, n_s * n_a)) for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: minus_op = np.zeros(n_s * n_a) minus_op[i * n_a + j] = 1 minus_op[i * n_a + V_n_max_index[i]] = -1 c1 = np.power(np.dot(minus_op, I_TM), 2) denom_consts[i][j] = c1 * np.diag(cov_V_D) # print(I_TM, c1) # exit() quad_consts[i][j] = ( Q_n[i * n_a + j] - Q_n[i * n_a + V_n_max_index[i]])**2 constraints = [] for i in range(n_s): for j in range(n_a): if j != V_n_max_index[i]: # print(denom_consts[i][j]) if np.max(denom_consts[i][j]) > 1e-5: constraints.append({ 'type': 'ineq', 'fun': lambda x, up_c, denom_c: -(np.sum( np.multiply(denom_c, np.reciprocal(x[1:]))) ) / up_c + x[0], 'args': (quad_consts[i][j], denom_consts[i][j]) }) for i in range(AA.shape[0]): constraints.append({ 'type': 'eq', 'fun': lambda x, a, b: np.dot(a, x[1:]) - b, 'args': (AA[i], b[i]) }) constraints = tuple(constraints) bnds = [] bnds.append((0., None)) for i in range(n_s * n_a): bnds.append((1e-6, 1)) bnds = tuple(bnds) initial = np.ones(n_s * n_a + 1) / (n_s * n_a) initial[0] = 0.1 # print(initial) res = minimize(fun, initial, method='SLSQP', bounds=bnds, constraints=constraints) x_opt_ori = res.x[1:] opts_ori.append(x_opt_ori) bench_val = F(x_opt_ori) if bench_val > opt_val: break else: print(opt_val) print(bench_val) print("#####") # print(opt_val) # print(bench_val) # print("#####") epsilon = 0.3 tran_M = optimize_pfs.transition_mat_S_A_epsilon( p_n, epsilon, V_n_max_index, n_s, n_a) bench_w = compare_var.solveStationary(tran_M) bench_w = np.array(bench_w).reshape(-1, ) # print(bench_w) bench_val = F(bench_w) # print(bench_val) opts.append(x_opt) # exit() Q_approximation = None initial_s_dist = "even" if initial_s_dist == "even": R_real = np.mean(V_real) initial_w = np.ones(n_s) / n_s rou = initial_w cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] numerical_tol = 1e-6 S_0 = None for i in range(num_rep): x_opt = opts[i] second_data = collect_data_swimmer.collect_data(p, r, num_data_2, s_0, n_s, n_a, right_prop=right_prop, pi_s_a=x_opt) data = second_data + datas[i] # data = second_data Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI( Q_approximation, S_0, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=data) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and(Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and(V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and(R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.divide(cov_bools_Q, num_rep) cov_rate_V = np.divide(cov_bools_V, num_rep) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) print("coverage for Q") print(cov_rate_Q) print(cov_rate_CI_Q) print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("coverage for V") print(cov_rate_V) print(cov_rate_CI_V) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci)) # Q-OCBA ori CS_num_naive = 0 cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] future_V = np.zeros(num_rep) for i in range(num_rep): x_opt = opts_ori[i] # print(x_opt) second_data = collect_data_swimmer.collect_data(p, r, num_data_2, s_0, n_s, n_a, right_prop=right_prop, pi_s_a=x_opt) data = second_data + datas[i] p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data, n_s, n_a) Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI( Q_approximation, S_0, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=data) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and(Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and(V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and(R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) # if not FS_bool_: # print(i) # print(f_n) # print(Q_n) PCS_naive = np.float(CS_num_naive) / num_rep CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv # print(CS_num_naive) print("Q-OCBA:") print("PCS is {}, with CI length {}".format(PCS_naive, CI_len)) print( "future value func is {} with CI length {}, real value is {}, diff is {}" .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff)) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.divide(cov_bools_Q, num_rep) cov_rate_V = np.divide(cov_bools_V, num_rep) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) print("coverage for Q") print(cov_rate_Q) print(cov_rate_CI_Q) print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("coverage for V") print(cov_rate_V) print(cov_rate_CI_V) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci)) #exit() epsilons = [0.2] for epsilon in epsilons: print("epsilon is {}".format(epsilon)) cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] for i in range(num_rep): Q_n = Q_ns[i] second_data = collect_data_swimmer.collect_data( p, r, num_data_2, s_0, n_s, n_a, right_prop=right_prop, Q=Q_n, epsilon=epsilon, print_pro_right=False) data = second_data + datas[i] Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI( Q_approximation, S_0, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=data) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and( Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and( V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and( R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.divide(cov_bools_Q, num_rep) cov_rate_V = np.divide(cov_bools_V, num_rep) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) print("coverage for Q") print(cov_rate_Q) print(cov_rate_CI_Q) print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("coverage for V") print(cov_rate_V) print(cov_rate_CI_V) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci)) print("RE(0.8)") cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] for i in range(num_rep): second_data = collect_data_swimmer.collect_data(p, r, num_data_2, s_0, n_s, n_a, right_prop=right_prop) data = second_data + datas[i] Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI( Q_approximation, S_0, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=data) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and(Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and(V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and(R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.divide(cov_bools_Q, num_rep) cov_rate_V = np.divide(cov_bools_V, num_rep) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) print("coverage for Q") print(cov_rate_Q) print(cov_rate_CI_Q) print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("coverage for V") print(cov_rate_V) print(cov_rate_CI_V) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions') parser.add_argument('--episode', nargs="?", type=int, default=100, help='number of episode') #parser.add_argument('--r0', nargs = "?", type = float, default = 1.0, help = 'value of r0' ) parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data') parser.add_argument('--rightprop', nargs="?", type=float, default=0.6, help='warm start random exploration right probability') parser.add_argument('--rstd', nargs="?", type=float, default=1.0, help='standard deviation of reward') args = parser.parse_args() num_iter, gamma, n_s, n_a, num_rep = 200, 0.95, 5, 2, args.rep episodes = args.episode Total_data = args.numdata right_prop = args.rightprop #print(num_rep, episodes, Total_data, right_prop) r = np.zeros(n_s * n_a) r_vals = range(1, 4) #r_vals = [5./1000] r_right = 10.0 for r0_val in r_vals: r[0] = float(r0_val) r[-1] = r_right r_std = args.rstd print("reward standard deviation is {}".format(r_std)) # r[0] = 10. # r[-1] = 0.1 Q_0 = np.zeros(n_s * n_a) V_0 = np.zeros(n_s) rou = np.ones(n_s) / n_s p = np.zeros(n_s * n_a * n_s) print("r[0] and r[-1] are {}, {}".format(r[0], r[-1])) #exit() p[0 * n_s * n_a + 0 * n_s + 0] = 1. p[0 * n_s * n_a + 1 * n_s + 0] = 0.7 p[0 * n_s * n_a + 1 * n_s + 1] = 0.3 for i in range(1, (n_s - 1)): p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1 p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1 p[i * n_a * n_s + 1 * n_s + i] = 0.6 p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3 p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7 p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3 Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a) V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a) print("Q real is {}".format(Q_real)) s_0 = 2 Q_approximation = None initial_s_dist = "even" if initial_s_dist == "even": R_real = np.mean(V_real) initial_w = np.ones(n_s) / n_s cov_bools_Q = np.zeros(n_s * n_a) cov_bools_V = np.zeros(n_s) cov_bools_R = 0. # print("Q real is {}".format(Q_real)) # print("V real is {}".format(V_real)) # print("R real is {}".format(R_real)) CI_lens_Q = [] CI_lens_V = [] CI_lens_R = [] numerical_tol = 1e-6 S_0 = None ## PSRL data parameter specification print("total num of data is {}".format(Total_data)) numdata_1 = Total_data * 3 / 10 seq_if = False numdata_2 = Total_data - numdata_1 print("# of epsisodes is {}".format(episodes)) num_datas = [numdata_2 / episodes] * episodes #print(num_datas) CS_num = 0. future_V = np.zeros(num_rep) for i in range(num_rep): para_cl = parameter_prior(n_s, n_a, s_0) all_data = [] if not seq_if: while True: data1 = collect_data_swimmer.collect_data( p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop, std=r_std) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( data1, n_s, n_a) Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a) #print("first stage visiting frequency is {}".format(f_n)) if f_n.all() != 0: break else: data1 = collect_data_swimmer.collect_data( p, r, numdata_2 / episodes, s_0, n_s, n_a, right_prop=right_prop, std=r_std) #data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop) all_data += data1 para_cl.update(data1, r_sigma=r_std) Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) #print(Q_estimate) second_stage_data = [] for num_data in num_datas: data = collect_data_swimmer.collect_data(p, r, num_data, para_cl.s_0, n_s, n_a, Q=Q_estimate, epsilon=0, std=r_std) all_data += data second_stage_data += data para_cl.update(data, r_sigma=r_std) Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma) #print(para_cl.pprior) #print(para_cl.r_mean) #exit() #print(Q_estimate) #print(para_cl.pprior) #print(para_cl.r_mean) #transition = np.array([1.] * n_s * (n_s * n_a)) #for i in range(n_s): # for j in range(n_a): # transition[ # (i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] = para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] \ # / np.sum(para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)]) #r_n = para_cl.r_mean #print(r_n) #print(transition) #Q_estimate = Iterative_Cal_Q.cal_Q_val(transition, Q_0, r_n, num_iter , gamma, n_s, n_a) #print(len(all_data)) p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats( all_data, n_s, n_a) Q_estimate = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a) V_here = optimize_pfs.policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma) # print(V_here, V_real) future_V[i] = np.dot(rou, V_here) FS_bool = optimize_pfs.FS_bool(Q_estimate, V_max_index, n_s, n_a) CS_num += FS_bool # 5.3 Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI( Q_approximation, S_0, Total_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, initial_w, right_prop, data=all_data) # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len)) cov_bool_Q = np.logical_and( Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >= (Q_n - CI_len_Q - numerical_tol)) cov_bool_V = np.logical_and( V_real <= (V_n + CI_len_V + numerical_tol), V_real >= (V_n - CI_len_V - numerical_tol)) cov_bool_R = np.logical_and( R_real <= (R_n + CI_len_R + numerical_tol), R_real >= (R_n - CI_len_R - numerical_tol)) # print(cov_bool_Q) # exit() # print(cov_bool) cov_bools_Q += cov_bool_Q cov_bools_V += cov_bool_V cov_bools_R += cov_bool_R CI_lens_Q.append(CI_len_Q) CI_lens_V.append(CI_len_V) CI_lens_R.append(CI_len_R) PCS = np.float(CS_num) / num_rep CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep) fv = np.mean(future_V) fv_std = np.std(future_V) rv = np.dot(rou, V_real) diff = rv - fv # print(CS_num_naive) print("PCS is {}, with CI length {}".format(PCS, CI_len)) print( "future value func is {} with CI length {}, real value is {}, diff is {}" .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff)) CI_len_Q_mean = np.mean(CI_lens_Q) CI_len_V_mean = np.mean(CI_lens_V) CI_len_R_mean = np.mean(CI_lens_R) CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep) CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep) CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep) cov_rate_Q = np.divide(cov_bools_Q, num_rep) cov_rate_V = np.divide(cov_bools_V, num_rep) cov_rate_R = np.divide(cov_bools_R, num_rep) cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep) cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep) cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep) print("coverage for Q") print(cov_rate_Q) print(cov_rate_CI_Q) print("mean coverage for Q ") print(np.mean(cov_rate_Q)) print(np.mean(cov_rate_CI_Q)) print("coverage for V") print(cov_rate_V) print(cov_rate_CI_V) print("mean coverage for V") print(np.mean(cov_rate_V)) print(np.mean(cov_rate_CI_V)) print("coverage for R") print(cov_rate_R) print(cov_rate_CI_R) print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci)) print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci)) print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci))