Example #1
0
def policy_val_iteration(Q_n, n_s, n_a, V_init, num_iter, r, p, gamma):
    _, V_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
    V_current = np.copy(V_init)
    V_pre = np.copy(V_init)
    for t in range(num_iter):
        for i in range(n_s):
            exp_next_q = 0
            for k in range(n_s):
                exp_next_q += p[i * n_s * n_a + V_max_index[i] * n_s + k] * V_pre[k]
            V_current[i] = r[i * n_a + V_max_index[i]] + gamma * exp_next_q
        # print("estimate of {} th iteration is {}".format(t,Q_current))
        V_pre = np.copy(V_current)
    return V_current
def stage_2_estimation(p, r, num_data_1, s_0, n_s, n_a, Q_0, x_opt, num_iter,
                       gamma, initial_w):
    data = collect_data_swimmer.collect_data(p,
                                             r,
                                             num_data_1,
                                             s_0,
                                             n_s,
                                             n_a,
                                             pi_s_a=x_opt)
    p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
        data, n_s, n_a)
    Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a)
    V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
    R_n = np.dot(initial_w, V_n)
    return p_n, r_n, f_n, var_r_n, Q_n, V_n, V_n_max_index, R_n
def main():
    #collect data with random policy
    n_s = 6
    n_a = 2
    p = np.zeros(n_s * n_a * n_s)
    r = np.zeros(n_s * n_a)
    r[0] = 0.1
    r[-1] = 10
    p[0 * n_s * n_a + 0 * n_s + 0] = 1.
    p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
    p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
    for i in range(1, (n_s - 1)):
        p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
        p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
        p[i * n_a * n_s + 1 * n_s + i] = 0.6
        p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
    p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
    initial_w = np.ones(n_s)/n_s
    Q_0 = np.zeros(n_s * n_a)
    num_iter = 1000
    gamma = 0.95
    num_data = 1000000
    Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
    R_real = np.mean(V_real)
    right_props = np.linspace(0.8,0.9,10, endpoint = False)
    CI_len_Rs = []
    for right_prop in right_props:
        var_r = np.zeros(n_s * n_a)
        tran_M_S_A = transition_mat_S_A (p, right_prop, n_s, n_a)
        f = solveStationary(tran_M_S_A)
        f = np.array(f).reshape(-1, )
        #print(tran_M_S_A)
        #print(f)
        Sigma_Q, Sigma_V, Sigma_R = inference. cal_Sigma_n(p, f, var_r, V_real, gamma, n_s, n_a, V_max_index, initial_w)
        CI_len_R = 1.96 * np.sqrt(Sigma_R) / np.sqrt(num_data)
        #print(CI_len_R)
        CI_len_Rs.append(CI_len_R)
    print(CI_len_Rs)
    plt.plot(right_props, CI_len_Rs, 'ro--', markersize=6)
    plt.xlabel("exploration right decision probability")
    plt.ylabel("CI_len")
    plt.title("Compare variance of different exploration strategy")
    plt.show()
Example #4
0
def stage_1_estimation(p, r, num_data_1, s_0, n_s, n_a, Q_0, right_prop,
                       num_iter, gamma, initial_w):
    count = 0
    while True:
        count += 1
        data = collect_data_swimmer.collect_data(p,
                                                 r,
                                                 num_data_1,
                                                 s_0,
                                                 n_s,
                                                 n_a,
                                                 right_prop=right_prop)
        p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
            data, n_s, n_a)
        Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s,
                                        n_a)
        V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
        # print("first stage visiting frequency is {}".format(f_n))
        if f_n.all() != 0:
            break
    R_n = np.dot(initial_w, V_n)
    return p_n, r_n, f_n, var_r_n, Q_n, V_n, V_n_max_index, R_n
Example #5
0
def main():
    rep = 10
    eva_train = []
    eva_test = []
    episodes_trained = []
    num_iter = 200
    gamma = 0.99
    dims = (4, 4, 4, 4)
    n_s = np.prod(dims)
    n_a = 2
    Q_0 = np.zeros(n_s * n_a)
    numdata1 = 1000
    print("first stage data number is {}".format(numdata1))
    for i in range(rep):
        total_train, episode, train_data, action_model = train_collect_data(
            "random", numdata1)

        data = collect_cartpole_data.pre_process(train_data, dims)
        # print(c)
        # for d in train_data:
        #    print(d)
        # exit()

        new_r_n, var_r_n, p_n = estimate_transition.find_rmean_rvar_p_estimate(
            data, n_s, n_a)
        Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, new_r_n, num_iter, gamma,
                                        n_s, n_a)
        V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
        #print(new_r_n)
        #print(Q_n)
        dummy_f_n = np.ones(n_s * n_a)
        I_TM, W_inverse, cov_V_D, I_TM_V, W_inverse_V, cov_V_V_D = inference.get_Sigma_n_comp(
            p_n, dummy_f_n, var_r_n, V_n, gamma, n_s, n_a, V_n_max_index)
        quad_consts = np.zeros((n_s, n_a))
        denom_consts = np.zeros((n_s, n_a, n_s * n_a))

        for i in range(n_s):
            for j in range(n_a):
                if j != V_n_max_index[i]:
                    minus_op = np.zeros(n_s * n_a)
                    minus_op[i * n_a + j] = 1
                    minus_op[i * n_a + V_n_max_index[i]] = -1
                    denom_consts[i][j] = np.power(np.dot(minus_op, I_TM),
                                                  2) * np.diag(cov_V_D)
                    quad_consts[i][j] = (Q_n[i * n_a + j] -
                                         Q_n[i * n_a + V_n_max_index[i]])**2

        A, b, G, h = two_stage_inference.construct_contrain_matrix(
            p_n, n_s, n_a)
        AA = np.array(A)
        bb = np.asarray(b)

        def fun(x):
            return x[0]

        def cons(x, i, j):
            z = x[0]
            w = x[1:]
            return quad_consts[i][j] / (np.sum(
                np.multiply(denom_consts[i][j], np.reciprocal(w)))) - z

        def eqcons(x, a, b):
            return np.dot(a, x[1:]) - b

        constraints = []
        for i in range(n_s):
            for j in range(n_a):
                if j != V_n_max_index[i]:
                    constraints.append({
                        'type':
                        'ineq',
                        'fun':
                        lambda x, up_c, denom_c: up_c /
                        (np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))
                         ) + x[0],
                        'args': (quad_consts[i][j], denom_consts[i][j])
                    })

        for i in range(AA.shape[0]):
            constraints.append({
                'type': 'eq',
                'fun': lambda x, a, b: np.dot(a, x[1:]) - b,
                'args': (AA[i], b[i])
            })
        constraints = tuple(constraints)
        bnds = []
        for i in range(n_s * n_a + 1):
            bnds.append((1e-5, None))
        bnds = tuple(bnds)
        initial = np.ones(n_s * n_a + 1) / (n_s * n_a)

        def func_val(x):
            vals = []
            for i in range(n_s):
                for j in range(n_a):
                    if j != V_n_max_index[i]:
                        vals.append(quad_consts[i][j] / (2 * np.sum(
                            np.multiply(denom_consts[i][j], np.reciprocal(x))))
                                    )
            z = np.min(vals)
            # print (z)
            # print (vals)
            return z

        initial[0] = 0
        # print(initial)
        t_1 = time.time()
        res = minimize(fun,
                       initial,
                       method='SLSQP',
                       bounds=bnds,
                       constraints=constraints)
        x_opt = res.x[1:]
        runnung_t = time.time() - t_1
        #print("optimization running time is {}".format(runnung_t))
        print("optimal stationary distribution")
        for i in range(n_s):
            prob = x_opt[i * n_a:(i + 1) * n_a]
            prob = prob / np.sum(prob)
            print(prob)
        exit()
        total_train, episode, train_data, action_model = train_collect_data(
            "Q-OCBA", 3000, opt=x_opt)
        #exit()

        ave_train = total_train / episode
        episodes_trained.append(episode)
        # test stage
        total_test = 0.
        env = gym.make('CartPole-v0')
        for episode in range(NUM_EPISODES):
            observation = env.reset()
            for iteration in range(MAX_ITERATIONS):
                q_values = get_q(action_model, observation)
                action = np.argmax(q_values)
                observation, reward, done, info = env.step(action)
                if done:
                    total_test += iteration
                    #print 'Episode {}, iterations: {}'.format(episode,iteration)
                    break
        ave_test = total_test / NUM_EPISODES
        print("{}: average train and average test are {} and {}".format(
            "Q-OCBA", ave_train, ave_test))
        eva_train.append(ave_train)
        eva_test.append(ave_test)
    epi_train_mean = np.mean(episodes_trained)
    epi_train_std = np.std(episodes_trained)
    eva_train_mean = np.mean(eva_train)
    eva_train_std = np.std(eva_train)
    eva_test_mean = np.mean(eva_test)
    eva_test_std = np.std(eva_test)

    print(
        "rep: average train and average test are {} and {}, number of episodes trained mean  is {}"
        .format(eva_train_mean, eva_test_mean, epi_train_mean))
    print(
        "rep: average std train and average test are {} and {}, number of episodes trained std  is {}"
        .format(eva_train_std, eva_test_std, epi_train_std))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions')
    parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0')
    parser.add_argument('--r_prior', nargs="?", type=float, default=0.0, help='prior value of reward function')
    parser.add_argument('--optLb', nargs="?", type=float, default=1e-2, help='value of r0')
    parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data')
    parser.add_argument('--epi_step_num', nargs="?", type=int, default=100, help='number of episode steps')
    parser.add_argument('--rightprop', nargs="?", type=float, default=0.6,
                        help='warm start random exploration right probability')
    parser.add_argument('--rstd', nargs="?", type=float, default=1.0,
                        help='standard deviation of reward')
    parser.add_argument('--opt_ori', nargs="?", type=bool, default=False,
                        help='Q-OCBA optimization method')
    parser.add_argument('--num_value_iter', nargs="?", type=int, default=200, help='number of value iteration')
    parser.add_argument('--opt_one_step', nargs="?", type=bool, default=False,
                        help='Q-OCBA optimization running only one step')

    args = parser.parse_args()
    opt_ori = args.opt_ori
    print("Q-OCBA optimization method using original formulation is {}".format(opt_ori))
    num_rep = args.rep
    initial_s_dist = "even"
    Q_approximation = None
    right_prop = args.rightprop
    optLb = args.optLb
    s_0 = 2
    # collect data configuration
    n_s = 5
    print("n_s is {}".format(n_s))
    n_a = 2
    # value-iteration configuration
    num_iter = 200
    gamma = 0.95
    # real p and r
    p = np.zeros(n_s * n_a * n_s)
    Q_0 = np.zeros(n_s * n_a)
    V_0 = np.zeros(n_s)
    rou = np.ones(n_s) / n_s
    r = np.zeros(n_s * n_a)
    r[0] = args.r0
    r[-1] = 10.
    r_sd = args.rstd
    r_prior_mean = args.r_prior
    print("reward standard deviation is {}".format(r_sd))
    # r[0] = 10.
    # r[-1] = 0.1
    print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
    p[0 * n_s * n_a + 0 * n_s + 0] = 1.
    p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
    p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
    for i in range(1, (n_s - 1)):
        p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
        p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
        p[i * n_a * n_s + 1 * n_s + i] = 0.6
        p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
    p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
    Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
    right_prop = args.rightprop


    Total_data = args.numdata
    print("total num of data is {}".format(Total_data))
    episode_steps = args.epi_step_num
    numdata_1 = 5
    print("warm start steps is {}".format(numdata_1))
    numdata_2 = Total_data
    print("epsisode timestep is {}".format(episode_steps))
    num_datas = [episode_steps] * (numdata_2/ episode_steps)
    #num_datas = [1000, 0]
    CS_num = 0.
    future_V = np.zeros(num_rep)
    Total_time = []
    #if use Bayesian prior as exploration
    Bayes_resample = False
    #optLbs = np.linspace(optLb, 1e-6, len(num_datas))
    ##print(optLbs)
    #exit()

    for ii in range(num_rep):
        time_rep = time.time()
        para_cl = parameter_prior(n_s,n_a, s_0, r_mean_prior =  r_prior_mean)
        data =  collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop,  std = r_sd)
        para_cl.update(data, resample = Bayes_resample)
        p_n, r_n, r_std = para_cl.get_para( resample = Bayes_resample)
        var_r_n = r_std **2
        #print(p_n)
        #print(r_n)
        #print(r_std)

        #test
        #p_n = p
        #r_n = r

        Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, args.num_value_iter , gamma, n_s, n_a)
        V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
        for jj, num_data in enumerate(num_datas):
            TM = inference.embedd_MC(p_n, n_s, n_a, V_n_max_index)
            I = np.identity(n_s * n_a)
            I_TM = np.linalg.inv(I - gamma * TM)
            V = np.diag(var_r_n)
            ds = []
            ds_V = []
            for i in range(n_s):
                for j in range(n_a):
                    p_sa = p_n[(i * n_a * n_s + j * n_s): (i * n_a * n_s + (j + 1) * n_s)]
                    dij = inference.cal_cov_p_quad_V(p_sa, V_n, n_s)
                    ds.append(dij)
                    if j == V_n_max_index[i]:
                        ds_V.append(dij)
            D = np.diag(ds)
            cov_V_D = V + D
            quad_consts = np.zeros((n_s, n_a))
            denom_consts = np.zeros((n_s, n_a, n_s * n_a))

            for i in range(n_s):
                for j in range(n_a):
                    if j != V_n_max_index[i]:
                        minus_op = np.zeros(n_s * n_a)
                        minus_op[i * n_a + j] = 1
                        minus_op[i * n_a + V_n_max_index[i]] = -1
                        denom_consts[i][j] = np.power(np.dot(minus_op, I_TM), 2) * np.diag(cov_V_D)
                        quad_consts[i][j] = (Q_n[i * n_a + j] - Q_n[i * n_a + V_n_max_index[i]]) ** 2

            A, b, G, h = two_stage_inference.construct_contrain_matrix(p_n, n_s, n_a)
            AA = np.array(A)
            #bb = np.asarray(b)


            if opt_ori:
                def fun(x):
                    return -x[0]
            else:
                def fun(x):
                    return x[0]
            constraints = []
            if opt_ori:
                for i in range(n_s):
                    for j in range(n_a):
                        if j != V_n_max_index[i]:
                            # print(denom_consts[i][j])
                            if np.max(denom_consts[i][j]) > 1e-5:
                                constraints.append({'type': 'ineq', 'fun': lambda x, up_c, denom_c: up_c / (
                                    np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))) - x[0],
                                                    'args': (quad_consts[i][j], denom_consts[i][j])})
            else:
                for i in range(n_s):
                    for j in range(n_a):
                        if j != V_n_max_index[i]:
                            # print(denom_consts[i][j])
                            if np.max(quad_consts[i][j]) > 1e-5:
                                constraints.append({'type': 'ineq', 'fun': lambda x, up_c, denom_c: -(
                                    np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))) / up_c + x[0],
                                                    'args': (quad_consts[i][j], denom_consts[i][j])})

            for i in range(AA.shape[0]):
                constraints.append(
                    {'type': 'eq', 'fun': lambda x, a, b: np.dot(a, x[1:]) - b, 'args': (AA[i], b[i])})
            constraints = tuple(constraints)
            bnds = []
            bnds.append((0., None))
            for i in range(n_s * n_a):
                bnds.append((optLb, 1))
                #bnds.append((optLbs[jj], 1))

            bnds = tuple(bnds)
            initial = np.ones(n_s * n_a + 1) / (n_s * n_a)

            initial[0] = 0.1
            # print(initial)
            # print("number of equality constraints is {}".format(len(A)))
            if args.opt_one_step:
                res = minimize(fun, initial, method='SLSQP', bounds=bnds,
                               constraints=constraints, options = {'disp':False, 'maxiter':1})
            else:
                res = minimize(fun, initial, method='SLSQP', bounds=bnds,
                               constraints=constraints)
            x_opt = res.x[1:]

            #exit()

            #print("***", para_cl.s)


            data = collect_data_swimmer.collect_data(p, r, num_data, para_cl.s, n_s, n_a, pi_s_a=x_opt,  std = r_sd)
            para_cl.update(data, resample = Bayes_resample)
            _, _, freq, _ = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a)
            #print("x_opt", x_opt)
            #print("freq", freq)
            #dist = np.linalg.norm(freq - x_opt)
            #dist = sklearn.metrics.mutual_info_score(freq, x_opt)
            #print(dist)

            p_n, r_n, r_std = para_cl.get_para(resample = Bayes_resample)
            var_r_n = r_std ** 2
            Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, args.num_value_iter, gamma, n_s, n_a)
            V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
        #print(p_n, r_n)
        #print(Q_n)
        Total_time.append(time.time() - time_rep)
        V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma)
        future_V[ii] = np.dot(rou, V_here)
        fS_bool = optimize_pfs.FS_bool(Q_n, V_max_index, n_s, n_a)
        CS_num += fS_bool
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
    PCS = np.float(CS_num) / num_rep
    CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep)
    print("Seq_Q_OCBA")
    print("PCS is {}, with CI length {}".format(PCS, CI_len))
    print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt(
        num_rep), rv, diff))
    runnung_time_mean = np.mean(Total_time)
    runnung_time_CI  = 1.96 * np.std(Total_time)/ np.sqrt(num_rep)
    print("average running time of Seq QOCBA is {} with CI length {}".format(runnung_time_mean, runnung_time_CI))
    #exit()

    # follow original
    CS_num_naive = 0
    future_V = np.zeros(num_rep)
    for i in range(num_rep):
        data = collect_data_swimmer.collect_data(p, r, Total_data, s_0, n_s, n_a, right_prop=right_prop)
        p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a)
        Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a)
        # print(Q_n)
        V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma)
        # print(V_here, V_real)
        future_V[i] = np.dot(rou, V_here)
        fS_bool_ = optimize_pfs.FS_bool(Q_n, V_max_index, n_s, n_a)
        CS_num_naive += fS_bool_
        # if not FS_bool_:
        # print(i)
        # print(f_n)
        # print(Q_n)
    PCS_naive = np.float(CS_num_naive) / num_rep
    CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep)
    fv = np.mean(future_V)
    fv_std = np.std(future_V)
    rv = np.dot(rou, V_real)
    diff = rv - fv
    print("follow original")
    print("PCS is {}, with CI length {}".format(PCS_naive, CI_len))
    print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt(
        num_rep), rv, diff))
def main():
    initial_s_dist = "even"
    Q_approximation = None
    right_props = [0.4, 0.6, 0.7, 0.8, 0.99]
    # collect data configuration
    n_s = 5
    print("n_s is {}".format(n_s))
    n_a = 2
    # value-iteration configuration
    num_iter = 200
    gamma = 0.95
    # real p and r
    p = np.zeros(n_s * n_a * n_s)
    Q_0 = np.zeros(n_s * n_a)
    r = np.zeros(n_s * n_a)
    r[0] = 1
    r[-1] = 10.
    p[0 * n_s * n_a + 0 * n_s + 0] = 1.
    p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
    p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
    for i in range(1, (n_s - 1)):
        p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
        p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
        p[i * n_a * n_s + 1 * n_s + i] = 0.6
        p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
    p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
    Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
    print("Q real is {}".format(Q_real))
    if initial_s_dist == "even":
        R_real = np.mean(V_real)
    quad_consts = np.zeros((n_s, n_a))
    denom_consts = np.zeros((n_s, n_a, n_s * n_a))
    f_n = np.ones(n_s * n_a)
    var_r_n = np.zeros(n_s * n_a)
    I_TM, W_inverse, cov_V_D, I_TM_V, W_inverse_V, cov_V_V_D = inference.get_Sigma_n_comp(
        p, f_n, var_r_n, V_real, gamma, n_s, n_a, V_max_index)
    for i in range(n_s):
        for j in range(n_a):
            if j != V_max_index[i]:
                minus_op = np.zeros(n_s * n_a)
                minus_op[i * n_a + j] = 1
                minus_op[i * n_a + V_max_index[i]] = -1
                denom_consts[i][j] = np.power(np.dot(minus_op, I_TM),
                                              2) * np.diag(cov_V_D)
                quad_consts[i][j] = (Q_real[i * n_a + j] -
                                     Q_real[i * n_a + V_max_index[i]])**2

    A, b, G, h = two_stage_inference.construct_contrain_matrix(p, n_s, n_a)
    AA = np.array(A)

    def fun(x):
        return x[0]

    constraints = []
    for i in range(n_s):
        for j in range(n_a):
            if j != V_max_index[i]:
                constraints.append({
                    'type':
                    'ineq',
                    'fun':
                    lambda x, up_c, denom_c: up_c / (np.sum(
                        np.multiply(denom_c, np.reciprocal(x[1:])))) - x[0],
                    'args': (quad_consts[i][j], denom_consts[i][j])
                })

    for i in range(AA.shape[0]):
        constraints.append({
            'type': 'eq',
            'fun': lambda x, a, b: np.dot(a, x[1:]) - b,
            'args': (AA[i], b[i])
        })
    constraints = tuple(constraints)
    bnds = []
    for i in range(n_s * n_a + 1):
        bnds.append((0.000001, None))
    bnds = tuple(bnds)
    initial = np.ones(n_s * n_a + 1) / (n_s * n_a)
    initial[0] = 1
    # print(initial)
    res = minimize(fun,
                   initial,
                   method='SLSQP',
                   bounds=bnds,
                   constraints=constraints)
    x_opt = res.x[1:]
    print(x_opt)

    def func_val(x):
        vals = []
        for i in range(n_s):
            for j in range(n_a):
                if j != V_max_index[i]:
                    vals.append(quad_consts[i][j] / (2 * np.sum(
                        np.multiply(denom_consts[i][j], np.reciprocal(x)))))
        z = np.min(vals)
        # print (z)
        # print (vals)
        return z

    opt_val = func_val(x_opt)
    print(opt_val)
    n = np.array([10000, 50000, 100000])
    print(1 - np.exp(-n * opt_val))
    for right_prop in right_props:
        transition_p = compare_var.transition_mat_S_A(p, right_prop, n_s, n_a)
        f_n = compare_var.solveStationary(transition_p)
        print(right_prop)
        #print(f_n)
        bench_val = func_val(f_n)
        print(bench_val)
        print(1 - np.exp(-n * bench_val))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of repetitions')
    parser.add_argument('--r0',
                        nargs="?",
                        type=float,
                        default=1.0,
                        help='value of r0')
    parser.add_argument('--optLb',
                        nargs="?",
                        type=float,
                        default=1e-2,
                        help='value of r0')
    parser.add_argument('--numdata',
                        nargs="?",
                        type=int,
                        default=1000,
                        help='number of data')
    parser.add_argument('--rightprop',
                        nargs="?",
                        type=float,
                        default=0.6,
                        help='warm start random exploration right probability')
    parser.add_argument('--rstd',
                        nargs="?",
                        type=float,
                        default=1.0,
                        help='standard deviation of reward')
    parser.add_argument('--opt_ori',
                        nargs="?",
                        type=bool,
                        default=False,
                        help='Q-OCBA optimization method')
    args = parser.parse_args()
    opt_ori = args.opt_ori
    print("Q-OCBA optimization method using original formulation is {}".format(
        opt_ori))

    two_stage_opt_bool = True
    print("two_stage_opt_bool is {}".format(two_stage_opt_bool))
    two_stage_eps_greedy_bool = True
    print("two_stage_eps_greedy_bool is {}".format(two_stage_eps_greedy_bool))
    num_rep = args.rep
    initial_s_dist = "even"
    Q_approximation = None
    right_prop = args.rightprop
    optLb = args.optLb
    s_0 = 2
    # collect data configuration
    num_data = args.numdata
    num_data_1 = num_data * 3 / 10
    num_data_2 = num_data * 7 / 10
    print(
        "num_data in stage 1 is {}, num_data in stage 2 is {}, rightprop in stage 1 is {}"
        .format(num_data_1, num_data_2, right_prop))
    n_s = 5
    print("n_s is {}".format(n_s))
    n_a = 2
    # value-iteration configuration
    num_iter = 200
    gamma = 0.95
    # real p and r
    p = np.zeros(n_s * n_a * n_s)
    Q_0 = np.zeros(n_s * n_a)
    V_0 = np.zeros(n_s)
    rou = np.ones(n_s) / n_s
    r = np.zeros(n_s * n_a)
    r[0] = args.r0
    r[-1] = 10.
    r_std = args.rstd
    print("reward standard deviation is {}".format(r_std))
    # r[0] = 10.
    # r[-1] = 0.1
    print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
    p[0 * n_s * n_a + 0 * n_s + 0] = 1.
    p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
    p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
    for i in range(1, (n_s - 1)):
        p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
        p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
        p[i * n_a * n_s + 1 * n_s + i] = 0.6
        p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
    p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
    Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
    # print("Q real is {}".format(Q_real))
    if initial_s_dist == "even":
        R_real = np.mean(V_real)
    initial_w = np.ones(n_s) / n_s
    if two_stage_opt_bool or two_stage_eps_greedy_bool:
        Q_ns = []
        x_opts = []
        counts = []
        data1s = []
        PCS_first_stage = 0.
        for i in range(num_rep):
            count = 0
            while True:
                count += 1
                data1 = collect_data_swimmer.collect_data(
                    p,
                    r,
                    num_data_1,
                    s_0,
                    n_s,
                    n_a,
                    right_prop=right_prop,
                    std=r_std)
                p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
                    data1, n_s, n_a)
                Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma,
                                                n_s, n_a)
                V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
                # print("first stage visiting frequency is {}".format(f_n))
                if f_n.all() != 0:
                    break
            counts.append(count)
            data1s.append(data1)
            PCS_first_stage += functools.reduce(
                lambda i, j: i and j,
                map(lambda i, j: i == j, V_max_index, V_n_max_index), True)
            Q_ns.append(Q_n)
            # print("first stage trial = {}".format(count))
            # print("real V_max_index vs estimated V_max_index after first stage is {} and {}".format(V_max_index, V_n_max_index))
            # print(Q_n)
            # test
            # p_n = p
            # V_n = V_real
            # V_n_max_index = V_max_index
            I_TM, W_inverse, cov_V_D, I_TM_V, W_inverse_V, cov_V_V_D = inference.get_Sigma_n_comp(
                p_n, f_n, var_r_n, V_n, gamma, n_s, n_a, V_n_max_index)
            # test  covariance
            # cov_V_D = np.diag(np.ones(n_s * n_a))
            # print("first stage stationary dist is {}".format(f_n))
            # print("real Q is {}".format(Q_real))
            # print("Q_n estiamte is {}".format(Q_n))
            # Q_n = Q_real

            if two_stage_opt_bool:
                quad_consts = np.zeros((n_s, n_a))
                denom_consts = np.zeros((n_s, n_a, n_s * n_a))

                for i in range(n_s):
                    for j in range(n_a):
                        if j != V_n_max_index[i]:
                            minus_op = np.zeros(n_s * n_a)
                            minus_op[i * n_a + j] = 1
                            minus_op[i * n_a + V_n_max_index[i]] = -1
                            c1 = np.power(np.dot(minus_op, I_TM), 2)
                            denom_consts[i][j] = c1 * np.diag(cov_V_D)
                            # print(I_TM, c1)
                            # exit()
                            quad_consts[i][j] = (
                                Q_n[i * n_a + j] -
                                Q_n[i * n_a + V_n_max_index[i]])**2

                A, b, G, h = two_stage_inference.construct_contrain_matrix(
                    p_n, n_s, n_a)
                AA = np.array(A)

                # bb = np.asarray(b)
                def fun(x):
                    return -x[0]

                """
                def cons(x, i,j):
                    z = x[0]
                    w = x[1:]
                    return  quad_consts[i][j] / (np.sum(np.multiply(denom_consts[i][j], np.reciprocal(w)))) -z

                def eqcons(x,a, b):
                    return np.dot(a,x[1:]) -b
                """
                # print("quardratic coeff of opt is {}".format(quad_consts))
                # print("denom consts coef of opt is {}".format(denom_consts))

                constraints = []
                for i in range(n_s):
                    for j in range(n_a):
                        constraints.append({
                            'type':
                            'ineq',
                            'fun':
                            lambda x, ii, jj: x[1 + ii * n_a + jj] - x[0],
                            'args': (i, j)
                        })

                for i in range(AA.shape[0]):
                    constraints.append({
                        'type':
                        'eq',
                        'fun':
                        lambda x, a, b: np.dot(a, x[1:]) - b,
                        'args': (AA[i], b[i])
                    })
                constraints = tuple(constraints)
                bnds = []
                bnds.append((0., None))
                for i in range(n_s * n_a):
                    bnds.append((optLb, 1))
                bnds = tuple(bnds)
                initial = np.ones(n_s * n_a + 1) / (n_s * n_a)

                initial[0] = 0.1
                # print(initial)
                t_1 = time.time()
                # print("number of equality constraints is {}".format(len(A)))
                res = minimize(fun,
                               initial,
                               method='SLSQP',
                               bounds=bnds,
                               constraints=constraints)
                x_opt = res.x[1:]
                runnung_t = time.time() - t_1

                def func_val(x):
                    vals = []
                    for i in range(n_s):
                        for j in range(n_a):
                            if j != V_n_max_index[i]:
                                vals.append(quad_consts[i][j] / (2 * np.sum(
                                    np.multiply(denom_consts[i][j],
                                                np.reciprocal(x)))))
                    z = np.min(vals)
                    # print (z)
                    # print (vals)
                    # z = 1
                    return z

                # print("optimization running time is {}".format(runnung_t))

                # ec = np.dot(AA, x_opt) - b
                # print("last equality constraint coeff is {}, {}".format(AA[-1], b[-1]))
                # print("verify equality constraints, equality residual is {}".format(ec))

                # opt_val = func_val(x_opt)
                # print(f_n)

                epsilon = 0.3
                tran_M = transition_mat_S_A_epsilon(p_n, epsilon,
                                                    V_n_max_index, n_s, n_a)
                bench_w = compare_var.solveStationary(tran_M)
                bench_w = np.array(bench_w).reshape(-1, )
                # print(bench_w)
                # bench_val_1=  func_val(bench_w)
                # bench_val_2 =  func_val(f_n)
                #print("optimal exploration policy has stationary dist {} with sum {}".format(x_opt, np.sum(x_opt)))
                #print("optimal value is {}".format(res.x[0]))
                #print("optimal value with optimal solution is {} ".format(opt_val))
                # print("benchmark objective value is {} and {}".format(bench_val_1, bench_val_2))
            # exit()
            x_opts.append(x_opt)
        mean_count = np.mean(counts)
        std_count = np.std(counts)
        print(
            "first stage average # of trials is {} with CI length  {}".format(
                mean_count, 1.96 * std_count / np.sqrt(num_rep)))
        PFS_first_stage = 1 - PCS_first_stage / num_rep
        print("PFS after first stage is {} ".format(PFS_first_stage))
    """
    w = cp.Variable(n_s * n_a)
    #z = cp.Variable(1)
    rate = w[0*n_a + 0]
    for i in range(n_s):
        for j in range(n_a):
            if j!= V_n_max_index[i]:
                #rates.append(quad_consts[i][j] * cp.inv_pos(cp.sum(cp.multiply(denom_consts[i][j], cp.inv_pos(w)))))
                rate = cp.min(rate, w[i*n_a + j])
    #rates = np.array(rates)
    problem = cp.Problem(cp.Maximize(rate), [AA * w == bb, w >= 0])
    problem.solve()
    # Print result.
    print("\nThe optimal value is", problem.value)
    print("A solution w is")
    print(w.value)
    exit()
    """
    CS_num_naive = 0
    future_V = np.zeros(num_rep)
    for i in range(num_rep):
        x_opt = x_opts[i]
        if two_stage_opt_bool:
            data = collect_data_swimmer.collect_data(p,
                                                     r,
                                                     num_data_2,
                                                     s_0,
                                                     n_s,
                                                     n_a,
                                                     right_prop=right_prop,
                                                     pi_s_a=x_opt)
        else:
            data = collect_data_swimmer.collect_data(p,
                                                     r,
                                                     num_data_2,
                                                     s_0,
                                                     n_s,
                                                     n_a,
                                                     right_prop=right_prop)
        data = data + data1s[i]
        p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
            data, n_s, n_a)
        Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s,
                                        n_a)
        # print(Q_n)
        V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p,
                                      gamma)
        # print(V_here, V_real)
        future_V[i] = np.dot(rou, V_here)
        FS_bool_ = FS_bool(Q_n, V_max_index, n_s, n_a)
        CS_num_naive += FS_bool_
        # if not FS_bool_:
        # print(i)
        # print(f_n)
        # print(Q_n)
    PCS_naive = np.float(CS_num_naive) / num_rep
    CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep)
    fv = np.mean(future_V)
    fv_std = np.std(future_V)
    rv = np.dot(rou, V_real)
    diff = rv - fv
    # print(CS_num_naive)
    print("Exploration_for pure exploration:")
    print("PCS is {}, with CI length {}".format(PCS_naive, CI_len))
    print(
        "future value func is {} with CI length {}, real value is {}, diff is {}"
        .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))
Example #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of repetitions')
    parser.add_argument('--r0',
                        nargs="?",
                        type=float,
                        default=0.0,
                        help='value of r0')
    #parser.add_argument('--r_prior', nargs="?", type=float, default=1.0, help='prior value of reward function')
    parser.add_argument('--numdata',
                        nargs="?",
                        type=int,
                        default=1000,
                        help='number of data')
    parser.add_argument('--epi_step_num',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of episode steps')
    parser.add_argument('--rightprop',
                        nargs="?",
                        type=float,
                        default=0.6,
                        help='warm start random exploration right probability')
    parser.add_argument('--rstd',
                        nargs="?",
                        type=float,
                        default=1.0,
                        help='standard deviation of reward')
    parser.add_argument('--beta',
                        nargs="?",
                        type=float,
                        default=0.25,
                        help='beta')
    parser.add_argument('--two_stage',
                        nargs="?",
                        type=bool,
                        default=True,
                        help='if run two stage or sequential experiment')
    args = parser.parse_args()
    print("PSPE")
    num_iter, gamma, n_s, n_a, num_rep = 200, 0.95, 5, 2, args.rep
    right_prop = args.rightprop

    Q_0 = np.zeros(n_s * n_a)
    V_0 = np.zeros(n_s)
    p = np.zeros(n_s * n_a * n_s)
    r = np.zeros(n_s * n_a)
    r[0] = args.r0
    r[-1] = 10.
    r_std = args.rstd
    print("reward standard deviation is {}".format(r_std))
    # r[0] = 10.
    # r[-1] = 0.1
    print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
    p[0 * n_s * n_a + 0 * n_s + 0] = 1.
    p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
    p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
    for i in range(1, (n_s - 1)):
        p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
        p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
        p[i * n_a * n_s + 1 * n_s + i] = 0.6
        p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
    p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
    Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
    #print("Q real is {}".format(Q_real))
    s_0 = 2

    ## PSPE
    if not args.two_stage:
        print("sequential implementation")
        Total_data = args.numdata
        print("total num of data is {}".format(Total_data))
        episode_steps = args.epi_step_num
        numdata_1 = episode_steps
        numdata_2 = Total_data - numdata_1
        print("epsisode timestep is {}".format(episode_steps))
        num_datas = [episode_steps] * (numdata_2 / episode_steps)
    else:
        print("two_stage implementation")
        Total_data = args.numdata
        print("total num of data is {}".format(Total_data))
        numdata_1 = Total_data * 3 / 10
        numdata_2 = Total_data - numdata_1
        episodes = 100
        num_datas = [numdata_2 / episodes] * episodes

    CS_num = 0.
    beta = args.beta
    rou = np.ones(n_s) / n_s
    future_V = np.zeros(num_rep)
    for i in range(num_rep):
        para_cl = parameter_prior(n_s, n_a, s_0)
        while True:
            data1 = collect_data_swimmer.collect_data(p,
                                                      r,
                                                      numdata_1,
                                                      s_0,
                                                      n_s,
                                                      n_a,
                                                      right_prop=right_prop,
                                                      std=r_std)
            p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
                data1, n_s, n_a)
            Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma,
                                            n_s, n_a)
            V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
            # print("first stage visiting frequency is {}".format(f_n))
            if f_n.all() != 0:
                break
        para_cl.update(data1, r_sigma=r_std)
        Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
        #print(Q_estimate)

        for num_data in num_datas:
            data = collect_data_swimmer.collect_data(p,
                                                     r,
                                                     num_data,
                                                     para_cl.s_0,
                                                     n_s,
                                                     n_a,
                                                     Q=Q_estimate,
                                                     epsilon=0,
                                                     std=r_std)
            para_cl.update(data, r_sigma=r_std)
            Q_estimate_1 = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
            V_n_1, V_n_max_index_1 = inference.get_V_from_Q(
                Q_estimate_1, n_s, n_a)
            sim = np.random.binomial(1, beta, 1)[0]
            if sim:
                Q_estimate = Q_estimate_1
            else:
                while True:
                    Q_estimate_2 = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
                    V_n_2, V_n_max_index_2 = inference.get_V_from_Q(
                        Q_estimate_2, n_s, n_a)
                    if V_n_max_index_2 != V_n_max_index_1:
                        break
                Q_estimate = Q_estimate_2
        #print(Q_estimate)
        #print(para_cl.pprior)
        #print(para_cl.r_mean)
        V_here = optimize_pfs.policy_val_iteration(Q_estimate, n_s, n_a, V_0,
                                                   num_iter, r, p, gamma)
        # print(V_here, V_real)
        future_V[i] = np.dot(rou, V_here)
        FS_bool = optimize_pfs.FS_bool(Q_estimate, V_max_index, n_s, n_a)
        CS_num += FS_bool
    PCS = np.float(CS_num) / num_rep
    CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep)
    fv = np.mean(future_V)
    fv_std = np.std(future_V)
    rv = np.dot(rou, V_real)
    diff = rv - fv
    # print(CS_num_naive)
    print("PCS is {}, with CI length {}".format(PCS, CI_len))
    print(
        "future value func is {} with  CI length {}, real value is {}, diff is {}"
        .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))
Example #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions')
    #parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0')
    parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data')
    parser.add_argument('--rightprop', nargs="?", type=float, default=0.6,
                        help='warm start random exploration right probability')
    parser.add_argument('--rstd', nargs="?", type=float, default=1.0,
                        help='standard deviation of reward ')

    args = parser.parse_args()
    num_iter, gamma, n_s, n_a, delta,  num_rep = 200, 0.95, 5, 2, 0.05, args.rep
    right_prop = args.rightprop
    Q_0 = np.zeros(n_s * n_a)
    V_0 = np.zeros(n_s)
    p = np.zeros(n_s * n_a * n_s)
    r = np.zeros(n_s * n_a)
    for r0_val in range(1, 4):
        r[0] = float(r0_val)
        r[-1] = 10.
        r_std  = args.rstd
        # r[0] = 10.
        # r[-1] = 0.1
        print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
        p[0 * n_s * n_a + 0 * n_s + 0] = 1.
        p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
        p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
        for i in range(1, (n_s - 1)):
            p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
            p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
            p[i * n_a * n_s + 1 * n_s + i] = 0.6
            p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
        p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
        p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
        p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
        Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
        V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
        print("Q real is {}".format(Q_real))
        s_0 = 2
        rou = np.ones(n_s) / n_s

        Q_approximation = None
        initial_s_dist = "even"
        if initial_s_dist == "even":
            R_real = np.mean(V_real)
            initial_w = np.ones(n_s) / n_s
        cov_bools_Q = np.zeros(n_s * n_a)
        cov_bools_V = np.zeros(n_s)
        cov_bools_R = 0.
        # print("Q real is {}".format(Q_real))
        # print("V real is {}".format(V_real))
        # print("R real is {}".format(R_real))
        CI_lens_Q = []
        CI_lens_V = []
        CI_lens_R = []
        numerical_tol = 1e-6
        S_0 = None

        ## UCRL
        CS_num = 0.
        num_data = args.numdata
        num_1  = num_data * 3/10
        num_2 = num_data * 7/10
        #print("smaller")
        future_V = np.zeros(num_rep)
        for i in range(num_rep):
            #all_data  = []
            while True:
                data1 = collect_data_swimmer.collect_data(p, r, num_1, s_0, n_s, n_a, right_prop=right_prop,  std = r_std)
                p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data1, n_s, n_a)
                Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a)
                V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
                #print("first stage visiting frequency is {}".format(f_n))
                if f_n.all()!=0:
                    break
            #all_data += data1
            pre_collected_stats = get_pre_collected_stats(data1, n_s, n_a)
            UCRL_cl = UCRL(n_s, n_a, 0.05, num_1, s_0, num_data, pre_collected_stats)
            while UCRL_cl.t < num_data:
                UCRL_cl.update_point_estimate_and_CIbound()
                #print("step1 finished")
                UCRL_cl.Extended_Value_Iter()
                #print("step2 finished")
                UCRL_cl.collect_data_and_update(p,r, r_std = r_std)
                #print("step3 finished")
                #print(UCRL_cl.t)
            UCRL_cl.update_point_estimate_and_CIbound()
            Q_estimate =  Iterative_Cal_Q.cal_Q_val(UCRL_cl.transition, Q_0, UCRL_cl.rew, num_iter , gamma, n_s, n_a)
            #print(Q_estimate)
            FS_bool = optimize_pfs.FS_bool(Q_estimate, V_max_index, n_s, n_a)
            CS_num += FS_bool
            V_here = optimize_pfs.policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma)
            # print(V_here, V_real)
            future_V[i] = np.dot(rou, V_here)
            datahere = data1 + UCRL_cl.datas
            Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(Q_approximation, S_0, num_data, s_0,
                                                                           num_iter, gamma,
                                                                           Q_0, n_s, n_a, r, p, initial_w, right_prop,
                                                                           data=datahere)
            # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
            cov_bool_Q = np.logical_and(Q_real <= (Q_n + CI_len_Q + numerical_tol),
                                        Q_real >= (Q_n - CI_len_Q - numerical_tol))
            cov_bool_V = np.logical_and(V_real <= (V_n + CI_len_V + numerical_tol),
                                        V_real >= (V_n - CI_len_V - numerical_tol))
            cov_bool_R = np.logical_and(R_real <= (R_n + CI_len_R + numerical_tol),
                                        R_real >= (R_n - CI_len_R - numerical_tol))
            # print(cov_bool_Q)
            # exit()
            # print(cov_bool)
            cov_bools_Q += cov_bool_Q
            cov_bools_V += cov_bool_V
            cov_bools_R += cov_bool_R
            CI_lens_Q.append(CI_len_Q)
            CI_lens_V.append(CI_len_V)
            CI_lens_R.append(CI_len_R)
        PCS = np.float(CS_num) / num_rep
        CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        # print(CS_num_naive)
        print("PCS is {}, with CI length {}".format(PCS, CI_len))
        print("future value func is {} with  CI length {}, real value is {}, diff is {}".format(fv, 1.96 * fv_std / np.sqrt(
            num_rep), rv, diff))

        CI_len_Q_mean = np.mean(CI_lens_Q)
        CI_len_V_mean = np.mean(CI_lens_V)
        CI_len_R_mean = np.mean(CI_lens_R)
        CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
        CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
        CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

        cov_rate_Q = np.divide(cov_bools_Q, num_rep)
        cov_rate_V = np.divide(cov_bools_V, num_rep)
        cov_rate_R = np.divide(cov_bools_R, num_rep)
        cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
        cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
        cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)
        print("coverage for Q")
        print(cov_rate_Q)
        print(cov_rate_CI_Q)
        print("mean coverage for Q ")
        print(np.mean(cov_rate_Q))
        print(np.mean(cov_rate_CI_Q))
        print("coverage for V")
        print(cov_rate_V)
        print(cov_rate_CI_V)
        print("mean coverage for V")
        print(np.mean(cov_rate_V))
        print(np.mean(cov_rate_CI_V))
        print("coverage for R")
        print(cov_rate_R)
        print(cov_rate_CI_R)
        print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci))
        print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci))
        print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci))
def main():
    num_rep = 10
    initial_s_dist = "even"
    Q_approximation = None
    # Q_approximation = "linear_interpolation"
    right_prop = 0.8
    s_0 = 2
    # collect data configuration
    num_data = 10000
    num_data_1 = num_data * 3 / 10
    num_data_2 = num_data * 7 / 10
    print(
        "num_data in stage 1 is {}, num_data in stage 2 is {}, rightprop in stage 1 is {}"
        .format(num_data_1, num_data_2, right_prop))
    n_s = 5
    print("n_s is {}".format(n_s))
    n_a = 2
    # value-iteration configuration
    num_iter = 200
    gamma = 0.95
    # real p and r
    p = np.zeros(n_s * n_a * n_s)
    Q_0 = np.zeros(n_s * n_a)
    r = np.zeros(n_s * n_a)
    r[0] = 2.
    r[-1] = 10.
    # r[0] = 10.
    # r[-1] = 0.1
    print(r)
    print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
    p[0 * n_s * n_a + 0 * n_s + 0] = 1.
    p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
    p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
    for i in range(1, (n_s - 1)):
        p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
        p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
        p[i * n_a * n_s + 1 * n_s + i] = 0.6
        p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
    p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3

    # one replication of coverage test
    # Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    # V_real = get_V_from_Q(Q_real, n_s, n_a)
    # Q_n, CI_len, V_n = get_CI(collec_data_bool, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p)
    # print(Q_real)
    # print(V_real)
    # print(Q_n)
    # print(V_n)
    # print(CI_len)
    Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    V_real, _ = inference.get_V_from_Q(Q_real, n_s, n_a)
    print("Q real is {}".format(Q_real))
    if initial_s_dist == "even":
        R_real = np.mean(V_real)
    initial_w = np.ones(n_s) / n_s
    p_n, r_n, f_n, var_r_n, Q_n, V_n, V_n_max_index, R_n = stage_1_estimation(
        p, r, num_data_1, s_0, n_s, n_a, Q_0, right_prop, num_iter, gamma,
        initial_w)
    # print(Q_n)
    I_TM, W_inverse, cov_V_D, I_TM_V, W_inverse_V, cov_V_V_D = inference.get_Sigma_n_comp(
        p_n, f_n, var_r_n, V_n, gamma, n_s, n_a, V_n_max_index)

    # print( np.diag(cov_V_V_D))
    # exit()
    quad_con_vec = np.power(np.dot(initial_w, I_TM_V), 2) * np.diag(cov_V_V_D)
    # print(quad_con_vec)
    if not np.all(f_n):
        print(f_n)
        print(quad_con_vec)
        print("need more data for first stage")
        exit()
    quad_con_vec_all = np.zeros(n_s * n_a)
    for i in range(n_s):
        quad_con_vec_all[i * n_a + V_n_max_index[i]] = quad_con_vec[i]
    # print(quad_con_vec)

    # print(I_TM_V)
    # print(cov_V_V_D)
    # print(initial_w)
    # Create a new model
    init_v_opt = 1. / (n_a * n_s)
    quad_con_vec_all = matrix(quad_con_vec_all)
    array_quad_con_vec = np.array(quad_con_vec_all).transpose()[0]

    # print(array_quad_con_vec)
    # exit()
    def F(x):
        u = np.divide(1, x)
        # print(u)
        uu = np.multiply(array_quad_con_vec, u)
        # print(quad_con_vec_all)
        # print(uu)
        val = np.sum(uu)
        # print(val)
        return val

    A, b, G, h = construct_contrain_matrix(p_n, n_s, n_a)
    AA = np.array(A)
    bb = np.asarray(b)

    constraints = []

    for i in range(AA.shape[0]):
        constraints.append({
            'type': 'eq',
            'fun': lambda x, a, b: np.dot(a, x) - b,
            'args': (AA[i], bb[i])
        })
    constraints = tuple(constraints)
    bnds = []
    for i in range(n_s * n_a):
        bnds.append((0.001, None))
    bnds = tuple(bnds)
    initial = np.ones(n_s * n_a) / (n_s * n_a)
    # print(initial)
    res = minimize(F,
                   initial,
                   method='SLSQP',
                   bounds=bnds,
                   constraints=constraints)
    x_opt = res.x
    print(x_opt)

    opt_val = F(x_opt)
    print(opt_val)
    epsilon = 0.3
    tran_M = optimize_pfs.transition_mat_S_A_epsilon(p_n, epsilon,
                                                     V_n_max_index, n_s, n_a)
    bench_w = compare_var.solveStationary(tran_M)
    bench_w = np.array(bench_w).reshape(-1, )
    print(bench_w)
    bench_val = F(bench_w)
    print(bench_val)
    # exit()
    """

    rank_A = np.linalg.matrix_rank(AA)
    dims = {'l': [], 'q': [], 's': []}
    #solvers.options['show_progress'] = False
    sol = solvers.cp(F, A=A ,b=b)
    x_opt = np.array(sol['x']. T)[0]

    exit()
    """
    cov_rate_Q, cov_rate_V, cov_rate_R, CI_len_Q_mean, CI_len_V_mean, CI_len_R_mean, CI_len_Q_ci, CI_len_V_ci, CI_len_R_ci = \
        inference.cal_coverage(Q_approximation, num_data_2, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, right_prop,
                               initial_s_dist=initial_s_dist, num_rep=num_rep, pi_s_a=x_opt)
    cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
    cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
    cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)
    print("coverage for Q")
    print(cov_rate_Q)
    print(cov_rate_CI_Q)
    print("mean coverage for Q ")
    print(np.mean(cov_rate_Q))
    print(np.mean(cov_rate_CI_Q))
    print("coverage for V")
    print(cov_rate_V)
    print(cov_rate_CI_V)
    print("mean coverage for V")
    print(np.mean(cov_rate_V))
    print(np.mean(cov_rate_CI_V))
    print("coverage for R")
    print(cov_rate_R)
    print(cov_rate_CI_R)
    print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci))
    print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci))
    print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci))

    epsilons = [0.2, 0.3]
    for epsilon in epsilons:
        print("epsilon is {}".format(epsilon))
        cov_rate_Q, cov_rate_V, cov_rate_R, CI_len_Q_mean, CI_len_V_mean, CI_len_R_mean, CI_len_Q_ci, CI_len_V_ci, CI_len_R_ci = \
            inference.cal_coverage(Q_approximation, num_data_2, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p, right_prop,
                                   initial_s_dist=initial_s_dist, num_rep=num_rep, Q=Q_n, epsilon=epsilon)
        cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
        cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
        cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)
        print("coverage for Q")
        print(cov_rate_Q)
        print(cov_rate_CI_Q)
        print("mean coverage for Q ")
        print(np.mean(cov_rate_Q))
        print(np.mean(cov_rate_CI_Q))
        print("coverage for V")
        print(cov_rate_V)
        print(cov_rate_CI_V)
        print("mean coverage for V")
        print(np.mean(cov_rate_V))
        print(np.mean(cov_rate_CI_V))
        print("coverage for R")
        print(cov_rate_R)
        print(cov_rate_CI_R)
        print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean,
                                                     CI_len_Q_ci))
        print("CI len for V CI {} with ci {}".format(CI_len_V_mean,
                                                     CI_len_V_ci))
        print("CI len for R CI {} with ci {}".format(CI_len_R_mean,
                                                     CI_len_R_ci))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of repetitions')
    parser.add_argument('--r0',
                        nargs="?",
                        type=float,
                        default=1.0,
                        help='value of r0')
    # parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data')
    parser.add_argument('--rightprop',
                        nargs="?",
                        type=float,
                        default=0.6,
                        help='warm start random exploration right probability')
    parser.add_argument('--rstd',
                        nargs="?",
                        type=float,
                        default=1.0,
                        help='standard deviation of reward')
    parser.add_argument('--episode',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of episode')
    parser.add_argument('--epi_step_num',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of episode steps')
    parser.add_argument('--first_stage_data',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of first stage data')
    parser.add_argument('--r_prior',
                        nargs="?",
                        type=float,
                        default=0.0,
                        help='prior value of reward function')
    parser.add_argument('--iflog',
                        nargs="?",
                        type=int,
                        default=0,
                        help='whether take logrithm of x-axis')

    args = parser.parse_args()

    num_rep = args.rep
    right_prop = args.rightprop
    print("right prop is {}".format(right_prop))
    s_0 = 2
    n_s = 5
    print("n_s is {}".format(n_s))
    n_a = 2
    # value-iteration configuration
    num_iter = 200
    gamma = 0.95
    # real p and r
    p = np.zeros(n_s * n_a * n_s)
    Q_0 = np.zeros(n_s * n_a)
    V_0 = np.zeros(n_s)
    rou = np.ones(n_s) / n_s
    r = np.zeros(n_s * n_a)
    r[0] = args.r0
    r[-1] = 10.
    r_sd = args.rstd
    r_prior_mean = args.r_prior
    print("reward standard deviation is {}".format(r_sd))
    # r[0] = 10.
    # r[-1] = 0.1
    print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
    p[0 * n_s * n_a + 0 * n_s + 0] = 1.
    p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
    p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
    for i in range(1, (n_s - 1)):
        p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
        p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
        p[i * n_a * n_s + 1 * n_s + i] = 0.6
        p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
    p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
    Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
    R_real = np.mean(V_real)
    # print("Q real is {}".format(Q_real))
    #num_datas = list(range(500, 10500, 500))
    episode_steps = args.epi_step_num

    #numdata_1 = episode_steps
    numdata_1 = args.first_stage_data
    print("first stage data num is {}".format(numdata_1))
    print("epsisode timestep is {}".format(episode_steps))
    logif = True if args.iflog else False
    print("we print x axis in log is {}".format(logif))
    if not logif:
        if r_sd == 10.0:
            num_datas = list(range(10, 8000, 1000))
        else:
            num_datas = list(range(5, 10010, 1000))
    else:
        num_datas = [10, 100, 1000, 5000, 10000]
    #num_datas = list(range(1000, 5000, 2000))
    #num_datas = [2000]
    QOCBAs_Q_cov = []
    REs_Q_cov = []
    eps_Q_cov = []
    UCRL_Q_cov = []
    PSRL_Q_cov = []
    Bayes_resample = False
    print_if = True
    epsilon = 0.2
    S_0 = None
    initial_w = np.ones(n_s) / n_s
    numerical_tol = 1e-6
    Q_approximation = None

    print("epsilon is {}".format(epsilon))

    for num_data in num_datas:
        print("numdata is {}".format(num_data))
        stage_datas = [episode_steps] * (num_data / episode_steps)

        cov_bools_Q = np.zeros(n_s * n_a)
        cov_bools_V = np.zeros(n_s)
        cov_bools_R = 0.
        # print("Q real is {}".format(Q_real))
        # print("V real is {}".format(V_real))
        # print("R real is {}".format(R_real))
        CI_lens_Q = []
        CI_lens_V = []
        CI_lens_R = []
        print("epsilon greedy")
        for i in range(num_rep):
            para_cl = seq_cls(n_s, n_a, s_0, r_mean_prior=r_prior_mean)
            all_data = collect_data_swimmer.collect_data(p,
                                                         r,
                                                         numdata_1,
                                                         s_0,
                                                         n_s,
                                                         n_a,
                                                         right_prop=right_prop,
                                                         std=r_sd)
            para_cl.update(all_data, resample=False)
            p_n, r_n, r_std = para_cl.get_para(resample=False)
            Q_here = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma,
                                               n_s, n_a)
            for num_dat in stage_datas:
                stage_data = collect_data_swimmer.collect_data(
                    p,
                    r,
                    num_dat,
                    s_0,
                    n_s,
                    n_a,
                    right_prop=right_prop,
                    Q=Q_here,
                    epsilon=epsilon,
                    print_pro_right=False,
                    std=r_sd)
                para_cl.update(stage_data, resample=Bayes_resample)
                p_n, r_n, r_std = para_cl.get_para(resample=False)
                Q_here = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter,
                                                   gamma, n_s, n_a)
                all_data += stage_data
            # print(Q_here)
            Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(
                Q_approximation,
                S_0,
                num_data,
                s_0,
                num_iter,
                gamma,
                Q_0,
                n_s,
                n_a,
                r,
                p,
                initial_w,
                right_prop,
                data=all_data)
            # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
            cov_bool_Q = np.logical_and(
                Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >=
                (Q_n - CI_len_Q - numerical_tol))
            cov_bool_V = np.logical_and(
                V_real <= (V_n + CI_len_V + numerical_tol), V_real >=
                (V_n - CI_len_V - numerical_tol))
            cov_bool_R = np.logical_and(
                R_real <= (R_n + CI_len_R + numerical_tol), R_real >=
                (R_n - CI_len_R - numerical_tol))
            # print(cov_bool_Q)
            # exit()
            # print(cov_bool)
            cov_bools_Q += cov_bool_Q
            cov_bools_V += cov_bool_V
            cov_bools_R += cov_bool_R
            CI_lens_Q.append(CI_len_Q)
            CI_lens_V.append(CI_len_V)
            CI_lens_R.append(CI_len_R)

        #print(CI_lens_Q)
        CI_len_Q_mean = np.mean(CI_lens_Q)
        CI_len_V_mean = np.mean(CI_lens_V)
        CI_len_R_mean = np.mean(CI_lens_R)
        CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
        CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
        CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

        cov_rate_Q = np.mean(np.divide(cov_bools_Q, num_rep))
        cov_rate_V = np.mean(np.divide(cov_bools_V, num_rep))
        cov_rate_R = np.divide(cov_bools_R, num_rep)
        cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
        cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
        cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)

        eps_Q_cov.append(cov_rate_Q)

        if print_if:
            print("mean coverage for Q ")
            print(np.mean(cov_rate_Q))
            print(np.mean(cov_rate_CI_Q))
            print("mean coverage for V")
            print(np.mean(cov_rate_V))
            print(np.mean(cov_rate_CI_V))
            print("coverage for R")
            print(cov_rate_R)
            print(cov_rate_CI_R)
            print("CI len for Q CI {} with ci {}".format(
                CI_len_Q_mean, CI_len_Q_ci))
            print("CI len for V CI {} with ci {}".format(
                CI_len_V_mean, CI_len_V_ci))
            print("CI len for R CI {} with ci {}".format(
                CI_len_R_mean, CI_len_R_ci))
        # exit()
        print("Q-OCBA")
        cov_bools_Q = np.zeros(n_s * n_a)
        cov_bools_V = np.zeros(n_s)
        cov_bools_R = 0.
        # print("Q real is {}".format(Q_real))
        # print("V real is {}".format(V_real))
        # print("R real is {}".format(R_real))
        CI_lens_Q = []
        CI_lens_V = []
        CI_lens_R = []
        for iii in range(num_rep):
            para_cl = seq_cls(n_s, n_a, s_0, r_mean_prior=r_prior_mean)
            all_data = collect_data_swimmer.collect_data(p,
                                                         r,
                                                         numdata_1,
                                                         s_0,
                                                         n_s,
                                                         n_a,
                                                         right_prop=right_prop,
                                                         std=r_sd)
            #data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=0.3, std=r_sd)
            para_cl.update(all_data, resample=Bayes_resample)
            p_n, r_n, r_std = para_cl.get_para(resample=Bayes_resample)
            var_r_n = r_std**2
            # print(p_n)
            # print(r_n)
            # print(r_std)

            # test
            # p_n = p
            # r_n = r

            Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma,
                                            n_s, n_a)
            V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
            for jj, stage_data in enumerate(stage_datas):
                TM_V = inference.P_V(p_n, n_s, n_a, V_n_max_index)
                I_V = np.identity(n_s)
                I_TM_V = np.linalg.inv(I_V - gamma * TM_V)

                var_r_n_V = np.array(
                    [var_r_n[i * n_a + V_n_max_index[i]] for i in range(n_s)])
                V_V = np.diag(var_r_n_V)

                ds = []
                ds_V = []
                for i in range(n_s):
                    for j in range(n_a):
                        p_sa = p_n[(i * n_a * n_s + j * n_s):(i * n_a * n_s +
                                                              (j + 1) * n_s)]
                        dij = inference.cal_cov_p_quad_V(p_sa, V_n, n_s)
                        ds.append(dij)
                        if j == V_n_max_index[i]:
                            ds_V.append(dij)
                D_V = np.diag(ds_V)
                cov_V_V_D = V_V + D_V

                A, b, G, h = two_stage_inference.construct_contrain_matrix(
                    p_n, n_s, n_a)
                AA = np.array(A)
                # bb = np.asarray(b)

                quad_con_vec = np.power(np.dot(initial_w, I_TM_V),
                                        2) * np.diag(cov_V_V_D)
                # print(quad_con_vec)
                # if  not np.all(f_n):
                #    print(f_n)
                #    print(quad_con_vec)
                #    print("need more data for first stage")
                #    exit()
                quad_con_vec_all = np.zeros(n_s * n_a)
                for i in range(n_s):
                    quad_con_vec_all[i * n_a +
                                     V_n_max_index[i]] = quad_con_vec[i]
                # print(quad_con_vec)

                # print(I_TM_V)
                # print(cov_V_V_D)
                # print(initial_w)
                # Create a new model
                init_v_opt = 1. / (n_a * n_s)
                quad_con_vec_all = matrix(quad_con_vec_all)
                array_quad_con_vec = np.array(quad_con_vec_all).transpose()[0]

                # print(array_quad_con_vec)
                # exit()
                def F(x):
                    u = np.divide(1, x)
                    # print(u)
                    uu = np.multiply(array_quad_con_vec, u)
                    # print(quad_con_vec_all)
                    # print(uu)
                    val = np.sum(uu)
                    # print(val)
                    return val

                A, b, G, h = two_stage_inference.construct_contrain_matrix(
                    p_n, n_s, n_a)
                AA = np.array(A)
                bb = np.asarray(b)

                constraints = []

                for i in range(AA.shape[0]):
                    constraints.append({
                        'type': 'eq',
                        'fun': lambda x, a, b: np.dot(a, x) - b,
                        'args': (AA[i], bb[i])
                    })
                constraints = tuple(constraints)
                bnds = []
                for i in range(n_s * n_a):
                    bnds.append((1e-6, None))
                    # bnds.append((0.001, None))

                bnds = tuple(bnds)
                initial = np.ones(n_s * n_a) / (n_s * n_a)
                # print(initial)
                res = minimize(F,
                               initial,
                               method='SLSQP',
                               bounds=bnds,
                               constraints=constraints)
                x_opt = res.x

                # exit()

                # print("***", para_cl.s)
                #print(x_opt)

                data = collect_data_swimmer.collect_data(p,
                                                         r,
                                                         stage_data,
                                                         para_cl.s,
                                                         n_s,
                                                         n_a,
                                                         pi_s_a=x_opt,
                                                         std=r_sd)
                all_data += data
                para_cl.update(data, resample=Bayes_resample)
                _, _, freq, _ = cal_impirical_r_p.cal_impirical_stats(
                    data, n_s, n_a)
                # print("x_opt", x_opt)
                # print("freq", freq)
                # dist = np.linalg.norm(freq - x_opt)
                # dist = sklearn.metrics.mutual_info_score(freq, x_opt)
                # print(dist)

                p_n, r_n, r_std = para_cl.get_para(resample=Bayes_resample)
                var_r_n = r_std**2
                Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma,
                                                n_s, n_a)
                V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
            Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(
                Q_approximation,
                S_0,
                num_data,
                s_0,
                num_iter,
                gamma,
                Q_0,
                n_s,
                n_a,
                r,
                p,
                initial_w,
                right_prop,
                data=all_data)
            # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
            cov_bool_Q = np.logical_and(
                Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >=
                (Q_n - CI_len_Q - numerical_tol))
            cov_bool_V = np.logical_and(
                V_real <= (V_n + CI_len_V + numerical_tol), V_real >=
                (V_n - CI_len_V - numerical_tol))
            cov_bool_R = np.logical_and(
                R_real <= (R_n + CI_len_R + numerical_tol), R_real >=
                (R_n - CI_len_R - numerical_tol))
            # print(cov_bool_Q)
            # exit()
            # print(cov_bool)
            cov_bools_Q += cov_bool_Q
            cov_bools_V += cov_bool_V
            cov_bools_R += cov_bool_R
            CI_lens_Q.append(CI_len_Q)
            CI_lens_V.append(CI_len_V)
            CI_lens_R.append(CI_len_R)

        CI_len_Q_mean = np.mean(CI_lens_Q)
        CI_len_V_mean = np.mean(CI_lens_V)
        CI_len_R_mean = np.mean(CI_lens_R)
        CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
        CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
        CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

        cov_rate_Q = np.mean(np.divide(cov_bools_Q, num_rep))
        cov_rate_V = np.mean(np.divide(cov_bools_V, num_rep))
        cov_rate_R = np.divide(cov_bools_R, num_rep)
        cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
        cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
        cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)

        QOCBAs_Q_cov.append(cov_rate_Q)

        if print_if:
            print("mean coverage for Q ")
            print(np.mean(cov_rate_Q))
            print(np.mean(cov_rate_CI_Q))
            print("mean coverage for V")
            print(np.mean(cov_rate_V))
            print(np.mean(cov_rate_CI_V))
            print("coverage for R")
            print(cov_rate_R)
            print(cov_rate_CI_R)
            print("CI len for Q CI {} with ci {}".format(
                CI_len_Q_mean, CI_len_Q_ci))
            print("CI len for V CI {} with ci {}".format(
                CI_len_V_mean, CI_len_V_ci))
            print("CI len for R CI {} with ci {}".format(
                CI_len_R_mean, CI_len_R_ci))

        # follow original
        print("random exploration")
        cov_bools_Q = np.zeros(n_s * n_a)
        cov_bools_V = np.zeros(n_s)
        cov_bools_R = 0.
        # print("Q real is {}".format(Q_real))
        # print("V real is {}".format(V_real))
        # print("R real is {}".format(R_real))
        CI_lens_Q = []
        CI_lens_V = []
        CI_lens_R = []
        for i in range(num_rep):
            data = collect_data_swimmer.collect_data(p,
                                                     r,
                                                     num_data + numdata_1,
                                                     s_0,
                                                     n_s,
                                                     n_a,
                                                     right_prop=right_prop,
                                                     std=r_sd)
            Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(
                Q_approximation,
                S_0,
                num_data,
                s_0,
                num_iter,
                gamma,
                Q_0,
                n_s,
                n_a,
                r,
                p,
                initial_w,
                right_prop,
                data=data)
            # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
            cov_bool_Q = np.logical_and(
                Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >=
                (Q_n - CI_len_Q - numerical_tol))
            cov_bool_V = np.logical_and(
                V_real <= (V_n + CI_len_V + numerical_tol), V_real >=
                (V_n - CI_len_V - numerical_tol))
            cov_bool_R = np.logical_and(
                R_real <= (R_n + CI_len_R + numerical_tol), R_real >=
                (R_n - CI_len_R - numerical_tol))
            # print(cov_bool_Q)
            # exit()
            # print(cov_bool)
            cov_bools_Q += cov_bool_Q
            cov_bools_V += cov_bool_V
            cov_bools_R += cov_bool_R
            CI_lens_Q.append(CI_len_Q)
            CI_lens_V.append(CI_len_V)
            CI_lens_R.append(CI_len_R)

        CI_len_Q_mean = np.mean(CI_lens_Q)
        CI_len_V_mean = np.mean(CI_lens_V)
        CI_len_R_mean = np.mean(CI_lens_R)
        CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
        CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
        CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

        cov_rate_Q = np.mean(np.divide(cov_bools_Q, num_rep))
        cov_rate_V = np.mean(np.divide(cov_bools_V, num_rep))
        cov_rate_R = np.divide(cov_bools_R, num_rep)
        cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
        cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
        cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)

        REs_Q_cov.append(cov_rate_Q)

        if print_if:
            print("mean coverage for Q ")
            print(np.mean(cov_rate_Q))
            print(np.mean(cov_rate_CI_Q))
            print("mean coverage for V")
            print(np.mean(cov_rate_V))
            print(np.mean(cov_rate_CI_V))
            print("coverage for R")
            print(cov_rate_R)
            print(cov_rate_CI_R)
            print("CI len for Q CI {} with ci {}".format(
                CI_len_Q_mean, CI_len_Q_ci))
            print("CI len for V CI {} with ci {}".format(
                CI_len_V_mean, CI_len_V_ci))
            print("CI len for R CI {} with ci {}".format(
                CI_len_R_mean, CI_len_R_ci))
        #UCRL
        #delta = 0.05

        print("UCRL")
        cov_bools_Q = np.zeros(n_s * n_a)
        cov_bools_V = np.zeros(n_s)
        cov_bools_R = 0.
        # print("Q real is {}".format(Q_real))
        # print("V real is {}".format(V_real))
        # print("R real is {}".format(R_real))
        CI_lens_Q = []
        CI_lens_V = []
        CI_lens_R = []
        for i in range(num_rep):
            all_data = collect_data_swimmer.collect_data(p,
                                                         r,
                                                         numdata_1,
                                                         s_0,
                                                         n_s,
                                                         n_a,
                                                         right_prop=right_prop,
                                                         std=r_sd)
            pre_collected_stats = UCRL_2.get_pre_collected_stats(
                data, n_s, n_a)
            UCRL_cl = UCRL_2.UCRL(n_s, n_a, 0.05, numdata_1, s_0, num_data,
                                  pre_collected_stats)
            while UCRL_cl.t < num_data:
                UCRL_cl.update_point_estimate_and_CIbound()
                # print("step1 finished")
                UCRL_cl.Extended_Value_Iter()
                # print("step2 finished")
                UCRL_cl.collect_data_and_update(p, r, r_std=r_sd)
                # print("step3 finished")
                # print(UCRL_cl.t)
            UCRL_cl.update_point_estimate_and_CIbound()
            all_data = all_data + UCRL_cl.datas
            #print(UCRL_cl.t, num_data)
            #print(len(datahere))
            #exit()
            Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(
                Q_approximation,
                S_0,
                num_data,
                s_0,
                num_iter,
                gamma,
                Q_0,
                n_s,
                n_a,
                r,
                p,
                initial_w,
                right_prop,
                data=all_data)
            # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
            cov_bool_Q = np.logical_and(
                Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >=
                (Q_n - CI_len_Q - numerical_tol))
            cov_bool_V = np.logical_and(
                V_real <= (V_n + CI_len_V + numerical_tol), V_real >=
                (V_n - CI_len_V - numerical_tol))
            cov_bool_R = np.logical_and(
                R_real <= (R_n + CI_len_R + numerical_tol), R_real >=
                (R_n - CI_len_R - numerical_tol))
            # print(cov_bool_Q)
            # exit()
            # print(cov_bool)
            cov_bools_Q += cov_bool_Q
            cov_bools_V += cov_bool_V
            cov_bools_R += cov_bool_R
            CI_lens_Q.append(CI_len_Q)
            CI_lens_V.append(CI_len_V)
            CI_lens_R.append(CI_len_R)

        CI_len_Q_mean = np.mean(CI_lens_Q)
        CI_len_V_mean = np.mean(CI_lens_V)
        CI_len_R_mean = np.mean(CI_lens_R)
        CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
        CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
        CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

        cov_rate_Q = np.mean(np.divide(cov_bools_Q, num_rep))
        cov_rate_V = np.mean(np.divide(cov_bools_V, num_rep))
        cov_rate_R = np.divide(cov_bools_R, num_rep)
        cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
        cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
        cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)

        UCRL_Q_cov.append(cov_rate_Q)

        if print_if:
            print("mean coverage for Q ")
            print(np.mean(cov_rate_Q))
            print(np.mean(cov_rate_CI_Q))
            print("mean coverage for V")
            print(np.mean(cov_rate_V))
            print(np.mean(cov_rate_CI_V))
            print("coverage for R")
            print(cov_rate_R)
            print(cov_rate_CI_R)
            print("CI len for Q CI {} with ci {}".format(
                CI_len_Q_mean, CI_len_Q_ci))
            print("CI len for V CI {} with ci {}".format(
                CI_len_V_mean, CI_len_V_ci))
            print("CI len for R CI {} with ci {}".format(
                CI_len_R_mean, CI_len_R_ci))

        print("PSRL")
        cov_bools_Q = np.zeros(n_s * n_a)
        cov_bools_V = np.zeros(n_s)
        cov_bools_R = 0.
        # print("Q real is {}".format(Q_real))
        # print("V real is {}".format(V_real))
        # print("R real is {}".format(R_real))
        CI_lens_Q = []
        CI_lens_V = []
        CI_lens_R = []
        for i in range(num_rep):
            para_cl = PSRLcls(n_s, n_a, s_0)
            data = collect_data_swimmer.collect_data(p,
                                                     r,
                                                     numdata_1,
                                                     s_0,
                                                     n_s,
                                                     n_a,
                                                     right_prop=right_prop,
                                                     std=r_sd)
            para_cl.update(data, r_sigma=r_sd)
            Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
            # print(Q_estimate)
            for nd in stage_datas:
                dat = collect_data_swimmer.collect_data(p,
                                                        r,
                                                        nd,
                                                        para_cl.s_0,
                                                        n_s,
                                                        n_a,
                                                        Q=Q_estimate,
                                                        epsilon=0,
                                                        std=r_sd)
                data += dat
                para_cl.update(dat, r_sigma=r_sd)
                Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
                # print(para_cl.pprior)
                # print(para_cl.r_mean)
            # exit()
            # print(Q_estimate)
            # print(para_cl.pprior)
            # print(para_cl.r_mean)
            # transition = np.array([1.] * n_s * (n_s * n_a))
            # for i in range(n_s):
            #    for j in range(n_a):
            #        transition[
            #        (i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] = para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] \
            #                                                                      / np.sum(para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)])
            # r_n = para_cl.r_mean
            # print(r_n)
            # print(transition)
            # Q_estimate = Iterative_Cal_Q.cal_Q_val(transition, Q_0, r_n, num_iter , gamma, n_s, n_a)
            Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(
                Q_approximation,
                S_0,
                num_data,
                s_0,
                num_iter,
                gamma,
                Q_0,
                n_s,
                n_a,
                r,
                p,
                initial_w,
                right_prop,
                data=data)
            # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
            cov_bool_Q = np.logical_and(
                Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >=
                (Q_n - CI_len_Q - numerical_tol))
            cov_bool_V = np.logical_and(
                V_real <= (V_n + CI_len_V + numerical_tol), V_real >=
                (V_n - CI_len_V - numerical_tol))
            cov_bool_R = np.logical_and(
                R_real <= (R_n + CI_len_R + numerical_tol), R_real >=
                (R_n - CI_len_R - numerical_tol))
            # print(cov_bool_Q)
            # exit()
            # print(cov_bool)
            cov_bools_Q += cov_bool_Q
            cov_bools_V += cov_bool_V
            cov_bools_R += cov_bool_R
            CI_lens_Q.append(CI_len_Q)
            CI_lens_V.append(CI_len_V)
            CI_lens_R.append(CI_len_R)

        CI_len_Q_mean = np.mean(CI_lens_Q)
        CI_len_V_mean = np.mean(CI_lens_V)
        CI_len_R_mean = np.mean(CI_lens_R)
        CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
        CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
        CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

        cov_rate_Q = np.mean(np.divide(cov_bools_Q, num_rep))
        cov_rate_V = np.mean(np.divide(cov_bools_V, num_rep))
        cov_rate_R = np.divide(cov_bools_R, num_rep)
        cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
        cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
        cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)

        PSRL_Q_cov.append(cov_rate_Q)

        if print_if:
            print("mean coverage for Q ")
            print(np.mean(cov_rate_Q))
            print(np.mean(cov_rate_CI_Q))
            print("mean coverage for V")
            print(np.mean(cov_rate_V))
            print(np.mean(cov_rate_CI_V))
            print("coverage for R")
            print(cov_rate_R)
            print(cov_rate_CI_R)
            print("CI len for Q CI {} with ci {}".format(
                CI_len_Q_mean, CI_len_Q_ci))
            print("CI len for V CI {} with ci {}".format(
                CI_len_V_mean, CI_len_V_ci))
            print("CI len for R CI {} with ci {}".format(
                CI_len_R_mean, CI_len_R_ci))

    print("epsilon greedy")
    print(eps_Q_cov)

    print("QOCBA")
    print(QOCBAs_Q_cov)

    print("REs")
    print(REs_Q_cov)

    print("UCRL ")
    print(UCRL_Q_cov)

    print("PSRL ")
    print(PSRL_Q_cov)

    if logif:
        num_datas = np.log(np.array(num_datas) + 1)
    plt.plot(num_datas,
             eps_Q_cov,
             'g<--',
             markersize=6,
             label="epsilon-greedy")
    plt.plot(num_datas, UCRL_Q_cov, 'm+--', markersize=6, label="UCRL")
    plt.plot(num_datas, PSRL_Q_cov, 'cx--', markersize=6, label="PSRL")
    plt.plot(num_datas, QOCBAs_Q_cov, 'ro--', markersize=6, label="Q-OCBA")
    # plt.fill_between(xs, np.subtract(y1, CI_1), np.add(y1, CI_1), color='r', alpha=0.4)
    plt.plot(num_datas,
             REs_Q_cov,
             'b>--',
             markersize=6,
             label="RE({})".format(right_prop))
    # plt.fill_between(xs, np.subtract(y2, CI_2), np.add(y2, CI_2), color='b', alpha=0.4)
    # plt.axhline(y=0.95)
    plt.xlabel("total number of data")
    # plt.ylabel("CR overall coverage")
    plt.ylabel("CI Coverage")
    plt.axhline(y=0.95)
    plt.legend(loc='lower right', shadow=True, fontsize='x-small')
    plt.title(r'$\sigma_R= {}, r_L = {}$ CI coverage'.format(r_sd, r[0]))
    plt.show()
Example #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep', nargs="?", type=int, default=100, help='number of repetitions')
    parser.add_argument('--r0', nargs="?", type=float, default=1.0, help='value of r0')
    parser.add_argument('--optLb', nargs="?", type=float, default=1e-2, help='value of r0')
    # parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data')
    parser.add_argument('--rightprop', nargs="?", type=float, default=0.6,
                        help='warm start random exploration right probability')
    parser.add_argument('--rstd', nargs="?", type=float, default=1.0,
                        help='standard deviation of reward')
    parser.add_argument('--opt_ori', nargs="?", type=bool, default=False,
                        help='Q-OCBA optimization method')
    parser.add_argument('--episode', nargs="?", type=int, default=100, help='number of episode')
    args = parser.parse_args()




    opt_ori = args.opt_ori
    print("Q-OCBA optimization method using original formulation is {}".format(opt_ori))
    num_rep = args.rep
    right_prop = args.rightprop
    optLb = args.optLb
    s_0 = 2
    n_s = 5
    print("n_s is {}".format(n_s))
    n_a = 2
    # value-iteration configuration
    num_iter = 200
    gamma = 0.95
    # real p and r
    p = np.zeros(n_s * n_a * n_s)
    Q_0 = np.zeros(n_s * n_a)
    V_0 = np.zeros(n_s)
    rou = np.ones(n_s) / n_s
    r = np.zeros(n_s * n_a)
    r[0] = args.r0
    r[-1] = 10.
    r_std = args.rstd
    print("reward standard deviation is {}".format(r_std))
    # r[0] = 10.
    # r[-1] = 0.1
    print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
    p[0 * n_s * n_a + 0 * n_s + 0] = 1.
    p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
    p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
    for i in range(1, (n_s - 1)):
        p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
        p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
        p[i * n_a * n_s + 1 * n_s + i] = 0.6
        p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
    p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
    Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
    # print("Q real is {}".format(Q_real))

    num_datas = list(range(500, 12500, 2000))
    num_datas = list(range(1000, 5000, 2000))
    num_datas = [10000]
    QOCBAs_PCS = []
    REs_PCS = []
    QOCBAs_fr = []
    REs_fr = []
    eps_PCSs = []
    eps_frs = []
    UCRL_PCSs = []
    UCRL_frs = []
    PSRL_PCSs = []
    PSRL_frs = []

    for num_data in num_datas:
        num_data_1 = num_data * 3 / 10
        num_data_2 = num_data * 7 / 10
        print("num_data in stage 1 is {}, num_data in stage 2 is {}, rightprop in stage 1 is {}".format(num_data_1,
                                                                                                        num_data_2,
                                                                                                        right_prop))
        if True:
            Q_ns = []
            x_opts = []
            counts = []
            data1s = []
            PCS_first_stage = 0.
            for i in range(num_rep):
                count = 0
                while True:
                    count += 1
                    data1 = collect_data_swimmer.collect_data(p, r, num_data_1, s_0, n_s, n_a, right_prop=right_prop,
                                                              std=r_std)
                    p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data1, n_s, n_a)
                    Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a)
                    V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
                    # print("first stage visiting frequency is {}".format(f_n))
                    if f_n.all() != 0:
                        break
                counts.append(count)
                data1s.append(data1)
                PCS_first_stage += functools.reduce(lambda i, j: i and j,
                                                    map(lambda i, j: i == j, V_max_index, V_n_max_index), True)
                Q_ns.append(Q_n)
                # print("first stage trial = {}".format(count))
                # print("real V_max_index vs estimated V_max_index after first stage is {} and {}".format(V_max_index, V_n_max_index))
                # print(Q_n)
                # test
                # p_n = p
                # V_n = V_real
                # V_n_max_index = V_max_index
                I_TM, W_inverse, cov_V_D, I_TM_V, W_inverse_V, cov_V_V_D = inference.get_Sigma_n_comp(p_n, f_n, var_r_n,
                                                                                                      V_n, gamma, n_s,
                                                                                                      n_a,
                                                                                                      V_n_max_index)
                # test  covariance
                # cov_V_D = np.diag(np.ones(n_s * n_a))
                # print("first stage stationary dist is {}".format(f_n))
                # print("real Q is {}".format(Q_real))
                # print("Q_n estiamte is {}".format(Q_n))
                # Q_n = Q_real

                if True:
                    quad_consts = np.zeros((n_s, n_a))
                    denom_consts = np.zeros((n_s, n_a, n_s * n_a))

                    for i in range(n_s):
                        for j in range(n_a):
                            if j != V_n_max_index[i]:
                                minus_op = np.zeros(n_s * n_a)
                                minus_op[i * n_a + j] = 1
                                minus_op[i * n_a + V_n_max_index[i]] = -1
                                c1 = np.power(np.dot(minus_op, I_TM), 2)
                                denom_consts[i][j] = c1 * np.diag(cov_V_D)
                                # print(I_TM, c1)
                                # exit()
                                quad_consts[i][j] = (Q_n[i * n_a + j] - Q_n[i * n_a + V_n_max_index[i]]) ** 2

                    A, b, G, h = two_stage_inference.construct_contrain_matrix(p_n, n_s, n_a)
                    AA = np.array(A)
                    # bb = np.asarray(b)
                    if opt_ori:
                        def fun(x):
                            return -x[0]
                    else:
                        def fun(x):
                            return x[0]
                    """
                    def cons(x, i,j):
                        z = x[0]
                        w = x[1:]
                        return  quad_consts[i][j] / (np.sum(np.multiply(denom_consts[i][j], np.reciprocal(w)))) -z

                    def eqcons(x,a, b):
                        return np.dot(a,x[1:]) -b
                    """
                    # print("quardratic coeff of opt is {}".format(quad_consts))
                    # print("denom consts coef of opt is {}".format(denom_consts))

                    constraints = []
                    if opt_ori:
                        for i in range(n_s):
                            for j in range(n_a):
                                if j != V_n_max_index[i]:
                                    # print(denom_consts[i][j])
                                    if np.max(denom_consts[i][j]) > 1e-5:
                                        constraints.append({'type': 'ineq', 'fun': lambda x, up_c, denom_c: up_c / (
                                            np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))) - x[0],
                                                            'args': (quad_consts[i][j], denom_consts[i][j])})
                    else:
                        for i in range(n_s):
                            for j in range(n_a):
                                if j != V_n_max_index[i]:
                                    # print(denom_consts[i][j])
                                    if np.max(denom_consts[i][j]) > 1e-5:
                                        constraints.append({'type': 'ineq', 'fun': lambda x, up_c, denom_c: -(
                                            np.sum(np.multiply(denom_c, np.reciprocal(x[1:])))) / up_c + x[0],
                                                            'args': (quad_consts[i][j], denom_consts[i][j])})

                    for i in range(AA.shape[0]):
                        constraints.append(
                            {'type': 'eq', 'fun': lambda x, a, b: np.dot(a, x[1:]) - b, 'args': (AA[i], b[i])})
                    constraints = tuple(constraints)
                    bnds = []
                    bnds.append((0., None))
                    for i in range(n_s * n_a):
                        bnds.append((optLb, 1))
                    bnds = tuple(bnds)
                    initial = np.ones(n_s * n_a + 1) / (n_s * n_a)

                    initial[0] = 0.1
                    # print(initial)
                    t_1 = time.time()
                    # print("number of equality constraints is {}".format(len(A)))
                    res = minimize(fun, initial, method='SLSQP', bounds=bnds,
                                   constraints=constraints)
                    x_opt = res.x[1:]
                    runnung_t = time.time() - t_1

                    def func_val(x):
                        vals = []
                        for i in range(n_s):
                            for j in range(n_a):
                                if j != V_n_max_index[i]:
                                    vals.append(quad_consts[i][j] / (2 *
                                                                     np.sum(np.multiply(denom_consts[i][j],
                                                                                        np.reciprocal(x)))))
                        z = np.min(vals)
                        # print (z)
                        # print (vals)
                        # z = 1
                        return z

                    # print("optimization running time is {}".format(runnung_t))

                    # ec = np.dot(AA, x_opt) - b
                    # print("last equality constraint coeff is {}, {}".format(AA[-1], b[-1]))
                    # print("verify equality constraints, equality residual is {}".format(ec))

                    # opt_val = func_val(x_opt)
                    # print(f_n)

                    epsilon = 0.3
                    tran_M = transition_mat_S_A_epsilon(p_n, epsilon, V_n_max_index, n_s, n_a)
                    bench_w = compare_var.solveStationary(tran_M)
                    bench_w = np.array(bench_w).reshape(-1, )
                    # print(bench_w)
                    # bench_val_1=  func_val(bench_w)
                    # bench_val_2 =  func_val(f_n)
                    # print("optimal exploration policy has stationary dist {} with sum {}".format(x_opt, np.sum(x_opt)))
                    # print("optimal value is {}".format(res.x[0]))
                    # print("optimal value with optimal solution is {} ".format(opt_val))
                    # print("benchmark objective value is {} and {}".format(bench_val_1, bench_val_2))
                    # exit()
                x_opts.append(x_opt)
            mean_count = np.mean(counts)
            std_count = np.std(counts)
            # print("first stage average # of trials is {} with CI length  {}".format(mean_count,1.96 * std_count / np.sqrt(num_rep)))
            # PFS_first_stage = 1 - PCS_first_stage / num_rep
            # print("PFS after first stage is {} ".format(PFS_first_stage))

        """
        w = cp.Variable(n_s * n_a)
        #z = cp.Variable(1)
        rate = w[0*n_a + 0]
        for i in range(n_s):
            for j in range(n_a):
                if j!= V_n_max_index[i]:
                    #rates.append(quad_consts[i][j] * cp.inv_pos(cp.sum(cp.multiply(denom_consts[i][j], cp.inv_pos(w)))))
                    rate = cp.min(rate, w[i*n_a + j])
        #rates = np.array(rates)
        problem = cp.Problem(cp.Maximize(rate), [AA * w == bb, w >= 0])
        problem.solve()
        # Print result.
        print("\nThe optimal value is", problem.value)
        print("A solution w is")
        print(w.value)
        exit()
        """
        epsilons = [0.2]
        for epsilon in epsilons:
            print("epsilon is {}".format(epsilon))
            CS_num_naive = 0
            future_V = np.zeros(num_rep)
            for i in range(num_rep):
                Q_n = Q_ns[i]
                data = collect_data_swimmer.collect_data(p, r, num_data_2, s_0, n_s, n_a, right_prop=right_prop, Q=Q_n,
                                                         epsilon=epsilon, print_pro_right=False, std=r_std)
                data = data + data1s[i]
                p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a)
                Q_here = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a)
                # print(Q_here)
                V_here = policy_val_iteration(Q_here, n_s, n_a, V_0, num_iter, r, p, gamma)
                future_V[i] = np.dot(rou, V_here)
                FS_bool_ = FS_bool(Q_here, V_max_index, n_s, n_a)
                CS_num_naive += FS_bool_
                # if not FS_bool_:
                # print(i)
                # print(f_n)
                # print(Q_here)
                # exit()
            PCS_naive = np.float(CS_num_naive) / num_rep
            CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep)
            fv = np.mean(future_V)
            fv_std = np.std(future_V)
            rv = np.dot(rou, V_real)
            diff = rv - fv
            eps_PCSs.append(PCS_naive)
            eps_frs.append(diff)

            print("epsilon--greedy with epsilon {}:".format(epsilon))
            print("PCS is {}, with CI length {}".format(PCS_naive, CI_len))
            print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv,
                                                                                                   1.96 * fv_std / np.sqrt(
                                                                                                       num_rep), rv,
                                                                                                   diff))

        # exit()

        CS_num_naive = 0
        future_V = np.zeros(num_rep)
        for i in range(num_rep):
            x_opt = x_opts[i]
            data = collect_data_swimmer.collect_data(p, r, num_data_2, s_0, n_s, n_a, right_prop=right_prop,
                                                     pi_s_a=x_opt, std=r_std)
            data = data + data1s[i]
            p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a)
            Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a)
            # print(Q_n)
            V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma)
            # print(V_here, V_real)
            future_V[i] = np.dot(rou, V_here)
            FS_bool_ = FS_bool(Q_n, V_max_index, n_s, n_a)
            CS_num_naive += FS_bool_
            # if not FS_bool_:
            # print(i)
            # print(f_n)
            # print(Q_n)
        PCS_naive = np.float(CS_num_naive) / num_rep
        CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        # print(CS_num_naive)
        QOCBAs_PCS.append(PCS_naive)
        QOCBAs_fr.append(diff)
        print("Q-OCBA:")
        print("PCS is {}, with CI length {}".format(PCS_naive, CI_len))
        print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv,
                                                                                               1.96 * fv_std / np.sqrt(
                                                                                                   num_rep), rv, diff))
        # exit()

        # follow original
        CS_num_naive = 0
        future_V = np.zeros(num_rep)
        for i in range(num_rep):
            data = collect_data_swimmer.collect_data(p, r, num_data_2, s_0, n_s, n_a, right_prop=right_prop, std=r_std)
            data = data + data1s[i]
            p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(data, n_s, n_a)
            Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a)
            # print(Q_n)
            V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p, gamma)
            # print(V_here, V_real)
            future_V[i] = np.dot(rou, V_here)
            FS_bool_ = FS_bool(Q_n, V_max_index, n_s, n_a)
            CS_num_naive += FS_bool_
            # if not FS_bool_:
            # print(i)
            # print(f_n)
            # print(Q_n)
        PCS_naive = np.float(CS_num_naive) / num_rep
        CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        REs_PCS.append(PCS_naive)
        REs_fr.append(diff)
        print("follow original")
        print("PCS is {}, with CI length {}".format(PCS_naive, CI_len))
        print("future value func is {} with CI length {}, real value is {}, diff is {}".format(fv,
                                                                                               1.96 * fv_std / np.sqrt(
                                                                                                num_rep), rv, diff))
        #UCRL
        #delta = 0.05
        CS_num = 0.
        future_V = np.zeros(num_rep)
        for i in range(num_rep):
            pre_collected_stats = UCRL_2.get_pre_collected_stats(data1s[i], n_s, n_a)
            UCRL_cl = UCRL_2.UCRL(n_s, n_a, 0.05, num_data_1, s_0, num_data_2, pre_collected_stats)
            while UCRL_cl.t < num_data_1 + num_data_2:
                UCRL_cl.update_point_estimate_and_CIbound()
                # print("step1 finished")
                UCRL_cl.Extended_Value_Iter()
                # print("step2 finished")
                UCRL_cl.collect_data_and_update(p, r, r_std=r_std)
                # print("step3 finished")
                # print(UCRL_cl.t)
            UCRL_cl.update_point_estimate_and_CIbound()
            Q_estimate = Iterative_Cal_Q.cal_Q_val(UCRL_cl.transition, Q_0, UCRL_cl.rew, num_iter, gamma, n_s, n_a)
            # print(Q_estimate)
            FS_bool_ = FS_bool(Q_estimate, V_max_index, n_s, n_a)
            CS_num += FS_bool_
            V_here = policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma)
            # print(V_here, V_real)
            future_V[i] = np.dot(rou, V_here)
        PCS = np.float(CS_num) / num_rep
        CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        # print(CS_num_naive)
        UCRL_PCSs.append(PCS)
        UCRL_frs.append(diff)
        print("UCRL")
        print("PCS is {}, with CI length {}".format(PCS, CI_len))
        print("future value func is {} with  CI length {}, real value is {}, diff is {}".format(fv,
                                                                                                1.96 * fv_std / np.sqrt(
                                                                                                    num_rep), rv, diff))


        episodes = args.episode
        ## PSRL
        print("# of epsisodes is {}".format(episodes))

        CS_num = 0.
        future_V = np.zeros(num_rep)
        for i in range(num_rep):
            all_data = data1s[i]
            para_cl = PSRLcls(n_s, n_a, s_0)
            para_cl.update(data1s[i], r_sigma=r_std)
            Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
            # print(Q_estimate)
            nds = [num_data_2 / episodes] * episodes
            for nd in nds:
                dat = collect_data_swimmer.collect_data(p, r, nd, para_cl.s_0, n_s, n_a, Q=Q_estimate,
                                                         epsilon=0, std=r_std)
                all_data += dat
                para_cl.update(dat, r_sigma=r_std)
                Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
                # print(para_cl.pprior)
                # print(para_cl.r_mean)
            # exit()
            # print(Q_estimate)
            # print(para_cl.pprior)
            # print(para_cl.r_mean)
            # transition = np.array([1.] * n_s * (n_s * n_a))
            # for i in range(n_s):
            #    for j in range(n_a):
            #        transition[
            #        (i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] = para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] \
            #                                                                      / np.sum(para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)])
            # r_n = para_cl.r_mean
            # print(r_n)
            # print(transition)
            # Q_estimate = Iterative_Cal_Q.cal_Q_val(transition, Q_0, r_n, num_iter , gamma, n_s, n_a)
            p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(all_data, n_s, n_a)
            Q_estimate = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma, n_s, n_a)
            V_here = policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter, r, p, gamma)
            # print(V_here, V_real)
            future_V[i] = np.dot(rou, V_here)
            FS_bool_ = FS_bool(Q_estimate, V_max_index, n_s, n_a)
            CS_num += FS_bool_
        PCS = np.float(CS_num) / num_rep
        CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        PSRL_PCSs.append(PCS)
        PSRL_frs.append(diff)
        # print(CS_num_naive)
        print("PSRL")
        print("PCS is {}, with CI length {}".format(PCS, CI_len))
        print("future value func is {} with  CI length {}, real value is {}, diff is {}".format(fv,
                                                                                                1.96 * fv_std / np.sqrt(
                                                                                                    num_rep), rv,
                                                                                                diff))
    print("epsilon greedy")
    print(eps_PCSs)
    print(eps_frs)
    print("QOCBA")
    print(QOCBAs_PCS)
    print(QOCBAs_fr)
    print("REs")
    print(REs_PCS)
    print(REs_fr)
    print("UCRL ")
    print(UCRL_PCSs)
    print(UCRL_frs)
    print("PSRL ")
    print(PSRL_PCSs)
    print(PSRL_frs)
    plt.plot(num_datas, eps_PCSs, 'g<--', markersize=6, label="epsilon-greedy")
    plt.plot(num_datas, UCRL_PCSs, 'm+--', markersize=6, label="UCRL")
    plt.plot(num_datas, PSRL_PCSs, 'cx--', markersize=6, label="PSRL")
    plt.plot(num_datas, QOCBAs_PCS, 'ro--', markersize=6, label="Q-OCBA")
    # plt.fill_between(xs, np.subtract(y1, CI_1), np.add(y1, CI_1), color='r', alpha=0.4)
    plt.plot(num_datas, REs_PCS, 'b>--', markersize=6, label="RE(0.6)")
    # plt.fill_between(xs, np.subtract(y2, CI_2), np.add(y2, CI_2), color='b', alpha=0.4)
    # plt.axhline(y=0.95)
    plt.xlabel("total number of data")
    # plt.ylabel("CR overall coverage")
    plt.ylabel("PCS")
    plt.legend(loc='lower right', shadow=True, fontsize='x-small')
    plt.title(r'$\sigma_R= {}, r_L = {}$ PCS'.format(r_std, r[0]))
    plt.show()

    plt.plot(num_datas, eps_frs, 'g<--', markersize=6, label="epsilon-greedy")
    plt.plot(num_datas, UCRL_frs, 'm+--', markersize=6, label="UCRL")
    plt.plot(num_datas, PSRL_frs, 'cx--', markersize=6, label="PSRL")


    plt.plot(num_datas, QOCBAs_fr, 'ro--', markersize=6, label="Q-OCBA")
    # plt.fill_between(xs, np.subtract(y1, CI_1), np.add(y1, CI_1), color='r', alpha=0.4)
    plt.plot(num_datas, REs_fr, 'b>--', markersize=6, label="RE(0.6)")
    # plt.fill_between(xs, np.subtract(y2, CI_2), np.add(y2, CI_2), color='b', alpha=0.4)
    # plt.axhline(y=0.95)
    plt.xlabel("total number of data")
    # plt.ylabel("CR overall coverage")
    plt.ylabel("future regret")
    plt.legend(loc='upper right', shadow=True, fontsize='x-small')
    plt.title(r'$\sigma_R= {}, r_L = {}$ future regret'.format(r_std, r[0]))

    plt.show()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of repetitions')
    parser.add_argument('--r0',
                        nargs="?",
                        type=float,
                        default=1.0,
                        help='value of r0')
    parser.add_argument('--optLb',
                        nargs="?",
                        type=float,
                        default=1e-2,
                        help='value of r0')
    # parser.add_argument('--numdata', nargs="?", type=int, default=1000, help='number of data')
    parser.add_argument('--rightprop',
                        nargs="?",
                        type=float,
                        default=0.6,
                        help='warm start random exploration right probability')
    parser.add_argument('--rstd',
                        nargs="?",
                        type=float,
                        default=1.0,
                        help='standard deviation of reward')
    parser.add_argument('--opt_ori',
                        nargs="?",
                        type=int,
                        default=0,
                        help='Q-OCBA optimization method')
    parser.add_argument('--episode',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of episode')
    parser.add_argument('--epi_step_num',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of episode steps')
    parser.add_argument('--first_stage_data',
                        nargs="?",
                        type=int,
                        default=3,
                        help='number of first stage data')
    parser.add_argument('--r_prior',
                        nargs="?",
                        type=float,
                        default=0.0,
                        help='prior value of reward function')
    parser.add_argument('--opt_one_step',
                        nargs="?",
                        type=int,
                        default=0,
                        help='Q-OCBA optimization running only one step')
    parser.add_argument('--iflog',
                        nargs="?",
                        type=int,
                        default=0,
                        help='whether take logrithm of x-axis')

    args = parser.parse_args()

    opt_ori = True if args.opt_ori else False
    print("Q-OCBA optimization method using original formulation is {}".format(
        opt_ori))
    num_rep = args.rep
    right_prop = args.rightprop
    print("right prop is {}".format(right_prop))

    optLb = args.optLb
    s_0 = 2
    n_s = 5
    print("n_s is {}".format(n_s))
    n_a = 2
    # value-iteration configuration
    num_iter = 200
    gamma = 0.95
    # real p and r
    p = np.zeros(n_s * n_a * n_s)
    Q_0 = np.zeros(n_s * n_a)
    V_0 = np.zeros(n_s)
    rou = np.ones(n_s) / n_s
    r = np.zeros(n_s * n_a)
    r[0] = args.r0
    r[-1] = 10.
    r_sd = args.rstd
    r_prior_mean = args.r_prior
    print("reward standard deviation is {}".format(r_sd))
    # r[0] = 10.
    # r[-1] = 0.1
    print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
    p[0 * n_s * n_a + 0 * n_s + 0] = 1.
    p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
    p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
    for i in range(1, (n_s - 1)):
        p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
        p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
        p[i * n_a * n_s + 1 * n_s + i] = 0.6
        p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
    p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
    Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
    # print("Q real is {}".format(Q_real))
    #num_datas = list(range(500, 10500, 500))
    episode_steps = args.epi_step_num

    #numdata_1 = episode_steps
    numdata_1 = args.first_stage_data
    print("first stage data num is {}".format(numdata_1))
    print("epsisode timestep is {}".format(episode_steps))
    logif = True if args.iflog else False
    print("we print x axis in log is {}".format(logif))
    if not logif:
        if r_sd == 10.0:
            num_datas = list(range(0, 8000, 1000))
        else:
            num_datas = list(range(0, 4000, 500))
            #num_datas = list(range(0, 2000, 500))
    else:
        num_datas = [0, 100, 1000, 5000, 10000]
    #num_datas = list(range(1000, 5000, 2000))
    #num_datas = [2000]
    QOCBAs_PCS = []
    REs_PCS = []
    QOCBAs_fr = []
    REs_fr = []
    eps_PCSs = []
    eps_frs = []
    UCRL_PCSs = []
    UCRL_frs = []
    PSRL_PCSs = []
    PSRL_frs = []
    REs_PCS_08 = []
    REs_fr_08 = []

    Bayes_resample = False
    print_if = True
    epsilon = 0.2

    print("epsilon is {}".format(epsilon))

    for num_data in num_datas:
        print("numdata is {}".format(num_data))
        stage_datas = [episode_steps] * (num_data / episode_steps)
        CS_num_naive = 0
        future_V = np.zeros(num_rep)
        for i in range(num_rep):
            para_cl = seq_cls(n_s, n_a, s_0, r_mean_prior=r_prior_mean)
            data = collect_data_swimmer.collect_data(p,
                                                     r,
                                                     numdata_1,
                                                     s_0,
                                                     n_s,
                                                     n_a,
                                                     right_prop=right_prop,
                                                     std=r_sd)
            para_cl.update(data, resample=False)
            p_n, r_n, r_std = para_cl.get_para(resample=False)
            Q_here = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma,
                                               n_s, n_a)
            for num_dat in stage_datas:
                stage_data = collect_data_swimmer.collect_data(
                    p,
                    r,
                    num_dat,
                    s_0,
                    n_s,
                    n_a,
                    right_prop=right_prop,
                    Q=Q_here,
                    epsilon=epsilon,
                    print_pro_right=False,
                    std=r_sd)
                para_cl.update(stage_data, resample=Bayes_resample)
                p_n, r_n, r_std = para_cl.get_para(resample=False)
                Q_here = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter,
                                                   gamma, n_s, n_a)
            # print(Q_here)
            V_here = policy_val_iteration(Q_here, n_s, n_a, V_0, num_iter, r,
                                          p, gamma)
            future_V[i] = np.dot(rou, V_here)
            FS_bool_ = FS_bool(Q_here, V_max_index, n_s, n_a)
            CS_num_naive += FS_bool_

        PCS_naive = np.float(CS_num_naive) / num_rep
        CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        eps_PCSs.append(PCS_naive)
        eps_frs.append(diff)
        if print_if:
            print("epsilon--greedy with epsilon {}:".format(epsilon))
            print("PCS is {}, with CI length {}".format(PCS_naive, CI_len))
            print(
                "future value func is {} with CI length {}, real value is {}, diff is {}"
                .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))
        # exit()

        CS_num_naive = 0
        future_V = np.zeros(num_rep)
        for iii in range(num_rep):
            para_cl = seq_cls(n_s, n_a, s_0, r_mean_prior=r_prior_mean)
            data = collect_data_swimmer.collect_data(p,
                                                     r,
                                                     numdata_1,
                                                     s_0,
                                                     n_s,
                                                     n_a,
                                                     right_prop=right_prop,
                                                     std=r_sd)
            #data = collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=0.3, std=r_sd)
            para_cl.update(data, resample=Bayes_resample)
            p_n, r_n, r_std = para_cl.get_para(resample=Bayes_resample)
            var_r_n = r_std**2
            # print(p_n)
            # print(r_n)
            # print(r_std)

            # test
            # p_n = p
            # r_n = r

            Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma,
                                            n_s, n_a)
            V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
            for jj, stage_data in enumerate(stage_datas):
                TM = inference.embedd_MC(p_n, n_s, n_a, V_n_max_index)
                I = np.identity(n_s * n_a)
                I_TM = np.linalg.inv(I - gamma * TM)
                V = np.diag(var_r_n)
                ds = []
                ds_V = []
                for i in range(n_s):
                    for j in range(n_a):
                        p_sa = p_n[(i * n_a * n_s + j * n_s):(i * n_a * n_s +
                                                              (j + 1) * n_s)]
                        dij = inference.cal_cov_p_quad_V(p_sa, V_n, n_s)
                        ds.append(dij)
                        if j == V_n_max_index[i]:
                            ds_V.append(dij)
                D = np.diag(ds)
                cov_V_D = V + D
                quad_consts = np.zeros((n_s, n_a))
                denom_consts = np.zeros((n_s, n_a, n_s * n_a))

                for i in range(n_s):
                    for j in range(n_a):
                        if j != V_n_max_index[i]:
                            minus_op = np.zeros(n_s * n_a)
                            minus_op[i * n_a + j] = 1
                            minus_op[i * n_a + V_n_max_index[i]] = -1
                            denom_consts[i][j] = np.power(
                                np.dot(minus_op, I_TM), 2) * np.diag(cov_V_D)
                            quad_consts[i][j] = (
                                Q_n[i * n_a + j] -
                                Q_n[i * n_a + V_n_max_index[i]])**2

                A, b, G, h = two_stage_inference.construct_contrain_matrix(
                    p_n, n_s, n_a)
                AA = np.array(A)
                # bb = np.asarray(b)

                if opt_ori:

                    def fun(x):
                        return -x[0]
                else:

                    def fun(x):
                        return x[0]

                constraints = []
                if opt_ori:
                    for i in range(n_s):
                        for j in range(n_a):
                            if j != V_n_max_index[i]:
                                # print(denom_consts[i][j])
                                if np.max(denom_consts[i][j]) > 1e-5:
                                    constraints.append({
                                        'type':
                                        'ineq',
                                        'fun':
                                        lambda x, up_c, denom_c: up_c /
                                        (np.sum(
                                            np.multiply(
                                                denom_c, np.reciprocal(x[1:])))
                                         ) - x[0],
                                        'args':
                                        (quad_consts[i][j], denom_consts[i][j])
                                    })
                else:
                    for i in range(n_s):
                        for j in range(n_a):
                            if j != V_n_max_index[i]:
                                # print(denom_consts[i][j])
                                if np.max(quad_consts[i][j]) > 1e-5:
                                    constraints.append({
                                        'type':
                                        'ineq',
                                        'fun':
                                        lambda x, up_c, denom_c: -(np.sum(
                                            np.multiply(
                                                denom_c, np.reciprocal(x[1:]))
                                        )) / up_c + x[0],
                                        'args':
                                        (quad_consts[i][j], denom_consts[i][j])
                                    })

                for i in range(AA.shape[0]):
                    constraints.append({
                        'type':
                        'eq',
                        'fun':
                        lambda x, a, b: np.dot(a, x[1:]) - b,
                        'args': (AA[i], b[i])
                    })
                constraints = tuple(constraints)
                bnds = []
                bnds.append((0., None))
                for i in range(n_s * n_a):
                    bnds.append((optLb, 1))
                    # bnds.append((optLbs[jj], 1))

                bnds = tuple(bnds)
                initial = np.ones(n_s * n_a + 1) / (n_s * n_a)

                initial[0] = 0.1
                # print(initial)
                # print("number of equality constraints is {}".format(len(A)))
                if args.opt_one_step:
                    #print("haha")
                    res = minimize(fun,
                                   initial,
                                   method='SLSQP',
                                   bounds=bnds,
                                   constraints=constraints,
                                   options={
                                       'disp': False,
                                       'maxiter': 1
                                   })
                else:
                    #print("huha")
                    res = minimize(fun,
                                   initial,
                                   method='SLSQP',
                                   bounds=bnds,
                                   constraints=constraints)
                x_opt = res.x[1:]

                # exit()

                # print("***", para_cl.s)
                #print(x_opt)

                data = collect_data_swimmer.collect_data(p,
                                                         r,
                                                         stage_data,
                                                         para_cl.s,
                                                         n_s,
                                                         n_a,
                                                         pi_s_a=x_opt,
                                                         std=r_sd)
                para_cl.update(data, resample=Bayes_resample)
                _, _, freq, _ = cal_impirical_r_p.cal_impirical_stats(
                    data, n_s, n_a)
                # print("x_opt", x_opt)
                # print("freq", freq)
                # dist = np.linalg.norm(freq - x_opt)
                # dist = sklearn.metrics.mutual_info_score(freq, x_opt)
                # print(dist)

                p_n, r_n, r_std = para_cl.get_para(resample=Bayes_resample)
                var_r_n = r_std**2
                Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma,
                                                n_s, n_a)
                V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
            V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p,
                                          gamma)
            future_V[iii] = np.dot(rou, V_here)
            FS_bool_ = FS_bool(Q_n, V_max_index, n_s, n_a)
            CS_num_naive += FS_bool_
        PCS_naive = np.float(CS_num_naive) / num_rep
        CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        # print(CS_num_naive)
        QOCBAs_PCS.append(PCS_naive)
        QOCBAs_fr.append(diff)
        if print_if:
            print("Q-OCBA:")
            print("PCS is {}, with CI length {}".format(PCS_naive, CI_len))
            print(
                "future value func is {} with CI length {}, real value is {}, diff is {}"
                .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))

        # follow original
        CS_num_naive = 0
        future_V = np.zeros(num_rep)
        for i in range(num_rep):
            data = collect_data_swimmer.collect_data(p,
                                                     r,
                                                     num_data + numdata_1,
                                                     s_0,
                                                     n_s,
                                                     n_a,
                                                     right_prop=right_prop,
                                                     std=r_sd)
            p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
                data, n_s, n_a)
            Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma,
                                            n_s, n_a)
            # print(Q_n)
            V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p,
                                          gamma)
            # print(V_here, V_real)
            future_V[i] = np.dot(rou, V_here)
            FS_bool_ = FS_bool(Q_n, V_max_index, n_s, n_a)
            CS_num_naive += FS_bool_
            # if not FS_bool_:
            # print(i)
            # print(f_n)
            # print(Q_n)
        PCS_naive = np.float(CS_num_naive) / num_rep
        CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        REs_PCS.append(PCS_naive)
        REs_fr.append(diff)
        if print_if:
            print("follow original")
            print("PCS is {}, with CI length {}".format(PCS_naive, CI_len))
            print(
                "future value func is {} with CI length {}, real value is {}, diff is {}"
                .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))

        # follow original
        CS_num_naive = 0
        future_V = np.zeros(num_rep)
        for i in range(num_rep):
            data = collect_data_swimmer.collect_data(p,
                                                     r,
                                                     num_data + numdata_1,
                                                     s_0,
                                                     n_s,
                                                     n_a,
                                                     right_prop=0.8,
                                                     std=r_sd)
            p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
                data, n_s, n_a)
            Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma,
                                            n_s, n_a)
            # print(Q_n)
            V_here = policy_val_iteration(Q_n, n_s, n_a, V_0, num_iter, r, p,
                                          gamma)
            # print(V_here, V_real)
            future_V[i] = np.dot(rou, V_here)
            FS_bool_ = FS_bool(Q_n, V_max_index, n_s, n_a)
            CS_num_naive += FS_bool_
            # if not FS_bool_:
            # print(i)
            # print(f_n)
            # print(Q_n)
        PCS_naive = np.float(CS_num_naive) / num_rep
        CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        REs_PCS_08.append(PCS_naive)
        REs_fr_08.append(diff)
        if print_if:
            print("RE(0.8)")
            print("PCS is {}, with CI length {}".format(PCS_naive, CI_len))
            print(
                "future value func is {} with CI length {}, real value is {}, diff is {}"
                .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))

        #UCRL
        #delta = 0.05
        CS_num = 0.
        future_V = np.zeros(num_rep)
        for i in range(num_rep):
            data = collect_data_swimmer.collect_data(p,
                                                     r,
                                                     numdata_1,
                                                     s_0,
                                                     n_s,
                                                     n_a,
                                                     right_prop=right_prop,
                                                     std=r_sd)
            pre_collected_stats = UCRL_2.get_pre_collected_stats(
                data, n_s, n_a)
            UCRL_cl = UCRL_2.UCRL(n_s, n_a, 0.05, numdata_1, s_0, num_data,
                                  pre_collected_stats)
            while UCRL_cl.t < num_data:
                UCRL_cl.update_point_estimate_and_CIbound()
                # print("step1 finished")
                UCRL_cl.Extended_Value_Iter()
                # print("step2 finished")
                UCRL_cl.collect_data_and_update(p, r, r_std=r_sd)
                # print("step3 finished")
                # print(UCRL_cl.t)
            UCRL_cl.update_point_estimate_and_CIbound()
            datahere = data + UCRL_cl.datas
            #print(UCRL_cl.t, num_data)
            #print(len(datahere))
            #exit()
            p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
                datahere, n_s, n_a)
            Q_estimate = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter,
                                                   gamma, n_s, n_a)
            # print(Q_estimate)
            FS_bool_ = FS_bool(Q_estimate, V_max_index, n_s, n_a)
            CS_num += FS_bool_
            V_here = policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter,
                                          r, p, gamma)
            # print(V_here, V_real)
            future_V[i] = np.dot(rou, V_here)
        PCS = np.float(CS_num) / num_rep
        CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        # print(CS_num_naive)
        UCRL_PCSs.append(PCS)
        UCRL_frs.append(diff)
        if print_if:
            print("UCRL")
            print("PCS is {}, with CI length {}".format(PCS, CI_len))
            print(
                "future value func is {} with  CI length {}, real value is {}, diff is {}"
                .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))

        CS_num = 0.
        future_V = np.zeros(num_rep)
        for i in range(num_rep):
            para_cl = PSRLcls(n_s, n_a, s_0)
            data = collect_data_swimmer.collect_data(p,
                                                     r,
                                                     numdata_1,
                                                     s_0,
                                                     n_s,
                                                     n_a,
                                                     right_prop=right_prop,
                                                     std=r_sd)
            para_cl.update(data, r_sigma=r_sd)
            Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
            # print(Q_estimate)
            for nd in stage_datas:
                dat = collect_data_swimmer.collect_data(p,
                                                        r,
                                                        nd,
                                                        para_cl.s_0,
                                                        n_s,
                                                        n_a,
                                                        Q=Q_estimate,
                                                        epsilon=0,
                                                        std=r_sd)
                data += dat
                para_cl.update(dat, r_sigma=r_sd)
                Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
                # print(para_cl.pprior)
                # print(para_cl.r_mean)
            # exit()
            # print(Q_estimate)
            # print(para_cl.pprior)
            # print(para_cl.r_mean)
            # transition = np.array([1.] * n_s * (n_s * n_a))
            # for i in range(n_s):
            #    for j in range(n_a):
            #        transition[
            #        (i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] = para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] \
            #                                                                      / np.sum(para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)])
            # r_n = para_cl.r_mean
            # print(r_n)
            # print(transition)
            # Q_estimate = Iterative_Cal_Q.cal_Q_val(transition, Q_0, r_n, num_iter , gamma, n_s, n_a)
            p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
                data, n_s, n_a)
            Q_estimate = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter,
                                                   gamma, n_s, n_a)
            V_here = policy_val_iteration(Q_estimate, n_s, n_a, V_0, num_iter,
                                          r, p, gamma)
            # print(V_here, V_real)
            future_V[i] = np.dot(rou, V_here)
            FS_bool_ = FS_bool(Q_estimate, V_max_index, n_s, n_a)
            CS_num += FS_bool_
        PCS = np.float(CS_num) / num_rep
        CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        PSRL_PCSs.append(PCS)
        PSRL_frs.append(diff)
        # print(CS_num_naive)
        if print_if:
            print("PSRL")
            print("PCS is {}, with CI length {}".format(PCS, CI_len))
            print(
                "future value func is {} with  CI length {}, real value is {}, diff is {}"
                .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))
    print("epsilon greedy")
    print(eps_PCSs)
    print(eps_frs)
    print("QOCBA")
    print(QOCBAs_PCS)
    print(QOCBAs_fr)
    print("REs")
    print(REs_PCS)
    print(REs_fr)
    print("RE(0.8)s")
    print(REs_PCS_08)
    print(REs_fr_08)
    print("UCRL ")
    print(UCRL_PCSs)
    print(UCRL_frs)
    print("PSRL ")
    print(PSRL_PCSs)
    print(PSRL_frs)
    if logif:
        num_datas = np.log(np.array(num_datas) + 1)
    plt.plot(num_datas, eps_PCSs, 'k<--', markersize=6, label="epsilon-greedy")
    plt.plot(num_datas, UCRL_PCSs, 'm+--', markersize=6, label="UCRL")
    plt.plot(num_datas, PSRL_PCSs, 'cx--', markersize=6, label="PSRL")
    plt.plot(num_datas, QOCBAs_PCS, 'ro--', markersize=6, label="Q-OCBA")
    # plt.fill_between(xs, np.subtract(y1, CI_1), np.add(y1, CI_1), color='r', alpha=0.4)
    plt.plot(num_datas,
             REs_PCS,
             'b>--',
             markersize=6,
             label="RE({})".format(right_prop))
    plt.plot(num_datas, REs_PCS_08, 'g*--', markersize=6, label="RE(0.8)")

    # plt.fill_between(xs, np.subtract(y2, CI_2), np.add(y2, CI_2), color='b', alpha=0.4)
    # plt.axhline(y=0.95)
    plt.xlabel("total number of data")
    # plt.ylabel("CR overall coverage")
    plt.ylabel("PCS")
    plt.legend(loc='lower right', shadow=True, fontsize='x-small')
    plt.title(r'$\sigma_R= {}, r_L = {}$ PCS'.format(r_sd, r[0]))
    plt.show()

    plt.plot(num_datas, eps_frs, 'k<--', markersize=6, label="epsilon-greedy")
    plt.plot(num_datas, UCRL_frs, 'm+--', markersize=6, label="UCRL")
    plt.plot(num_datas, PSRL_frs, 'cx--', markersize=6, label="PSRL")
    plt.plot(num_datas, REs_fr_08, 'g*--', markersize=6, label="RE(0.8)")

    plt.plot(num_datas, QOCBAs_fr, 'ro--', markersize=6, label="Q-OCBA")
    # plt.fill_between(xs, np.subtract(y1, CI_1), np.add(y1, CI_1), color='r', alpha=0.4)
    plt.plot(num_datas,
             REs_fr,
             'b>--',
             markersize=6,
             label="RE({})".format(right_prop))
    # plt.fill_between(xs, np.subtract(y2, CI_2), np.add(y2, CI_2), color='b', alpha=0.4)
    # plt.axhline(y=0.95)
    plt.xlabel("total number of data")
    # plt.ylabel("CR overall coverage")
    plt.ylabel("future regret")
    plt.legend(loc='upper right', shadow=True, fontsize='x-small')
    plt.title(r'$\sigma_R= {}, r_L = {}$ future regret'.format(r_sd, r[0]))

    plt.show()
Example #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of repetitions')
    parser.add_argument('--r0',
                        nargs="?",
                        type=float,
                        default=1.0,
                        help='value of r0')
    parser.add_argument('--numdata',
                        nargs="?",
                        type=int,
                        default=1000,
                        help='number of data')
    parser.add_argument('--rightprop',
                        nargs="?",
                        type=float,
                        default=0.6,
                        help='warm start random exploration right probability')
    args = parser.parse_args()

    num_rep = args.rep
    initial_s_dist = "even"
    Q_approximation = None
    # Q_approximation = "linear_interpolation"
    right_prop = args.rightprop  # 0.8
    s_0 = 2
    # collect data configuration
    num_data = args.numdata
    num_data_1 = num_data * 3 / 10
    num_data_2 = num_data * 7 / 10
    print(
        "num_data in stage 1 is {}, num_data in stage 2 is {}, rightprop in stage 1 is {}"
        .format(num_data_1, num_data_2, right_prop))
    n_s = 5
    print("n_s is {}".format(n_s))
    n_a = 2
    # value-iteration configuration
    num_iter = 200
    gamma = 0.95
    # real p and r
    p = np.zeros(n_s * n_a * n_s)
    Q_0 = np.zeros(n_s * n_a)
    r = np.zeros(n_s * n_a)
    r[0] = args.r0
    r[-1] = 10.
    # r[0] = 10.
    # r[-1] = 0.1
    print(r)
    print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
    p[0 * n_s * n_a + 0 * n_s + 0] = 1.
    p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
    p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
    for i in range(1, (n_s - 1)):
        p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
        p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
        p[i * n_a * n_s + 1 * n_s + i] = 0.6
        p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
    p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
    p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3

    # one replication of coverage test
    # Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    # V_real = get_V_from_Q(Q_real, n_s, n_a)
    # Q_n, CI_len, V_n = get_CI(collec_data_bool, num_data, s_0, num_iter, gamma, Q_0, n_s, n_a, r, p)
    # print(Q_real)
    # print(V_real)
    # print(Q_n)
    # print(V_n)
    # print(CI_len)
    Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s, n_a)
    V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
    print("Q real is {}".format(Q_real))
    if initial_s_dist == "even":
        R_real = np.mean(V_real)
    initial_w = np.ones(n_s) / n_s
    opts = []
    datas = []
    Q_ns = []
    opts_ori = []
    for i in range(num_rep):
        while True:

            while True:
                data = collect_data_swimmer.collect_data(p,
                                                         r,
                                                         num_data_1,
                                                         s_0,
                                                         n_s,
                                                         n_a,
                                                         right_prop=right_prop)
                p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
                    data, n_s, n_a)
                Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter, gamma,
                                                n_s, n_a)
                V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
                # print("first stage visiting frequency is {}".format(f_n))
                if f_n.all() != 0:
                    break
            datas.append(data)
            Q_ns.append(Q_n)
            I_TM, W_inverse, cov_V_D, I_TM_V, W_inverse_V, cov_V_V_D = inference.get_Sigma_n_comp(
                p_n, f_n, var_r_n, V_n, gamma, n_s, n_a, V_n_max_index)

            # print( np.diag(cov_V_V_D))
            # exit()
            quad_con_vec = np.power(np.dot(initial_w, I_TM_V),
                                    2) * np.diag(cov_V_V_D)
            # print(quad_con_vec)
            # if  not np.all(f_n):
            #    print(f_n)
            #    print(quad_con_vec)
            #    print("need more data for first stage")
            #    exit()
            quad_con_vec_all = np.zeros(n_s * n_a)
            for i in range(n_s):
                quad_con_vec_all[i * n_a + V_n_max_index[i]] = quad_con_vec[i]
            # print(quad_con_vec)

            # print(I_TM_V)
            # print(cov_V_V_D)
            # print(initial_w)
            # Create a new model
            init_v_opt = 1. / (n_a * n_s)
            quad_con_vec_all = matrix(quad_con_vec_all)
            array_quad_con_vec = np.array(quad_con_vec_all).transpose()[0]

            # print(array_quad_con_vec)
            # exit()
            def F(x):
                u = np.divide(1, x)
                # print(u)
                uu = np.multiply(array_quad_con_vec, u)
                # print(quad_con_vec_all)
                # print(uu)
                val = np.sum(uu)
                # print(val)
                return val

            A, b, G, h = construct_contrain_matrix(p_n, n_s, n_a)
            AA = np.array(A)
            bb = np.asarray(b)

            constraints = []

            for i in range(AA.shape[0]):
                constraints.append({
                    'type': 'eq',
                    'fun': lambda x, a, b: np.dot(a, x) - b,
                    'args': (AA[i], bb[i])
                })
            constraints = tuple(constraints)
            bnds = []
            for i in range(n_s * n_a):
                bnds.append((1e-6, None))
                # bnds.append((0.001, None))

            bnds = tuple(bnds)
            initial = np.ones(n_s * n_a) / (n_s * n_a)
            # print(initial)
            res = minimize(F,
                           initial,
                           method='SLSQP',
                           bounds=bnds,
                           constraints=constraints)
            x_opt = res.x
            # print(x_opt)

            opt_val = F(x_opt)

            # ori-Q-OCBA

            def fun(x):
                return x[0]

            # print("quardratic coeff of opt is {}".format(quad_consts))
            # print("denom consts coef of opt is {}".format(denom_consts))

            quad_consts = np.zeros((n_s, n_a))
            denom_consts = np.zeros((n_s, n_a, n_s * n_a))
            for i in range(n_s):
                for j in range(n_a):
                    if j != V_n_max_index[i]:
                        minus_op = np.zeros(n_s * n_a)
                        minus_op[i * n_a + j] = 1
                        minus_op[i * n_a + V_n_max_index[i]] = -1
                        c1 = np.power(np.dot(minus_op, I_TM), 2)
                        denom_consts[i][j] = c1 * np.diag(cov_V_D)
                        # print(I_TM, c1)
                        # exit()
                        quad_consts[i][j] = (
                            Q_n[i * n_a + j] -
                            Q_n[i * n_a + V_n_max_index[i]])**2

            constraints = []
            for i in range(n_s):
                for j in range(n_a):
                    if j != V_n_max_index[i]:
                        # print(denom_consts[i][j])
                        if np.max(denom_consts[i][j]) > 1e-5:
                            constraints.append({
                                'type':
                                'ineq',
                                'fun':
                                lambda x, up_c, denom_c: -(np.sum(
                                    np.multiply(denom_c, np.reciprocal(x[1:])))
                                                           ) / up_c + x[0],
                                'args': (quad_consts[i][j], denom_consts[i][j])
                            })

            for i in range(AA.shape[0]):
                constraints.append({
                    'type': 'eq',
                    'fun': lambda x, a, b: np.dot(a, x[1:]) - b,
                    'args': (AA[i], b[i])
                })
            constraints = tuple(constraints)
            bnds = []
            bnds.append((0., None))
            for i in range(n_s * n_a):
                bnds.append((1e-6, 1))
            bnds = tuple(bnds)
            initial = np.ones(n_s * n_a + 1) / (n_s * n_a)

            initial[0] = 0.1
            # print(initial)
            res = minimize(fun,
                           initial,
                           method='SLSQP',
                           bounds=bnds,
                           constraints=constraints)
            x_opt_ori = res.x[1:]
            opts_ori.append(x_opt_ori)
            bench_val = F(x_opt_ori)
            if bench_val > opt_val:
                break
            else:
                print(opt_val)
                print(bench_val)
                print("#####")

        # print(opt_val)
        # print(bench_val)
        # print("#####")

        epsilon = 0.3
        tran_M = optimize_pfs.transition_mat_S_A_epsilon(
            p_n, epsilon, V_n_max_index, n_s, n_a)
        bench_w = compare_var.solveStationary(tran_M)
        bench_w = np.array(bench_w).reshape(-1, )
        # print(bench_w)
        bench_val = F(bench_w)
        # print(bench_val)
        opts.append(x_opt)
        # exit()
    Q_approximation = None
    initial_s_dist = "even"
    if initial_s_dist == "even":
        R_real = np.mean(V_real)
        initial_w = np.ones(n_s) / n_s
        rou = initial_w
    cov_bools_Q = np.zeros(n_s * n_a)
    cov_bools_V = np.zeros(n_s)
    cov_bools_R = 0.
    # print("Q real is {}".format(Q_real))
    # print("V real is {}".format(V_real))
    # print("R real is {}".format(R_real))
    CI_lens_Q = []
    CI_lens_V = []
    CI_lens_R = []
    numerical_tol = 1e-6
    S_0 = None

    for i in range(num_rep):
        x_opt = opts[i]
        second_data = collect_data_swimmer.collect_data(p,
                                                        r,
                                                        num_data_2,
                                                        s_0,
                                                        n_s,
                                                        n_a,
                                                        right_prop=right_prop,
                                                        pi_s_a=x_opt)
        data = second_data + datas[i]
        # data = second_data
        Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(
            Q_approximation,
            S_0,
            num_data,
            s_0,
            num_iter,
            gamma,
            Q_0,
            n_s,
            n_a,
            r,
            p,
            initial_w,
            right_prop,
            data=data)
        # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
        cov_bool_Q = np.logical_and(Q_real <= (Q_n + CI_len_Q + numerical_tol),
                                    Q_real >= (Q_n - CI_len_Q - numerical_tol))
        cov_bool_V = np.logical_and(V_real <= (V_n + CI_len_V + numerical_tol),
                                    V_real >= (V_n - CI_len_V - numerical_tol))
        cov_bool_R = np.logical_and(R_real <= (R_n + CI_len_R + numerical_tol),
                                    R_real >= (R_n - CI_len_R - numerical_tol))
        # print(cov_bool_Q)
        # exit()
        # print(cov_bool)
        cov_bools_Q += cov_bool_Q
        cov_bools_V += cov_bool_V
        cov_bools_R += cov_bool_R
        CI_lens_Q.append(CI_len_Q)
        CI_lens_V.append(CI_len_V)
        CI_lens_R.append(CI_len_R)

    CI_len_Q_mean = np.mean(CI_lens_Q)
    CI_len_V_mean = np.mean(CI_lens_V)
    CI_len_R_mean = np.mean(CI_lens_R)
    CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
    CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
    CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

    cov_rate_Q = np.divide(cov_bools_Q, num_rep)
    cov_rate_V = np.divide(cov_bools_V, num_rep)
    cov_rate_R = np.divide(cov_bools_R, num_rep)
    cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
    cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
    cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)
    print("coverage for Q")
    print(cov_rate_Q)
    print(cov_rate_CI_Q)
    print("mean coverage for Q ")
    print(np.mean(cov_rate_Q))
    print(np.mean(cov_rate_CI_Q))
    print("coverage for V")
    print(cov_rate_V)
    print(cov_rate_CI_V)
    print("mean coverage for V")
    print(np.mean(cov_rate_V))
    print(np.mean(cov_rate_CI_V))
    print("coverage for R")
    print(cov_rate_R)
    print(cov_rate_CI_R)
    print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci))
    print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci))
    print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci))

    # Q-OCBA ori

    CS_num_naive = 0
    cov_bools_Q = np.zeros(n_s * n_a)
    cov_bools_V = np.zeros(n_s)
    cov_bools_R = 0.
    # print("Q real is {}".format(Q_real))
    # print("V real is {}".format(V_real))
    # print("R real is {}".format(R_real))
    CI_lens_Q = []
    CI_lens_V = []
    CI_lens_R = []
    future_V = np.zeros(num_rep)
    for i in range(num_rep):
        x_opt = opts_ori[i]
        # print(x_opt)
        second_data = collect_data_swimmer.collect_data(p,
                                                        r,
                                                        num_data_2,
                                                        s_0,
                                                        n_s,
                                                        n_a,
                                                        right_prop=right_prop,
                                                        pi_s_a=x_opt)

        data = second_data + datas[i]
        p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
            data, n_s, n_a)
        Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(
            Q_approximation,
            S_0,
            num_data,
            s_0,
            num_iter,
            gamma,
            Q_0,
            n_s,
            n_a,
            r,
            p,
            initial_w,
            right_prop,
            data=data)
        # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
        cov_bool_Q = np.logical_and(Q_real <= (Q_n + CI_len_Q + numerical_tol),
                                    Q_real >= (Q_n - CI_len_Q - numerical_tol))
        cov_bool_V = np.logical_and(V_real <= (V_n + CI_len_V + numerical_tol),
                                    V_real >= (V_n - CI_len_V - numerical_tol))
        cov_bool_R = np.logical_and(R_real <= (R_n + CI_len_R + numerical_tol),
                                    R_real >= (R_n - CI_len_R - numerical_tol))
        # print(cov_bool_Q)
        # exit()
        # print(cov_bool)
        cov_bools_Q += cov_bool_Q
        cov_bools_V += cov_bool_V
        cov_bools_R += cov_bool_R
        CI_lens_Q.append(CI_len_Q)
        CI_lens_V.append(CI_len_V)
        CI_lens_R.append(CI_len_R)
        # if not FS_bool_:
        # print(i)
        # print(f_n)
        # print(Q_n)
    PCS_naive = np.float(CS_num_naive) / num_rep
    CI_len = 1.96 * np.sqrt(PCS_naive * (1 - PCS_naive) / num_rep)
    fv = np.mean(future_V)
    fv_std = np.std(future_V)
    rv = np.dot(rou, V_real)
    diff = rv - fv
    # print(CS_num_naive)
    print("Q-OCBA:")
    print("PCS is {}, with CI length {}".format(PCS_naive, CI_len))
    print(
        "future value func is {} with CI length {}, real value is {}, diff is {}"
        .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))

    CI_len_Q_mean = np.mean(CI_lens_Q)
    CI_len_V_mean = np.mean(CI_lens_V)
    CI_len_R_mean = np.mean(CI_lens_R)
    CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
    CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
    CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

    cov_rate_Q = np.divide(cov_bools_Q, num_rep)
    cov_rate_V = np.divide(cov_bools_V, num_rep)
    cov_rate_R = np.divide(cov_bools_R, num_rep)
    cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
    cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
    cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)
    print("coverage for Q")
    print(cov_rate_Q)
    print(cov_rate_CI_Q)
    print("mean coverage for Q ")
    print(np.mean(cov_rate_Q))
    print(np.mean(cov_rate_CI_Q))
    print("coverage for V")
    print(cov_rate_V)
    print(cov_rate_CI_V)
    print("mean coverage for V")
    print(np.mean(cov_rate_V))
    print(np.mean(cov_rate_CI_V))
    print("coverage for R")
    print(cov_rate_R)
    print(cov_rate_CI_R)
    print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci))
    print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci))
    print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci))

    #exit()

    epsilons = [0.2]
    for epsilon in epsilons:
        print("epsilon is {}".format(epsilon))
        cov_bools_Q = np.zeros(n_s * n_a)
        cov_bools_V = np.zeros(n_s)
        cov_bools_R = 0.
        # print("Q real is {}".format(Q_real))
        # print("V real is {}".format(V_real))
        # print("R real is {}".format(R_real))
        CI_lens_Q = []
        CI_lens_V = []
        CI_lens_R = []
        for i in range(num_rep):
            Q_n = Q_ns[i]
            second_data = collect_data_swimmer.collect_data(
                p,
                r,
                num_data_2,
                s_0,
                n_s,
                n_a,
                right_prop=right_prop,
                Q=Q_n,
                epsilon=epsilon,
                print_pro_right=False)
            data = second_data + datas[i]
            Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(
                Q_approximation,
                S_0,
                num_data,
                s_0,
                num_iter,
                gamma,
                Q_0,
                n_s,
                n_a,
                r,
                p,
                initial_w,
                right_prop,
                data=data)
            # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
            cov_bool_Q = np.logical_and(
                Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >=
                (Q_n - CI_len_Q - numerical_tol))
            cov_bool_V = np.logical_and(
                V_real <= (V_n + CI_len_V + numerical_tol), V_real >=
                (V_n - CI_len_V - numerical_tol))
            cov_bool_R = np.logical_and(
                R_real <= (R_n + CI_len_R + numerical_tol), R_real >=
                (R_n - CI_len_R - numerical_tol))
            # print(cov_bool_Q)
            # exit()
            # print(cov_bool)
            cov_bools_Q += cov_bool_Q
            cov_bools_V += cov_bool_V
            cov_bools_R += cov_bool_R
            CI_lens_Q.append(CI_len_Q)
            CI_lens_V.append(CI_len_V)
            CI_lens_R.append(CI_len_R)

        CI_len_Q_mean = np.mean(CI_lens_Q)
        CI_len_V_mean = np.mean(CI_lens_V)
        CI_len_R_mean = np.mean(CI_lens_R)
        CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
        CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
        CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

        cov_rate_Q = np.divide(cov_bools_Q, num_rep)
        cov_rate_V = np.divide(cov_bools_V, num_rep)
        cov_rate_R = np.divide(cov_bools_R, num_rep)
        cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
        cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
        cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)
        print("coverage for Q")
        print(cov_rate_Q)
        print(cov_rate_CI_Q)
        print("mean coverage for Q ")
        print(np.mean(cov_rate_Q))
        print(np.mean(cov_rate_CI_Q))
        print("coverage for V")
        print(cov_rate_V)
        print(cov_rate_CI_V)
        print("mean coverage for V")
        print(np.mean(cov_rate_V))
        print(np.mean(cov_rate_CI_V))
        print("coverage for R")
        print(cov_rate_R)
        print(cov_rate_CI_R)
        print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean,
                                                     CI_len_Q_ci))
        print("CI len for V CI {} with ci {}".format(CI_len_V_mean,
                                                     CI_len_V_ci))
        print("CI len for R CI {} with ci {}".format(CI_len_R_mean,
                                                     CI_len_R_ci))

    print("RE(0.8)")
    cov_bools_Q = np.zeros(n_s * n_a)
    cov_bools_V = np.zeros(n_s)
    cov_bools_R = 0.
    # print("Q real is {}".format(Q_real))
    # print("V real is {}".format(V_real))
    # print("R real is {}".format(R_real))
    CI_lens_Q = []
    CI_lens_V = []
    CI_lens_R = []
    for i in range(num_rep):
        second_data = collect_data_swimmer.collect_data(p,
                                                        r,
                                                        num_data_2,
                                                        s_0,
                                                        n_s,
                                                        n_a,
                                                        right_prop=right_prop)
        data = second_data + datas[i]
        Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(
            Q_approximation,
            S_0,
            num_data,
            s_0,
            num_iter,
            gamma,
            Q_0,
            n_s,
            n_a,
            r,
            p,
            initial_w,
            right_prop,
            data=data)
        # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
        cov_bool_Q = np.logical_and(Q_real <= (Q_n + CI_len_Q + numerical_tol),
                                    Q_real >= (Q_n - CI_len_Q - numerical_tol))
        cov_bool_V = np.logical_and(V_real <= (V_n + CI_len_V + numerical_tol),
                                    V_real >= (V_n - CI_len_V - numerical_tol))
        cov_bool_R = np.logical_and(R_real <= (R_n + CI_len_R + numerical_tol),
                                    R_real >= (R_n - CI_len_R - numerical_tol))
        # print(cov_bool_Q)
        # exit()
        # print(cov_bool)
        cov_bools_Q += cov_bool_Q
        cov_bools_V += cov_bool_V
        cov_bools_R += cov_bool_R
        CI_lens_Q.append(CI_len_Q)
        CI_lens_V.append(CI_len_V)
        CI_lens_R.append(CI_len_R)

    CI_len_Q_mean = np.mean(CI_lens_Q)
    CI_len_V_mean = np.mean(CI_lens_V)
    CI_len_R_mean = np.mean(CI_lens_R)
    CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
    CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
    CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

    cov_rate_Q = np.divide(cov_bools_Q, num_rep)
    cov_rate_V = np.divide(cov_bools_V, num_rep)
    cov_rate_R = np.divide(cov_bools_R, num_rep)
    cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
    cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
    cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)
    print("coverage for Q")
    print(cov_rate_Q)
    print(cov_rate_CI_Q)
    print("mean coverage for Q ")
    print(np.mean(cov_rate_Q))
    print(np.mean(cov_rate_CI_Q))
    print("coverage for V")
    print(cov_rate_V)
    print(cov_rate_CI_V)
    print("mean coverage for V")
    print(np.mean(cov_rate_V))
    print(np.mean(cov_rate_CI_V))
    print("coverage for R")
    print(cov_rate_R)
    print(cov_rate_CI_R)
    print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean, CI_len_Q_ci))
    print("CI len for V CI {} with ci {}".format(CI_len_V_mean, CI_len_V_ci))
    print("CI len for R CI {} with ci {}".format(CI_len_R_mean, CI_len_R_ci))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--rep',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of repetitions')
    parser.add_argument('--episode',
                        nargs="?",
                        type=int,
                        default=100,
                        help='number of episode')
    #parser.add_argument('--r0', nargs = "?", type = float, default = 1.0, help = 'value of r0'  )
    parser.add_argument('--numdata',
                        nargs="?",
                        type=int,
                        default=1000,
                        help='number of data')
    parser.add_argument('--rightprop',
                        nargs="?",
                        type=float,
                        default=0.6,
                        help='warm start random exploration right probability')
    parser.add_argument('--rstd',
                        nargs="?",
                        type=float,
                        default=1.0,
                        help='standard deviation of reward')

    args = parser.parse_args()

    num_iter, gamma, n_s, n_a, num_rep = 200, 0.95, 5, 2, args.rep
    episodes = args.episode
    Total_data = args.numdata
    right_prop = args.rightprop
    #print(num_rep, episodes, Total_data, right_prop)

    r = np.zeros(n_s * n_a)
    r_vals = range(1, 4)
    #r_vals = [5./1000]
    r_right = 10.0

    for r0_val in r_vals:
        r[0] = float(r0_val)
        r[-1] = r_right
        r_std = args.rstd
        print("reward standard deviation is {}".format(r_std))
        # r[0] = 10.
        # r[-1] = 0.1
        Q_0 = np.zeros(n_s * n_a)
        V_0 = np.zeros(n_s)
        rou = np.ones(n_s) / n_s
        p = np.zeros(n_s * n_a * n_s)
        print("r[0] and r[-1] are {}, {}".format(r[0], r[-1]))
        #exit()

        p[0 * n_s * n_a + 0 * n_s + 0] = 1.
        p[0 * n_s * n_a + 1 * n_s + 0] = 0.7
        p[0 * n_s * n_a + 1 * n_s + 1] = 0.3
        for i in range(1, (n_s - 1)):
            p[i * n_a * n_s + 0 * n_s + (i - 1)] = 1
            p[i * n_a * n_s + 1 * n_s + (i - 1)] = 0.1
            p[i * n_a * n_s + 1 * n_s + i] = 0.6
            p[i * n_a * n_s + 1 * n_s + (i + 1)] = 0.3
        p[(n_s - 1) * n_s * n_a + 0 * n_s + (n_s - 2)] = 1
        p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 2)] = 0.7
        p[(n_s - 1) * n_s * n_a + 1 * n_s + (n_s - 1)] = 0.3
        Q_real = Iterative_Cal_Q.cal_Q_val(p, Q_0, r, num_iter, gamma, n_s,
                                           n_a)
        V_real, V_max_index = inference.get_V_from_Q(Q_real, n_s, n_a)
        print("Q real is {}".format(Q_real))
        s_0 = 2

        Q_approximation = None
        initial_s_dist = "even"
        if initial_s_dist == "even":
            R_real = np.mean(V_real)
            initial_w = np.ones(n_s) / n_s
        cov_bools_Q = np.zeros(n_s * n_a)
        cov_bools_V = np.zeros(n_s)
        cov_bools_R = 0.
        # print("Q real is {}".format(Q_real))
        # print("V real is {}".format(V_real))
        # print("R real is {}".format(R_real))
        CI_lens_Q = []
        CI_lens_V = []
        CI_lens_R = []
        numerical_tol = 1e-6
        S_0 = None

        ## PSRL data parameter specification
        print("total num of data is {}".format(Total_data))
        numdata_1 = Total_data * 3 / 10
        seq_if = False
        numdata_2 = Total_data - numdata_1

        print("# of epsisodes is {}".format(episodes))
        num_datas = [numdata_2 / episodes] * episodes
        #print(num_datas)
        CS_num = 0.
        future_V = np.zeros(num_rep)

        for i in range(num_rep):
            para_cl = parameter_prior(n_s, n_a, s_0)
            all_data = []
            if not seq_if:
                while True:
                    data1 = collect_data_swimmer.collect_data(
                        p,
                        r,
                        numdata_1,
                        s_0,
                        n_s,
                        n_a,
                        right_prop=right_prop,
                        std=r_std)
                    p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
                        data1, n_s, n_a)
                    Q_n = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter,
                                                    gamma, n_s, n_a)
                    V_n, V_n_max_index = inference.get_V_from_Q(Q_n, n_s, n_a)
                    #print("first stage visiting frequency is {}".format(f_n))
                    if f_n.all() != 0:
                        break
            else:
                data1 = collect_data_swimmer.collect_data(
                    p,
                    r,
                    numdata_2 / episodes,
                    s_0,
                    n_s,
                    n_a,
                    right_prop=right_prop,
                    std=r_std)
            #data =  collect_data_swimmer.collect_data(p, r, numdata_1, s_0, n_s, n_a, right_prop=right_prop)
            all_data += data1
            para_cl.update(data1, r_sigma=r_std)
            Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
            #print(Q_estimate)
            second_stage_data = []
            for num_data in num_datas:
                data = collect_data_swimmer.collect_data(p,
                                                         r,
                                                         num_data,
                                                         para_cl.s_0,
                                                         n_s,
                                                         n_a,
                                                         Q=Q_estimate,
                                                         epsilon=0,
                                                         std=r_std)
                all_data += data
                second_stage_data += data
                para_cl.update(data, r_sigma=r_std)
                Q_estimate = para_cl.sampled_MDP_Q(Q_0, num_iter, gamma)
                #print(para_cl.pprior)
                #print(para_cl.r_mean)
            #exit()
            #print(Q_estimate)
            #print(para_cl.pprior)
            #print(para_cl.r_mean)
            #transition = np.array([1.] * n_s * (n_s * n_a))
            #for i in range(n_s):
            #    for j in range(n_a):
            #        transition[
            #        (i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] = para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)] \
            #                                                                      / np.sum(para_cl.pprior[(i * n_s * n_a + j * n_s): (i * n_s * n_a + (j + 1) * n_s)])
            #r_n = para_cl.r_mean
            #print(r_n)
            #print(transition)
            #Q_estimate = Iterative_Cal_Q.cal_Q_val(transition, Q_0, r_n, num_iter , gamma, n_s, n_a)
            #print(len(all_data))
            p_n, r_n, f_n, var_r_n = cal_impirical_r_p.cal_impirical_stats(
                all_data, n_s, n_a)
            Q_estimate = Iterative_Cal_Q.cal_Q_val(p_n, Q_0, r_n, num_iter,
                                                   gamma, n_s, n_a)
            V_here = optimize_pfs.policy_val_iteration(Q_estimate, n_s, n_a,
                                                       V_0, num_iter, r, p,
                                                       gamma)
            # print(V_here, V_real)
            future_V[i] = np.dot(rou, V_here)
            FS_bool = optimize_pfs.FS_bool(Q_estimate, V_max_index, n_s, n_a)
            CS_num += FS_bool

            # 5.3
            Q_n, CI_len_Q, V_n, CI_len_V, R_n, CI_len_R = inference.get_CI(
                Q_approximation,
                S_0,
                Total_data,
                s_0,
                num_iter,
                gamma,
                Q_0,
                n_s,
                n_a,
                r,
                p,
                initial_w,
                right_prop,
                data=all_data)
            # print("{} th replication : Q_n is {}, CI len is {}".format(i, Q_n, CI_len))
            cov_bool_Q = np.logical_and(
                Q_real <= (Q_n + CI_len_Q + numerical_tol), Q_real >=
                (Q_n - CI_len_Q - numerical_tol))
            cov_bool_V = np.logical_and(
                V_real <= (V_n + CI_len_V + numerical_tol), V_real >=
                (V_n - CI_len_V - numerical_tol))
            cov_bool_R = np.logical_and(
                R_real <= (R_n + CI_len_R + numerical_tol), R_real >=
                (R_n - CI_len_R - numerical_tol))
            # print(cov_bool_Q)
            # exit()
            # print(cov_bool)
            cov_bools_Q += cov_bool_Q
            cov_bools_V += cov_bool_V
            cov_bools_R += cov_bool_R
            CI_lens_Q.append(CI_len_Q)
            CI_lens_V.append(CI_len_V)
            CI_lens_R.append(CI_len_R)

        PCS = np.float(CS_num) / num_rep
        CI_len = 1.96 * np.sqrt(PCS * (1 - PCS) / num_rep)
        fv = np.mean(future_V)
        fv_std = np.std(future_V)
        rv = np.dot(rou, V_real)
        diff = rv - fv
        # print(CS_num_naive)
        print("PCS is {}, with CI length {}".format(PCS, CI_len))
        print(
            "future value func is {} with  CI length {}, real value is {}, diff is {}"
            .format(fv, 1.96 * fv_std / np.sqrt(num_rep), rv, diff))

        CI_len_Q_mean = np.mean(CI_lens_Q)
        CI_len_V_mean = np.mean(CI_lens_V)
        CI_len_R_mean = np.mean(CI_lens_R)
        CI_len_Q_ci = 1.96 * np.std(CI_lens_Q) / np.sqrt(num_rep)
        CI_len_V_ci = 1.96 * np.std(CI_lens_V) / np.sqrt(num_rep)
        CI_len_R_ci = 1.96 * np.std(CI_lens_R) / np.sqrt(num_rep)

        cov_rate_Q = np.divide(cov_bools_Q, num_rep)
        cov_rate_V = np.divide(cov_bools_V, num_rep)
        cov_rate_R = np.divide(cov_bools_R, num_rep)
        cov_rate_CI_Q = 1.96 * np.sqrt(cov_rate_Q * (1 - cov_rate_Q) / num_rep)
        cov_rate_CI_V = 1.96 * np.sqrt(cov_rate_V * (1 - cov_rate_V) / num_rep)
        cov_rate_CI_R = 1.96 * np.sqrt(cov_rate_R * (1 - cov_rate_R) / num_rep)
        print("coverage for Q")
        print(cov_rate_Q)
        print(cov_rate_CI_Q)
        print("mean coverage for Q ")
        print(np.mean(cov_rate_Q))
        print(np.mean(cov_rate_CI_Q))
        print("coverage for V")
        print(cov_rate_V)
        print(cov_rate_CI_V)
        print("mean coverage for V")
        print(np.mean(cov_rate_V))
        print(np.mean(cov_rate_CI_V))
        print("coverage for R")
        print(cov_rate_R)
        print(cov_rate_CI_R)
        print("CI len for Q CI {} with ci {}".format(CI_len_Q_mean,
                                                     CI_len_Q_ci))
        print("CI len for V CI {} with ci {}".format(CI_len_V_mean,
                                                     CI_len_V_ci))
        print("CI len for R CI {} with ci {}".format(CI_len_R_mean,
                                                     CI_len_R_ci))