def main():

	parser = argparse.ArgumentParser()
	parser.add_argument('--weight_discount', default=0.99, type=float)	# note: 1.0 only for finite
	parser.add_argument('--exploration', default=0.1, type=float)	# 0.0 means no random action
	parser.add_argument('--basis_function_dim', default=40, type=int)
	parser.add_argument('--stop_criterion', default=10**-3, type=float)
	parser.add_argument('--sample_max_steps', default="2000", choices=["2000","5000","10000","20000"])
	parser.add_argument('--max_steps', default=20, type=int)
	parser.add_argument('--reg_opt', default="l2", choices=["l1","l2", "wl1", "none"])
	parser.add_argument('--reg_param', default=0.001, type=float)
	parser.add_argument('--rbf_sigma', default=0.01, type=float)
	# parser.add_argument('--batch_size', default=2000, type=int)
	parser.add_argument('--L', default=0.1, type=float)	# 0.0 means no random action
	

	args = parser.parse_args()
	params = vars(args)

	# env 
	env = LQREnv()
	params['n_actions'] = env.action_space.shape[0]
	params['state_dim'] = env.observation_space.shape[0]
	params['sample_max_steps'] = int(params['sample_max_steps'])
	# print(params['state_dim'])
	
	# basis function
	n_features = params['basis_function_dim']
	gamma = params['weight_discount']
	# params['basis_func'] = ExactBasis4LQR()
	params['basis_func'] = RBF_LQR([params['state_dim'], params['n_actions']], n_features, params['rbf_sigma'])

	params['policy'] = RBFPolicy4LQR(params['basis_func'])

	# set the parameters for agent
	batch_size = params['sample_max_steps']
	max_steps = params['max_steps']

	agent = LSPIAgent(params)
	sample_filename = LQR_samples_filename[params['sample_max_steps']]
	# sample_filename = LQR_samples_filename["-22-10000"]
	f = open(sample_filename, 'rb')
	replay_buffer = pickle.load(f)

	sample = replay_buffer.sample(batch_size)
	print("length of sample: {}".format(len(sample[0])))
	error_list, new_weights = agent.train(sample)

	test_agent4LQR(agent, env, gamma, ifshow=False)
	# clean
	env.close()
Example #2
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--weight_discount', default=0.99, type=float)	# note: 1.0 only for finite
	parser.add_argument('--exploration', default=0.1, type=float)	# 0.0 means no random action
	parser.add_argument('--basis_function_dim', default=30, type=int)
	parser.add_argument('--stop_criterion', default=10**-3, type=float)
	parser.add_argument('--sample_max_steps', default="2000", choices=["2000","5000","10000","20000"])
	parser.add_argument('--max_steps', default=20, type=int)
	parser.add_argument('--reg_opt', default="wl1", choices=["l1","l2", "wl1", "none"])
	parser.add_argument('--reg_param', default=0.001, type=float)	# for l1 and l2
	parser.add_argument('--rbf_sigma', default=0.01, type=float)	# for RBFs
	# parser.add_argument('--batch_size', default=2000, type=int)
	parser.add_argument('--L', default=0.1, type=float)	# 0.0 means no random action
	

	args = parser.parse_args()
	params = vars(args)

	# env 
	env = LQREnv()
	params['n_actions'] = env.action_space.shape[0]
	params['state_dim'] = env.observation_space.shape[0]
	params['sample_max_steps'] = int(params['sample_max_steps'])
	# print(params['state_dim'])
	
	# basis function
	# real feature = n_feature**(dim of state + dim of action)
	n_features = params['basis_function_dim']
	gamma = params['weight_discount']
	# params['basis_func'] = ExactBasis4LQR()

	sample_filename = LQR_samples_filename[params['sample_max_steps']]
	# sample_filename = LQR_samples_filename["-22-10000"]
	f = open(sample_filename, 'rb')
	replay_buffer = pickle.load(f)
	batch_size = params['sample_max_steps']
	max_steps = params['max_steps']
	sample = replay_buffer.sample(batch_size)
	print("length of sample: {}".format(len(sample[0])))

	L_vec = 1.1*np.concatenate((np.max(np.abs(sample[0]), axis=0).flatten(), [np.max(np.abs(sample[1]))]))
	params['basis_func'] = Laplace_LQR(n_features, L_vec)
	params['policy'] = RBFPolicy4LQR(params['basis_func'])

	agent = BellmanAgent(params, n_iter_max=15)
	
	error_list, new_weights = agent.train(sample)

	test_agent4LQR(agent, env, gamma, ifshow=False)
	# clean
	env.close()
Example #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_name',
                        default="LQR",
                        choices=[
                            "cliff-v0", "CartPole-v0", "inverted_pedulum",
                            "LQR", "chain"
                        ])  # gym env to train
    parser.add_argument('--weight_discount', default=0.99,
                        type=float)  # note: 1.0 only for finite
    parser.add_argument('--exploration', default=0.1,
                        type=float)  # 0.0 means no random action
    parser.add_argument('--basis_function_dim', default=50, type=int)
    parser.add_argument('--stop_criterion', default=10**-5, type=float)
    parser.add_argument('--sample_max_steps',
                        default="2000",
                        choices=["2000", "5000", "10000", "20000"])
    parser.add_argument('--max_steps', default=500, type=int)
    # parser.add_argument('--batch_size', default=2000, type=int)
    parser.add_argument('--L', default=0.1,
                        type=float)  # 0.0 means no random action
    parser.add_argument('--reg_opt', default="l2", choices=["l1", "l2", "wl1"])
    parser.add_argument('--reg_param', default=0.001, type=float)
    args = parser.parse_args()
    params = vars(args)

    env = LQREnv()
    params['n_actions'] = env.action_space.shape[0]
    params['state_dim'] = env.observation_space.shape[0]
    params['sample_max_steps'] = int(params['sample_max_steps'])
    # print(params['state_dim'])

    n_features = params['basis_function_dim']
    gamma = params['weight_discount']
    # params['basis_func'] = ExactBasis4LQR()
    # basis_function = ExactBasis4LQR()

    basis_function = RBF_LQR([params['state_dim'], params['n_actions']],
                             n_features, 0.001)
    params['basis_func'] = basis_function

    # esitimate specific L
    L = np.matrix(params['L'])

    # set the parameters for agent
    # use all the samples in buffer
    batch_size = params['sample_max_steps']
    max_steps = params['max_steps']

    sample_filename = LQR_samples_filename[params['sample_max_steps']]
    # sample_filename = LQR_samples_filename[-22]
    f = open(sample_filename, 'rb')
    replay_buffer = pickle.load(f)
    samples = replay_buffer.sample(batch_size)

    # samples to true Q value
    phi_list = []
    qTrue_list = []

    states = samples[0]
    actions = samples[1]
    rewards = samples[2]
    next_states = samples[3]
    dones = samples[4]

    phi_list = basis_function.evaluate(states, actions)

    for i in range(len(states)):
        print("i: {}".format(i))
        s = states[i]
        # print("state: {}".format(state))
        # print("action: {}".format(action))
        qTrue = env.true_Qvalue(L, gamma, states[i], actions[i])
        # print("qTrue: {}".format(qTrue))
        qTrue_list.append(qTrue)

    phi_list = np.array(phi_list)
    # print("phi_list shape: {}".format(phi_list.shape))
    # print("phi_list: {}".format(phi_list[:10]))
    qTrue_list = np.array(qTrue_list)
    # print("qTrue_list: {}".format(qTrue_list[:10]))
    # print("qTrue_list shape: {}".format(qTrue_list.shape))
    reg = LinearRegression().fit(phi_list, qTrue_list)
    # print("reg.get_params(): {}".format(reg.get_params()))

    # for state range
    state_low = -10.0
    state_high = 10.0
    states = np.linspace(state_low, state_high, 100)
    actions = []
    # true_weights_his = []
    true_estimate_error_history = []
    q_true_his = []
    q_estimate_his = reg.predict(
        basis_function.evaluate(states, -L.item() * states))

    for i in range(len(states)):
        state = np.matrix(states[i])
        action = -L * state
        actions.append(action.item())
        q_true = env.true_Qvalue(L, gamma, state, action)
        # q_state = env.true_Qvalue_state(L, gamma, state)
        q_true_his.append(q_true)

    now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time()))

    # save estimate data to file
    dirname = "data/Regression/states[" + str(state_low) + "," + str(
        state_high) + "]/"
    try:
        os.mkdir(dirname)
    except OSError as error:
        print(error)
    # # q_true
    # filename = dirname + "q_true.pickle"
    # f = open(filename, 'wb')
    # pickle.dump(q_true_his, f)
    # f.close()

    # estimate
    # if params['basis_func'].name()[:3]=='RBF':
    # 	filename = dirname + params['basis_func'].name()+"-"+str(params['basis_function_dim'])+"-"+params['reg_opt']+"-"+str(params['reg_param'])+".pickle"
    # else:
    # 	filename = dirname + params['basis_func'].name()+".pickle"
    # f1 = open(filename, 'wb')
    # pickle.dump(q_estimate_his, f1)
    # f1.close()

    # plot
    plt.figure(figsize=(10, 10))
    plt.title('state from -2 to 2 and action(-L*state)')
    plt.subplot(411)
    plt.plot(states)
    plt.title('state')

    plt.subplot(412)
    plt.plot(actions)
    plt.title('actions')
    # plt.show()

    plt.subplot(413)
    plt.plot(states, q_true_his)
    plt.title('true Q')
    # plt.show()

    plt.subplot(414)
    plt.plot(states, q_estimate_his)
    plt.title('estimate Q')
    # plt.savefig(now+"q_true&estimate-state(-2,2)")
    plt.show()

    # for specific state
    # range of action
    state = np.matrix(-1.)
    actions = np.linspace(-6, 6, 100)
    q_true_his = []

    q_estimate_his = reg.predict(
        basis_function.evaluate(np.full(len(actions), state), actions))
    for i in range(len(actions)):
        action = np.matrix(actions[i])
        # print("q_estimate: {}".format(q_estimate))
        q_true = env.true_Qvalue(L, gamma, state, action)
        # print("q_true: {}".format(q_true))
        q_true_his.append(q_true)

    # true_weights_scala = env.true_weights_scala(L, gamma)
    # print("true_weights_scala: {}".format(true_weights_scala))
    # estimate_weights = agent.policy.weights
    # print("estimate_weights: {}".format(estimate_weights))
    # true_estimate_error = np.linalg.norm(true_weights_scala-estimate_weights)
    # print("true_estimate_error: {}".format(true_estimate_error))

    now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time()))

    # save data to file
    # note .item() only for one element
    dirname = "data/Estimation/state=" + str(state.item()) + "/"
    try:
        os.mkdir(dirname)
    except OSError as error:
        print(error)

    # # save q_true
    # filename = dirname + "q_true.pickle"
    # f = open(filename, 'wb')
    # pickle.dump(q_true_his, f)
    # f.close()
    # save q_estimate

    # if params['basis_func'].name()[:3] == 'RBF':
    # 	filename = dirname + params['basis_func'].name()+"-"+str(params['basis_function_dim'])+"-"+params['reg_opt']+"-"+str(params['reg_param'])+".pickle"
    # else:
    # 	filename = dirname + params['basis_func'].name()+".pickle"
    # f1 = open(filename, 'wb')
    # pickle.dump(q_estimate_his, f1)
    # f1.close()

    print("q_estimate_his: {}".format(q_estimate_his))
    plt.figure(figsize=(8, 6))
    plt.subplot(211)
    plt.plot(actions, q_estimate_his)
    plt.title('q estimate')

    plt.subplot(212)
    plt.plot(actions, q_true_his)
    plt.title('q true')
    # plt.savefig("images/rbf-lqr/"+str(n_features)+"-"+now+"q_true&estimate-action(-1,1)")
    plt.show()

    env.close()
    replay_buffer.reset()
Example #4
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--weight_discount', default=0.99,
                        type=float)  # note: 1.0 only for finite
    parser.add_argument('--exploration', default=0.1,
                        type=float)  # 0.0 means no random action
    parser.add_argument('--basis_function_dim', default=40, type=int)
    parser.add_argument('--stop_criterion', default=10**-5, type=float)
    parser.add_argument('--sample_max_steps',
                        default="5000",
                        choices=["2000", "5000"])
    parser.add_argument('--max_steps', default=500, type=int)
    parser.add_argument('--reg_opt',
                        default="l2",
                        choices=["l1", "l2", "wl1", "none"])
    parser.add_argument('--reg_param', default=0.001, type=float)
    parser.add_argument('--rbf_sigma', default=0.01, type=float)
    # parser.add_argument('--batch_size', default=2000, type=int)
    parser.add_argument('--L', default=0.1,
                        type=float)  # 0.0 means no random action

    args = parser.parse_args()
    params = vars(args)

    # env
    env = LQREnv()
    params['n_actions'] = env.action_space.shape[0]
    params['state_dim'] = env.observation_space.shape[0]
    params['sample_max_steps'] = int(params['sample_max_steps'])
    # print(params['state_dim'])

    # basis function
    n_features = params['basis_function_dim']
    gamma = params['weight_discount']
    # params['basis_func'] = ExactBasis4LQR()
    params['basis_func'] = RBF_LQR([params['state_dim'], params['n_actions']],
                                   n_features, params['rbf_sigma'])

    # esitimate specific L
    L = np.matrix(params['L'])

    # params['policy'] = ExactPolicy4LQR(params['basis_func'], L)
    params['policy'] = RBFPolicy4LQR(params['basis_func'], L)
    # set the parameters for agent
    batch_size = params['sample_max_steps']
    max_steps = params['max_steps']

    agent = LSPIAgent(params)
    sample_filename = LQR_samples_filename[params['sample_max_steps']]
    f = open(sample_filename, 'rb')
    replay_buffer = pickle.load(f)

    samples = replay_buffer.sample(batch_size)
    print("length of sample: {}".format(len(samples[0])))
    error_list, new_weights = agent.train(samples)

    # for specific state
    # range of action
    for si in range(-10, 10, 5):
        si = -1.0
        true_estimate_error_history = []
        q_true_his = []
        q_estimate_his = []

        state = np.matrix(si)
        actions = np.linspace(-6, 6, 100)

        q_estimate_his = agent.policy.q_state_action_func(
            np.full(len(actions), state), actions)
        for i in range(len(actions)):
            action = np.matrix(actions[i])
            # q_estimate = agent.policy.q_state_action_func(state, action)[0]
            # q_estimate_his.append(q_estimate)
            # print("q_estimate: {}".format(q_estimate))
            q_true = env.true_Qvalue(L, gamma, state, action)
            # print("q_true: {}".format(q_true))
            q_true_his.append(q_true)

        true_weights_scala = env.true_weights_scala(L, gamma)
        print("true_weights_scala: {}".format(true_weights_scala))
        estimate_weights = agent.policy.weights
        print("estimate_weights: {}".format(estimate_weights))
        true_estimate_error = np.linalg.norm(true_weights_scala -
                                             estimate_weights)
        print("true_estimate_error: {}".format(true_estimate_error))

        # now = time.strftime("%Y-%m-%d-%H_%M_%S",time.localtime(time.time()))

        # save data to file
        # note .item() only for one element
        # dirname = "data/Estimation/state=" + str(state.item())+"/"
        # try:
        # 	os.mkdir(dirname)
        # except OSError as error:
        # 	print(error)

        # # save q_true
        # filename = dirname + "q_true.pickle"
        # f = open(filename, 'wb')
        # pickle.dump(q_true_his, f)
        # f.close()
        # save q_estimate

        # if params['basis_func'].name()[:3] == 'RBF':
        # 	filename = dirname + params['basis_func'].name()+"-"+str(params['basis_function_dim'])+"-"+params['reg_opt']+"-"+str(params['reg_param'])+".pickle"
        # else:
        # 	filename = dirname + params['basis_func'].name()+".pickle"
        # f1 = open(filename, 'wb')
        # pickle.dump(q_estimate_his, f1)
        # f1.close()

        qe_index = np.argmax(q_estimate_his)
        qt_index = np.argmax(q_true_his)

        plt.figure(figsize=(10, 8))
        plt.subplot(211)
        ax = plt.gca()
        plt.plot(actions, q_estimate_his)
        plt.scatter(actions[qe_index], q_estimate_his[qe_index], c='r')
        plt.xlabel('actions')
        plt.ylabel('q value')
        plt.title('estimate q value')
        ax.xaxis.set_label_coords(1.02, -0.035)
        plt.subplot(212)
        plt.plot(actions, q_true_his)
        plt.scatter(actions[qt_index], q_true_his[qt_index], c='r')

        plt.title('true q value')
        # plt.savefig("images/rbf-lqr/"+str(n_features)+"-"+now+"q_true&estimate-action(-1,1)")
        plt.show()

    env.close()
    replay_buffer.reset()
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--weight_discount', default=0.99,
                        type=float)  # note: 1.0 only for finite
    parser.add_argument('--exploration', default=0.1,
                        type=float)  # 0.0 means no random action
    parser.add_argument('--basis_function_dim', default=100, type=int)
    parser.add_argument('--stop_criterion', default=10**-3, type=float)
    parser.add_argument('--sample_max_steps',
                        default="10000",
                        choices=["2000", "5000", "10000", "20000"])
    parser.add_argument('--max_steps', default=20, type=int)
    parser.add_argument('--reg_opt',
                        default="l2",
                        choices=["l1", "l2", "wl1", "none"])
    parser.add_argument('--reg_param', default=0.001, type=float)
    parser.add_argument('--rbf_sigma', default=0.01, type=float)
    # parser.add_argument('--batch_size', default=2000, type=int)
    parser.add_argument('--L', default=0.1,
                        type=float)  # 0.0 means no random action

    args = parser.parse_args()
    params = vars(args)

    # env
    # env = LQREnv()
    A = np.matrix([[0.9, 0.], [0.08, 0.9]])
    B = np.matrix([[0.1], [0.6]])
    Z1 = np.matrix([[1, 0], [0, 0]])
    Z2 = 0.1
    noise_cov = np.matrix([[1, 0], [0, 1]])
    env = LQREnv(A=A, B=B, Z1=Z1, Z2=Z2, noise_cov=noise_cov)
    params['n_actions'] = env.action_space.shape[0]
    params['state_dim'] = env.observation_space.shape[0]
    params['sample_max_steps'] = int(params['sample_max_steps'])
    # print(params['state_dim'])

    # basis function
    n_features = params['basis_function_dim']
    gamma = params['weight_discount']
    # params['basis_func'] = ExactBasis4LQR()
    params['basis_func'] = RBF_LQR([params['state_dim'], params['n_actions']],
                                   n_features, params['rbf_sigma'])

    params['policy'] = RBFPolicy4LQR(params['basis_func'])

    # set the parameters for agent
    batch_size = params['sample_max_steps']
    max_steps = params['max_steps']

    agent = LSPIAgent(params)
    sample_filename = LQR2D_samples_filename[params['sample_max_steps']]
    # sample_filename = LQR_samples_filename["-22-10000"]
    f = open(sample_filename, 'rb')
    replay_buffer = pickle.load(f)

    sample = replay_buffer.sample(batch_size)
    print("length of sample: {}".format(len(sample[0])))
    error_list, new_weights = agent.train(sample)

    # states = np.linspace(-10,10,500)
    states = np.linspace([-10] * env.m, [10] * env.m, 500)
    trueL = env.optimal_policy_L(gamma)

    actions_true = []
    for i in range(len(states)):
        state = np.matrix(states[i].reshape(env.m, 1))
        # action = agent.policy.get_best_action(state)
        # actions_estimate.append(action)
        actions_true.append((-trueL * state).item())
    # print(actions_true)
    actions_estimate = agent.policy.get_best_action(states)
    # save agent

    now = time.strftime("%Y-%m-%d", time.localtime(time.time()))
    now2 = time.strftime("%H_%M_%S", time.localtime(time.time()))
    path = "data/LQR2D/" + now
    folder = os.path.exists(path)
    if not folder:
        os.makedirs(path)
    fn = path + "/data-" + str(params['reg_opt']) + "-" + str(
        params['reg_param']) + "-BF" + str(n_features) + "-" + params[
            'basis_func'].name() + "-" + now2 + ".pkl"
    f = open(fn, 'wb')
    pickle.dump(actions_estimate, f)
    f.close()

    # plot
    plt.plot(states, actions_estimate, label='estimate')
    # print(actions_true)
    plt.plot(states, actions_true, label='true')
    plt.legend(loc='upper right')
    pltfn = path + "/data-" + str(params['reg_opt']) + "-" + str(
        params['reg_param']) + "-BF" + str(n_features) + "-" + params[
            'basis_func'].name() + "-" + now2 + ".png"
    # f = open(fn, 'wb')
    plt.savefig(pltfn, dpi=300)
    # plt.show()
    # clean
    env.close()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_name',
                        default="LQR",
                        choices=[
                            "cliff-v0", "CartPole-v0", "inverted_pedulum",
                            "LQR", "chain"
                        ])  # gym env to train
    parser.add_argument('--episode_num', default=10, type=int)
    parser.add_argument('--weight_discount', default=0.99,
                        type=float)  # note: 1.0 only for finite
    parser.add_argument('--exploration', default=0.1,
                        type=float)  # 0.0 means no random action
    parser.add_argument('--basis_function_dim', default=10, type=int)
    parser.add_argument('--stop_criterion', default=10**-5, type=float)
    parser.add_argument('--sample_max_steps',
                        default="5000",
                        choices=["2000", "5000", "10000", "20000"])
    parser.add_argument('--max_steps', default=500, type=int)
    parser.add_argument('--batch_size', default=2000, type=int)
    parser.add_argument('--update_freq', default=10000000, type=int)
    parser.add_argument('--L', default=0.1,
                        type=float)  # 0.0 means no random action
    parser.add_argument('--reg_opt',
                        default="none",
                        choices=["l1", "l2", "wl1", "none"])
    parser.add_argument('--reg_param', default=0.01, type=float)

    args = parser.parse_args()
    params = vars(args)

    # env
    env = LQREnv()
    params['n_actions'] = env.action_space.shape[0]
    params['state_dim'] = env.observation_space.shape[0]
    params['basis_func'] = ExactBasis4LQR()
    params['sample_max_steps'] = int(params['sample_max_steps'])
    gamma = params['weight_discount']
    # Note: now init policy with specific L
    #		the action would be related to this init L
    #		Remember to update L!
    L = np.matrix(params['L'])
    params['policy'] = ExactPolicy4LQR(params['basis_func'], L)

    # set the parameters for agent
    batch_size = params['batch_size']
    update_freq = params['update_freq']
    n_episode = params['episode_num']
    max_steps = params['max_steps']

    agent = LSPIAgent(params)

    sample_filename = LQR_samples_filename[params['sample_max_steps']]
    f = open(sample_filename, 'rb')
    replay_buffer = pickle.load(f)
    # training to get weights -> best L
    sample = replay_buffer.sample(batch_size)
    error_list, new_weights = agent.train(sample)

    # log
    reward_his = []
    estimateL_his = []
    i_update = 0
    for i_episode in range(n_episode):
        state = env.reset()
        i_episode_steps = 0
        accu_reward = 0
        # LQR never done
        # print("i_episode: {}".format(i_episode))
        while True:
            i_episode_steps += 1
            action = agent.get_action(state)
            state_, reward, done, info = env.step(action[0])
            # print("state: {}".format(state))
            # print("action: {}".format(action))
            # print("reward: {}".format(reward))
            # print("state_: {}\n".format(state_))
            # replay_buffer.store(state, action, reward, state_, done)
            accu_reward += reward
            state = state_
            if i_episode_steps > 20:
                # done
                # print("accu_reward {}\n".format(accu_reward))
                reward_his.append(accu_reward)
                time.sleep(0.1)
                break
        # estimateL = agent.policy.estimate_policy_L().item()
        # use true Q/weights in this L to check whether it converge to optimal one
        true_weights = env.true_weights_scala(agent.policy.L, gamma)
        w3 = true_weights[2].item()
        w4 = true_weights[3].item()
        estimateL = np.matrix(w4 / (2 * w3))
        estimateL_his.append(estimateL.item())
        agent.policy.L = estimateL
        print("estimateL: {}".format(estimateL))
        agent.train(sample)

    # now = time.strftime("%Y-%m-%d-%H_%M_%S",time.localtime(time.time()))
    trueL = env.optimal_policy_L(gamma).item()
    print("trueL: {}".format(trueL))
    print("estimateL_his: {}", estimateL_his)
    env.close()
    replay_buffer.reset()

    # plot
    # plt.plot(reward_his)
    # plt.show()
    plt.plot(np.arange(n_episode), estimateL_his, label='estimate L')
    plt.plot(np.arange(n_episode), [trueL] * n_episode, label='optimal L')
    plt.ylabel('L')
    plt.xlabel('iteration')
    plt.legend(loc='upper right')
    plt.show()
def main():

	parser = argparse.ArgumentParser()
	parser.add_argument('--weight_discount', default=0.99, type=float)	# note: 1.0 only for finite
	parser.add_argument('--exploration', default=0.1, type=float)	# 0.0 means no random action
	parser.add_argument('--basis_function_dim', default=20, type=int)
	parser.add_argument('--stop_criterion', default=10**-3, type=float)
	parser.add_argument('--sample_max_steps', default="5000", choices=["2000","5000","10000","20000"])
	parser.add_argument('--reg_opt', default="wl1", choices=["l1","l2", "wl1", "none"])
	parser.add_argument('--reg_param', default=0.001, type=float)
	parser.add_argument('--rbf_sigma', default=0.01, type=float)
	# parser.add_argument('--batch_size', default=2000, type=int)
	parser.add_argument('--L', default=0.1, type=float)	# 0.0 means no random action
	

	args = parser.parse_args()
	params = vars(args)

	# env 
	# env = LQREnv()
	# present state[1]
	A = np.matrix([[0.9,0.],[0.1,0.9]])
	B = np.matrix([[1],[0.]])
	Z1 = np.matrix([[0,0],[0,1]])
	Z2 = 0.1
	noise_cov = np.matrix([[0.01,0],[0,0.01]])

	env = LQREnv(A=A,B=B,Z1=Z1,Z2=Z2,noise_cov=noise_cov)
	params['n_actions'] = env.action_space.shape[0]
	params['state_dim'] = env.observation_space.shape[0]
	params['sample_max_steps'] = int(params['sample_max_steps'])
	# print(params['state_dim'])
	
	# basis function
	n_features = params['basis_function_dim']
	gamma = params['weight_discount']
	# params['basis_func'] = ExactBasis4LQR()

	batch_size = params['sample_max_steps']
	sample_filename = LQR2D_samples_filename[params['sample_max_steps']]
	# sample_filename = LQR_samples_filename["-22-10000"]
	f = open(sample_filename, 'rb')
	replay_buffer = pickle.load(f)

	sample = replay_buffer.sample(batch_size)
	print("length of sample: {}".format(len(sample[0])))

	L_vec = 2*np.concatenate((np.max(np.abs(sample[0]), axis=0).flatten(), [np.max(np.abs(sample[1]))]))
	params['basis_func'] = Laplace_LQR(n_features,L_vec)

	params['policy'] = RBFPolicy4LQR(params['basis_func'])



	agent = BellmanAgent(params)
	
	error_list, new_weights = agent.train(sample)

	test_agent4LQR2D(agent, env, gamma, ifshow=False)

	env.close()