Esempio n. 1
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--weight_discount', default=0.99,
                        type=float)  # note: 1.0 only for finite
    parser.add_argument('--exploration', default=0.1,
                        type=float)  # 0.0 means no random action
    parser.add_argument('--basis_function_dim', default=40, type=int)
    parser.add_argument('--stop_criterion', default=10**-5, type=float)
    parser.add_argument('--sample_max_steps',
                        default="5000",
                        choices=["2000", "5000"])
    parser.add_argument('--max_steps', default=500, type=int)
    parser.add_argument('--reg_opt',
                        default="l2",
                        choices=["l1", "l2", "wl1", "none"])
    parser.add_argument('--reg_param', default=0.001, type=float)
    parser.add_argument('--rbf_sigma', default=0.01, type=float)
    # parser.add_argument('--batch_size', default=2000, type=int)
    parser.add_argument('--L', default=0.1,
                        type=float)  # 0.0 means no random action

    args = parser.parse_args()
    params = vars(args)

    # env
    env = LQREnv()
    params['n_actions'] = env.action_space.shape[0]
    params['state_dim'] = env.observation_space.shape[0]
    params['sample_max_steps'] = int(params['sample_max_steps'])
    # print(params['state_dim'])

    # basis function
    n_features = params['basis_function_dim']
    gamma = params['weight_discount']
    # params['basis_func'] = ExactBasis4LQR()
    params['basis_func'] = RBF_LQR([params['state_dim'], params['n_actions']],
                                   n_features, params['rbf_sigma'])

    # esitimate specific L
    L = np.matrix(params['L'])

    # params['policy'] = ExactPolicy4LQR(params['basis_func'], L)
    params['policy'] = RBFPolicy4LQR(params['basis_func'], L)
    # set the parameters for agent
    batch_size = params['sample_max_steps']
    max_steps = params['max_steps']

    agent = LSPIAgent(params)
    sample_filename = LQR_samples_filename[params['sample_max_steps']]
    f = open(sample_filename, 'rb')
    replay_buffer = pickle.load(f)

    samples = replay_buffer.sample(batch_size)
    print("length of sample: {}".format(len(samples[0])))
    error_list, new_weights = agent.train(samples)

    # for specific state
    # range of action
    for si in range(-10, 10, 5):
        si = -1.0
        true_estimate_error_history = []
        q_true_his = []
        q_estimate_his = []

        state = np.matrix(si)
        actions = np.linspace(-6, 6, 100)

        q_estimate_his = agent.policy.q_state_action_func(
            np.full(len(actions), state), actions)
        for i in range(len(actions)):
            action = np.matrix(actions[i])
            # q_estimate = agent.policy.q_state_action_func(state, action)[0]
            # q_estimate_his.append(q_estimate)
            # print("q_estimate: {}".format(q_estimate))
            q_true = env.true_Qvalue(L, gamma, state, action)
            # print("q_true: {}".format(q_true))
            q_true_his.append(q_true)

        true_weights_scala = env.true_weights_scala(L, gamma)
        print("true_weights_scala: {}".format(true_weights_scala))
        estimate_weights = agent.policy.weights
        print("estimate_weights: {}".format(estimate_weights))
        true_estimate_error = np.linalg.norm(true_weights_scala -
                                             estimate_weights)
        print("true_estimate_error: {}".format(true_estimate_error))

        # now = time.strftime("%Y-%m-%d-%H_%M_%S",time.localtime(time.time()))

        # save data to file
        # note .item() only for one element
        # dirname = "data/Estimation/state=" + str(state.item())+"/"
        # try:
        # 	os.mkdir(dirname)
        # except OSError as error:
        # 	print(error)

        # # save q_true
        # filename = dirname + "q_true.pickle"
        # f = open(filename, 'wb')
        # pickle.dump(q_true_his, f)
        # f.close()
        # save q_estimate

        # if params['basis_func'].name()[:3] == 'RBF':
        # 	filename = dirname + params['basis_func'].name()+"-"+str(params['basis_function_dim'])+"-"+params['reg_opt']+"-"+str(params['reg_param'])+".pickle"
        # else:
        # 	filename = dirname + params['basis_func'].name()+".pickle"
        # f1 = open(filename, 'wb')
        # pickle.dump(q_estimate_his, f1)
        # f1.close()

        qe_index = np.argmax(q_estimate_his)
        qt_index = np.argmax(q_true_his)

        plt.figure(figsize=(10, 8))
        plt.subplot(211)
        ax = plt.gca()
        plt.plot(actions, q_estimate_his)
        plt.scatter(actions[qe_index], q_estimate_his[qe_index], c='r')
        plt.xlabel('actions')
        plt.ylabel('q value')
        plt.title('estimate q value')
        ax.xaxis.set_label_coords(1.02, -0.035)
        plt.subplot(212)
        plt.plot(actions, q_true_his)
        plt.scatter(actions[qt_index], q_true_his[qt_index], c='r')

        plt.title('true q value')
        # plt.savefig("images/rbf-lqr/"+str(n_features)+"-"+now+"q_true&estimate-action(-1,1)")
        plt.show()

    env.close()
    replay_buffer.reset()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_name',
                        default="LQR",
                        choices=[
                            "cliff-v0", "CartPole-v0", "inverted_pedulum",
                            "LQR", "chain"
                        ])  # gym env to train
    parser.add_argument('--episode_num', default=10, type=int)
    parser.add_argument('--weight_discount', default=0.99,
                        type=float)  # note: 1.0 only for finite
    parser.add_argument('--exploration', default=0.1,
                        type=float)  # 0.0 means no random action
    parser.add_argument('--basis_function_dim', default=10, type=int)
    parser.add_argument('--stop_criterion', default=10**-5, type=float)
    parser.add_argument('--sample_max_steps',
                        default="5000",
                        choices=["2000", "5000", "10000", "20000"])
    parser.add_argument('--max_steps', default=500, type=int)
    parser.add_argument('--batch_size', default=2000, type=int)
    parser.add_argument('--update_freq', default=10000000, type=int)
    parser.add_argument('--L', default=0.1,
                        type=float)  # 0.0 means no random action
    parser.add_argument('--reg_opt',
                        default="none",
                        choices=["l1", "l2", "wl1", "none"])
    parser.add_argument('--reg_param', default=0.01, type=float)

    args = parser.parse_args()
    params = vars(args)

    # env
    env = LQREnv()
    params['n_actions'] = env.action_space.shape[0]
    params['state_dim'] = env.observation_space.shape[0]
    params['basis_func'] = ExactBasis4LQR()
    params['sample_max_steps'] = int(params['sample_max_steps'])
    gamma = params['weight_discount']
    # Note: now init policy with specific L
    #		the action would be related to this init L
    #		Remember to update L!
    L = np.matrix(params['L'])
    params['policy'] = ExactPolicy4LQR(params['basis_func'], L)

    # set the parameters for agent
    batch_size = params['batch_size']
    update_freq = params['update_freq']
    n_episode = params['episode_num']
    max_steps = params['max_steps']

    agent = LSPIAgent(params)

    sample_filename = LQR_samples_filename[params['sample_max_steps']]
    f = open(sample_filename, 'rb')
    replay_buffer = pickle.load(f)
    # training to get weights -> best L
    sample = replay_buffer.sample(batch_size)
    error_list, new_weights = agent.train(sample)

    # log
    reward_his = []
    estimateL_his = []
    i_update = 0
    for i_episode in range(n_episode):
        state = env.reset()
        i_episode_steps = 0
        accu_reward = 0
        # LQR never done
        # print("i_episode: {}".format(i_episode))
        while True:
            i_episode_steps += 1
            action = agent.get_action(state)
            state_, reward, done, info = env.step(action[0])
            # print("state: {}".format(state))
            # print("action: {}".format(action))
            # print("reward: {}".format(reward))
            # print("state_: {}\n".format(state_))
            # replay_buffer.store(state, action, reward, state_, done)
            accu_reward += reward
            state = state_
            if i_episode_steps > 20:
                # done
                # print("accu_reward {}\n".format(accu_reward))
                reward_his.append(accu_reward)
                time.sleep(0.1)
                break
        # estimateL = agent.policy.estimate_policy_L().item()
        # use true Q/weights in this L to check whether it converge to optimal one
        true_weights = env.true_weights_scala(agent.policy.L, gamma)
        w3 = true_weights[2].item()
        w4 = true_weights[3].item()
        estimateL = np.matrix(w4 / (2 * w3))
        estimateL_his.append(estimateL.item())
        agent.policy.L = estimateL
        print("estimateL: {}".format(estimateL))
        agent.train(sample)

    # now = time.strftime("%Y-%m-%d-%H_%M_%S",time.localtime(time.time()))
    trueL = env.optimal_policy_L(gamma).item()
    print("trueL: {}".format(trueL))
    print("estimateL_his: {}", estimateL_his)
    env.close()
    replay_buffer.reset()

    # plot
    # plt.plot(reward_his)
    # plt.show()
    plt.plot(np.arange(n_episode), estimateL_his, label='estimate L')
    plt.plot(np.arange(n_episode), [trueL] * n_episode, label='optimal L')
    plt.ylabel('L')
    plt.xlabel('iteration')
    plt.legend(loc='upper right')
    plt.show()