コード例 #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_name',
                        default="LQR",
                        choices=[
                            "cliff-v0", "CartPole-v0", "inverted_pedulum",
                            "LQR", "chain"
                        ])  # gym env to train
    parser.add_argument('--weight_discount', default=0.99,
                        type=float)  # note: 1.0 only for finite
    parser.add_argument('--exploration', default=0.1,
                        type=float)  # 0.0 means no random action
    parser.add_argument('--basis_function_dim', default=50, type=int)
    parser.add_argument('--stop_criterion', default=10**-5, type=float)
    parser.add_argument('--sample_max_steps',
                        default="2000",
                        choices=["2000", "5000", "10000", "20000"])
    parser.add_argument('--max_steps', default=500, type=int)
    # parser.add_argument('--batch_size', default=2000, type=int)
    parser.add_argument('--L', default=0.1,
                        type=float)  # 0.0 means no random action
    parser.add_argument('--reg_opt', default="l2", choices=["l1", "l2", "wl1"])
    parser.add_argument('--reg_param', default=0.001, type=float)
    args = parser.parse_args()
    params = vars(args)

    env = LQREnv()
    params['n_actions'] = env.action_space.shape[0]
    params['state_dim'] = env.observation_space.shape[0]
    params['sample_max_steps'] = int(params['sample_max_steps'])
    # print(params['state_dim'])

    n_features = params['basis_function_dim']
    gamma = params['weight_discount']
    # params['basis_func'] = ExactBasis4LQR()
    # basis_function = ExactBasis4LQR()

    basis_function = RBF_LQR([params['state_dim'], params['n_actions']],
                             n_features, 0.001)
    params['basis_func'] = basis_function

    # esitimate specific L
    L = np.matrix(params['L'])

    # set the parameters for agent
    # use all the samples in buffer
    batch_size = params['sample_max_steps']
    max_steps = params['max_steps']

    sample_filename = LQR_samples_filename[params['sample_max_steps']]
    # sample_filename = LQR_samples_filename[-22]
    f = open(sample_filename, 'rb')
    replay_buffer = pickle.load(f)
    samples = replay_buffer.sample(batch_size)

    # samples to true Q value
    phi_list = []
    qTrue_list = []

    states = samples[0]
    actions = samples[1]
    rewards = samples[2]
    next_states = samples[3]
    dones = samples[4]

    phi_list = basis_function.evaluate(states, actions)

    for i in range(len(states)):
        print("i: {}".format(i))
        s = states[i]
        # print("state: {}".format(state))
        # print("action: {}".format(action))
        qTrue = env.true_Qvalue(L, gamma, states[i], actions[i])
        # print("qTrue: {}".format(qTrue))
        qTrue_list.append(qTrue)

    phi_list = np.array(phi_list)
    # print("phi_list shape: {}".format(phi_list.shape))
    # print("phi_list: {}".format(phi_list[:10]))
    qTrue_list = np.array(qTrue_list)
    # print("qTrue_list: {}".format(qTrue_list[:10]))
    # print("qTrue_list shape: {}".format(qTrue_list.shape))
    reg = LinearRegression().fit(phi_list, qTrue_list)
    # print("reg.get_params(): {}".format(reg.get_params()))

    # for state range
    state_low = -10.0
    state_high = 10.0
    states = np.linspace(state_low, state_high, 100)
    actions = []
    # true_weights_his = []
    true_estimate_error_history = []
    q_true_his = []
    q_estimate_his = reg.predict(
        basis_function.evaluate(states, -L.item() * states))

    for i in range(len(states)):
        state = np.matrix(states[i])
        action = -L * state
        actions.append(action.item())
        q_true = env.true_Qvalue(L, gamma, state, action)
        # q_state = env.true_Qvalue_state(L, gamma, state)
        q_true_his.append(q_true)

    now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time()))

    # save estimate data to file
    dirname = "data/Regression/states[" + str(state_low) + "," + str(
        state_high) + "]/"
    try:
        os.mkdir(dirname)
    except OSError as error:
        print(error)
    # # q_true
    # filename = dirname + "q_true.pickle"
    # f = open(filename, 'wb')
    # pickle.dump(q_true_his, f)
    # f.close()

    # estimate
    # if params['basis_func'].name()[:3]=='RBF':
    # 	filename = dirname + params['basis_func'].name()+"-"+str(params['basis_function_dim'])+"-"+params['reg_opt']+"-"+str(params['reg_param'])+".pickle"
    # else:
    # 	filename = dirname + params['basis_func'].name()+".pickle"
    # f1 = open(filename, 'wb')
    # pickle.dump(q_estimate_his, f1)
    # f1.close()

    # plot
    plt.figure(figsize=(10, 10))
    plt.title('state from -2 to 2 and action(-L*state)')
    plt.subplot(411)
    plt.plot(states)
    plt.title('state')

    plt.subplot(412)
    plt.plot(actions)
    plt.title('actions')
    # plt.show()

    plt.subplot(413)
    plt.plot(states, q_true_his)
    plt.title('true Q')
    # plt.show()

    plt.subplot(414)
    plt.plot(states, q_estimate_his)
    plt.title('estimate Q')
    # plt.savefig(now+"q_true&estimate-state(-2,2)")
    plt.show()

    # for specific state
    # range of action
    state = np.matrix(-1.)
    actions = np.linspace(-6, 6, 100)
    q_true_his = []

    q_estimate_his = reg.predict(
        basis_function.evaluate(np.full(len(actions), state), actions))
    for i in range(len(actions)):
        action = np.matrix(actions[i])
        # print("q_estimate: {}".format(q_estimate))
        q_true = env.true_Qvalue(L, gamma, state, action)
        # print("q_true: {}".format(q_true))
        q_true_his.append(q_true)

    # true_weights_scala = env.true_weights_scala(L, gamma)
    # print("true_weights_scala: {}".format(true_weights_scala))
    # estimate_weights = agent.policy.weights
    # print("estimate_weights: {}".format(estimate_weights))
    # true_estimate_error = np.linalg.norm(true_weights_scala-estimate_weights)
    # print("true_estimate_error: {}".format(true_estimate_error))

    now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time()))

    # save data to file
    # note .item() only for one element
    dirname = "data/Estimation/state=" + str(state.item()) + "/"
    try:
        os.mkdir(dirname)
    except OSError as error:
        print(error)

    # # save q_true
    # filename = dirname + "q_true.pickle"
    # f = open(filename, 'wb')
    # pickle.dump(q_true_his, f)
    # f.close()
    # save q_estimate

    # if params['basis_func'].name()[:3] == 'RBF':
    # 	filename = dirname + params['basis_func'].name()+"-"+str(params['basis_function_dim'])+"-"+params['reg_opt']+"-"+str(params['reg_param'])+".pickle"
    # else:
    # 	filename = dirname + params['basis_func'].name()+".pickle"
    # f1 = open(filename, 'wb')
    # pickle.dump(q_estimate_his, f1)
    # f1.close()

    print("q_estimate_his: {}".format(q_estimate_his))
    plt.figure(figsize=(8, 6))
    plt.subplot(211)
    plt.plot(actions, q_estimate_his)
    plt.title('q estimate')

    plt.subplot(212)
    plt.plot(actions, q_true_his)
    plt.title('q true')
    # plt.savefig("images/rbf-lqr/"+str(n_features)+"-"+now+"q_true&estimate-action(-1,1)")
    plt.show()

    env.close()
    replay_buffer.reset()
コード例 #2
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--weight_discount', default=0.99,
                        type=float)  # note: 1.0 only for finite
    parser.add_argument('--exploration', default=0.1,
                        type=float)  # 0.0 means no random action
    parser.add_argument('--basis_function_dim', default=40, type=int)
    parser.add_argument('--stop_criterion', default=10**-5, type=float)
    parser.add_argument('--sample_max_steps',
                        default="5000",
                        choices=["2000", "5000"])
    parser.add_argument('--max_steps', default=500, type=int)
    parser.add_argument('--reg_opt',
                        default="l2",
                        choices=["l1", "l2", "wl1", "none"])
    parser.add_argument('--reg_param', default=0.001, type=float)
    parser.add_argument('--rbf_sigma', default=0.01, type=float)
    # parser.add_argument('--batch_size', default=2000, type=int)
    parser.add_argument('--L', default=0.1,
                        type=float)  # 0.0 means no random action

    args = parser.parse_args()
    params = vars(args)

    # env
    env = LQREnv()
    params['n_actions'] = env.action_space.shape[0]
    params['state_dim'] = env.observation_space.shape[0]
    params['sample_max_steps'] = int(params['sample_max_steps'])
    # print(params['state_dim'])

    # basis function
    n_features = params['basis_function_dim']
    gamma = params['weight_discount']
    # params['basis_func'] = ExactBasis4LQR()
    params['basis_func'] = RBF_LQR([params['state_dim'], params['n_actions']],
                                   n_features, params['rbf_sigma'])

    # esitimate specific L
    L = np.matrix(params['L'])

    # params['policy'] = ExactPolicy4LQR(params['basis_func'], L)
    params['policy'] = RBFPolicy4LQR(params['basis_func'], L)
    # set the parameters for agent
    batch_size = params['sample_max_steps']
    max_steps = params['max_steps']

    agent = LSPIAgent(params)
    sample_filename = LQR_samples_filename[params['sample_max_steps']]
    f = open(sample_filename, 'rb')
    replay_buffer = pickle.load(f)

    samples = replay_buffer.sample(batch_size)
    print("length of sample: {}".format(len(samples[0])))
    error_list, new_weights = agent.train(samples)

    # for specific state
    # range of action
    for si in range(-10, 10, 5):
        si = -1.0
        true_estimate_error_history = []
        q_true_his = []
        q_estimate_his = []

        state = np.matrix(si)
        actions = np.linspace(-6, 6, 100)

        q_estimate_his = agent.policy.q_state_action_func(
            np.full(len(actions), state), actions)
        for i in range(len(actions)):
            action = np.matrix(actions[i])
            # q_estimate = agent.policy.q_state_action_func(state, action)[0]
            # q_estimate_his.append(q_estimate)
            # print("q_estimate: {}".format(q_estimate))
            q_true = env.true_Qvalue(L, gamma, state, action)
            # print("q_true: {}".format(q_true))
            q_true_his.append(q_true)

        true_weights_scala = env.true_weights_scala(L, gamma)
        print("true_weights_scala: {}".format(true_weights_scala))
        estimate_weights = agent.policy.weights
        print("estimate_weights: {}".format(estimate_weights))
        true_estimate_error = np.linalg.norm(true_weights_scala -
                                             estimate_weights)
        print("true_estimate_error: {}".format(true_estimate_error))

        # now = time.strftime("%Y-%m-%d-%H_%M_%S",time.localtime(time.time()))

        # save data to file
        # note .item() only for one element
        # dirname = "data/Estimation/state=" + str(state.item())+"/"
        # try:
        # 	os.mkdir(dirname)
        # except OSError as error:
        # 	print(error)

        # # save q_true
        # filename = dirname + "q_true.pickle"
        # f = open(filename, 'wb')
        # pickle.dump(q_true_his, f)
        # f.close()
        # save q_estimate

        # if params['basis_func'].name()[:3] == 'RBF':
        # 	filename = dirname + params['basis_func'].name()+"-"+str(params['basis_function_dim'])+"-"+params['reg_opt']+"-"+str(params['reg_param'])+".pickle"
        # else:
        # 	filename = dirname + params['basis_func'].name()+".pickle"
        # f1 = open(filename, 'wb')
        # pickle.dump(q_estimate_his, f1)
        # f1.close()

        qe_index = np.argmax(q_estimate_his)
        qt_index = np.argmax(q_true_his)

        plt.figure(figsize=(10, 8))
        plt.subplot(211)
        ax = plt.gca()
        plt.plot(actions, q_estimate_his)
        plt.scatter(actions[qe_index], q_estimate_his[qe_index], c='r')
        plt.xlabel('actions')
        plt.ylabel('q value')
        plt.title('estimate q value')
        ax.xaxis.set_label_coords(1.02, -0.035)
        plt.subplot(212)
        plt.plot(actions, q_true_his)
        plt.scatter(actions[qt_index], q_true_his[qt_index], c='r')

        plt.title('true q value')
        # plt.savefig("images/rbf-lqr/"+str(n_features)+"-"+now+"q_true&estimate-action(-1,1)")
        plt.show()

    env.close()
    replay_buffer.reset()