Exemple #1
0
def sample_one_reward(theta, env, num_actions):
    this_trajectory_reward = []
    this_trajectory_grads = []

    #first, initialize the observation
    observation = env.reset()
    current_feature = utils.extract_features(observation, num_actions)

    for time_index in range(0, 200):
        #compute an action given current observation
        action_distribution = utils.compute_action_distribution(
            theta, current_feature)
        #print("the action distribution is",action_distribution)
        #action = np.argmax(action_distribution) This is not correct
        #action = np.random.binomial(1,action_distribution[0][1],1)[0]
        action = np.random.choice(num_actions, 1, p=action_distribution[0])[0]
        #print("the action is",action)
        #apply the action to the environment
        observation, reward, done, info = env.step(action)

        this_trajectory_reward.append(reward)
        log_softmax_grad = utils.compute_log_softmax_grad(
            theta, current_feature, action)
        this_trajectory_grads.append(log_softmax_grad)

        current_feature = utils.extract_features(observation, num_actions)

        if done:
            break

    return this_trajectory_reward, this_trajectory_grads
Exemple #2
0
def sample_one_trajectory(theta, env):
    this_trajectory_reward = []
    this_trajectory_grads = []

    #first, initialize the observation
    observation = env.reset()
    current_feature = utils.extract_features(observation, C.output_dim)

    for time_index in range(0, 200):

        #compute an action given current observation
        action = utils.compute_action_distribution(theta,
                                                   current_feature,
                                                   mode='train')

        #apply the action to the environment
        observation, reward, done, info = env.step(action[0])

        #record reward and grad
        this_trajectory_reward.append(reward)
        computed_grad_log_state_action = utils.compute_log_grad(
            theta, current_feature, action)
        this_trajectory_grads.append(computed_grad_log_state_action)

        current_feature = utils.extract_features(observation, C.output_dim)

        if done:
            break

    return this_trajectory_reward, this_trajectory_grads
Exemple #3
0
def sample_one_trajectory(theta, env, cov_matrix):
    this_trajectory_rewards = []
    this_trajectory_states = []
    this_trajectory_actions = []
    this_trajectory_log_prob_action_states = []

    #first, initialize the observation
    observation = env.reset()
    current_feature = utils.extract_features(observation, C.output_dim)

    for time_index in range(0, 200):

        #compute an action given current observation
        action, log_prob = utils.compute_action_distribution(theta,
                                                             current_feature,
                                                             cov_matrix,
                                                             mode='train')

        #apply the action to the environment
        observation, reward, done, info = env.step(action)

        #record reward and grad
        this_trajectory_rewards.append(reward)
        this_trajectory_states.append(current_feature)
        this_trajectory_actions.append(action[0])
        this_trajectory_log_prob_action_states.append(log_prob)

        #next time step
        current_feature = utils.extract_features(observation, C.output_dim)

        if done:
            break

    sample_dict = {}
    sample_dict['rewards'] = this_trajectory_rewards
    sample_dict['states'] = this_trajectory_states
    sample_dict['actions'] = this_trajectory_actions
    sample_dict['log_prob'] = this_trajectory_log_prob_action_states

    total_reward = np.sum(this_trajectory_rewards)

    return sample_dict, total_reward
Exemple #4
0
def sample_one_trajectory(q, theta, env):

    this_trajectory_reward = []
    this_trajectory_grads = []
    execute_sell = False
    null_objective = True

    #first, initialize the observation
    observation = env.reset()
    current_feature = utils.extract_features(observation, C.output_dim)

    while True:

        #compute an action given current observation
        action = utils.compute_action_distribution(theta,
                                                   current_feature,
                                                   mode='train')

        #apply the action to the environment
        observation, reward, done, info = env.step(action[0])

        #record reward and grad
        computed_grad_log_state_action = utils.compute_log_grad(
            theta, current_feature, action)
        this_trajectory_grads.append(computed_grad_log_state_action)

        if done:
            break

        current_feature = utils.extract_features(observation, C.output_dim)

    for _ in range(0, len(this_trajectory_grads)):
        this_trajectory_reward.append(reward)

    #print('finished one sample and the reward is',np.mean(this_trajectory_reward))

    q.put([this_trajectory_reward, this_trajectory_grads])
Exemple #5
0

if __name__ == '__main__':
    np.random.seed(1234)
    theta, episode_rewards = train(N=100, T=20, delta=1e-2)

    env = gym.make('CartPole-v0')
    env.seed(12345)
    observation = env.reset()
    num_actions = 2
    current_feature = utils.extract_features(observation, num_actions)
    for t in range(200):
        env.render()

        #compute an action given current observation
        action_distribution = utils.compute_action_distribution(
            theta, current_feature)
        #action = np.argmax(action_distribution) This is not correct
        action = np.random.binomial(1, action_distribution[0][1], 1)[0]
        #apply the action to the environment
        observation, reward, done, info = env.step(action)
        #compute the next feature vector
        current_feature = utils.extract_features(observation, num_actions)

        if done:
            print("Episode finished after {} timesteps".format(t + 1))
            break

    plt.plot(episode_rewards)
    plt.title("avg rewards per timestep")
    plt.xlabel("timestep")
    plt.ylabel("avg rewards")
Exemple #6
0
import numpy as np
import utils


with open('test_info.pkl', 'rb') as f:
    tests_info = pickle.load(f)
test_cases = sorted(tests_info.keys())


""" ------------- testing action distribution computation ----------------"""
print('-'*10 + ' testing compute_action_distribution ' + '-'*10)
for i in test_cases:
    theta = tests_info[i]['theta']
    phis = tests_info[i]['phis']
    soln_action_dist = tests_info[i]['action_dst']
    action_dist = utils.compute_action_distribution(theta, phis)

    err = np.linalg.norm(soln_action_dist - action_dist)
    print('test {} for compute_action_distribution - error = {}'.format(i, err))

""" ------------- testing compute_log_softmax_grad ----------------"""
print('-' * 10 + ' testing compute_log_softmax_grad ' + '-' * 10)
for i in test_cases:
    theta = tests_info[i]['theta']
    phis = tests_info[i]['phis']
    action = tests_info[i]['action']
    soln_grad = tests_info[i]['grad']
    grad = utils.compute_log_softmax_grad(theta, phis, action)
    err = np.linalg.norm(soln_grad - grad)
    print('test {} for compute_log_softmax_grad - error = {}'.format(i, err))
Exemple #7
0
    param N: number of trajectories to sample in each time step
    param T: number of iterations to train the model
    param delta: trust region size, because we are using trpo
    param env: the environment for the policy to learn
    '''

    #test the training result
    observation = env.reset()
    current_feature = utils.extract_features(observation, C.output_dim)
    for t in range(200):

        env.render()

        #compute an action given current observation
        action = utils.compute_action_distribution(theta,
                                                   current_feature,
                                                   mode='test')

        #apply the action to the environment
        observation, reward, done, info = env.step(action)

        #compute the next feature vector
        current_feature = utils.extract_features(observation, C.output_dim)

        if done:
            print("Episode finished after {} timesteps".format(t + 1))
            break

    #plot the training history
    plt.plot(episode_rewards)
    plt.title("avg rewards per timestep")