Ejemplo n.º 1
0
 def test_unflatten(self):
     env = TheanoEnv(
         normalize(gym.make('Blackjack-v0'),
                   normalize_reward=True,
                   normalize_obs=True,
                   flatten_obs=False))
     for i in range(10):
         env.reset()
         for e in range(5):
             action = env.action_space.sample()
             next_obs, reward, done, info = env.step(action)
             assert (env.observation_space.flatten(next_obs).shape ==
                     env.observation_space.flat_dim)  # yapf: disable
             if done:
                 break
     env.close()
Ejemplo n.º 2
0
 def test_flatten(self):
     env = TheanoEnv(
         normalize(gym.make('Pendulum-v0'),
                   normalize_reward=True,
                   normalize_obs=True,
                   flatten_obs=True))
     for i in range(10):
         env.reset()
         for e in range(5):
             env.render()
             action = env.action_space.sample()
             next_obs, reward, done, info = env.step(action)
             assert next_obs.shape == env.observation_space.low.shape
             if done:
                 break
     env.close()
Ejemplo n.º 3
0
        observation = env.reset()

        for _ in range(T):
            # policy.get_action() returns a pair of values. The second
            # one returns a dictionary, whose values contains
            # sufficient statistics for the action distribution. It
            # should at least contain entries that would be returned
            # by calling policy.dist_info(), which is the non-symbolic
            # analog of policy.dist_info_sym(). Storing these
            # statistics is useful, e.g., when forming importance
            # sampling ratios. In our case it is not needed.
            action, _ = policy.get_action(observation)
            # Recall that the last entry of the tuple stores diagnostic
            # information about the environment. In our case it is not needed.
            next_observation, reward, terminal, _ = env.step(action)
            observations.append(observation)
            actions.append(action)
            rewards.append(reward)
            observation = next_observation
            if terminal:
                # Finish rollout if terminal state reached
                break

        # We need to compute the empirical return for each time step along the
        # trajectory
        path = dict(
            observations=np.array(observations),
            actions=np.array(actions),
            rewards=np.array(rewards),
        )