def _generate_replay_buffer(self, rb_cls):
    stack_count = 4
    shape = (15, 15, stack_count)
    single_shape = (15, 15, 1)
    observation_spec = array_spec.ArraySpec(shape, np.int32, 'obs')
    time_step_spec = ts.time_step_spec(observation_spec)
    action_spec = policy_step.PolicyStep(array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=1, name='action'))
    self._trajectory_spec = trajectory.from_transition(
        time_step_spec, action_spec, time_step_spec)

    self._capacity = 32
    self._replay_buffer = rb_cls(
        data_spec=self._trajectory_spec, capacity=self._capacity)

    # Generate N frames: the value of pixels is the frame index.
    # The observations will be generated by stacking K frames out of those N,
    # generating some redundancies between the observations.
    single_frames = []
    frame_count = 100
    for k in range(frame_count):
      single_frames.append(np.full(single_shape, k, dtype=np.int32))

    # Add stack of frames to the replay buffer.
    time_steps = []
    for k in range(len(single_frames) - stack_count + 1):
      observation = np.concatenate(single_frames[k:k + stack_count], axis=-1)
      time_steps.append(ts.transition(observation, reward=0.0))

    self._transition_count = len(time_steps) - 1
    dummy_action = policy_step.PolicyStep(np.int32(0))
    for k in range(self._transition_count):
      self._replay_buffer.add_batch(nest_utils.batch_nested_array(
          trajectory.from_transition(
              time_steps[k], dummy_action, time_steps[k + 1])))
Beispiel #2
0
    def _action(self, time_step, policy_state):
        if not self._built:
            self._build_from_time_step(time_step)

        batch_size = None
        if time_step.step_type.shape:
            batch_size = time_step.step_type.shape[0]
        if self._batch_size != batch_size:
            raise ValueError(
                'The batch size of time_step is different from the batch size '
                'provided previously. Expected {}, but saw {}.'.format(
                    self._batch_size, batch_size))

        if not self._batched:
            # Since policy_state is given in a batched form from the policy and we
            # simply have to send it back we do not need to worry about it. Only
            # update time_step.
            time_step = nest_utils.batch_nested_array(time_step)

        tf.nest.assert_same_structure(self._time_step, time_step)
        feed_dict = {self._time_step: time_step}
        if policy_state is not None:
            # Flatten policy_state to handle specs that are not hashable due to lists.
            for state_ph, state in zip(tf.nest.flatten(self._policy_state),
                                       tf.nest.flatten(policy_state)):
                feed_dict[state_ph] = state

        action_step = self.session.run(self._action_step, feed_dict)
        action, state, info = action_step

        if not self._batched:
            action, info = nest_utils.unbatch_nested_array([action, info])

        return policy_step.PolicyStep(action, state, info)
Beispiel #3
0
    def _action(self, time_step, policy_state, seed):
        del seed

        def _mode(dist, spec):
            action = dist.mode()
            return tf.reshape(action, [
                -1,
            ] + spec.shape.as_list())

        # TODO(oars): Remove batched data checks when tf_env is batched.
        time_step_batched = nest_utils.is_batched_nested_tensors(
            time_step, self._time_step_spec)
        if not time_step_batched:
            time_step = nest_utils.batch_nested_tensors(
                time_step, self._time_step_spec)

        distribution_step = self._wrapped_policy.distribution(
            time_step, policy_state)
        actions = nest.map_structure(_mode, distribution_step.action,
                                     self._action_spec)

        if not time_step_batched:
            actions = nest_utils.unbatch_nested_tensors(
                actions, self._action_spec)
        return policy_step.PolicyStep(actions, distribution_step.state,
                                      distribution_step.info)
Beispiel #4
0
    def _distribution(self, time_step, policy_state):
        # In DQN, we always either take a uniformly random action, or the action
        # with the highest Q-value. However, to support more complicated policies,
        # we expose all Q-values as a categorical distribution with Q-values as
        # logits, and apply the GreedyPolicy wrapper in dqn_agent.py to select the
        # action with the highest Q-value.
        q_values, policy_state = self._q_network(time_step.observation,
                                                 time_step.step_type,
                                                 policy_state)
        q_values.shape.assert_has_rank(2)

        # TODO(b/122314058): Validate and enforce that sampling distributions
        # created with the q_network logits generate the right action shapes. This
        # is curretly patching the problem.

        # If the action spec says each action should be shaped (1,), add another
        # dimension so the final shape is (B, 1, A), where A is the number of
        # actions. This will make Categorical emit events shaped (B, 1) rather than
        # (B,). Using axis -2 to allow for (B, T, 1, A) shaped q_values.
        if self._action_shape.ndims == 1:
            q_values = tf.expand_dims(q_values, -2)

        # TODO(kbanoop): Handle distributions over nests.
        distribution = tfp.distributions.Categorical(logits=q_values,
                                                     dtype=self._action_dtype)
        distribution = tf.nest.pack_sequence_as(self._action_spec,
                                                [distribution])
        return policy_step.PolicyStep(distribution, policy_state)
Beispiel #5
0
 def _setup_specs(self):
     self._policy_step_spec = policy_step.PolicyStep(
         action=self._action_spec,
         state=self._policy_state_spec,
         info=self._info_spec)
     self._trajectory_spec = trajectory.from_transition(
         self._time_step_spec, self._policy_step_spec, self._time_step_spec)
    def _action(self, time_step, policy_state, seed):
        seed_stream = tfd.SeedStream(seed=seed, salt='ou_noise')

        def _create_ou_process(action_spec):
            return common.OUProcess(
                lambda: tf.zeros(action_spec.shape, dtype=action_spec.dtype),
                self._ou_damping,
                self._ou_stddev,
                seed=seed_stream())

        if self._ou_process is None:
            self._ou_process = nest.map_structure(_create_ou_process,
                                                  self._action_spec)

        action_step = self._wrapped_policy.action(time_step, policy_state,
                                                  seed_stream())

        def _add_ou_noise(action, ou_process, action_spec):
            noisy_action = action + ou_process()
            if self._clip:
                return common.clip_to_spec(noisy_action, action_spec)
            return noisy_action

        actions = nest.map_structure(_add_ou_noise, action_step.action,
                                     self._ou_process, self._action_spec)
        return policy_step.PolicyStep(actions, action_step.state,
                                      action_step.info)
Beispiel #7
0
    def _distribution(self, time_step, policy_state):
        q_values, policy_state = self._q_network(
            time_step.observation,
            time_step.step_type,
            policy_state,
        )
        q_values.shape.assert_has_rank(2)

        if self._action_shape.ndims == 1:
            q_values = tf.expand_dims(q_values, -2)

        observation = time_step.observation.numpy()[0]
        amount_now = observation[-3]  # can sale
        amount_available = observation[-2]  # can buy
        q_values_np = q_values.numpy()[0]
        lower_bound = int(500 - amount_now)
        upper_bound = int(amount_available + 1)
        q_values_np[:lower_bound] = -np.inf
        q_values_np[upper_bound:] = -np.inf

        new_q_values = ops.convert_to_tensor(
            np.array([q_values_np], dtype=np.float32))

        distribution = tfp.distributions.Categorical(logits=new_q_values,
                                                     dtype=self._action_dtype)
        distribution = tf.nest.pack_sequence_as(self._action_spec,
                                                [distribution])
        return policy_step.PolicyStep(distribution, policy_state)
Beispiel #8
0
def to_transition(trajectory, next_trajectory=None):
    """Create a transition from a trajectory or two adjacent trajectories.

  **NOTE** If `next_trajectory` is not provided, tensors of `trajectory` are
  sliced along their *second* (`time`) dimension; for example:

  ```
  time_steps.observation = trajectory.observation[:, :-1]
  next_time_steps.observation = trajectory.observation[:, 1:]
  ```

  Args:
    trajectory: An instance of `Trajectory`.
    next_trajectory: (optional) An instance of `Trajectory`.

  Returns:
    A tuple `(time_steps, policy_steps, next_time_steps)`.  The `reward` and
    `discount` fields of `time_steps` are filled with zeros because these
    cannot be deduced.
  """
    if next_trajectory is None:
        next_trajectory = nest.map_structure(lambda x: x[:, 1:], trajectory)
        trajectory = nest.map_structure(lambda x: x[:, :-1], trajectory)
    policy_steps = policy_step.PolicyStep(trajectory.action, (),
                                          trajectory.policy_info)
    # TODO(kbanoop): Consider replacing 0 rewards & discounts with ().
    time_steps = ts.TimeStep(
        trajectory.step_type,
        reward=nest.map_structure(tf.zeros_like, trajectory.reward),  # unknown
        discount=tf.zeros_like(trajectory.discount),  # unknown
        observation=trajectory.observation)
    next_time_steps = ts.TimeStep(trajectory.next_step_type, trajectory.reward,
                                  trajectory.discount,
                                  next_trajectory.observation)
    return [time_steps, policy_steps, next_time_steps]
    def _fill_replay_buffer(self):
        # Generate N frames: the value of pixels is the frame index.
        # The observations will be generated by stacking K frames out of those N,
        # generating some redundancies between the observations.
        single_frames = []
        frame_count = 100
        for k in range(frame_count):
            single_frames.append(np.full(self._single_shape, k,
                                         dtype=np.int32))

        # Add stack of frames to the replay buffer.
        time_steps = []
        for k in range(len(single_frames) - self._stack_count + 1):
            observation = np.concatenate(single_frames[k:k +
                                                       self._stack_count],
                                         axis=-1)
            time_steps.append(ts.transition(observation, reward=0.0))

        self._transition_count = len(time_steps) - 1
        dummy_action = policy_step.PolicyStep(np.int32(0))
        for k in range(self._transition_count):
            self._replay_buffer.add_batch(
                nest_utils.batch_nested_array(
                    trajectory.from_transition(time_steps[k], dummy_action,
                                               time_steps[k + 1])))
Beispiel #10
0
 def testCreateWithDefaultInfo(self):
     action = 1
     state = 2
     info = ()
     step = policy_step.PolicyStep(action, state)
     self.assertEqual(step.action, action)
     self.assertEqual(step.state, state)
     self.assertEqual(step.info, info)
Beispiel #11
0
 def _distribution(self, time_step, policy_state):
     q_values = self._q_network(time_step).q_values
     # TODO(kbanoop): Handle distributions over nests.
     distribution_ = tfp.distributions.Categorical(logits=q_values,
                                                   dtype=self._action_dtype)
     distribution_ = nest.pack_sequence_as(self._action_spec,
                                           [distribution_])
     return policy_step.PolicyStep(distribution_, policy_state)
Beispiel #12
0
 def testCreate(self):
     action = 1
     state = 2
     info = 3
     step = policy_step.PolicyStep(action=action, state=state, info=info)
     self.assertEqual(step.action, action)
     self.assertEqual(step.state, state)
     self.assertEqual(step.info, info)
Beispiel #13
0
  def _distribution(self, time_step, policy_state):
    outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec)
    action = common.replicate(self._action_value, outer_shape)

    def dist_fn(action):
      """Return a categorical distribution with all density on fixed action."""
      return tfp.distributions.Deterministic(loc=action)
    return policy_step.PolicyStep(nest.map_structure(dist_fn, action),
                                  policy_state)
Beispiel #14
0
    def _distribution(self, time_step, policy_state):
        def dist_fn(dist):
            greedy_action = dist.mode()
            return tfp.distributions.Deterministic(loc=greedy_action)

        distribution_step = self._wrapped_policy.distribution(
            time_step, policy_state)
        return policy_step.PolicyStep(
            tf.nest.map_structure(dist_fn, distribution_step.action),
            distribution_step.state, distribution_step.info)
  def _action(self, time_step, policy_state, seed):
    outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec)

    action_ = tensor_spec.sample_spec_nest(
        self._action_spec, seed=seed, outer_dims=outer_dims)
    # TODO(b/78181147): Investigate why this control dependency is required.
    if time_step is not None:
      with tf.control_dependencies(nest.flatten(time_step)):
        action_ = nest.map_structure(tf.identity, action_)
    return policy_step.PolicyStep(action_, policy_state)
Beispiel #16
0
def make_replay_buffer(tf_env):
    """Default replay buffer factory."""

    time_step_spec = tf_env.time_step_spec()
    action_step_spec = policy_step.PolicyStep(
        tf_env.action_spec(), (), tensor_spec.TensorSpec((), tf.int32))
    trajectory_spec = trajectory.from_transition(time_step_spec,
                                                 action_step_spec,
                                                 time_step_spec)
    return tf_uniform_replay_buffer.TFUniformReplayBuffer(trajectory_spec,
                                                          batch_size=1)
Beispiel #17
0
 def _action(self, time_step, policy_state, seed):
     q_values = self._q_network(time_step).q_values
     q_values.shape.assert_has_rank(2)
     # TODO(kbanoop): Add a test for temperature
     logits = q_values / self._temperature
     actions = tf.multinomial(logits, num_samples=1, seed=seed)
     actions = tf.reshape(actions, [
         -1,
     ] + self._action_shape.as_list())
     actions = tf.cast(actions, self._action_dtype, name='action')
     actions = nest.pack_sequence_as(self._action_spec, [actions])
     return policy_step.PolicyStep(actions, policy_state)
Beispiel #18
0
 def _make_ppo_trajectory_spec(self, action_distribution_params_spec):
     # Make policy_step_spec with action_spec, empty tuple for policy_state, and
     # (act_log_prob_spec, value_pred_spec, action_distribution_params_spec) for
     # info.
     policy_step_spec = policy_step.PolicyStep(
         action=self.action_spec(),
         state=self._policy.policy_state_spec(),
         info=action_distribution_params_spec)
     trajectory_spec = trajectory.from_transition(self.time_step_spec(),
                                                  policy_step_spec,
                                                  self.time_step_spec())
     return trajectory_spec
Beispiel #19
0
    def _setup_mocks(self):
        self.trainer = train_eval_atari.TrainEval(self.get_temp_dir(),
                                                  'Pong-v0',
                                                  terminal_on_life_loss=True)

        self.trainer._env = mock.MagicMock()
        self.trainer._env.envs[0].game_over = False
        self.trainer._replay_buffer = mock.MagicMock()
        self.trainer._collect_policy = mock.MagicMock()
        action_step = policy_step.PolicyStep(action=1)
        self.trainer._collect_policy.action.return_value = action_step
        self.observer = mock.MagicMock()
        self.metric_observers = [self.observer]
Beispiel #20
0
    def _action(self, time_step, policy_state):
        outer_dims = self._outer_dims
        if outer_dims is None:
            if self.time_step_spec.observation:
                outer_dims = nest_utils.get_outer_array_shape(
                    time_step.observation, self.time_step_spec.observation)
            else:
                outer_dims = ()

        random_action = array_spec.sample_spec_nest(self._action_spec,
                                                    self._rng,
                                                    outer_dims=outer_dims)
        return policy_step.PolicyStep(random_action, policy_state)
Beispiel #21
0
    def _action(self, time_step, policy_state, seed):
        del seed

        # Reset the policy for batch indices that have restarted episode.
        policy_state = tf.where(time_step.is_first(),
                                self._initial_policy_state, policy_state)

        # Take actions 1 and 2 alternating.
        action = tf.floormod(policy_state, 2) + 1
        new_policy_state = policy_state + tf.constant(
            1, shape=self._batch_shape, dtype=tf.int32)
        policy_info = action * 2
        return policy_step.PolicyStep(action, new_policy_state, policy_info)
Beispiel #22
0
def to_transition(trajectory, next_trajectory=None):
    """Create a transition from a trajectory or two adjacent trajectories.

  **NOTE** If `next_trajectory` is not provided, tensors of `trajectory` are
  sliced along their *second* (`time`) dimension; for example:

  ```
  time_steps.step_type = trajectory.step_type[:,:-1]
  time_steps.observation = trajectory.observation[:,:-1]
  next_time_steps.observation = trajectory.observation[:,1:]
  next_time_steps. step_type = trajectory. next_step_type[:,:-1]
  next_time_steps.reward = trajectory.reward[:,:-1]
  next_time_steps. discount = trajectory. discount[:,:-1]

  ```
  Notice that reward and discount for time_steps are undefined, therefore filled
  with zero.

  Args:
    trajectory: An instance of `Trajectory`. The tensors in Trajectory must have
      shape `[ B, T, ...]` when next_trajectory is None.
    next_trajectory: (optional) An instance of `Trajectory`.

  Returns:
    A tuple `(time_steps, policy_steps, next_time_steps)`.  The `reward` and
    `discount` fields of `time_steps` are filled with zeros because these
    cannot be deduced (please do not use them).
  """
    _validate_rank(trajectory.discount, min_rank=1, max_rank=2)

    if next_trajectory is not None:
        _validate_rank(next_trajectory.discount, min_rank=1, max_rank=2)

    if next_trajectory is None:
        next_trajectory = tf.nest.map_structure(lambda x: x[:, 1:], trajectory)
        trajectory = tf.nest.map_structure(lambda x: x[:, :-1], trajectory)
    policy_steps = policy_step.PolicyStep(action=trajectory.action,
                                          state=(),
                                          info=trajectory.policy_info)
    # TODO(kbanoop): Consider replacing 0 rewards & discounts with ().
    time_steps = ts.TimeStep(
        trajectory.step_type,
        reward=tf.nest.map_structure(tf.zeros_like,
                                     trajectory.reward),  # unknown
        discount=tf.zeros_like(trajectory.discount),  # unknown
        observation=trajectory.observation)
    next_time_steps = ts.TimeStep(step_type=trajectory.next_step_type,
                                  reward=trajectory.reward,
                                  discount=trajectory.discount,
                                  observation=next_trajectory.observation)
    return [time_steps, policy_steps, next_time_steps]
Beispiel #23
0
 def _action(self, time_step, policy_state=()):
     self._count += 1
     # _random_function()'s range should be [0, 1), so if epsilon is 1,
     # we should always use random policy, and if epislon is 0, it
     # should always use greedy_policy since the if condition won't be
     # met.
     if self._random_function() < self._get_epsilon():
         # Avoid mixing policy_state from greedy_policy and random_policy,
         # always return policy_state from greedy_policy.
         action_step = self._random_policy.action(time_step)
         return policy_step.PolicyStep(action_step.action, policy_state)
     else:
         return self._greedy_policy.action(time_step,
                                           policy_state=policy_state)
Beispiel #24
0
    def _action(self, time_step, policy_state):
        # Reset the policy when starting a new episode.
        is_time_step_first = time_step.is_first()
        if np.isscalar(is_time_step_first):
            if is_time_step_first:
                policy_state = self._initial_policy_state
        else:
            policy_state[is_time_step_first] = self._initial_policy_state[
                is_time_step_first]

        # Take actions 1 and 2 alternating.
        action = (policy_state % 2) + 1
        policy_info = action * 2
        return policy_step.PolicyStep(action, policy_state + 1, policy_info)
Beispiel #25
0
    def testAction(self):
        py_observation_spec = array_spec.BoundedArraySpec((3, ), np.int32, 1,
                                                          1)
        py_time_step_spec = ts.time_step_spec(py_observation_spec)
        py_action_spec = array_spec.BoundedArraySpec((7, ), np.int32, 1, 1)
        py_policy_state_spec = array_spec.BoundedArraySpec((5, ), np.int32, 0,
                                                           1)
        py_policy_info_spec = array_spec.BoundedArraySpec((3, ), np.int32, 0,
                                                          1)

        mock_py_policy = mock.create_autospec(py_policy.Base)
        mock_py_policy.time_step_spec = py_time_step_spec
        mock_py_policy.action_spec = py_action_spec
        mock_py_policy.policy_state_spec = py_policy_state_spec
        mock_py_policy.info_spec = py_policy_info_spec

        expected_py_policy_state = np.ones(py_policy_state_spec.shape,
                                           py_policy_state_spec.dtype)
        expected_py_time_step = tf.nest.map_structure(
            lambda arr_spec: np.ones(arr_spec.shape, arr_spec.dtype),
            py_time_step_spec)
        expected_py_action = np.ones(py_action_spec.shape,
                                     py_action_spec.dtype)
        expected_new_py_policy_state = np.zeros(py_policy_state_spec.shape,
                                                py_policy_state_spec.dtype)
        expected_py_info = np.zeros(py_policy_info_spec.shape,
                                    py_policy_info_spec.dtype)

        mock_py_policy.action.return_value = policy_step.PolicyStep(
            expected_py_action, expected_new_py_policy_state, expected_py_info)

        tf_mock_py_policy = tf_py_policy.TFPyPolicy(mock_py_policy)
        time_step = tf.nest.map_structure(
            lambda arr_spec: tf.ones(arr_spec.shape, arr_spec.dtype),
            py_time_step_spec)
        action_step = tf_mock_py_policy.action(
            time_step, tf.ones(py_policy_state_spec.shape, tf.int32))
        py_action_step = self.evaluate(action_step)

        self.assertEqual(1, mock_py_policy.action.call_count)
        np.testing.assert_equal(
            mock_py_policy.action.call_args[1]['time_step'],
            expected_py_time_step)
        np.testing.assert_equal(
            mock_py_policy.action.call_args[1]['policy_state'],
            expected_py_policy_state)
        np.testing.assert_equal(py_action_step.action, expected_py_action)
        np.testing.assert_equal(py_action_step.state,
                                expected_new_py_policy_state)
        np.testing.assert_equal(py_action_step.info, expected_py_info)
Beispiel #26
0
    def test_get_distribution_class_spec(self):
        ones = tf.ones(shape=[2], dtype=tf.float32)
        obs_spec = tensor_spec.TensorSpec(shape=[5], dtype=tf.float32)
        time_step_spec = ts.time_step_spec(obs_spec)
        mock_policy = mock.create_autospec(actor_policy.ActorPolicy)
        mock_policy.distribution.return_value = policy_step.PolicyStep(
            (tfp.distributions.Categorical(logits=ones),
             tfp.distributions.Normal(ones, ones)), None)

        class_spec = ppo_utils.get_distribution_class_spec(
            mock_policy, time_step_spec)
        self.assertAllEqual(
            (tfp.distributions.Categorical, tfp.distributions.Normal),
            class_spec)
Beispiel #27
0
    def _action(self, time_step, policy_state, seed):
        distribution_step = self.distribution(time_step, policy_state)

        def _sample(dist, action_spec):
            action = dist.sample(seed=seed)
            if self._clip:
                return common.clip_to_spec(action, action_spec)
            return action

        actions = nest.map_structure(_sample, distribution_step.action,
                                     self._action_spec)

        return policy_step.PolicyStep(actions, distribution_step.state,
                                      distribution_step.info)
Beispiel #28
0
    def _distribution(self, time_step, policy_state):
        # Actor network outputs nested structure of distributions or actions.
        actions_or_distributions, policy_state = self._apply_actor_network(
            time_step, policy_state)

        def _to_distribution(action_or_distribution):
            if isinstance(action_or_distribution, tf.Tensor):
                # This is an action tensor, so wrap it in a deterministic distribution.
                return tfp.distributions.Deterministic(
                    loc=action_or_distribution)
            return action_or_distribution

        distributions = tf.nest.map_structure(_to_distribution,
                                              actions_or_distributions)
        return policy_step.PolicyStep(distributions, policy_state)
Beispiel #29
0
    def _action(self, time_step, policy_state, seed):
        seed_stream = tfd.SeedStream(seed=seed, salt='ppo_policy')

        def _sample(dist, action_spec):
            action = dist.sample(seed=seed_stream())
            if self._clip:
                return common_utils.clip_to_spec(action, action_spec)
            return action

        distribution_step = self.distribution(time_step, policy_state)
        actions = nest.map_structure(_sample, distribution_step.action,
                                     self._action_spec)

        return policy_step.PolicyStep(actions, distribution_step.state,
                                      distribution_step.info)
Beispiel #30
0
    def test_get_distribution_params_spec(self):
        ones = tf.ones(shape=[1, 2], dtype=tf.float32)
        obs_spec = tensor_spec.TensorSpec(shape=[5], dtype=tf.float32)
        time_step_spec = ts.time_step_spec(obs_spec)
        mock_policy = mock.create_autospec(actor_policy.ActorPolicy)
        mock_policy._distribution.return_value = policy_step.PolicyStep(
            (tfp.distributions.Categorical(logits=ones),
             tfp.distributions.Normal(ones, ones)))

        params_spec = ppo_utils.get_distribution_params_spec(
            mock_policy, time_step_spec)
        self.assertAllEqual(
            [set(['logits']), set(['loc', 'scale'])],
            [set(d.keys()) for d in params_spec])
        self.assertAllEqual([[[2]], [[2], [2]]],
                            [[d[k].shape for k in d] for d in params_spec])