Ejemplo n.º 1
0
def to_transition(trajectory, next_trajectory=None):
    """Create a transition from a trajectory or two adjacent trajectories.

  **NOTE** If `next_trajectory` is not provided, tensors of `trajectory` are
  sliced along their *second* (`time`) dimension; for example:

  ```
  time_steps.observation = trajectory.observation[:, :-1]
  next_time_steps.observation = trajectory.observation[:, 1:]
  ```

  Args:
    trajectory: An instance of `Trajectory`.
    next_trajectory: (optional) An instance of `Trajectory`.

  Returns:
    A tuple `(time_steps, policy_steps, next_time_steps)`.  The `reward` and
    `discount` fields of `time_steps` are filled with zeros because these
    cannot be deduced.
  """
    if next_trajectory is None:
        next_trajectory = nest.map_structure(lambda x: x[:, 1:], trajectory)
        trajectory = nest.map_structure(lambda x: x[:, :-1], trajectory)
    policy_steps = policy_step.PolicyStep(trajectory.action, (),
                                          trajectory.policy_info)
    # TODO(kbanoop): Consider replacing 0 rewards & discounts with ().
    time_steps = ts.TimeStep(
        trajectory.step_type,
        reward=nest.map_structure(tf.zeros_like, trajectory.reward),  # unknown
        discount=tf.zeros_like(trajectory.discount),  # unknown
        observation=trajectory.observation)
    next_time_steps = ts.TimeStep(trajectory.next_step_type, trajectory.reward,
                                  trajectory.discount,
                                  next_trajectory.observation)
    return [time_steps, policy_steps, next_time_steps]
Ejemplo n.º 2
0
 def _get_mock_env_episode(self):
   mock_env = mock.MagicMock()
   mock_env.step.side_effect = [
       ts.TimeStep(ts.StepType.FIRST, 2, 1, [0]),
       ts.TimeStep(ts.StepType.MID, 3, 1, [1]),
       ts.TimeStep(ts.StepType.MID, 5, 1, [2]),
       ts.TimeStep(ts.StepType.LAST, 7, 1, [3]),
   ]
   return mock_env
Ejemplo n.º 3
0
def to_transition(trajectory, next_trajectory=None):
    """Create a transition from a trajectory or two adjacent trajectories.

  **NOTE** If `next_trajectory` is not provided, tensors of `trajectory` are
  sliced along their *second* (`time`) dimension; for example:

  ```
  time_steps.step_type = trajectory.step_type[:,:-1]
  time_steps.observation = trajectory.observation[:,:-1]
  next_time_steps.observation = trajectory.observation[:,1:]
  next_time_steps. step_type = trajectory. next_step_type[:,:-1]
  next_time_steps.reward = trajectory.reward[:,:-1]
  next_time_steps. discount = trajectory. discount[:,:-1]

  ```
  Notice that reward and discount for time_steps are undefined, therefore filled
  with zero.

  Args:
    trajectory: An instance of `Trajectory`. The tensors in Trajectory must have
      shape `[ B, T, ...]` when next_trajectory is None.
    next_trajectory: (optional) An instance of `Trajectory`.

  Returns:
    A tuple `(time_steps, policy_steps, next_time_steps)`.  The `reward` and
    `discount` fields of `time_steps` are filled with zeros because these
    cannot be deduced (please do not use them).
  """
    _validate_rank(trajectory.discount, min_rank=1, max_rank=2)

    if next_trajectory is not None:
        _validate_rank(next_trajectory.discount, min_rank=1, max_rank=2)

    if next_trajectory is None:
        next_trajectory = tf.nest.map_structure(lambda x: x[:, 1:], trajectory)
        trajectory = tf.nest.map_structure(lambda x: x[:, :-1], trajectory)
    policy_steps = policy_step.PolicyStep(action=trajectory.action,
                                          state=(),
                                          info=trajectory.policy_info)
    # TODO(kbanoop): Consider replacing 0 rewards & discounts with ().
    time_steps = ts.TimeStep(
        trajectory.step_type,
        reward=tf.nest.map_structure(tf.zeros_like,
                                     trajectory.reward),  # unknown
        discount=tf.zeros_like(trajectory.discount),  # unknown
        observation=trajectory.observation)
    next_time_steps = ts.TimeStep(step_type=trajectory.next_step_type,
                                  reward=trajectory.reward,
                                  discount=trajectory.discount,
                                  observation=next_trajectory.observation)
    return [time_steps, policy_steps, next_time_steps]
Ejemplo n.º 4
0
 def setUp(self):
     super(PolicySaverTest, self).setUp()
     self._time_step_spec = ts.TimeStep(
         step_type=tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                 shape=(),
                                                 name='st',
                                                 minimum=0,
                                                 maximum=2),
         reward=tensor_spec.BoundedTensorSpec(dtype=tf.float32,
                                              shape=(),
                                              name='reward',
                                              minimum=0.0,
                                              maximum=5.0),
         discount=tensor_spec.BoundedTensorSpec(dtype=tf.float32,
                                                shape=(),
                                                name='discount',
                                                minimum=0.0,
                                                maximum=1.0),
         observation=tensor_spec.BoundedTensorSpec(dtype=tf.float32,
                                                   shape=(4, ),
                                                   name='obs',
                                                   minimum=-10.0,
                                                   maximum=10.0))
     self._action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                       shape=(),
                                                       minimum=0,
                                                       maximum=10,
                                                       name='act_0')
     self._global_seed = 12345
     tf.compat.v1.set_random_seed(self._global_seed)
Ejemplo n.º 5
0
    def current_time_step(self):
        def first():
            return (tf.constant(FIRST, dtype=tf.int32),
                    tf.constant(0.0, dtype=tf.float32),
                    tf.constant(1.0, dtype=tf.float32))

        def mid():
            return (tf.constant(MID, dtype=tf.int32),
                    tf.constant(0.0, dtype=tf.float32),
                    tf.constant(1.0, dtype=tf.float32))

        def last():
            return (tf.constant(LAST, dtype=tf.int32),
                    tf.constant(1.0, dtype=tf.float32),
                    tf.constant(0.0, dtype=tf.float32))

        state_value = tf.mod(self._state.value(), 3)
        step_type, reward, discount = tf.case(
            {
                tf.equal(state_value, FIRST): first,
                tf.equal(state_value, MID): mid,
                tf.equal(state_value, LAST): last
            },
            exclusive=True,
            strict=True)
        return ts.TimeStep(step_type, reward, discount, state_value)
Ejemplo n.º 6
0
    def _train(self, experience, weights=None):
        # TODO(b/126593927): Support batch dimensions >1.
        if experience.step_type.shape[0] != 1:
            raise NotImplementedError(
                'ReinforceAgent does not yet support batch '
                'dimensions greater than 1.')

        experience = tf.nest.map_structure(lambda t: tf.squeeze(t, 0),
                                           experience)
        returns = common.compute_returns(experience.reward,
                                         experience.discount)
        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        # TODO(b/126592060): replace with tensor normalizer.
        if self._normalize_returns:
            ret_mean, ret_var = tf.nn.moments(x=returns, axes=[0])
            returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6)
            if self._debug_summaries:
                tf.compat.v2.summary.histogram(name='normalized_returns',
                                               data=returns,
                                               step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        variables_to_train = self._actor_network.variables
        with tf.GradientTape() as tape:
            loss_info = self._loss(time_step,
                                   experience.action,
                                   tf.stop_gradient(returns),
                                   weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = zip(grads, variables_to_train)
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
Ejemplo n.º 7
0
        def loop_body(time, time_step, policy_state, output_action_tas,
                      output_policy_info_tas):
            """Runs a step in environment.

      While loop will call multiple times.

      Args:
        time: Step time.
        time_step: Previous step's `TimeStep`.
        policy_state: Policy state tensor or nested structure of tensors.
        output_action_tas: Updated nest of `tf.TensorArray`, the new actions.
        output_policy_info_tas: Updated nest of `tf.TensorArray`, the new
          policy info.

      Returns:
        loop_vars for next iteration of tf.while_loop.
      """
            policy_state, next_output_action_tas, next_output_policy_info_tas = (
                process_step(time, time_step, policy_state, output_action_tas,
                             output_policy_info_tas))

            ta_read = lambda ta: ta.read(time)
            ta_read_prev = lambda ta: ta.read(time - 1)
            time_step = ts.TimeStep(
                step_type=ta_read(trajectory_tas.step_type),
                observation=tf.nest.map_structure(ta_read,
                                                  trajectory_tas.observation),
                reward=tf.nest.map_structure(ta_read_prev,
                                             trajectory_tas.reward),
                discount=ta_read_prev(trajectory_tas.discount))

            return (time + 1, time_step, policy_state, next_output_action_tas,
                    next_output_policy_info_tas)
Ejemplo n.º 8
0
    def _set_names_and_shapes(self, step_type, reward, discount,
                              *flat_observations):
        """Returns a `TimeStep` namedtuple."""
        step_type = tf.identity(step_type, name='step_type')
        reward = tf.identity(reward, name='reward')
        discount = tf.identity(discount, name='discount')
        batch_shape = () if not self.batched else (self.batch_size, )
        batch_shape = tf.TensorShape(batch_shape)
        if not tf.executing_eagerly():
            # Shapes are not required in eager mode.
            reward.set_shape(batch_shape)
            step_type.set_shape(batch_shape)
            discount.set_shape(batch_shape)
        # Give each tensor a meaningful name and set the static shape.
        named_observations = []
        for obs, spec in zip(flat_observations,
                             tf.nest.flatten(self.observation_spec())):
            named_observation = tf.identity(obs, name=spec.name)
            if not tf.executing_eagerly():
                named_observation.set_shape(batch_shape.concatenate(
                    spec.shape))
            named_observations.append(named_observation)

        observations = tf.nest.pack_sequence_as(self.observation_spec(),
                                                named_observations)

        return ts.TimeStep(step_type, reward, discount, observations)
Ejemplo n.º 9
0
    def testTrain(self, num_epochs, use_td_lambda_return):
        if tf.executing_eagerly():
            self.skipTest('b/123777119')  # Secondary bug: ('b/123770140')

        with tf.compat.v2.summary.record_if(False):
            agent = ppo_agent.PPOAgent(
                self._time_step_spec,
                self._action_spec,
                tf.compat.v1.train.AdamOptimizer(),
                actor_net=DummyActorNet(self._action_spec, ),
                value_net=DummyValueNet(outer_rank=2),
                normalize_observations=False,
                num_epochs=num_epochs,
                use_gae=use_td_lambda_return,
                use_td_lambda_return=use_td_lambda_return)
            observations = tf.constant([
                [[1, 2], [3, 4], [5, 6]],
                [[1, 2], [3, 4], [5, 6]],
            ],
                                       dtype=tf.float32)

            time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2,
                                                           dtype=tf.int32),
                                     reward=tf.constant([[1] * 3] * 2,
                                                        dtype=tf.float32),
                                     discount=tf.constant([[1] * 3] * 2,
                                                          dtype=tf.float32),
                                     observation=observations)
            actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                                  dtype=tf.float32)

            action_distribution_parameters = {
                'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
                'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
            }

            policy_info = action_distribution_parameters

            experience = trajectory.Trajectory(
                time_steps.step_type, observations, actions, policy_info,
                time_steps.step_type, time_steps.reward, time_steps.discount)

            # Mock the build_train_op to return an op for incrementing this counter.
            counter = tf.compat.v1.train.get_or_create_global_step()
            zero = tf.constant(0, dtype=tf.float32)
            agent.build_train_op = (
                lambda *_, **__: tf_agent.LossInfo(  # pylint: disable=g-long-lambda
                    counter.assign_add(1), ppo_agent.PPOLossInfo(*[zero] * 5)))

            train_op = agent.train(experience)

            self.evaluate(tf.compat.v1.global_variables_initializer())

            # Assert that counter starts out at zero.
            self.assertEqual(0, self.evaluate(counter))

            self.evaluate(train_op)

            # Assert that train_op ran increment_counter num_epochs times.
            self.assertEqual(num_epochs, self.evaluate(counter))
Ejemplo n.º 10
0
    def _pack_and_filter_timestep_observation(self, timestep):
        """Pack and filter observations into a single dimension.

    Args:
      timestep: A `TimeStep` namedtuple containing:
        - step_type: A `StepType` value.
        - reward: Reward at this timestep.
        - discount: A discount in the range [0, 1].
        - observation: A NumPy array, or a nested dict, list or tuple of arrays
          corresponding to `observation_spec()`.

    Returns:
      A new `TimeStep` namedtuple that has filtered observations and packed into
        a single dimenison.
    """
        # We can't set attribute to the TimeStep tuple, so we make a copy of the
        # observations.
        observations = timestep.observation
        if self._observations_whitelist is not None:
            observations = self._filter_observations(observations)

        return ts.TimeStep(
            timestep.step_type, timestep.reward, timestep.discount,
            self._flatten_nested_observations(observations,
                                              is_batched=self._env.batched))
Ejemplo n.º 11
0
 def _time_step_batch(self):
   return ts.TimeStep(
       tf.constant(
           ts.StepType.FIRST, dtype=tf.int32, shape=[2], name='step_type'),
       tf.constant(0.0, dtype=tf.float32, shape=[2], name='reward'),
       tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'),
       tf.constant([[1, 2], [3, 4]], dtype=tf.float32, name='observation'))
Ejemplo n.º 12
0
    def testTrain(self, num_epochs, use_td_lambda_return):
        with tf.compat.v2.summary.record_if(False):
            # Mock the build_train_op to return an op for incrementing this counter.
            counter = common.create_variable('test_train_counter')
            agent = ppo_agent.PPOAgent(
                self._time_step_spec,
                self._action_spec,
                tf.compat.v1.train.AdamOptimizer(),
                actor_net=DummyActorNet(
                    self._obs_spec,
                    self._action_spec,
                ),
                value_net=DummyValueNet(self._obs_spec),
                normalize_observations=False,
                num_epochs=num_epochs,
                use_gae=use_td_lambda_return,
                use_td_lambda_return=use_td_lambda_return,
                train_step_counter=counter)
            observations = tf.constant([
                [[1, 2], [3, 4], [5, 6]],
                [[1, 2], [3, 4], [5, 6]],
            ],
                                       dtype=tf.float32)

            time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2,
                                                           dtype=tf.int32),
                                     reward=tf.constant([[1] * 3] * 2,
                                                        dtype=tf.float32),
                                     discount=tf.constant([[1] * 3] * 2,
                                                          dtype=tf.float32),
                                     observation=observations)
            actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                                  dtype=tf.float32)

            action_distribution_parameters = {
                'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
                'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
            }

            policy_info = action_distribution_parameters

            experience = trajectory.Trajectory(
                time_steps.step_type, observations, actions, policy_info,
                time_steps.step_type, time_steps.reward, time_steps.discount)

            # Force variable creation.
            agent.policy.variables()

            if not tf.executing_eagerly():
                loss = agent.train(experience)
            else:
                loss = lambda: agent.train(experience)

            # Assert that counter starts out at zero.
            self.evaluate(tf.compat.v1.initialize_all_variables())
            self.assertEqual(0, self.evaluate(counter))
            self.evaluate(loss)
            # Assert that train_op ran increment_counter num_epochs times.
            self.assertEqual(num_epochs, self.evaluate(counter))
Ejemplo n.º 13
0
 def _apply_actor_network(self, time_step, policy_state):
   if self._observation_normalizer:
     observation = self._observation_normalizer.normalize(
         time_step.observation)
     time_step = ts.TimeStep(time_step.step_type, time_step.reward,
                             time_step.discount, observation)
   return self._actor_network(
       time_step.observation, time_step.step_type, network_state=policy_state)
Ejemplo n.º 14
0
    def _train(self, experience, weights=None, train_step_counter=None):
        # TODO(sfishman): Support batch dimensions >1.
        if experience.step_type.shape[0] != 1:
            raise NotImplementedError(
                'ReinforceAgent does not yet support batch '
                'dimensions greater than 1.')
        experience = nest.map_structure(lambda t: tf.squeeze(t, 0), experience)
        returns = common.compute_returns(experience.reward,
                                         experience.discount)
        if self._debug_summaries:
            tf.contrib.summary.histogram('rewards', experience.reward)
            tf.contrib.summary.histogram('discounts', experience.discount)
            tf.contrib.summary.histogram('returns', returns)

        # TODO(kbnaoop): replace with tensor normalizer.
        if self._normalize_returns:
            ret_mean, ret_var = tf.nn.moments(returns, axes=[0])
            returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6)
            if self._debug_summaries:
                tf.contrib.summary.histogram('normalized_returns', returns)

        # TODO(kbanoop): remove after changing network interface to accept
        # observations and step_types, instead of time_steps.
        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)
        # TODO(kbanoop): Filter boundary steps.

        loss_info = self._loss(time_step,
                               experience.action,
                               tf.stop_gradient(returns),
                               weights=weights)

        clip_gradients = (tf.contrib.training.clip_gradient_norms_fn(
            self._gradient_clipping) if self._gradient_clipping else None)

        # TODO(sguada): create_train_step should not return a Future.
        loss_info = eager_utils.create_train_step(
            loss_info,
            self._optimizer,
            total_loss_fn=lambda loss_info: loss_info.loss,
            global_step=train_step_counter,
            transform_grads_fn=clip_gradients,
            summarize_gradients=self._summarize_grads_and_vars,
            variables_to_train=lambda: self._actor_network.trainable_weights,
        )

        if isinstance(loss_info, eager_utils.Future):
            loss_info = loss_info()

        if self._summarize_grads_and_vars:
            with tf.name_scope('Variables/'):
                for var in self._actor_network.trainable_weights:
                    tf.contrib.summary.histogram(var.name.replace(':', '_'),
                                                 var)

        return loss_info
Ejemplo n.º 15
0
    def _step(self, action):
        total_reward = 0

        for _ in range(self._times):
            time_step = self._env.step(action)
            total_reward += time_step.reward
            if time_step.is_last():
                break

        return ts.TimeStep(time_step.step_type, total_reward,
                           time_step.discount, time_step.observation)
Ejemplo n.º 16
0
 def testLastNumpy(self):
     observation = -1
     reward = 2.0
     discount = 1.0
     time_step = ts.TimeStep(np.asarray(ts.StepType.LAST),
                             np.asarray(reward), np.asarray(discount),
                             np.asarray(observation))
     self.assertTrue(time_step.is_last())
     self.assertEqual(ts.StepType.LAST, time_step.step_type)
     self.assertEqual(-1, time_step.observation)
     self.assertEqual(2.0, time_step.reward)
     self.assertEqual(1.0, time_step.discount)
Ejemplo n.º 17
0
  def _train(self, experience, weights=None):
    returns = value_ops.discounted_return(
        experience.reward, experience.discount, time_major=False)

    if self._debug_summaries:
      tf.compat.v2.summary.histogram(
          name='rewards', data=experience.reward, step=self.train_step_counter)
      tf.compat.v2.summary.histogram(
          name='discounts',
          data=experience.discount,
          step=self.train_step_counter)
      tf.compat.v2.summary.histogram(
          name='returns', data=returns, step=self.train_step_counter)

    # TODO(b/126592060): replace with tensor normalizer.
    if self._normalize_returns:
      returns = _standard_normalize(returns, axes=(0, 1))
      if self._debug_summaries:
        tf.compat.v2.summary.histogram(
            name='normalized_returns',
            data=returns,
            step=self.train_step_counter)

    time_step = ts.TimeStep(experience.step_type,
                            tf.zeros_like(experience.reward),
                            tf.zeros_like(experience.discount),
                            experience.observation)

    variables_to_train = self._actor_network.variables
    with tf.GradientTape() as tape:
      loss_info = self._loss(time_step,
                             experience.action,
                             tf.stop_gradient(returns),
                             weights=weights)
      tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
    grads = tape.gradient(loss_info.loss, variables_to_train)

    grads_and_vars = zip(grads, variables_to_train)
    if self._gradient_clipping:
      grads_and_vars = eager_utils.clip_gradient_norms(
          grads_and_vars, self._gradient_clipping)

    if self._summarize_grads_and_vars:
      eager_utils.add_variables_summaries(
          grads_and_vars, self.train_step_counter)
      eager_utils.add_gradients_summaries(
          grads_and_vars, self.train_step_counter)

    self._optimizer.apply_gradients(
        grads_and_vars, global_step=self.train_step_counter)

    return tf.nest.map_structure(tf.identity, loss_info)
Ejemplo n.º 18
0
  def testTrain(self, num_epochs):
    agent = ppo_agent.PPOAgent(
        self._time_step_spec,
        self._action_spec,
        tf.train.AdamOptimizer(),
        actor_net=DummyActorNet(self._action_spec,),
        value_net=DummyValueNet(outer_rank=2),
        normalize_observations=False,
        num_epochs=num_epochs,
    )
    observations = tf.constant([
        [[1, 2], [3, 4], [5, 6]],
        [[1, 2], [3, 4], [5, 6]],
    ],
                               dtype=tf.float32)
    time_steps = ts.TimeStep(
        step_type=tf.constant([[1] * 3] * 2, dtype=tf.int32),
        reward=tf.constant([[1] * 3] * 2, dtype=tf.float32),
        discount=tf.constant([[1] * 3] * 2, dtype=tf.float32),
        observation=observations)
    actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32)
    action_distribution_parameters = {
        'loc': tf.constant([[0.0, 0.0], [0.0, 0.0]], dtype=tf.float32),
        'scale': tf.constant([[1.0, 1.0], [1.0, 1.0]], dtype=tf.float32),
    }
    policy_info = action_distribution_parameters

    experience = trajectory.Trajectory(
        time_steps.step_type, observations, actions, policy_info,
        time_steps.step_type, time_steps.reward, time_steps.discount)

    # Mock the build_train_op to return an op for incrementing this counter.
    counter = tf.train.get_or_create_global_step()
    zero = tf.constant(0, dtype=tf.float32)
    agent.build_train_op = (
        lambda *_, **__: (counter.assign_add(1), [zero] * 5))

    train_op = agent.train(experience)

    with self.test_session() as sess:
      sess.run(tf.global_variables_initializer())

      # Assert that counter starts out at zero.
      counter_ = sess.run(counter)
      self.assertEqual(0, counter_)

      sess.run(train_op)

      # Assert that train_op ran increment_counter num_epochs times.
      counter_ = sess.run(counter)
      self.assertEqual(num_epochs, counter_)
Ejemplo n.º 19
0
def convert_time_step(time_step):
    """Convert to agents time_step type as the __hash__ method is different."""
    reward = time_step.reward
    if reward is None:
        reward = 0.0
    discount = time_step.discount
    if discount is None:
        discount = 1.0
    return ts.TimeStep(
        ts.StepType(time_step.step_type),
        _as_float32_array(reward),
        _as_float32_array(discount),
        time_step.observation,
    )
Ejemplo n.º 20
0
  def test(self):
    first = ts.StepType.FIRST
    mid = ts.StepType.MID
    last = ts.StepType.LAST
    step_types = [first, mid, mid, last, mid, mid, mid, last]
    discounts = [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0]
    time_steps = ts.TimeStep(
        step_type=step_types, discount=discounts, reward=discounts,
        observation=discounts)
    episode_mask = common.get_episode_mask(time_steps)

    expected_mask = [1, 1, 1, 0, 1, 1, 1, 0]
    self.evaluate(tf.global_variables_initializer())
    self.assertAllEqual(expected_mask, self.evaluate(episode_mask))
Ejemplo n.º 21
0
    def testTrainWithRnn(self):
        with tf.compat.v2.summary.record_if(False):
            actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
                self._obs_spec,
                self._action_spec,
                input_fc_layer_params=None,
                output_fc_layer_params=None,
                conv_layer_params=None,
                lstm_size=(40, ))

            counter = common.create_variable('test_train_counter')
            agent = reinforce_agent.ReinforceAgent(
                self._time_step_spec,
                self._action_spec,
                actor_network=actor_net,
                optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
                train_step_counter=counter)

            batch_size = 5
            observations = tf.constant([[[1, 2], [3, 4], [5, 6]]] * batch_size,
                                       dtype=tf.float32)
            time_steps = ts.TimeStep(
                step_type=tf.constant([[1] * 3] * batch_size, dtype=tf.int32),
                reward=tf.constant([[1] * 3] * batch_size, dtype=tf.float32),
                discount=tf.constant([[1] * 3] * batch_size, dtype=tf.float32),
                observation=observations)
            actions = tf.constant([[[0], [1], [1]]] * batch_size,
                                  dtype=tf.float32)

            experience = trajectory.Trajectory(time_steps.step_type,
                                               observations, actions, (),
                                               time_steps.step_type,
                                               time_steps.reward,
                                               time_steps.discount)

            # Force variable creation.
            agent.policy.variables()

            if tf.executing_eagerly():
                loss = lambda: agent.train(experience)
            else:
                loss = agent.train(experience)

            self.evaluate(tf.compat.v1.initialize_all_variables())
            self.assertEqual(self.evaluate(counter), 0)
            self.evaluate(loss)
            self.assertEqual(self.evaluate(counter), 1)
Ejemplo n.º 22
0
    def testMakeTimestepMaskWithPartialEpisode(self):
        first, mid, last = ts.StepType.FIRST, ts.StepType.MID, ts.StepType.LAST

        next_step_types = tf.constant(
            [[mid, mid, last, first, mid, mid, last, first, mid, mid],
             [mid, mid, last, first, mid, mid, mid, mid, mid, last]])
        zeros = tf.zeros_like(next_step_types)
        next_time_step = ts.TimeStep(next_step_types, zeros, zeros, zeros)

        # Mask should be 0.0 for transition timesteps (3, 7) and for all timesteps
        #   belonging to the final, incomplete episode.
        expected_mask = [[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0],
                         [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
        timestep_mask = ppo_utils.make_timestep_mask(next_time_step)

        timestep_mask_ = self.evaluate(timestep_mask)
        self.assertAllClose(expected_mask, timestep_mask_)
Ejemplo n.º 23
0
    def run(self, trajectory, policy_state=None):
        """Apply the policy to trajectory steps and store actions/info.

    If `self.time_major == True`, the tensors in `trajectory` are assumed to
    have shape `[time, batch, ...]`.  Otherwise they are assumed to
    have shape `[batch, time, ...]`.

    Args:
      trajectory: The `Trajectory` to run against.
        If the replay class was created with `time_major=True`, then
        the tensors in trajectory must be shaped `[time, batch, ...]`.
        Otherwise they must be shaped `[batch, time, ...]`.
      policy_state: (optional) A nest Tensor with initial step policy state.

    Returns:
      output_actions: A nest of the actions that the policy took.
        If the replay class was created with `time_major=True`, then
        the tensors here will be shaped `[time, batch, ...]`.  Otherwise
        they'll be shaped `[batch, time, ...]`.
      output_policy_info: A nest of the policy info that the policy emitted.
        If the replay class was created with `time_major=True`, then
        the tensors here will be shaped `[time, batch, ...]`.  Otherwise
        they'll be shaped `[batch, time, ...]`.
      policy_state: A nest Tensor with final step policy state.

    Raises:
      TypeError: If `policy_state` structure doesn't match
        `self.policy.policy_state_spec`, or `trajectory` structure doesn't
        match `self.policy.trajectory_spec`.
      ValueError: If `policy_state` doesn't match
        `self.policy.policy_state_spec`, or `trajectory` structure doesn't
        match `self.policy.trajectory_spec`.
      ValueError: If `trajectory` lacks two outer dims.
    """
        trajectory_spec = self._policy.trajectory_spec()
        outer_dims = nest_utils.get_outer_shape(trajectory, trajectory_spec)

        if tf.compat.dimension_value(outer_dims.shape[0]) != 2:
            raise ValueError(
                "Expected two outer dimensions, but saw '{}' dimensions.\n"
                "Trajectory:\n{}.\nTrajectory spec from policy:\n{}.".format(
                    tf.compat.dimension_value(outer_dims.shape[0]), trajectory,
                    trajectory_spec))
        if self._time_major:
            sequence_length = outer_dims[0]
            batch_size = outer_dims[1]
            static_batch_size = tf.compat.dimension_value(
                trajectory.discount.shape[1])
        else:
            batch_size = outer_dims[0]
            sequence_length = outer_dims[1]
            static_batch_size = tf.compat.dimension_value(
                trajectory.discount.shape[0])

        if policy_state is None:
            policy_state = self._policy.get_initial_state(batch_size)
        else:
            tf.nest.assert_same_structure(policy_state,
                                          self._policy.policy_state_spec())

        if not self._time_major:
            # Make trajectory time-major.
            trajectory = tf.nest.map_structure(
                common_utils.transpose_batch_time, trajectory)

        trajectory_tas = tf.nest.map_structure(
            lambda t: tf.TensorArray(t.dtype, size=sequence_length).unstack(t),
            trajectory)

        def create_output_ta(spec):
            return tf.TensorArray(spec.dtype,
                                  size=sequence_length,
                                  element_shape=(tf.TensorShape([
                                      static_batch_size
                                  ]).concatenate(spec.shape)))

        output_action_tas = tf.nest.map_structure(create_output_ta,
                                                  trajectory_spec.action)
        output_policy_info_tas = tf.nest.map_structure(
            create_output_ta, trajectory_spec.policy_info)

        read0 = lambda ta: ta.read(0)
        zeros_like0 = lambda t: tf.zeros_like(t[0])
        ones_like0 = lambda t: tf.ones_like(t[0])
        time_step = ts.TimeStep(
            step_type=read0(trajectory_tas.step_type),
            reward=tf.nest.map_structure(zeros_like0, trajectory.reward),
            discount=ones_like0(trajectory.discount),
            observation=tf.nest.map_structure(read0,
                                              trajectory_tas.observation))

        def process_step(time, time_step, policy_state, output_action_tas,
                         output_policy_info_tas):
            """Take an action on the given step, and update output TensorArrays.

      Args:
        time: Step time.  Describes which row to read from the trajectory
          TensorArrays and which location to write into in the output
          TensorArrays.
        time_step: Previous step's `TimeStep`.
        policy_state: Policy state tensor or nested structure of tensors.
        output_action_tas: Nest of `tf.TensorArray` containing new actions.
        output_policy_info_tas: Nest of `tf.TensorArray` containing new
          policy info.

      Returns:
        policy_state: The next policy state.
        next_output_action_tas: Updated `output_action_tas`.
        next_output_policy_info_tas: Updated `output_policy_info_tas`.
      """
            action_step = self._policy.action(time_step, policy_state)
            policy_state = action_step.state
            write_ta = lambda ta, t: ta.write(time - 1, t)
            next_output_action_tas = tf.nest.map_structure(
                write_ta, output_action_tas, action_step.action)
            next_output_policy_info_tas = tf.nest.map_structure(
                write_ta, output_policy_info_tas, action_step.info)

            return (action_step.state, next_output_action_tas,
                    next_output_policy_info_tas)

        def loop_body(time, time_step, policy_state, output_action_tas,
                      output_policy_info_tas):
            """Runs a step in environment.

      While loop will call multiple times.

      Args:
        time: Step time.
        time_step: Previous step's `TimeStep`.
        policy_state: Policy state tensor or nested structure of tensors.
        output_action_tas: Updated nest of `tf.TensorArray`, the new actions.
        output_policy_info_tas: Updated nest of `tf.TensorArray`, the new
          policy info.

      Returns:
        loop_vars for next iteration of tf.while_loop.
      """
            policy_state, next_output_action_tas, next_output_policy_info_tas = (
                process_step(time, time_step, policy_state, output_action_tas,
                             output_policy_info_tas))

            ta_read = lambda ta: ta.read(time)
            ta_read_prev = lambda ta: ta.read(time - 1)
            time_step = ts.TimeStep(
                step_type=ta_read(trajectory_tas.step_type),
                observation=tf.nest.map_structure(ta_read,
                                                  trajectory_tas.observation),
                reward=tf.nest.map_structure(ta_read_prev,
                                             trajectory_tas.reward),
                discount=ta_read_prev(trajectory_tas.discount))

            return (time + 1, time_step, policy_state, next_output_action_tas,
                    next_output_policy_info_tas)

        time = tf.constant(1)
        time, time_step, policy_state, output_action_tas, output_policy_info_tas = (
            tf.while_loop(cond=lambda time, *_: time < sequence_length,
                          body=loop_body,
                          loop_vars=[
                              time, time_step, policy_state, output_action_tas,
                              output_policy_info_tas
                          ],
                          back_prop=False,
                          name="trajectory_replay_loop"))

        # Run the last time step
        last_policy_state, output_action_tas, output_policy_info_tas = (
            process_step(time, time_step, policy_state, output_action_tas,
                         output_policy_info_tas))

        def stack_ta(ta):
            t = ta.stack()
            if not self._time_major:
                t = common_utils.transpose_batch_time(t)
            return t

        stacked_output_actions = tf.nest.map_structure(stack_ta,
                                                       output_action_tas)
        stacked_output_policy_info = tf.nest.map_structure(
            stack_ta, output_policy_info_tas)

        return (stacked_output_actions, stacked_output_policy_info,
                last_policy_state)
Ejemplo n.º 24
0
 def _time_step(self):
     return ts.TimeStep(step_type=tf.constant([1], dtype=tf.int32),
                        reward=tf.constant([1], dtype=tf.float32),
                        discount=tf.constant([1], dtype=tf.float32),
                        observation=tf.constant([[1, 2]], dtype=tf.float32))
Ejemplo n.º 25
0
    def testAgentDoesNotFailWhenNestedObservationActionAndDebugSummaries(self):
        summary_writer = tf.compat.v2.summary.create_file_writer(
            FLAGS.test_tmpdir, flush_millis=10000)
        summary_writer.set_as_default()

        nested_obs_spec = (self._obs_spec, self._obs_spec, {
            'a': self._obs_spec,
            'b': self._obs_spec,
        })
        nested_time_spec = ts.time_step_spec(nested_obs_spec)

        nested_act_spec = (self._action_spec, {
            'c': self._action_spec,
            'd': self._action_spec
        })

        class NestedActorNet(network.DistributionNetwork):
            def __init__(self, dummy_model):
                output_spec = (dummy_model.output_spec, {
                    'c': dummy_model.output_spec,
                    'd': dummy_model.output_spec,
                })
                super(NestedActorNet,
                      self).__init__(dummy_model.input_tensor_spec, (),
                                     output_spec=output_spec,
                                     name='NestedActorNet')
                self.dummy_model = dummy_model

            def call(self, *args, **kwargs):
                dummy_ans, _ = self.dummy_model(*args, **kwargs)
                return (dummy_ans, {'c': dummy_ans, 'd': dummy_ans}), ()

        dummy_model = DummyActorNet(nested_obs_spec, self._action_spec)
        agent = ppo_agent.PPOAgent(nested_time_spec,
                                   nested_act_spec,
                                   tf.compat.v1.train.AdamOptimizer(),
                                   actor_net=NestedActorNet(dummy_model),
                                   value_net=DummyValueNet(nested_obs_spec),
                                   debug_summaries=True)

        observations = tf.constant([
            [[1, 2], [3, 4], [5, 6]],
            [[1, 2], [3, 4], [5, 6]],
        ],
                                   dtype=tf.float32)

        observations = (observations, observations, {
            'a': observations,
            'b': observations,
        })

        time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2,
                                                       dtype=tf.int32),
                                 reward=tf.constant([[1] * 3] * 2,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 3] * 2,
                                                      dtype=tf.float32),
                                 observation=observations)
        actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                              dtype=tf.float32)

        actions = (actions, {
            'c': actions,
            'd': actions,
        })

        action_distribution_parameters = {
            'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
            'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
        }
        action_distribution_parameters = (action_distribution_parameters, {
            'c': action_distribution_parameters,
            'd': action_distribution_parameters,
        })

        policy_info = action_distribution_parameters

        experience = trajectory.Trajectory(time_steps.step_type, observations,
                                           actions, policy_info,
                                           time_steps.step_type,
                                           time_steps.reward,
                                           time_steps.discount)

        agent.train(experience)