Esempio n. 1
0
def updated_sample(sample: Any, reward_shift: float,
                   action_clipping: Optional[Tuple[float, float]],
                   use_trajectories: bool):
    """Create a sample with reward_shift and action_clipping."""
    def _clip_actions(actions):
        return tf.clip_by_value(actions,
                                clip_value_min=action_clipping[0],
                                clip_value_max=action_clipping[1])

    if use_trajectories:
        # Update trajectory.
        shifted_reward = sample.reward + reward_shift
        if action_clipping:
            return sample._replace(action=tf.nest.map_structure(
                _clip_actions, sample.action),
                                   reward=shifted_reward)
        else:
            return sample._replace(reward=shifted_reward)
    else:
        # Update transition.
        next_time_step = sample.next_time_step
        next_time_step = ts.TimeStep(step_type=next_time_step.step_type,
                                     reward=next_time_step.reward +
                                     reward_shift,
                                     discount=next_time_step.discount,
                                     observation=next_time_step.observation)
        action_step = sample.action_step
        if action_clipping:
            action_step = action_step._replace(action=tf.nest.map_structure(
                _clip_actions, action_step.action))
        return trajectory.Transition(time_step=sample.time_step,
                                     action_step=action_step,
                                     next_time_step=next_time_step)
Esempio n. 2
0
    def test_collect_data_spec_transition(self):
        episode_dict = {
            'states':
            np.array([[1., 2.], [3., 4.], [5., 6.], [7., 8.]],
                     dtype=np.float32),
            'actions':
            np.array([[1.], [2.], [3.], [4.]], dtype=np.float32),
            'rewards':
            np.array([[0.], [1.], [0.], [1.]], dtype=np.float32),
            'discounts':
            np.array([1.0, 0.0, 1.0, 0.0], dtype=np.float32),
            'episode_start_index':
            np.array([0, 2], dtype=np.int32)
        }

        time_step_spec = time_step.TimeStep(
            step_type=ArraySpec(shape=[], dtype=np.int32),
            reward=ArraySpec(shape=[1], dtype=np.float32),
            discount=ArraySpec(shape=[], dtype=np.float32),
            observation=ArraySpec(shape=[2], dtype=np.float32))
        action_spec = policy_step.PolicyStep(action=ArraySpec(
            shape=[1], dtype=np.float32),
                                             state=(),
                                             info=())
        expected_spec = trajectory.Transition(time_step=time_step_spec,
                                              action_step=action_spec,
                                              next_time_step=time_step_spec)
        actual_spec = create_collect_data_spec(episode_dict,
                                               use_trajectories=False)
        self.assertEqual(actual_spec, expected_spec)
Esempio n. 3
0
    def testAgentTransitionTrain(self):
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self._obs_spec,
            self._action_spec,
            fc_layer_params=(10, ),
            continuous_projection_net=tanh_normal_projection_network.
            TanhNormalProjectionNetwork)

        agent = sac_agent.SacAgent(
            self._time_step_spec,
            self._action_spec,
            critic_network=DummyCriticNet(),
            actor_network=actor_net,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001))

        time_step_spec = self._time_step_spec._replace(
            reward=tensor_spec.BoundedTensorSpec(
                [], tf.float32, minimum=0.0, maximum=1.0, name='reward'))

        transition_spec = trajectory.Transition(
            time_step=time_step_spec,
            action_step=policy_step.PolicyStep(action=self._action_spec,
                                               state=(),
                                               info=()),
            next_time_step=time_step_spec)

        sample_trajectory_experience = tensor_spec.sample_spec_nest(
            transition_spec, outer_dims=(3, ))
        agent.train(sample_trajectory_experience)
Esempio n. 4
0
def _as_tfa_transition(value: typing.Tuple[typing.Any, typing.Any, typing.Any]):
  """Makes sure the transition and its values are TFA types."""
  time_step, action_step, next_time_step = value
  time_step = ts.TimeStep(*time_step)
  action_step = policy_step.PolicyStep(*action_step)
  next_time_step = ts.TimeStep(*next_time_step)
  return trajectory.Transition(time_step, action_step, next_time_step)
Esempio n. 5
0
  def __call__(self, value: typing.Any) -> trajectory.Transition:
    """Convert `value` to an N-step Transition; validate data & prune.

    - If `value` is already a `Transition`, only validation is performed.
    - If `value` is a `Trajectory` with tensors containing a time dimension
      having `T != n + 1`, a `ValueError` is raised.

    Args:
      value: A `Trajectory` or `Transition` object to convert.

    Returns:
      A validated and pruned `Transition`.  If `squeeze_time_dim = True`,
      the resulting `Transition` has tensors with shape `[B, ...]`.  Otherwise,
      the tensors will have shape `[B, T - 1, ...]`.

    Raises:
      TypeError: If `value` is not one of `Trajectory` or `Transition`.
      ValueError: If `value` has structure that doesn't match the converter's
        spec.
      TypeError: If `value` has a structure that doesn't match the converter's
        spec.
      ValueError: If `n != None` and `value` is a `Trajectory`
        with a time dimension having value other than `T=n + 1`.
    """
    if _is_transition_like(value):
      value = _as_tfa_transition(value)
    elif _is_trajectory_like(value):
      required_sequence_length = 1 if self._squeeze_time_dim else None
      _validate_trajectory(
          value,
          self._data_context.trajectory_spec,
          sequence_length=required_sequence_length)
      if self._squeeze_time_dim:
        value = tf.nest.map_structure(lambda e: tf.squeeze(e, axis=1), value)
      policy_steps = policy_step.PolicyStep(
          action=value.action, state=(), info=value.policy_info)
      # TODO(b/130244652): Consider replacing 0 rewards & discounts with ().
      time_steps = ts.TimeStep(
          value.step_type,
          reward=tf.nest.map_structure(tf.zeros_like, value.reward),  # unknown
          discount=tf.zeros_like(value.discount),  # unknown
          observation=value.observation)
      next_time_steps = ts.TimeStep(
          step_type=value.next_step_type,
          reward=value.reward,
          discount=value.discount,
          observation=tf.zeros_like(value.discount))
      value = trajectory.Transition(time_steps, policy_steps, next_time_steps)
    else:
      raise TypeError('Input type not supported: {}'.format(value))

    num_outer_dims = 1 if self._squeeze_time_dim else 2
    _validate_transition(
        value, self._data_context.transition_spec, num_outer_dims)

    value = nest_utils.prune_extra_keys(
        self._data_context.transition_spec, value)
    return value
  def __init__(
      self,
      time_step_spec: ts.TimeStep,
      action_spec: types.NestedTensorSpec,
      info_spec: types.NestedTensorSpec
  ):
    """Creates a DataContext.

    Note: The context does not store a state spec, or other information about
    a Policy's internal state.  Policy state is not typically stored in a
    replay buffer or on disk, except when the policy explicitly chooses to
    store it by adding the state as a field inside its `info` output.  In
    those cases, the internal policy state spec is represented as part of the
    `info_spec`.

    Args:
      time_step_spec: A nest of `tf.TimeStep` representing the time_steps.
      action_spec: A nest of `tf.TypeSpec` representing the actions.
      info_spec: A nest of `tf.TypeSpec` representing the policy's info.
        (Typically this is the info emitted by the collect policy).

    Raises:
      TypeError: If any of the specs are not nests containing tf.TypeSpec
        objects.
    """
    def _each_isinstance(spec, spec_types):
      """Checks if each element of `spec` is instance of `spec_types`."""
      return all([isinstance(s, spec_types) for s in tf.nest.flatten(spec)])

    for (spec, label) in ((time_step_spec, 'time_step_spec'),
                          (action_spec, 'action_spec'),
                          (info_spec, 'info_spec')):
      if not _each_isinstance(spec, tf.TypeSpec):
        raise TypeError(
            '{} has to contain TypeSpec (TensorSpec, '
            'SparseTensorSpec, etc) objects, but received: {}'
            .format(label, spec))

    self._time_step_spec = time_step_spec
    self._action_spec = action_spec
    self._info_spec = info_spec
    self._trajectory_spec = trajectory.Trajectory(
        step_type=time_step_spec.step_type,
        observation=time_step_spec.observation,
        action=action_spec,
        policy_info=info_spec,
        next_step_type=time_step_spec.step_type,
        reward=time_step_spec.reward,
        discount=time_step_spec.discount)
    self._transition_spec = trajectory.Transition(
        time_step=time_step_spec,
        action_step=policy_step.PolicyStep(action=action_spec,
                                           state=(),
                                           info=info_spec),
        next_time_step=time_step_spec)
Esempio n. 7
0
def create_transition(state: types.Array, action: types.Array,
                      next_state: types.Array, discount: types.Array,
                      reward: types.Array, step_type: types.Array,
                      next_step_type: types.Array) -> trajectory.Transition:
    """Creates a Transition from current and next state information."""
    tfagents_time_step = ts.TimeStep(
        step_type=step_type,
        reward=np.zeros_like(reward),  # unknown
        discount=np.zeros_like(discount),  # unknown
        observation=state)
    action_step = policy_step.PolicyStep(action=action, state=(), info=())
    tfagents_next_time_step = ts.TimeStep(step_type=next_step_type,
                                          reward=reward,
                                          discount=discount,
                                          observation=next_state)
    return trajectory.Transition(time_step=tfagents_time_step,
                                 action_step=action_step,
                                 next_time_step=tfagents_next_time_step)
Esempio n. 8
0
  def testTrainWithRnnTransitions(self):
    actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
        self._obs_spec,
        self._action_spec,
        input_fc_layer_params=None,
        output_fc_layer_params=None,
        conv_layer_params=None,
        lstm_size=(40,))

    counter = common.create_variable('test_train_counter')
    agent = reinforce_agent.ReinforceAgent(
        self._time_step_spec,
        self._action_spec,
        actor_network=actor_net,
        optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        train_step_counter=counter
    )

    batch_size = 5
    observations = tf.constant(
        [[[1, 2], [3, 4], [5, 6]]] * batch_size, dtype=tf.float32)
    time_steps = ts.TimeStep(
        step_type=tf.constant([[1, 1, 1]] * batch_size, dtype=tf.int32),
        reward=tf.constant([[1] * 3] * batch_size, dtype=tf.float32),
        discount=tf.constant([[1] * 3] * batch_size, dtype=tf.float32),
        observation=observations)
    actions = policy_step.PolicyStep(
        tf.constant([[[0], [1], [1]]] * batch_size, dtype=tf.float32))
    next_time_steps = ts.TimeStep(
        step_type=tf.constant([[1, 1, 2]] * batch_size, dtype=tf.int32),
        reward=tf.constant([[1] * 3] * batch_size, dtype=tf.float32),
        discount=tf.constant([[1] * 3] * batch_size, dtype=tf.float32),
        observation=observations)

    experience = trajectory.Transition(time_steps, actions, next_time_steps)

    agent.initialize()
    agent.train(experience)
Esempio n. 9
0
def create_collect_data_spec(
    dataset_dict: EpisodeDictType,
    use_trajectories: bool = True
) -> Union[trajectory.Transition, trajectory.Trajectory]:
    """Create a spec that describes the data collected by agent.collect_policy."""
    reward = dataset_dict['rewards'][0]
    discount = dataset_dict['discounts'][0]
    observation = dataset_dict['states'][0]
    action = dataset_dict['actions'][0]
    step_type = np.asarray(0, dtype=np.int32)

    if use_trajectories:
        return trajectory.Trajectory(
            step_type=ArraySpec(shape=step_type.shape, dtype=step_type.dtype),
            observation=ArraySpec(shape=observation.shape,
                                  dtype=observation.dtype),
            action=ArraySpec(shape=action.shape, dtype=action.dtype),
            policy_info=(),
            next_step_type=ArraySpec(shape=step_type.shape,
                                     dtype=step_type.dtype),
            reward=ArraySpec(shape=reward.shape, dtype=reward.dtype),
            discount=ArraySpec(shape=discount.shape, dtype=discount.dtype))
    else:
        time_step_spec = time_step.TimeStep(
            step_type=ArraySpec(shape=step_type.shape, dtype=step_type.dtype),
            reward=ArraySpec(shape=reward.shape, dtype=reward.dtype),
            discount=ArraySpec(shape=discount.shape, dtype=discount.dtype),
            observation=ArraySpec(shape=observation.shape,
                                  dtype=observation.dtype))
        action_spec = policy_step.PolicyStep(action=ArraySpec(
            shape=action.shape, dtype=action.dtype),
                                             state=(),
                                             info=())
        return trajectory.Transition(time_step=time_step_spec,
                                     action_step=action_spec,
                                     next_time_step=time_step_spec)
    def testAgentTransitionTrain(self):
        agent = td3_agent.Td3Agent(
            self._time_step_spec,
            self._action_spec,
            critic_network=self._critic_net,
            actor_network=self._bounded_actor_net,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        )

        time_step_spec = self._time_step_spec._replace(
            reward=tensor_spec.BoundedTensorSpec(
                [], tf.float32, minimum=0.0, maximum=1.0, name='reward'))

        transition_spec = trajectory.Transition(
            time_step=time_step_spec,
            action_step=policy_step.PolicyStep(action=self._action_spec,
                                               state=(),
                                               info=()),
            next_time_step=time_step_spec)

        sample_trajectory_experience = tensor_spec.sample_spec_nest(
            transition_spec, outer_dims=(3, ))
        agent.train(sample_trajectory_experience)
 def push(self, *args):
     if len(self.buffer) < self.capacity:
         self.buffer.append(None)
     self.buffer[self.position] = trajectory.Transition(*args)
     self.position = (self.position + 1) % self.capacity