Example #1
0
def _get_initial_and_final_steps(batch_size, context_dim):
    observation = np.array(range(batch_size * context_dim)).reshape(
        [batch_size, context_dim])
    reward = np.random.uniform(0.0, 1.0, [batch_size])
    initial_step = time_step.TimeStep(
        tf.constant(time_step.StepType.FIRST,
                    dtype=tf.int32,
                    shape=[batch_size],
                    name='step_type'),
        tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'),
        tf.constant(1.0, dtype=tf.float32, shape=[batch_size],
                    name='discount'),
        tf.constant(observation,
                    dtype=tf.float32,
                    shape=[batch_size, context_dim],
                    name='observation'))
    final_step = time_step.TimeStep(
        tf.constant(time_step.StepType.LAST,
                    dtype=tf.int32,
                    shape=[batch_size],
                    name='step_type'),
        tf.constant(reward,
                    dtype=tf.float32,
                    shape=[batch_size],
                    name='reward'),
        tf.constant(1.0, dtype=tf.float32, shape=[batch_size],
                    name='discount'),
        tf.constant(observation + 100.0,
                    dtype=tf.float32,
                    shape=[batch_size, context_dim],
                    name='observation'))
    return initial_step, final_step
def _get_initial_and_final_steps_nested_rewards(observations, rewards):
  batch_size = tf.nest.flatten(observations)[0].shape[0]
  if isinstance(observations, np.ndarray):
    observations = tf.constant(
        observations, dtype=tf.float32, name='observation')
  zero_rewards = {
      'reward': tf.constant(0.0, dtype=tf.float32, shape=[batch_size]),
      'constraint': tf.constant(0.0, dtype=tf.float32, shape=[batch_size])
  }
  initial_step = ts.TimeStep(
      tf.constant(
          ts.StepType.FIRST,
          dtype=tf.int32,
          shape=[batch_size],
          name='step_type'),
      zero_rewards,
      tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'),
      observations)
  rewards_nest = tf.nest.map_structure(
      lambda t: tf.convert_to_tensor(t, dtype=tf.float32), rewards)
  final_step = ts.TimeStep(
      tf.constant(
          ts.StepType.LAST,
          dtype=tf.int32,
          shape=[batch_size],
          name='step_type'),
      rewards_nest,
      tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'),
      observations)
  return initial_step, final_step
def _get_initial_and_final_steps_action_mask_nested_rewards(
    observations, rewards):
  batch_size = tf.nest.flatten(observations)[0].shape[0]
  zero_rewards = {
      'reward': tf.constant(0.0, dtype=tf.float32, shape=[batch_size]),
      'constraint': tf.constant(0.0, dtype=tf.float32, shape=[batch_size])
  }
  initial_step = ts.TimeStep(
      tf.constant(
          ts.StepType.FIRST,
          dtype=tf.int32,
          shape=[batch_size],
          name='step_type'),
      zero_rewards,
      tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'),
      (observations[0], observations[1]))
  rewards_nest = tf.nest.map_structure(
      lambda t: tf.convert_to_tensor(t, dtype=tf.float32), rewards)
  final_step = ts.TimeStep(
      tf.constant(
          ts.StepType.LAST,
          dtype=tf.int32,
          shape=[batch_size],
          name='step_type'),
      rewards_nest,
      tf.constant(1.0, dtype=tf.float32, shape=[batch_size],
                  name='discount'), (tf.nest.map_structure(
                      lambda x: x + 100., observations[0]), observations[1]))
  return initial_step, final_step
Example #4
0
def _get_initial_and_final_steps_with_action_mask(batch_size,
                                                  context_dim,
                                                  num_actions=None):
    observation = np.array(range(batch_size * context_dim)).reshape(
        [batch_size, context_dim])
    observation = tf.constant(observation, dtype=tf.float32)
    mask = 1 - tf.eye(batch_size, num_columns=num_actions, dtype=tf.int32)
    reward = np.random.uniform(0.0, 1.0, [batch_size])
    initial_step = time_step.TimeStep(
        tf.constant(time_step.StepType.FIRST,
                    dtype=tf.int32,
                    shape=[batch_size],
                    name='step_type'),
        tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'),
        tf.constant(1.0, dtype=tf.float32, shape=[batch_size],
                    name='discount'), (observation, mask))
    final_step = time_step.TimeStep(
        tf.constant(time_step.StepType.LAST,
                    dtype=tf.int32,
                    shape=[batch_size],
                    name='step_type'),
        tf.constant(reward,
                    dtype=tf.float32,
                    shape=[batch_size],
                    name='reward'),
        tf.constant(1.0, dtype=tf.float32, shape=[batch_size],
                    name='discount'), (observation + 100.0, mask))
    return initial_step, final_step
Example #5
0
def to_transition(trajectory: Trajectory,
                  next_trajectory: Optional[Trajectory] = None) -> Transition:
    """Create a transition from a trajectory or two adjacent trajectories.

  **NOTE** If `next_trajectory` is not provided, tensors of `trajectory` are
  sliced along their *second* (`time`) dimension; for example:

  ```
  time_steps.step_type = trajectory.step_type[:,:-1]
  time_steps.observation = trajectory.observation[:,:-1]
  next_time_steps.observation = trajectory.observation[:,1:]
  next_time_steps. step_type = trajectory. next_step_type[:,:-1]
  next_time_steps.reward = trajectory.reward[:,:-1]
  next_time_steps. discount = trajectory. discount[:,:-1]

  ```
  Notice that reward and discount for time_steps are undefined, therefore filled
  with zero.

  Args:
    trajectory: An instance of `Trajectory`. The tensors in Trajectory must have
      shape `[B, T, ...]` when next_trajectory is `None`.  `discount` is assumed
      to be a scalar float; hence the shape of `trajectory.discount` must
      be `[B, T]`.
    next_trajectory: (optional) An instance of `Trajectory`.

  Returns:
    A tuple `(time_steps, policy_steps, next_time_steps)`.  The `reward` and
    `discount` fields of `time_steps` are filled with zeros because these
    cannot be deduced (please do not use them).

  Raises:
    ValueError: if `discount` rank is not within the range [1, 2].
  """
    _validate_rank(trajectory.discount, min_rank=1, max_rank=2)

    if next_trajectory is not None:
        _validate_rank(next_trajectory.discount, min_rank=1, max_rank=2)

    if next_trajectory is None:
        next_trajectory = tf.nest.map_structure(
            lambda t: composite.slice_from(t, axis=1, start=1), trajectory)
        trajectory = tf.nest.map_structure(
            lambda t: composite.slice_to(t, axis=1, end=-1), trajectory)
    policy_steps = policy_step.PolicyStep(action=trajectory.action,
                                          state=(),
                                          info=trajectory.policy_info)
    # TODO(b/130244652): Consider replacing 0 rewards & discounts with ().
    time_steps = ts.TimeStep(
        trajectory.step_type,
        reward=tf.nest.map_structure(tf.zeros_like,
                                     trajectory.reward),  # unknown
        discount=tf.zeros_like(trajectory.discount),  # unknown
        observation=trajectory.observation)
    next_time_steps = ts.TimeStep(step_type=trajectory.next_step_type,
                                  reward=trajectory.reward,
                                  discount=trajectory.discount,
                                  observation=next_trajectory.observation)
    return Transition(time_steps, policy_steps, next_time_steps)
Example #6
0
 def testTrainPerArmAgentVariableActions(self):
   num_actions = 5
   obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
       2, 3, num_actions, add_num_actions_feature=True)
   time_step_spec = time_step.time_step_spec(obs_spec)
   action_spec = tensor_spec.BoundedTensorSpec(
       dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
   encoding_dim = 10
   encoder = (
       global_and_arm_feature_network.create_feed_forward_common_tower_network(
           obs_spec, (4, 3), (3, 4), (4, 2), encoding_dim))
   agent = neural_linucb_agent.NeuralLinUCBAgent(
       time_step_spec=time_step_spec,
       action_spec=action_spec,
       encoding_network=encoder,
       encoding_network_num_train_steps=10,
       encoding_dim=encoding_dim,
       accepts_per_arm_features=True,
       optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001))
   observations = {
       bandit_spec_utils.GLOBAL_FEATURE_KEY:
           tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
       bandit_spec_utils.PER_ARM_FEATURE_KEY:
           tf.cast(
               tf.reshape(tf.range(30), shape=[2, 5, 3]), dtype=tf.float32),
       bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
           tf.constant([3, 4], dtype=tf.int32)
   }
   actions = np.array([0, 3], dtype=np.int32)
   rewards = np.array([0.5, 3.0], dtype=np.float32)
   initial_step = time_step.TimeStep(
       tf.constant(
           time_step.StepType.FIRST,
           dtype=tf.int32,
           shape=[2],
           name='step_type'),
       tf.constant(0.0, dtype=tf.float32, shape=[2], name='reward'),
       tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'),
       observations)
   final_step = time_step.TimeStep(
       tf.constant(
           time_step.StepType.LAST,
           dtype=tf.int32,
           shape=[2],
           name='step_type'),
       tf.constant(rewards, dtype=tf.float32, name='reward'),
       tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'),
       observations)
   action_step = policy_step.PolicyStep(
       action=tf.convert_to_tensor(actions),
       info=policy_utilities.PerArmPolicyInfo(
           chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                        dtype=np.float32)))
   experience = _get_experience(initial_step, action_step, final_step)
   loss_info, _ = agent.train(experience, None)
   self.evaluate(tf.compat.v1.initialize_all_variables())
   loss_value = self.evaluate(loss_info)
   self.assertGreater(loss_value, 0.0)
Example #7
0
  def __call__(self, value: typing.Any) -> trajectory.Transition:
    """Convert `value` to an N-step Transition; validate data & prune.

    - If `value` is already a `Transition`, only validation is performed.
    - If `value` is a `Trajectory` with tensors containing a time dimension
      having `T != n + 1`, a `ValueError` is raised.

    Args:
      value: A `Trajectory` or `Transition` object to convert.

    Returns:
      A validated and pruned `Transition`.  If `squeeze_time_dim = True`,
      the resulting `Transition` has tensors with shape `[B, ...]`.  Otherwise,
      the tensors will have shape `[B, T - 1, ...]`.

    Raises:
      TypeError: If `value` is not one of `Trajectory` or `Transition`.
      ValueError: If `value` has structure that doesn't match the converter's
        spec.
      TypeError: If `value` has a structure that doesn't match the converter's
        spec.
      ValueError: If `n != None` and `value` is a `Trajectory`
        with a time dimension having value other than `T=n + 1`.
    """
    if _is_transition_like(value):
      value = _as_tfa_transition(value)
    elif _is_trajectory_like(value):
      required_sequence_length = 1 if self._squeeze_time_dim else None
      _validate_trajectory(
          value,
          self._data_context.trajectory_spec,
          sequence_length=required_sequence_length)
      if self._squeeze_time_dim:
        value = tf.nest.map_structure(lambda e: tf.squeeze(e, axis=1), value)
      policy_steps = policy_step.PolicyStep(
          action=value.action, state=(), info=value.policy_info)
      # TODO(b/130244652): Consider replacing 0 rewards & discounts with ().
      time_steps = ts.TimeStep(
          value.step_type,
          reward=tf.nest.map_structure(tf.zeros_like, value.reward),  # unknown
          discount=tf.zeros_like(value.discount),  # unknown
          observation=value.observation)
      next_time_steps = ts.TimeStep(
          step_type=value.next_step_type,
          reward=value.reward,
          discount=value.discount,
          observation=tf.zeros_like(value.discount))
      value = trajectory.Transition(time_steps, policy_steps, next_time_steps)
    else:
      raise TypeError('Input type not supported: {}'.format(value))

    num_outer_dims = 1 if self._squeeze_time_dim else 2
    _validate_transition(
        value, self._data_context.transition_spec, num_outer_dims)

    value = nest_utils.prune_extra_keys(
        self._data_context.transition_spec, value)
    return value
Example #8
0
 def _get_mock_env_episode(self):
     mock_env = mock.MagicMock()
     mock_env.step.side_effect = [
         ts.TimeStep(ts.StepType.FIRST, 2, 1, [0]),
         ts.TimeStep(ts.StepType.MID, 3, 1, [1]),
         ts.TimeStep(ts.StepType.MID, 5, 1, [2]),
         ts.TimeStep(ts.StepType.LAST, 7, 1, [3]),
     ]
     return mock_env
Example #9
0
 def _get_mock_env_step(self):
   mock_env = mock.MagicMock()
   mock_env.observation_spec.side_effect = [
       array_spec.BoundedArraySpec((3,), np.int32, -10, 10),
       array_spec.BoundedArraySpec((3,), np.int32, -10, 10),
       array_spec.BoundedArraySpec((3,), np.int32, -10, 10),
   ]
   mock_env.reset.side_effect = [ts.TimeStep(ts.StepType.MID, 5, 1, [3, 5, 2])]
   mock_env.step.side_effect = [ts.TimeStep(ts.StepType.MID, 5, 1, [1, 2, 3])]
   return mock_env
def _get_initial_and_final_steps_with_per_arm_features(batch_size,
                                                       global_context_dim,
                                                       num_actions,
                                                       arm_context_dim):
    global_observation = np.array(range(batch_size *
                                        global_context_dim)).reshape(
                                            [batch_size, global_context_dim])
    arm_observation = np.array(
        range(batch_size * num_actions * arm_context_dim)).reshape(
            [batch_size, num_actions, arm_context_dim])
    reward = np.random.uniform(0.0, 1.0, [batch_size])
    initial_step = time_step.TimeStep(
        tf.constant(time_step.StepType.FIRST,
                    dtype=tf.int32,
                    shape=[batch_size],
                    name='step_type'),
        tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'),
        tf.constant(1.0, dtype=tf.float32, shape=[batch_size],
                    name='discount'),
        {
            'global':
            tf.constant(global_observation,
                        dtype=tf.float32,
                        shape=[batch_size, global_context_dim],
                        name='global_observation'),
            'per_arm':
            tf.constant(arm_observation,
                        dtype=tf.float32,
                        shape=[batch_size, num_actions, arm_context_dim],
                        name='arm_observation')
        })
    final_step = time_step.TimeStep(
        tf.constant(time_step.StepType.LAST,
                    dtype=tf.int32,
                    shape=[batch_size],
                    name='step_type'),
        tf.constant(reward,
                    dtype=tf.float32,
                    shape=[batch_size],
                    name='reward'),
        tf.constant(1.0, dtype=tf.float32, shape=[batch_size],
                    name='discount'),
        {
            'global':
            tf.constant(global_observation + 100.0,
                        dtype=tf.float32,
                        shape=[batch_size, global_context_dim],
                        name='global_observation'),
            'arm':
            tf.constant(arm_observation + 100.0,
                        dtype=tf.float32,
                        shape=[batch_size, num_actions, arm_context_dim],
                        name='arm_observation')
        })
    return initial_step, final_step
Example #11
0
 def _get_mock_env_episode(self):
   mock_env = mock.MagicMock()
   mock_env.step.side_effect = [
       # In practice, the first reward would be 0, but test with a reward of 1.
       ts.TimeStep(ts.StepType.FIRST, 1, 1, [0]),
       ts.TimeStep(ts.StepType.MID, 2, 1, [1]),
       ts.TimeStep(ts.StepType.MID, 3, 1, [2]),
       ts.TimeStep(ts.StepType.MID, 5, 1, [3]),
       ts.TimeStep(ts.StepType.LAST, 7, 1, [4]),
   ]
   return mock_env
Example #12
0
 def testMixturePolicyDynamicBatchSize(self):
     context_dim = 35
     observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
     time_step_spec = ts.time_step_spec(observation_spec)
     action_spec = tensor_spec.BoundedTensorSpec(shape=(),
                                                 dtype=tf.int32,
                                                 minimum=0,
                                                 maximum=9,
                                                 name='action')
     sub_policies = [
         ConstantPolicy(action_spec, time_step_spec, i) for i in range(10)
     ]
     weights = [0, 0, 0.2, 0, 0, 0.3, 0, 0, 0.5, 0]
     dist = tfd.Categorical(probs=weights)
     policy = mixture_policy.MixturePolicy(dist, sub_policies)
     batch_size = tf.random.uniform(shape=(),
                                    minval=10,
                                    maxval=15,
                                    dtype=tf.int32)
     time_step = ts.TimeStep(
         tf.fill(tf.expand_dims(batch_size, axis=0),
                 ts.StepType.FIRST,
                 name='step_type'),
         tf.zeros(shape=[batch_size], dtype=tf.float32, name='reward'),
         tf.ones(shape=[batch_size], dtype=tf.float32, name='discount'),
         tf.reshape(tf.range(tf.cast(batch_size * context_dim,
                                     dtype=tf.float32),
                             dtype=tf.float32),
                    shape=[-1, context_dim],
                    name='observation'))
     action_step = policy.action(time_step)
     actions, bsize = self.evaluate([action_step.action, batch_size])
     self.assertAllEqual(actions.shape, [bsize])
     self.assertAllInSet(actions, [2, 5, 8])
     saver = policy_saver.PolicySaver(policy)
     location = os.path.join(self.get_temp_dir(), 'saved_policy')
     saver.save(location)
     loaded_policy = tf.compat.v2.saved_model.load(location)
     new_batch_size = 3
     new_time_step = ts.TimeStep(
         tf.fill(tf.expand_dims(new_batch_size, axis=0),
                 ts.StepType.FIRST,
                 name='step_type'),
         tf.zeros(shape=[new_batch_size], dtype=tf.float32, name='reward'),
         tf.ones(shape=[new_batch_size], dtype=tf.float32, name='discount'),
         tf.reshape(tf.range(tf.cast(new_batch_size * context_dim,
                                     dtype=tf.float32),
                             dtype=tf.float32),
                    shape=[-1, context_dim],
                    name='observation'))
     new_action = self.evaluate(loaded_policy.action(new_time_step).action)
     self.assertAllEqual(new_action.shape, [new_batch_size])
     self.assertAllInSet(new_action, [2, 5, 8])
Example #13
0
 def testMixturePolicyNegativeProb(self):
     context_dim = 11
     observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
     time_step_spec = ts.time_step_spec(observation_spec)
     action_spec = tensor_spec.BoundedTensorSpec(shape=(),
                                                 dtype=tf.int32,
                                                 minimum=0,
                                                 maximum=9,
                                                 name='action')
     sub_policies = [
         ConstantPolicy(action_spec, time_step_spec, i) for i in range(10)
     ]
     weights = [0, 0, 0.2, 0, 0, -0.3, 0, 0, 0.5, 0]
     policy = mixture_policy.MixturePolicy(weights, sub_policies)
     batch_size = 15
     time_step = ts.TimeStep(
         tf.constant(ts.StepType.FIRST,
                     dtype=tf.int32,
                     shape=[batch_size],
                     name='step_type'),
         tf.constant(0.0,
                     dtype=tf.float32,
                     shape=[batch_size],
                     name='reward'),
         tf.constant(1.0,
                     dtype=tf.float32,
                     shape=[batch_size],
                     name='discount'),
         tf.constant(list(range(batch_size * context_dim)),
                     dtype=tf.float32,
                     shape=[batch_size, context_dim],
                     name='observation'))
     with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
                                  'Negative probability'):
         policy.action(time_step)
Example #14
0
    def _set_names_and_shapes(self, step_type, reward, discount,
                              *flat_observations):
        """Returns a `TimeStep` namedtuple."""
        step_type = tf.identity(step_type, name='step_type')
        reward = tf.identity(reward, name='reward')
        discount = tf.identity(discount, name='discount')
        batch_shape = () if not self.batched else (self.batch_size, )
        batch_shape = tf.TensorShape(batch_shape)
        if not tf.executing_eagerly():
            # Shapes are not required in eager mode.
            reward.set_shape(batch_shape)
            step_type.set_shape(batch_shape)
            discount.set_shape(batch_shape)
        # Give each tensor a meaningful name and set the static shape.
        named_observations = []
        for obs, spec in zip(flat_observations,
                             tf.nest.flatten(self.observation_spec())):
            named_observation = tf.identity(obs, name=spec.name)
            if not tf.executing_eagerly():
                named_observation.set_shape(batch_shape.concatenate(
                    spec.shape))
            named_observations.append(named_observation)

        observations = tf.nest.pack_sequence_as(self.observation_spec(),
                                                named_observations)

        return ts.TimeStep(step_type, reward, discount, observations)
Example #15
0
    def _reset(self):
        """Starts a new sequence and returns the first `TimeStep`."""
        time_step = self._env.reset()
        observations = time_step.observation

        # initial frame stacking
        for _ in range(self.stack_size):
            self._frames.append(observations['pixels'])
        observations['pixels'] = np.concatenate(self._frames, axis=2)

        # initial action stacking
        if self.actions_in_obs:
            for _ in range(self.stack_size - 1):
                self._actions.append(
                    np.zeros(self._env.action_spec().shape, dtype=np.float32))
            observations['actions'] = np.stack(self._actions)

        # initial reward stacking
        if self.rewards_in_obs:
            for _ in range(self.stack_size):
                self._rewards.append(np.array(0.0, dtype=np.float32))
            observations['rewards'] = np.stack(self._rewards)

        return ts.TimeStep(time_step.step_type, time_step.reward,
                           time_step.discount, observations)
Example #16
0
def updated_sample(sample: Any, reward_shift: float,
                   action_clipping: Optional[Tuple[float, float]],
                   use_trajectories: bool):
    """Create a sample with reward_shift and action_clipping."""
    def _clip_actions(actions):
        return tf.clip_by_value(actions,
                                clip_value_min=action_clipping[0],
                                clip_value_max=action_clipping[1])

    if use_trajectories:
        # Update trajectory.
        shifted_reward = sample.reward + reward_shift
        if action_clipping:
            return sample._replace(action=tf.nest.map_structure(
                _clip_actions, sample.action),
                                   reward=shifted_reward)
        else:
            return sample._replace(reward=shifted_reward)
    else:
        # Update transition.
        next_time_step = sample.next_time_step
        next_time_step = ts.TimeStep(step_type=next_time_step.step_type,
                                     reward=next_time_step.reward +
                                     reward_shift,
                                     discount=next_time_step.discount,
                                     observation=next_time_step.observation)
        action_step = sample.action_step
        if action_clipping:
            action_step = action_step._replace(action=tf.nest.map_structure(
                _clip_actions, action_step.action))
        return trajectory.Transition(time_step=sample.time_step,
                                     action_step=action_step,
                                     next_time_step=next_time_step)
Example #17
0
  def testWithAdvantageFn(self, with_value_network):
    advantage_fn = mock.Mock(
        side_effect=lambda returns, _: returns)

    value_network = (DummyValueNet(self._obs_spec) if with_value_network
                     else None)
    agent = reinforce_agent.ReinforceAgent(
        self._time_step_spec,
        self._action_spec,
        actor_network=DummyActorNet(
            self._obs_spec, self._action_spec, unbounded_actions=False),
        value_network=value_network,
        advantage_fn=advantage_fn,
        optimizer=None,
    )

    step_type = tf.constant(
        [[ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST,
          ts.StepType.LAST]])
    reward = tf.constant([[0, 0, 0, 0]], dtype=tf.float32)
    discount = tf.constant([[1, 1, 1, 1]], dtype=tf.float32)
    observations = tf.constant(
        [[[1, 2], [1, 2], [1, 2], [1, 2]]], dtype=tf.float32)
    time_steps = ts.TimeStep(step_type, reward, discount, observations)

    actions = tf.constant([[[0], [1], [2], [3]]], dtype=tf.float32)

    agent.total_loss(time_steps, actions, time_steps.reward, None)

    advantage_fn.assert_called_once()
Example #18
0
  def testPolicyGradientLossMultipleEpisodes(self):
    agent = reinforce_agent.ReinforceAgent(
        self._time_step_spec,
        self._action_spec,
        actor_network=DummyActorNet(
            self._obs_spec, self._action_spec, unbounded_actions=True),
        optimizer=None,
    )

    step_type = tf.constant(
        [ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST,
         ts.StepType.LAST])
    reward = tf.constant([0, 0, 0, 0], dtype=tf.float32)
    discount = tf.constant([1, 1, 1, 1], dtype=tf.float32)
    observations = tf.constant(
        [[1, 2], [1, 2], [1, 2], [1, 2]], dtype=tf.float32)
    time_steps = ts.TimeStep(step_type, reward, discount, observations)

    actions = tf.constant([[0], [1], [2], [3]], dtype=tf.float32)
    actions_distribution = agent.collect_policy.distribution(
        time_steps).action
    returns = tf.constant([1.9, 1.9, 1.0, 1.0], dtype=tf.float32)

    expected_loss = 5.140229225158691
    loss = agent.policy_gradient_loss(
        actions_distribution, actions, time_steps.is_last(), returns, 2)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    loss_ = self.evaluate(loss)
    self.assertAllClose(loss_, expected_loss)
Example #19
0
    def testObservationShapeMismatch(self, batch_size, exploration_strategy):
        policy = linear_policy.LinearBanditPolicy(self._action_spec, self._a,
                                                  self._b,
                                                  self._num_samples_per_arm,
                                                  self._time_step_spec,
                                                  exploration_strategy)

        current_time_step = ts.TimeStep(
            tf.constant(ts.StepType.FIRST,
                        dtype=tf.int32,
                        shape=[batch_size],
                        name='step_type'),
            tf.constant(0.0,
                        dtype=tf.float32,
                        shape=[batch_size],
                        name='reward'),
            tf.constant(1.0,
                        dtype=tf.float32,
                        shape=[batch_size],
                        name='discount'),
            tf.constant(np.array(range(batch_size * (self._obs_dim + 1))),
                        dtype=tf.float32,
                        shape=[batch_size, self._obs_dim + 1],
                        name='observation'))
        with self.assertRaisesRegexp(
                ValueError, r'Observation shape is expected to be \[None, 2\].'
                r' Got \[%d, 3\].' % batch_size):
            policy.action(current_time_step)
Example #20
0
 def _per_arm_time_step_batch(self, batch_size):
     return ts.TimeStep(
         tf.constant(ts.StepType.FIRST,
                     dtype=tf.int32,
                     shape=[batch_size],
                     name='step_type'),
         tf.constant(0.0,
                     dtype=tf.float32,
                     shape=[batch_size],
                     name='reward'),
         tf.constant(1.0,
                     dtype=tf.float32,
                     shape=[batch_size],
                     name='discount'),
         {
             bandit_spec_utils.GLOBAL_FEATURE_KEY:
             tf.constant(np.array(range(batch_size * self._obs_dim)),
                         dtype=tf.float32,
                         shape=[batch_size, self._obs_dim],
                         name='observation'),
             bandit_spec_utils.PER_ARM_FEATURE_KEY:
             tf.constant(np.array(range(
                 batch_size * self._num_actions * 4)),
                         dtype=tf.float32,
                         shape=[batch_size, self._num_actions, 4],
                         name='observation'),
             bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
             tf.ones([batch_size], dtype=tf.int32) * 2
         })
Example #21
0
    def _create_experience(_):
        observations = tf.constant([
            [[1, 2], [3, 4], [5, 6]],
            [[1, 2], [3, 4], [5, 6]],
        ],
                                   dtype=tf.float32)
        mid_time_step_val = ts.StepType.MID.tolist()
        time_steps = ts.TimeStep(step_type=tf.constant(
            [[mid_time_step_val] * 3] * 2, dtype=tf.int32),
                                 reward=tf.constant([[1] * 3] * 2,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 3] * 2,
                                                      dtype=tf.float32),
                                 observation=observations)
        actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                              dtype=tf.float32)

        action_distribution_parameters = {
            'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
            'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
        }
        value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]],
                                  dtype=tf.float32)

        policy_info = {
            'dist_params': action_distribution_parameters,
        }
        policy_info['value_prediction'] = value_preds
        experience = trajectory.Trajectory(time_steps.step_type, observations,
                                           actions, policy_info,
                                           time_steps.step_type,
                                           time_steps.reward,
                                           time_steps.discount)
        return agent._preprocess(experience)  # pylint: disable=protected-access
    def _step(self, action):
        """Steps the environment."""
        if self.current_time_step().is_last():
            return self.reset()

        total_reward = 0

        for _ in range(self._action_repeat):
            time_step = self._env.step(action)

            if self._frames is not None and self._stack_within_repeat:
                self._frames.append(time_step.observation['pixels'])

            total_reward += time_step.reward
            if time_step.is_first() or time_step.is_last():
                break

        # Only add the last frame of the action repeat if we don't stack within.
        if self._frames is not None and not self._stack_within_repeat:
            self._frames.append(time_step.observation['pixels'])

        total_reward = np.asarray(total_reward,
                                  dtype=np.asarray(time_step.reward).dtype)

        # Stack frames.
        if self._frames is not None:
            time_step.observation['pixels'] = np.concatenate(self._frames,
                                                             axis=2)

        return ts.TimeStep(time_step.step_type, total_reward,
                           time_step.discount, time_step.observation)
Example #23
0
  def testMakeTimestepMaskWithPartialEpisode(self, allow_partial):
    first, mid, last = ts.StepType.FIRST, ts.StepType.MID, ts.StepType.LAST

    next_step_types = tf.constant([[mid, mid, last, first,
                                    mid, mid, last, first,
                                    mid, mid],
                                   [mid, mid, last, first,
                                    mid, mid, mid, mid,
                                    mid, last]])
    zeros = tf.zeros_like(next_step_types)
    next_time_step = ts.TimeStep(next_step_types, zeros, zeros, zeros)

    if not allow_partial:
      # Mask should be 0.0 for transition timesteps (3, 7) and for all timesteps
      #   belonging to the final, incomplete episode.
      expected_mask = [[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0],
                       [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
    else:
      # Zeros only between episodes. Incomplete episodes are valid and not
      # zeroed out.
      expected_mask = [[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0],
                       [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
    timestep_mask = ppo_utils.make_timestep_mask(
        next_time_step, allow_partial_episodes=allow_partial)

    timestep_mask_ = self.evaluate(timestep_mask)
    self.assertAllClose(expected_mask, timestep_mask_)
Example #24
0
    def test_collect_data_spec_transition(self):
        episode_dict = {
            'states':
            np.array([[1., 2.], [3., 4.], [5., 6.], [7., 8.]],
                     dtype=np.float32),
            'actions':
            np.array([[1.], [2.], [3.], [4.]], dtype=np.float32),
            'rewards':
            np.array([[0.], [1.], [0.], [1.]], dtype=np.float32),
            'discounts':
            np.array([1.0, 0.0, 1.0, 0.0], dtype=np.float32),
            'episode_start_index':
            np.array([0, 2], dtype=np.int32)
        }

        time_step_spec = time_step.TimeStep(
            step_type=ArraySpec(shape=[], dtype=np.int32),
            reward=ArraySpec(shape=[1], dtype=np.float32),
            discount=ArraySpec(shape=[], dtype=np.float32),
            observation=ArraySpec(shape=[2], dtype=np.float32))
        action_spec = policy_step.PolicyStep(action=ArraySpec(
            shape=[1], dtype=np.float32),
                                             state=(),
                                             info=())
        expected_spec = trajectory.Transition(time_step=time_step_spec,
                                              action_step=action_spec,
                                              next_time_step=time_step_spec)
        actual_spec = create_collect_data_spec(episode_dict,
                                               use_trajectories=False)
        self.assertEqual(actual_spec, expected_spec)
Example #25
0
  def get_single_agent_specs(self, time_step_spec, action_spec):
    """Get single agent version of environment specs to feed to baby agents."""

    def make_single_agent_spec(spec):
      if len(spec.shape) == 1:
        shape = 1
      else:
        shape = spec.shape[1:]
      return tensor_spec.BoundedTensorSpec(
          shape=shape,
          name=spec.name,
          minimum=spec.minimum,
          maximum=spec.maximum,
          dtype=spec.dtype)

    single_obs_spec = tf.nest.map_structure(make_single_agent_spec,
                                            time_step_spec.observation)
    single_reward_spec = tensor_spec.TensorSpec(
        shape=(), dtype=time_step_spec.reward.dtype, name='reward')
    single_time_step_spec = ts.TimeStep(time_step_spec.step_type,
                                        single_reward_spec,
                                        time_step_spec.discount,
                                        single_obs_spec)
    single_action_spec = action_spec[0]
    return single_obs_spec, single_time_step_spec, single_action_spec
Example #26
0
    def testObservationShapeMismatch(self, batch_size,
                                     actions_from_reward_layer):
        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            DummyNet(),
            self._encoding_dim,
            get_reward_layer(),
            actions_from_reward_layer=actions_from_reward_layer,
            cov_matrix=self._a,
            data_vector=self._b,
            num_samples=self._num_samples_per_arm,
            epsilon_greedy=0.0,
            time_step_spec=self._time_step_spec)

        current_time_step = ts.TimeStep(
            tf.constant(ts.StepType.FIRST,
                        dtype=tf.int32,
                        shape=[batch_size],
                        name='step_type'),
            tf.constant(0.0,
                        dtype=tf.float32,
                        shape=[batch_size],
                        name='reward'),
            tf.constant(1.0,
                        dtype=tf.float32,
                        shape=[batch_size],
                        name='discount'),
            tf.constant(np.array(range(batch_size * (self._obs_dim + 1))),
                        dtype=tf.float32,
                        shape=[batch_size, self._obs_dim + 1],
                        name='observation'))
        with self.assertRaisesRegexp(
                ValueError, r'Observation shape is expected to be \[None, 2\].'
                r' Got \[%d, 3\].' % batch_size):
            policy.action(current_time_step)
Example #27
0
 def setUp(self):
     super(PolicySaverTest, self).setUp()
     self._time_step_spec = ts.TimeStep(
         step_type=tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                 shape=(),
                                                 name='st',
                                                 minimum=0,
                                                 maximum=2),
         reward=tensor_spec.BoundedTensorSpec(dtype=tf.float32,
                                              shape=(),
                                              name='reward',
                                              minimum=0.0,
                                              maximum=5.0),
         discount=tensor_spec.BoundedTensorSpec(dtype=tf.float32,
                                                shape=(),
                                                name='discount',
                                                minimum=0.0,
                                                maximum=1.0),
         observation=tensor_spec.BoundedTensorSpec(dtype=tf.float32,
                                                   shape=(4, ),
                                                   name='obs',
                                                   minimum=-10.0,
                                                   maximum=10.0))
     self._action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                       shape=(),
                                                       minimum=0,
                                                       maximum=10,
                                                       name='act_0')
     self._global_seed = 12345
     tf.compat.v1.set_random_seed(self._global_seed)
Example #28
0
 def _convert_string_vector_to_action_input(self, example):
     return (ts.TimeStep(
         step_type=tf.cast(tf.strings.to_number(example[:, 0], tf.float32),
                           tf.int32),
         reward=tf.strings.to_number(example[:, 1], tf.float32),
         discount=tf.strings.to_number(example[:, 2], tf.float32),
         observation=tf.strings.to_number(example[:, 3:7], tf.float32)), ())
Example #29
0
    def _pack_and_filter_timestep_observation(self, timestep):
        """Pack and filter observations into a single dimension.

    Args:
      timestep: A `TimeStep` namedtuple containing:
        - step_type: A `StepType` value.
        - reward: Reward at this timestep.
        - discount: A discount in the range [0, 1].
        - observation: A NumPy array, or a nested dict, list or tuple of arrays
          corresponding to `observation_spec()`.

    Returns:
      A new `TimeStep` namedtuple that has filtered observations and packed into
        a single dimenison.
    """
        # We can't set attribute to the TimeStep tuple, so we make a copy of the
        # observations.
        observations = timestep.observation
        if self._observations_allowlist is not None:
            observations = self._filter_observations(observations)

        return ts.TimeStep(
            timestep.step_type, timestep.reward, timestep.discount,
            self._flatten_nested_observations(observations,
                                              is_batched=self._env.batched))
        def loop_body(time, time_step, policy_state, output_action_tas,
                      output_policy_info_tas):
            """Runs a step in environment.

      While loop will call multiple times.

      Args:
        time: Step time.
        time_step: Previous step's `TimeStep`.
        policy_state: Policy state tensor or nested structure of tensors.
        output_action_tas: Updated nest of `tf.TensorArray`, the new actions.
        output_policy_info_tas: Updated nest of `tf.TensorArray`, the new
          policy info.

      Returns:
        loop_vars for next iteration of tf.while_loop.
      """
            policy_state, next_output_action_tas, next_output_policy_info_tas = (
                process_step(time, time_step, policy_state, output_action_tas,
                             output_policy_info_tas))

            ta_read = lambda ta: ta.read(time)
            ta_read_prev = lambda ta: ta.read(time - 1)
            time_step = ts.TimeStep(
                step_type=ta_read(trajectory_tas.step_type),
                observation=tf.nest.map_structure(ta_read,
                                                  trajectory_tas.observation),
                reward=tf.nest.map_structure(ta_read_prev,
                                             trajectory_tas.reward),
                discount=ta_read_prev(trajectory_tas.discount))

            return (time + 1, time_step, policy_state, next_output_action_tas,
                    next_output_policy_info_tas)