Esempio n. 1
0
    def test_weighted_generator(self):
        data0 = types.Transition(np.array([[1], [2], [3]]), (), _REWARD, (),
                                 ())
        it0 = iter([data0])

        data1 = types.Transition(np.array([[4], [5], [6]]), (), _REWARD, (),
                                 ())
        data2 = types.Transition(np.array([[7], [8], [9]]), (), _REWARD, (),
                                 ())
        it1 = iter([
            reverb.ReplaySample(info=reverb.SampleInfo(
                *[() for _ in reverb.SampleInfo.tf_dtypes()]),
                                data=data1),
            reverb.ReplaySample(info=reverb.SampleInfo(
                *[() for _ in reverb.SampleInfo.tf_dtypes()]),
                                data=data2)
        ])

        weighted_it = builder._generate_samples_with_demonstrations(
            it0, it1, policy_to_expert_data_ratio=2, batch_size=3)

        np.testing.assert_array_equal(
            next(weighted_it).data.observation, np.array([[1], [4], [5]]))
        np.testing.assert_array_equal(
            next(weighted_it).data.observation, np.array([[7], [8], [2]]))
        self.assertRaises(StopIteration, lambda: next(weighted_it))
Esempio n. 2
0
def transition_dataset(environment: dm_env.Environment) -> tf.data.Dataset:
    """Fake dataset of Reverb N-step transition samples.

  Args:
    environment: Used to create a fake transition by looking at the
      observation, action, discount and reward specs.

  Returns:
    tf.data.Dataset that produces the same fake N-step transition ReverSample
    object indefinitely.
  """

    observation = environment.observation_spec().generate_value()
    action = environment.action_spec().generate_value()
    reward = environment.reward_spec().generate_value()
    discount = environment.discount_spec().generate_value()
    data = (observation, action, reward, discount, observation)

    key = np.array(0, np.uint64)
    probability = np.array(1.0, np.float64)
    table_size = np.array(1, np.int64)
    priority = np.array(1.0, np.float64)
    info = reverb.SampleInfo(key=key,
                             probability=probability,
                             table_size=table_size,
                             priority=priority)
    sample = reverb.ReplaySample(info=info, data=data)

    return tf.data.Dataset.from_tensors(sample).repeat()
Esempio n. 3
0
def _n_step_transition_from_episode(observations: acme_types.NestedTensor,
                                    actions: tf.Tensor,
                                    rewards: tf.Tensor,
                                    discounts: tf.Tensor,
                                    n_step: int,
                                    discount: float):
  """Produce Reverb-like N-step transition from a full episode.

  Observations, actions, rewards and discounts have the same length. This
  function will ignore the first reward and discount and the last action.

  Args:
    observations: [L, ...] Tensor.
    actions: [L, ...] Tensor.
    rewards: [L] Tensor.
    discounts: [L] Tensor.
    n_step: number of steps to squash into a single transition.
    discount: discount to use for TD updates.

  Returns:
    (o_t, a_t, r_t, d_t, o_tp1) tuple.
  """

  max_index = tf.shape(rewards)[0] - 1
  first = tf.random.uniform(shape=(), minval=0, maxval=max_index - 1,
                            dtype=tf.int32)
  last = tf.minimum(first + n_step, max_index)

  o_t = tree.map_structure(operator.itemgetter(first), observations)
  a_t = tree.map_structure(operator.itemgetter(first), actions)
  o_tp1 = tree.map_structure(operator.itemgetter(last), observations)

  # 0, 1, ..., n-1.
  discount_range = tf.cast(tf.range(last - first), tf.float32)
  # 1, g, ..., g^{n-1}.
  additional_discounts = tf.pow(discount, discount_range)
  # 1, d_t, d_t * d_{t+1}, ..., d_t * ... * d_{t+n-2}.
  discounts = tf.concat([[1.], tf.math.cumprod(discounts[first:last-1])], 0)
  # 1, g * d_t, ..., g^{n-1} * d_t * ... * d_{t+n-2}.
  discounts *= additional_discounts
  # r_t + g * d_t * r_{t+1} + ... + g^{n-1} * d_t * ... * d_{t+n-2} * r_{t+n-1}
  # We have to shift rewards by one so last=max_index corresponds to transitions
  # that include the last reward.
  r_t = tf.reduce_sum(rewards[first+1:last+1] * discounts)

  # g^{n-1} * d_{t} * ... * d_{t+n-1}.
  d_t = discounts[-1]

  key = tf.constant(0, tf.uint64)
  probability = tf.constant(1.0, tf.float64)
  table_size = tf.constant(1, tf.int64)
  priority = tf.constant(1.0, tf.float64)
  info = reverb.SampleInfo(
      key=key,
      probability=probability,
      table_size=table_size,
      priority=priority)
  return reverb.ReplaySample(
      info=info, data=acme_types.Transition(o_t, a_t, r_t, d_t, o_tp1))
Esempio n. 4
0
def _sequence_from_episode(observations: acme_types.NestedTensor,
                           actions: tf.Tensor,
                           rewards: tf.Tensor,
                           discounts: tf.Tensor,
                           extra_spec: acme_types.NestedSpec,
                           period: int,
                           sequence_length: int):
  """Produce Reverb-like sequence from a full episode.

  Observations, actions, rewards and discounts have the same length. This
  function will ignore the first reward and discount and the last action.

  This function generates fake (all-zero) extras.

  See docs for reverb.SequenceAdder() for more details.

  Args:
    observations: [L, ...] Tensor.
    actions: [L, ...] Tensor.
    rewards: [L] Tensor.
    discounts: [L] Tensor.
    extra_spec: A possibly nested structure of specs for extras. This function
      will generate fake (all-zero) extras.
    period: The period with which we add sequences.
    sequence_length: The fixed length of sequences we wish to add.

  Returns:
    (o_t, a_t, r_t, d_t, e_t) Tuple.
  """

  length = tf.shape(rewards)[0]
  first = tf.random.uniform(shape=(), minval=0, maxval=length, dtype=tf.int32)
  first = first // period * period  # Get a multiple of `period`.
  to = tf.minimum(first + sequence_length, length)

  def _slice_and_pad(x):
    pad_length = sequence_length + first - to
    padding_shape = tf.concat([[pad_length], tf.shape(x)[1:]], axis=0)
    result = tf.concat([x[first:to], tf.zeros(padding_shape, x.dtype)], axis=0)
    result.set_shape([sequence_length] + x.shape.as_list()[1:])
    return result

  o_t = tree.map_structure(_slice_and_pad, observations)
  a_t = tree.map_structure(_slice_and_pad, actions)
  r_t = _slice_and_pad(rewards)
  d_t = _slice_and_pad(discounts)

  def _sequence_zeros(spec):
    return tf.zeros([sequence_length] + spec.shape, spec.dtype)

  e_t = tree.map_structure(_sequence_zeros, extra_spec)

  key = tf.zeros([sequence_length], tf.uint64)
  probability = tf.ones([sequence_length], tf.float64)
  table_size = tf.ones([sequence_length], tf.int64)
  info = reverb.SampleInfo(
      key=key, probability=probability, table_size=table_size)
  return reverb.ReplaySample(info=info, data=(o_t, a_t, r_t, d_t, e_t))
Esempio n. 5
0
class ReverbUtilsTest(absltest.TestCase):

  def test_make_replay_table_preserves_table_info(self):
    limiter = reverb.rate_limiters.SampleToInsertRatio(
        samples_per_insert=1, min_size_to_sample=2, error_buffer=(0, 10))
    table = reverb.Table(
        name='test',
        sampler=reverb.selectors.Uniform(),
        remover=reverb.selectors.Fifo(),
        max_size=10,
        rate_limiter=limiter)
    new_table = reverb_utils.make_replay_table_from_info(table.info)
    new_info = new_table.info

    # table_worker_time is not set by the above utility since this is meant to
    # be monitoring information about any given table. So instead we copy this
    # so that the assertion below checks that everything else matches.

    new_info.table_worker_time.sleeping_ms = (
        table.info.table_worker_time.sleeping_ms)

    self.assertEqual(new_info, table.info)

  _EMPTY_INFO = reverb.SampleInfo(*[() for _ in reverb.SampleInfo.tf_dtypes()])
  _DUMMY_OBS = np.array([[[0], [1], [2]]])
  _DUMMY_ACTION = np.array([[[3], [4], [5]]])
  _DUMMY_REWARD = np.array([[6, 7, 8]])
  _DUMMY_DISCOUNT = np.array([[.99, .99, .99]])
  _DUMMY_NEXT_OBS = np.array([[[1], [2], [0]]])
  _DUMMY_RETURN = np.array([[20.77, 14.92, 8.]])

  def _create_dummy_steps(self):
    return reverb_adders.Step(
        observation=self._DUMMY_OBS,
        action=self._DUMMY_ACTION,
        reward=self._DUMMY_REWARD,
        discount=self._DUMMY_DISCOUNT,
        start_of_episode=True,
        extras={'return': self._DUMMY_RETURN})

  def _create_dummy_transitions(self):
    return types.Transition(
        observation=self._DUMMY_OBS,
        action=self._DUMMY_ACTION,
        reward=self._DUMMY_REWARD,
        discount=self._DUMMY_DISCOUNT,
        next_observation=self._DUMMY_NEXT_OBS,
        extras={'return': self._DUMMY_RETURN})

  def test_replay_sample_to_sars_transition_is_sequence(self):
    fake_sample = reverb.ReplaySample(
        info=self._EMPTY_INFO, data=self._create_dummy_steps())
    fake_transition = self._create_dummy_transitions()
    transition_from_sample = reverb_utils.replay_sample_to_sars_transition(
        fake_sample, is_sequence=True)
    tree.map_structure(np.testing.assert_array_equal, transition_from_sample,
                       fake_transition)
Esempio n. 6
0
def _build_sequence_example(sequences):
    """Convert raw sequences into a Reverb sequence sample."""
    o = sequences['observation']
    a = sequences['action']
    r = sequences['reward']
    p = sequences['discount']

    info = reverb.SampleInfo(key=tf.constant(0, tf.uint64),
                             probability=tf.constant(1.0, tf.float64),
                             table_size=tf.constant(0, tf.int64),
                             priority=tf.constant(1.0, tf.float64))
    return reverb.ReplaySample(info=info, data=(o, a, r, p))
def _build_sequence_example(sequences):
    """Convert raw sequences into a Reverb sequence sample."""
    data = adders.Step(observation=sequences['observation'],
                       action=sequences['action'],
                       reward=sequences['reward'],
                       discount=sequences['discount'],
                       start_of_episode=(),
                       extras=())

    info = reverb.SampleInfo(key=tf.constant(0, tf.uint64),
                             probability=tf.constant(1.0, tf.float64),
                             table_size=tf.constant(0, tf.int64),
                             priority=tf.constant(1.0, tf.float64))
    return reverb.ReplaySample(info=info, data=data)
Esempio n. 8
0
def _build_sarsa_example(sequences):
    """Convert raw sequences into a Reverb n-step SARSA sample."""

    o_tm1 = tree.map_structure(lambda t: t[0], sequences['observation'])
    o_t = tree.map_structure(lambda t: t[1], sequences['observation'])
    a_tm1 = tree.map_structure(lambda t: t[0], sequences['action'])
    a_t = tree.map_structure(lambda t: t[1], sequences['action'])
    r_t = tree.map_structure(lambda t: t[0], sequences['reward'])
    p_t = tree.map_structure(lambda t: t[0], sequences['discount'])

    info = reverb.SampleInfo(key=tf.constant(0, tf.uint64),
                             probability=tf.constant(1.0, tf.float64),
                             table_size=tf.constant(0, tf.int64),
                             priority=tf.constant(1.0, tf.float64))
    return reverb.ReplaySample(info=info,
                               data=(o_tm1, a_tm1, r_t, p_t, o_t, a_t))
Esempio n. 9
0
class ReverbUtilsTest(absltest.TestCase):

  def test_make_replay_table_preserves_table_info(self):
    limiter = reverb.rate_limiters.SampleToInsertRatio(
        samples_per_insert=1, min_size_to_sample=2, error_buffer=(0, 10))
    table = reverb.Table(
        name='test',
        sampler=reverb.selectors.Uniform(),
        remover=reverb.selectors.Fifo(),
        max_size=10,
        rate_limiter=limiter)
    table_from_info = reverb_utils.make_replay_table_from_info(table.info)
    self.assertEqual(table_from_info.info, table.info)

  _EMPTY_INFO = reverb.SampleInfo((), (), (), ())
  _DUMMY_OBS = np.array([[[0], [1], [2]]])
  _DUMMY_ACTION = np.array([[[3], [4], [5]]])
  _DUMMY_REWARD = np.array([[6, 7, 8]])
  _DUMMY_DISCOUNT = np.array([[.99, .99, .99]])
  _DUMMY_NEXT_OBS = np.array([[[1], [2], [0]]])

  def _create_dummy_steps(self):
    return reverb_adders.Step(
        observation=self._DUMMY_OBS,
        action=self._DUMMY_ACTION,
        reward=self._DUMMY_REWARD,
        discount=self._DUMMY_DISCOUNT,
        start_of_episode=True,
        extras=())

  def _create_dummy_transitions(self):
    return types.Transition(
        observation=self._DUMMY_OBS,
        action=self._DUMMY_ACTION,
        reward=self._DUMMY_REWARD,
        discount=self._DUMMY_DISCOUNT,
        next_observation=self._DUMMY_NEXT_OBS)

  def test_replay_sample_to_sars_transition_is_sequence(self):
    fake_sample = reverb.ReplaySample(
        info=self._EMPTY_INFO, data=self._create_dummy_steps())
    fake_transition = self._create_dummy_transitions()
    transition_from_sample = reverb_utils.replay_sample_to_sars_transition(
        fake_sample, is_sequence=True)
    tree.map_structure(np.testing.assert_array_equal, transition_from_sample,
                       fake_transition)
Esempio n. 10
0
    def test_after_train_step_fn_with_fresh_data_only(self,
                                                      create_strategy_fn):
        strategy = create_strategy_fn()
        with strategy.scope():
            # Prepare the test context context.
            train_step = train_utils.create_train_step()
            train_step.assign(225)
            train_steps_per_policy_update = 100

            # Create the after train function to test, and the test input.
            after_train_step_fn = (
                train_utils.create_staleness_metrics_after_train_step_fn(
                    train_step,
                    train_steps_per_policy_update=train_steps_per_policy_update
                ))
            observation_train_steps = np.array([[200], [200], [200]],
                                               dtype=np.int64)

            # Define the expectations (expected scalar summary calls).
            expected_scalar_summary_calls = [
                mock.call(name='staleness/max_train_step_delta_in_batch',
                          data=0,
                          step=225),
                mock.call(name='staleness/max_policy_update_delta_in_batch',
                          data=0,
                          step=225),
                mock.call(name='staleness/num_stale_obserations_in_batch',
                          data=0,
                          step=225)
            ]

            # Call the after train function and check the expectations.
            with mock.patch.object(tf.summary, 'scalar',
                                   autospec=True) as mock_scalar_summary:
                # Call the `after_train_function` on the test input. Assumed the
                # observation train steps are stored in the field `priority` of the
                # the sample info of Reverb.
                info = reverb.SampleInfo(
                    *[None for _ in reverb.SampleInfo.tf_dtypes()])
                info = info._replace(priority=observation_train_steps)
                strategy.run(after_train_step_fn, args=((None, info), None))

                # Check if the expected calls happened on the scalar summary.
                mock_scalar_summary.assert_has_calls(
                    expected_scalar_summary_calls, any_order=False)
def _make_reverb_sample(o_t, a_t, r_t, d_t, o_tp1, a_tp1, extras):
    """Create Reverb sample with offline data.

  Args:
    o_t: Observation at time t.
    a_t: Action at time t.
    r_t: Reward at time t.
    d_t: Discount at time t.
    o_tp1: Observation at time t+1.
    a_tp1: Action at time t+1.
    extras: Dictionary with extra features.

  Returns:
    Replay sample with fake info: key=0, probability=1, table_size=0.
  """
    info = reverb.SampleInfo(key=tf.constant(0, tf.uint64),
                             probability=tf.constant(1.0, tf.float64),
                             table_size=tf.constant(0, tf.int64),
                             priority=tf.constant(1.0, tf.float64))
    data = (o_t, a_t, r_t, d_t, o_tp1, a_tp1, extras)
    return reverb.ReplaySample(info=info, data=data)
Esempio n. 12
0
 def add_info_fn(data):
     info = reverb.SampleInfo(key=0,
                              probability=0.0,
                              table_size=0,
                              priority=0.0)
     return reverb.ReplaySample(info=info, data=data)
Esempio n. 13
0
 def _reverb_sample(*data_tuple):
     info = reverb.SampleInfo(key=tf.constant(0, tf.uint64),
                              probability=tf.constant(1.0, tf.float64),
                              table_size=tf.constant(0, tf.int64),
                              priority=tf.constant(1.0, tf.float64))
     return reverb.ReplaySample(info=info, data=data_tuple)