def test_weighted_generator(self): data0 = types.Transition(np.array([[1], [2], [3]]), (), _REWARD, (), ()) it0 = iter([data0]) data1 = types.Transition(np.array([[4], [5], [6]]), (), _REWARD, (), ()) data2 = types.Transition(np.array([[7], [8], [9]]), (), _REWARD, (), ()) it1 = iter([ reverb.ReplaySample(info=reverb.SampleInfo( *[() for _ in reverb.SampleInfo.tf_dtypes()]), data=data1), reverb.ReplaySample(info=reverb.SampleInfo( *[() for _ in reverb.SampleInfo.tf_dtypes()]), data=data2) ]) weighted_it = builder._generate_samples_with_demonstrations( it0, it1, policy_to_expert_data_ratio=2, batch_size=3) np.testing.assert_array_equal( next(weighted_it).data.observation, np.array([[1], [4], [5]])) np.testing.assert_array_equal( next(weighted_it).data.observation, np.array([[7], [8], [2]])) self.assertRaises(StopIteration, lambda: next(weighted_it))
def transition_dataset(environment: dm_env.Environment) -> tf.data.Dataset: """Fake dataset of Reverb N-step transition samples. Args: environment: Used to create a fake transition by looking at the observation, action, discount and reward specs. Returns: tf.data.Dataset that produces the same fake N-step transition ReverSample object indefinitely. """ observation = environment.observation_spec().generate_value() action = environment.action_spec().generate_value() reward = environment.reward_spec().generate_value() discount = environment.discount_spec().generate_value() data = (observation, action, reward, discount, observation) key = np.array(0, np.uint64) probability = np.array(1.0, np.float64) table_size = np.array(1, np.int64) priority = np.array(1.0, np.float64) info = reverb.SampleInfo(key=key, probability=probability, table_size=table_size, priority=priority) sample = reverb.ReplaySample(info=info, data=data) return tf.data.Dataset.from_tensors(sample).repeat()
def _n_step_transition_from_episode(observations: acme_types.NestedTensor, actions: tf.Tensor, rewards: tf.Tensor, discounts: tf.Tensor, n_step: int, discount: float): """Produce Reverb-like N-step transition from a full episode. Observations, actions, rewards and discounts have the same length. This function will ignore the first reward and discount and the last action. Args: observations: [L, ...] Tensor. actions: [L, ...] Tensor. rewards: [L] Tensor. discounts: [L] Tensor. n_step: number of steps to squash into a single transition. discount: discount to use for TD updates. Returns: (o_t, a_t, r_t, d_t, o_tp1) tuple. """ max_index = tf.shape(rewards)[0] - 1 first = tf.random.uniform(shape=(), minval=0, maxval=max_index - 1, dtype=tf.int32) last = tf.minimum(first + n_step, max_index) o_t = tree.map_structure(operator.itemgetter(first), observations) a_t = tree.map_structure(operator.itemgetter(first), actions) o_tp1 = tree.map_structure(operator.itemgetter(last), observations) # 0, 1, ..., n-1. discount_range = tf.cast(tf.range(last - first), tf.float32) # 1, g, ..., g^{n-1}. additional_discounts = tf.pow(discount, discount_range) # 1, d_t, d_t * d_{t+1}, ..., d_t * ... * d_{t+n-2}. discounts = tf.concat([[1.], tf.math.cumprod(discounts[first:last-1])], 0) # 1, g * d_t, ..., g^{n-1} * d_t * ... * d_{t+n-2}. discounts *= additional_discounts # r_t + g * d_t * r_{t+1} + ... + g^{n-1} * d_t * ... * d_{t+n-2} * r_{t+n-1} # We have to shift rewards by one so last=max_index corresponds to transitions # that include the last reward. r_t = tf.reduce_sum(rewards[first+1:last+1] * discounts) # g^{n-1} * d_{t} * ... * d_{t+n-1}. d_t = discounts[-1] key = tf.constant(0, tf.uint64) probability = tf.constant(1.0, tf.float64) table_size = tf.constant(1, tf.int64) priority = tf.constant(1.0, tf.float64) info = reverb.SampleInfo( key=key, probability=probability, table_size=table_size, priority=priority) return reverb.ReplaySample( info=info, data=acme_types.Transition(o_t, a_t, r_t, d_t, o_tp1))
def _sequence_from_episode(observations: acme_types.NestedTensor, actions: tf.Tensor, rewards: tf.Tensor, discounts: tf.Tensor, extra_spec: acme_types.NestedSpec, period: int, sequence_length: int): """Produce Reverb-like sequence from a full episode. Observations, actions, rewards and discounts have the same length. This function will ignore the first reward and discount and the last action. This function generates fake (all-zero) extras. See docs for reverb.SequenceAdder() for more details. Args: observations: [L, ...] Tensor. actions: [L, ...] Tensor. rewards: [L] Tensor. discounts: [L] Tensor. extra_spec: A possibly nested structure of specs for extras. This function will generate fake (all-zero) extras. period: The period with which we add sequences. sequence_length: The fixed length of sequences we wish to add. Returns: (o_t, a_t, r_t, d_t, e_t) Tuple. """ length = tf.shape(rewards)[0] first = tf.random.uniform(shape=(), minval=0, maxval=length, dtype=tf.int32) first = first // period * period # Get a multiple of `period`. to = tf.minimum(first + sequence_length, length) def _slice_and_pad(x): pad_length = sequence_length + first - to padding_shape = tf.concat([[pad_length], tf.shape(x)[1:]], axis=0) result = tf.concat([x[first:to], tf.zeros(padding_shape, x.dtype)], axis=0) result.set_shape([sequence_length] + x.shape.as_list()[1:]) return result o_t = tree.map_structure(_slice_and_pad, observations) a_t = tree.map_structure(_slice_and_pad, actions) r_t = _slice_and_pad(rewards) d_t = _slice_and_pad(discounts) def _sequence_zeros(spec): return tf.zeros([sequence_length] + spec.shape, spec.dtype) e_t = tree.map_structure(_sequence_zeros, extra_spec) key = tf.zeros([sequence_length], tf.uint64) probability = tf.ones([sequence_length], tf.float64) table_size = tf.ones([sequence_length], tf.int64) info = reverb.SampleInfo( key=key, probability=probability, table_size=table_size) return reverb.ReplaySample(info=info, data=(o_t, a_t, r_t, d_t, e_t))
class ReverbUtilsTest(absltest.TestCase): def test_make_replay_table_preserves_table_info(self): limiter = reverb.rate_limiters.SampleToInsertRatio( samples_per_insert=1, min_size_to_sample=2, error_buffer=(0, 10)) table = reverb.Table( name='test', sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=10, rate_limiter=limiter) new_table = reverb_utils.make_replay_table_from_info(table.info) new_info = new_table.info # table_worker_time is not set by the above utility since this is meant to # be monitoring information about any given table. So instead we copy this # so that the assertion below checks that everything else matches. new_info.table_worker_time.sleeping_ms = ( table.info.table_worker_time.sleeping_ms) self.assertEqual(new_info, table.info) _EMPTY_INFO = reverb.SampleInfo(*[() for _ in reverb.SampleInfo.tf_dtypes()]) _DUMMY_OBS = np.array([[[0], [1], [2]]]) _DUMMY_ACTION = np.array([[[3], [4], [5]]]) _DUMMY_REWARD = np.array([[6, 7, 8]]) _DUMMY_DISCOUNT = np.array([[.99, .99, .99]]) _DUMMY_NEXT_OBS = np.array([[[1], [2], [0]]]) _DUMMY_RETURN = np.array([[20.77, 14.92, 8.]]) def _create_dummy_steps(self): return reverb_adders.Step( observation=self._DUMMY_OBS, action=self._DUMMY_ACTION, reward=self._DUMMY_REWARD, discount=self._DUMMY_DISCOUNT, start_of_episode=True, extras={'return': self._DUMMY_RETURN}) def _create_dummy_transitions(self): return types.Transition( observation=self._DUMMY_OBS, action=self._DUMMY_ACTION, reward=self._DUMMY_REWARD, discount=self._DUMMY_DISCOUNT, next_observation=self._DUMMY_NEXT_OBS, extras={'return': self._DUMMY_RETURN}) def test_replay_sample_to_sars_transition_is_sequence(self): fake_sample = reverb.ReplaySample( info=self._EMPTY_INFO, data=self._create_dummy_steps()) fake_transition = self._create_dummy_transitions() transition_from_sample = reverb_utils.replay_sample_to_sars_transition( fake_sample, is_sequence=True) tree.map_structure(np.testing.assert_array_equal, transition_from_sample, fake_transition)
def _build_sequence_example(sequences): """Convert raw sequences into a Reverb sequence sample.""" o = sequences['observation'] a = sequences['action'] r = sequences['reward'] p = sequences['discount'] info = reverb.SampleInfo(key=tf.constant(0, tf.uint64), probability=tf.constant(1.0, tf.float64), table_size=tf.constant(0, tf.int64), priority=tf.constant(1.0, tf.float64)) return reverb.ReplaySample(info=info, data=(o, a, r, p))
def _build_sequence_example(sequences): """Convert raw sequences into a Reverb sequence sample.""" data = adders.Step(observation=sequences['observation'], action=sequences['action'], reward=sequences['reward'], discount=sequences['discount'], start_of_episode=(), extras=()) info = reverb.SampleInfo(key=tf.constant(0, tf.uint64), probability=tf.constant(1.0, tf.float64), table_size=tf.constant(0, tf.int64), priority=tf.constant(1.0, tf.float64)) return reverb.ReplaySample(info=info, data=data)
def _build_sarsa_example(sequences): """Convert raw sequences into a Reverb n-step SARSA sample.""" o_tm1 = tree.map_structure(lambda t: t[0], sequences['observation']) o_t = tree.map_structure(lambda t: t[1], sequences['observation']) a_tm1 = tree.map_structure(lambda t: t[0], sequences['action']) a_t = tree.map_structure(lambda t: t[1], sequences['action']) r_t = tree.map_structure(lambda t: t[0], sequences['reward']) p_t = tree.map_structure(lambda t: t[0], sequences['discount']) info = reverb.SampleInfo(key=tf.constant(0, tf.uint64), probability=tf.constant(1.0, tf.float64), table_size=tf.constant(0, tf.int64), priority=tf.constant(1.0, tf.float64)) return reverb.ReplaySample(info=info, data=(o_tm1, a_tm1, r_t, p_t, o_t, a_t))
class ReverbUtilsTest(absltest.TestCase): def test_make_replay_table_preserves_table_info(self): limiter = reverb.rate_limiters.SampleToInsertRatio( samples_per_insert=1, min_size_to_sample=2, error_buffer=(0, 10)) table = reverb.Table( name='test', sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=10, rate_limiter=limiter) table_from_info = reverb_utils.make_replay_table_from_info(table.info) self.assertEqual(table_from_info.info, table.info) _EMPTY_INFO = reverb.SampleInfo((), (), (), ()) _DUMMY_OBS = np.array([[[0], [1], [2]]]) _DUMMY_ACTION = np.array([[[3], [4], [5]]]) _DUMMY_REWARD = np.array([[6, 7, 8]]) _DUMMY_DISCOUNT = np.array([[.99, .99, .99]]) _DUMMY_NEXT_OBS = np.array([[[1], [2], [0]]]) def _create_dummy_steps(self): return reverb_adders.Step( observation=self._DUMMY_OBS, action=self._DUMMY_ACTION, reward=self._DUMMY_REWARD, discount=self._DUMMY_DISCOUNT, start_of_episode=True, extras=()) def _create_dummy_transitions(self): return types.Transition( observation=self._DUMMY_OBS, action=self._DUMMY_ACTION, reward=self._DUMMY_REWARD, discount=self._DUMMY_DISCOUNT, next_observation=self._DUMMY_NEXT_OBS) def test_replay_sample_to_sars_transition_is_sequence(self): fake_sample = reverb.ReplaySample( info=self._EMPTY_INFO, data=self._create_dummy_steps()) fake_transition = self._create_dummy_transitions() transition_from_sample = reverb_utils.replay_sample_to_sars_transition( fake_sample, is_sequence=True) tree.map_structure(np.testing.assert_array_equal, transition_from_sample, fake_transition)
def test_after_train_step_fn_with_fresh_data_only(self, create_strategy_fn): strategy = create_strategy_fn() with strategy.scope(): # Prepare the test context context. train_step = train_utils.create_train_step() train_step.assign(225) train_steps_per_policy_update = 100 # Create the after train function to test, and the test input. after_train_step_fn = ( train_utils.create_staleness_metrics_after_train_step_fn( train_step, train_steps_per_policy_update=train_steps_per_policy_update )) observation_train_steps = np.array([[200], [200], [200]], dtype=np.int64) # Define the expectations (expected scalar summary calls). expected_scalar_summary_calls = [ mock.call(name='staleness/max_train_step_delta_in_batch', data=0, step=225), mock.call(name='staleness/max_policy_update_delta_in_batch', data=0, step=225), mock.call(name='staleness/num_stale_obserations_in_batch', data=0, step=225) ] # Call the after train function and check the expectations. with mock.patch.object(tf.summary, 'scalar', autospec=True) as mock_scalar_summary: # Call the `after_train_function` on the test input. Assumed the # observation train steps are stored in the field `priority` of the # the sample info of Reverb. info = reverb.SampleInfo( *[None for _ in reverb.SampleInfo.tf_dtypes()]) info = info._replace(priority=observation_train_steps) strategy.run(after_train_step_fn, args=((None, info), None)) # Check if the expected calls happened on the scalar summary. mock_scalar_summary.assert_has_calls( expected_scalar_summary_calls, any_order=False)
def _make_reverb_sample(o_t, a_t, r_t, d_t, o_tp1, a_tp1, extras): """Create Reverb sample with offline data. Args: o_t: Observation at time t. a_t: Action at time t. r_t: Reward at time t. d_t: Discount at time t. o_tp1: Observation at time t+1. a_tp1: Action at time t+1. extras: Dictionary with extra features. Returns: Replay sample with fake info: key=0, probability=1, table_size=0. """ info = reverb.SampleInfo(key=tf.constant(0, tf.uint64), probability=tf.constant(1.0, tf.float64), table_size=tf.constant(0, tf.int64), priority=tf.constant(1.0, tf.float64)) data = (o_t, a_t, r_t, d_t, o_tp1, a_tp1, extras) return reverb.ReplaySample(info=info, data=data)
def add_info_fn(data): info = reverb.SampleInfo(key=0, probability=0.0, table_size=0, priority=0.0) return reverb.ReplaySample(info=info, data=data)
def _reverb_sample(*data_tuple): info = reverb.SampleInfo(key=tf.constant(0, tf.uint64), probability=tf.constant(1.0, tf.float64), table_size=tf.constant(0, tf.int64), priority=tf.constant(1.0, tf.float64)) return reverb.ReplaySample(info=info, data=data_tuple)