def test_make_dataset_with_sequence_length_and_batch_size(self): sequence_length = 6 batch_size = 4 environment = fakes.ContinuousEnvironment() environment_spec = specs.make_environment_spec(environment) dataset = reverb_dataset.make_dataset( client=self.tf_client, environment_spec=environment_spec, batch_size=batch_size, sequence_length=sequence_length) def make_tensor_spec(spec): return tf.TensorSpec(shape=( batch_size, sequence_length, ) + spec.shape, dtype=spec.dtype) expected_spec = tree.map_structure(make_tensor_spec, environment_spec) expected_spec = adders.Step(observation=expected_spec.observations, action=expected_spec.actions, reward=expected_spec.rewards, discount=expected_spec.discounts, start_of_episode=specs.Array( shape=(batch_size, sequence_length), dtype=bool), extras=()) self.assertTrue(_check_specs(expected_spec, dataset.element_spec.data))
def test_make_dataset_nested_specs(self): environment_spec = specs.EnvironmentSpec(observations={ 'obs_1': specs.Array((3, 64, 64), 'uint8'), 'obs_2': specs.Array((10, ), 'int32') }, actions=specs.BoundedArray( (), 'float32', minimum=-1., maximum=1.), rewards=specs.Array( (), 'float32'), discounts=specs.BoundedArray( (), 'float32', minimum=0., maximum=1.)) dataset = reverb_dataset.make_dataset( client=self.tf_client, environment_spec=environment_spec) expected_spec = adders.Step(observation=environment_spec.observations, action=environment_spec.actions, reward=environment_spec.rewards, discount=environment_spec.discounts, start_of_episode=specs.Array(shape=(), dtype=bool), extras=()) self.assertTrue(_check_specs(expected_spec, dataset.element_spec.data))
def _create_dummy_steps(self): return reverb_adders.Step(observation=self._DUMMY_OBS, action=self._DUMMY_ACTION, reward=self._DUMMY_REWARD, discount=self._DUMMY_DISCOUNT, start_of_episode=True, extras={'return': self._DUMMY_RETURN})
def _read_results(self): while len(self._evaluation_state.received_results ) != self._config.num_directions * 2: data = next(self._iterator).data data = acme_reverb.Step(*data) # validation params_key = data.extras['params_key'] training_step, perturbation_id, is_opposite = params_key # If the incoming data does not correspond to the current iteration, # we simply ignore it. if not np.all(training_step[:-1] == self._training_state.training_iteration): continue # The whole episode should be run with the same policy, so let's check # for that. assert np.all(perturbation_id[:-1] == perturbation_id[0]) assert np.all(is_opposite[:-1] == is_opposite[0]) perturbation_id = perturbation_id[0].item() is_opposite = is_opposite[0].item() total_reward = np.sum(data.reward - self._config.reward_shift) k = PerturbationKey(self._training_state.training_iteration, perturbation_id, is_opposite) if k in self._evaluation_state.received_results: continue self._evaluation_state.received_results[k] = EvaluationResult( total_reward, data.observation)
def test_make_dataset_simple(self): environment = fakes.ContinuousEnvironment() environment_spec = specs.make_environment_spec(environment) dataset = reverb_dataset.make_dataset( client=self.tf_client, environment_spec=environment_spec) expected_spec = adders.Step(observation=environment_spec.observations, action=environment_spec.actions, reward=environment_spec.rewards, discount=environment_spec.discounts, start_of_episode=specs.Array(shape=(), dtype=bool), extras=()) self.assertTrue(_check_specs(expected_spec, dataset.element_spec.data))
def _build_sequence_example(sequences): """Convert raw sequences into a Reverb sequence sample.""" data = adders.Step(observation=sequences['observation'], action=sequences['action'], reward=sequences['reward'], discount=sequences['discount'], start_of_episode=(), extras=()) info = reverb.SampleInfo(key=tf.constant(0, tf.uint64), probability=tf.constant(1.0, tf.float64), table_size=tf.constant(0, tf.int64), priority=tf.constant(1.0, tf.float64)) return reverb.ReplaySample(info=info, data=data)
def _spec_to_shapes_and_dtypes(transition_adder: bool, environment_spec: specs.EnvironmentSpec, extra_spec: Optional[types.NestedSpec], sequence_length: Optional[int], convert_zero_size_to_none: bool, using_deprecated_adder: bool): """Creates the shapes and dtypes needed to describe the Reverb dataset. This takes a `environment_spec`, `extra_spec`, and additional information and returns a tuple (shapes, dtypes) that describe the data contained in Reverb. Args: transition_adder: A boolean, describing if a `TransitionAdder` was used to add data. environment_spec: A `specs.EnvironmentSpec`, describing the shapes and dtypes of the data produced by the environment (and the action). extra_spec: A nested structure of objects with a `.shape` and `.dtype` property. This describes any additional data the Actor adds into Reverb. sequence_length: An optional integer for how long the added sequences are, only used with `SequenceAdder`. convert_zero_size_to_none: If True, then all shape dimensions that are 0 are converted to None. A None dimension is only set at runtime. using_deprecated_adder: True if the adder used to generate the data is from acme/adders/reverb/deprecated. Returns: A tuple (dtypes, shapes) that describes the data that has been added into Reverb. """ # The *transition* adder is special in that it also adds an arrival state. if transition_adder: # Use the environment spec but convert it to a plain tuple. adder_spec = tuple(environment_spec) + ( environment_spec.observations, ) # Any 'extra' data that is passed to the adder is put on the end. if extra_spec: adder_spec += (extra_spec, ) elif using_deprecated_adder and deprecated_base is not None: adder_spec = deprecated_base.Step( observation=environment_spec.observations, action=environment_spec.actions, reward=environment_spec.rewards, discount=environment_spec.discounts, extras=() if not extra_spec else extra_spec) else: adder_spec = adders.Step(observation=environment_spec.observations, action=environment_spec.actions, reward=environment_spec.rewards, discount=environment_spec.discounts, start_of_episode=specs.Array(shape=(), dtype=bool), extras=() if not extra_spec else extra_spec) # Extract the shapes and dtypes from these specs. get_dtype = lambda x: tf.as_dtype(x.dtype) get_shape = lambda x: tf.TensorShape(x.shape) if sequence_length: get_shape = lambda x: tf.TensorShape([sequence_length, *x.shape]) if convert_zero_size_to_none: # TODO(b/143692455): Consider making this default behaviour. get_shape = lambda x: tf.TensorShape( [s if s else None for s in x.shape]) shapes = tree.map_structure(get_shape, adder_spec) dtypes = tree.map_structure(get_dtype, adder_spec) return shapes, dtypes
def _sequence_from_episode(observations: acme_types.NestedTensor, actions: tf.Tensor, rewards: tf.Tensor, discounts: tf.Tensor, extra_spec: acme_types.NestedSpec, period: int, sequence_length: int): """Produce Reverb-like sequence from a full episode. Observations, actions, rewards and discounts have the same length. This function will ignore the first reward and discount and the last action. This function generates fake (all-zero) extras. See docs for reverb.SequenceAdder() for more details. Args: observations: [L, ...] Tensor. actions: [L, ...] Tensor. rewards: [L] Tensor. discounts: [L] Tensor. extra_spec: A possibly nested structure of specs for extras. This function will generate fake (all-zero) extras. period: The period with which we add sequences. sequence_length: The fixed length of sequences we wish to add. Returns: (o_t, a_t, r_t, d_t, e_t) Tuple. """ length = tf.shape(rewards)[0] first = tf.random.uniform(shape=(), minval=0, maxval=length, dtype=tf.int32) first = first // period * period # Get a multiple of `period`. to = tf.minimum(first + sequence_length, length) def _slice_and_pad(x): pad_length = sequence_length + first - to padding_shape = tf.concat([[pad_length], tf.shape(x)[1:]], axis=0) result = tf.concat( [x[first:to], tf.zeros(padding_shape, x.dtype)], axis=0) result.set_shape([sequence_length] + x.shape.as_list()[1:]) return result o_t = tree.map_structure(_slice_and_pad, observations) a_t = tree.map_structure(_slice_and_pad, actions) r_t = _slice_and_pad(rewards) d_t = _slice_and_pad(discounts) start_of_episode = tf.equal(first, 0) start_of_episode = tf.expand_dims(start_of_episode, axis=0) start_of_episode = tf.tile(start_of_episode, [sequence_length]) def _sequence_zeros(spec): return tf.zeros([sequence_length] + spec.shape, spec.dtype) e_t = tree.map_structure(_sequence_zeros, extra_spec) key = tf.zeros([sequence_length], tf.uint64) probability = tf.ones([sequence_length], tf.float64) table_size = tf.ones([sequence_length], tf.int64) priority = tf.ones([sequence_length], tf.float64) info = reverb.SampleInfo(key=key, probability=probability, table_size=table_size, priority=priority) return reverb.ReplaySample(info=info, data=adders.Step( observation=o_t, action=a_t, reward=r_t, discount=d_t, start_of_episode=start_of_episode, extras=e_t))
def test_shapes(self): # batch_size = 2 sequence_len = 3 num_actions = 5 hidden_size = 7 # Define a trivial recurrent actor-critic network. @hk.without_apply_rng @hk.transform def unroll_fn(observations, state): lstm = hk.LSTM(hidden_size) embedding, state = hk.dynamic_unroll(lstm, observations, state) logits = hk.Linear(num_actions)(embedding) values = jnp.squeeze(hk.Linear(1)(embedding), axis=-1) return (logits, values), state @hk.without_apply_rng @hk.transform def initial_state_fn(): return hk.LSTM(hidden_size).initial_state(None) # Initial recurrent network state. initial_state = initial_state_fn.apply(None) # Make some fake data. observations = np.ones(shape=(sequence_len, 50)) actions = np.random.randint(num_actions, size=sequence_len) rewards = np.random.rand(sequence_len) discounts = np.ones(shape=(sequence_len, )) batch_tile = tree_map( lambda x: np.tile(x, [batch_size, *([1] * x.ndim)])) seq_tile = tree_map( lambda x: np.tile(x, [sequence_len, *([1] * x.ndim)])) extras = { 'logits': np.random.rand(sequence_len, num_actions), 'core_state': seq_tile(initial_state), } # Package up the data into a ReverbSample. data = adders.Step(observations, actions, rewards, discounts, extras=extras, start_of_episode=()) data = batch_tile(data) sample = reverb.ReplaySample(info=None, data=data) # Initialise parameters. rng = hk.PRNGSequence(1) params = unroll_fn.init(next(rng), observations, initial_state) # Make loss function. loss_fn = impala.impala_loss(unroll_fn, discount=0.99) # Return value should be scalar. loss = loss_fn(params, sample) loss = jax.device_get(loss) self.assertEqual(loss.shape, ())