def test_make_dataset_nested_specs(self): environment_spec = specs.EnvironmentSpec(observations={ 'obs_1': specs.Array((3, 64, 64), 'uint8'), 'obs_2': specs.Array((10, ), 'int32') }, actions=specs.BoundedArray( (), 'float32', minimum=-1., maximum=1.), rewards=specs.Array( (), 'float32'), discounts=specs.BoundedArray( (), 'float32', minimum=0., maximum=1.)) dataset = reverb_dataset.make_dataset( client=self.tf_client, environment_spec=environment_spec) self.assertTrue( _check_specs(tuple(environment_spec), dataset.element_spec.data))
def test_make_dataset_nested_specs(self): environment_spec = specs.EnvironmentSpec(observations={ 'obs_1': specs.Array((3, 64, 64), 'uint8'), 'obs_2': specs.Array((10, ), 'int32') }, actions=specs.BoundedArray( (), 'float32', minimum=-1., maximum=1.), rewards=specs.Array( (), 'float32'), discounts=specs.BoundedArray( (), 'float32', minimum=0., maximum=1.)) dataset = reverb_dataset.make_dataset( client=self.tf_client, environment_spec=environment_spec) expected_spec = adders.Step(observation=environment_spec.observations, action=environment_spec.actions, reward=environment_spec.rewards, discount=environment_spec.discounts, start_of_episode=specs.Array(shape=(), dtype=bool), extras=()) self.assertTrue(_check_specs(expected_spec, dataset.element_spec.data))
def __init__(self, *, num_actions: int = 1, num_observations: int = 1, action_dtype=np.int32, obs_dtype=np.int32, obs_shape: Sequence[int] = (), discount_spec: Optional[types.NestedSpec] = None, reward_spec: Optional[types.NestedSpec] = None, **kwargs): """Initialize the environment.""" if reward_spec is None: reward_spec = specs.Array((), np.float32) if discount_spec is None: discount_spec = specs.BoundedArray((), np.float32, 0.0, 1.0) actions = specs.DiscreteArray(num_actions, dtype=action_dtype) observations = specs.BoundedArray(shape=obs_shape, dtype=obs_dtype, minimum=obs_dtype(0), maximum=obs_dtype(num_observations - 1)) super().__init__(spec=specs.EnvironmentSpec(observations=observations, actions=actions, rewards=reward_spec, discounts=discount_spec), **kwargs)
def _make_fake_env() -> dm_env.Environment: env_spec = specs.EnvironmentSpec( observations=specs.Array(shape=(10, 5), dtype=np.float32), actions=specs.DiscreteArray(num_values=3), rewards=specs.Array(shape=(), dtype=np.float32), discounts=specs.BoundedArray( shape=(), dtype=np.float32, minimum=0., maximum=1.), ) return fakes.Environment(env_spec, episode_length=10)
def __init__(self, environment_spec: specs.EnvironmentSpec, action_spec: specs.BoundedArray, z_dim: int) -> None: self._z_dim = z_dim z_spec = specs.BoundedArray((z_dim, ), np.float64, minimum=0, maximum=1) # Modify the environment_spec to also include the latent variable # observation (z) self._obs_space = environment_spec.observations assert ( len(self._obs_space.shape) == 1 ), f"Only vector observations are supported for now. Observations shape passed: {obs_shape}" updated_observations = specs.BoundedArray( (self._obs_space.shape[0] + z_dim, ), dtype=environment_spec.observations.dtype, name=environment_spec.observations.name, minimum=np.append(environment_spec.observations.minimum, [0] * z_dim), maximum=np.append(environment_spec.observations.maximum, [0] * z_dim), ) environment_spec = specs.EnvironmentSpec( observations=updated_observations, actions=environment_spec.actions, rewards=environment_spec.rewards, discounts=environment_spec.discounts, ) self._agent_networks = make_feed_forward_networks(action_spec, z_spec) self._agent = dmpo.DistributionalMPO( environment_spec=environment_spec, policy_network=self._agent_networks['policy'], critic_network=self._agent_networks['critic'], observation_network=self._agent_networks['observation'], # pytype: disable=wrong-arg-types extra_modules_to_save={ 'discriminator': self._agent_networks['discriminator'], }, return_action_entropy=True, ) self._z_distribution = tfd.Categorical([1] * z_dim) self._current_z = self._z_distribution.sample() # Create discriminator optimizer. self._discriminator_optimizer = snt.optimizers.Adam(1e-4) self._discriminator_logger = loggers.make_default_logger( 'discriminator') # Create variables for the discriminator. tf2_utils.create_variables(self._agent_networks['discriminator'], [self._obs_space])
def test_make_dataset_with_variable_length_instances(self): """Dataset with variable length instances should have shapes with None.""" environment_spec = specs.EnvironmentSpec( observations=specs.Array((0, 64, 64), 'uint8'), actions=specs.BoundedArray((), 'float32', minimum=-1., maximum=1.), rewards=specs.Array((), 'float32'), discounts=specs.BoundedArray((), 'float32', minimum=0., maximum=1.)) dataset = reverb_dataset.make_dataset( server_address=self.server_address, environment_spec=environment_spec, convert_zero_size_to_none=True) self.assertSequenceEqual(dataset.element_spec.data[0].shape.as_list(), [None, 64, 64])
def test_step(self): simple_spec = specs.Array(shape=(), dtype=float) spec = specs.EnvironmentSpec(simple_spec, simple_spec, simple_spec, simple_spec) discriminator = _make_discriminator(spec) ail_network = ail_networks.AILNetworks(discriminator, imitation_reward_fn=lambda x: x, direct_rl_networks=None) loss = losses.gail_loss() optimizer = optax.adam(.01) step = jax.jit( functools.partial(ail_learning.ail_update_step, optimizer=optimizer, ail_network=ail_network, loss_fn=loss)) zero_transition = types.Transition(np.array([0.]), np.array([0.]), 0., 0., np.array([0.])) zero_transition = utils.add_batch_dim(zero_transition) one_transition = types.Transition(np.array([1.]), np.array([0.]), 0., 0., np.array([0.])) one_transition = utils.add_batch_dim(one_transition) key = jax.random.PRNGKey(0) discriminator_params, discriminator_state = discriminator.init(key) state = ail_learning.DiscriminatorTrainingState( optimizer_state=optimizer.init(discriminator_params), discriminator_params=discriminator_params, discriminator_state=discriminator_state, policy_params=None, key=key, steps=0, ) expected_loss = [1.062, 1.057, 1.052] for i in range(3): state, loss = step(state, (one_transition, zero_transition)) self.assertAlmostEqual(loss['total_loss'], expected_loss[i], places=3)
def get_specs(step): """Infer spec from an example step.""" env_spec = tree.map_structure( _numeric_to_spec, specs.EnvironmentSpec(observations=step[1].observation, actions=step[0], rewards=step[1].reward, discounts=step[1].discount)) has_extras = len(step) == 3 if has_extras: extras_spec = tree.map_structure(_numeric_to_spec, step[2]) else: extras_spec = () return env_spec, extras_spec
def setUp(self): super().setUp() self.state_dims = 8 self.action_dims = 4 self.params = { 'world': jnp.ones((3, )), 'policy': jnp.ones((3, )), 'value': jnp.ones((3, )) } self.env_spec = specs.EnvironmentSpec( observations=specs.Array(shape=(self.state_dims, ), dtype=float), actions=specs.Array(shape=(self.action_dims, ), dtype=float), rewards=specs.Array(shape=(1, ), dtype=float, name='reward'), discounts=specs.BoundedArray(shape=(), dtype=float, minimum=0., maximum=1., name='discount'))
def __init__(self, *, action_dim: int = 1, observation_dim: int = 1, bounded: bool = False, dtype=np.float32, reward_dtype=np.float32, **kwargs): """Initialize the environment. Args: action_dim: number of action dimensions. observation_dim: number of observation dimensions. bounded: whether or not the actions are bounded in [-1, 1]. dtype: dtype of the action and observation spaces. reward_dtype: dtype of the reward and discounts. **kwargs: additional kwargs passed to the Environment base class. """ action_shape = () if action_dim == 0 else (action_dim,) observation_shape = () if observation_dim == 0 else (observation_dim,) observations = specs.Array(observation_shape, dtype) rewards = specs.Array((), reward_dtype) discounts = specs.BoundedArray((), reward_dtype, 0.0, 1.0) if bounded: actions = specs.BoundedArray(action_shape, dtype, -1.0, 1.0) else: actions = specs.Array(action_shape, dtype) super().__init__( spec=specs.EnvironmentSpec( observations=observations, actions=actions, rewards=rewards, discounts=discounts), **kwargs)
def __init__(self, *, num_actions: int = 1, num_observations: int = 1, action_dtype=np.int32, obs_dtype=np.int32, reward_dtype=np.float32, obs_shape: Sequence[int] = (), **kwargs): """Initialize the environment.""" actions = specs.DiscreteArray(num_actions, dtype=action_dtype) observations = specs.BoundedArray(shape=obs_shape, dtype=obs_dtype, minimum=obs_dtype(0), maximum=obs_dtype(num_observations - 1)) rewards = specs.Array((), reward_dtype) discounts = specs.BoundedArray((), reward_dtype, 0.0, 1.0) super().__init__(spec=specs.EnvironmentSpec(observations=observations, actions=actions, rewards=rewards, discounts=discounts), **kwargs)
def run_test_adder(self, adder: base.ReverbAdder, first: dm_env.TimeStep, steps: Sequence[Step], expected_items: Sequence[Any], pack_expected_items: bool = False, repeat_episode_times: int = 1, break_end_of_episode: bool = True): """Runs a unit test case for the adder. Args: adder: The instance of `base.ReverbAdder` that is being tested. first: The first `dm_env.TimeStep` that is used to call `base.ReverbAdder.add_first()`. steps: A sequence of (action, timestep) tuples that are passed to `base.ReverbAdder.add()`. expected_items: The sequence of items that are expected to be created by calling the adder's `add_first()` method on `first` and `add()` on all of the elements in `steps`. pack_expected_items: If true the expected items are given unpacked and need to be packed in a list before comparison. repeat_episode_times: How many times to run an episode. break_end_of_episode: If False, an end of an episode does not break the sequence. """ if not steps: raise ValueError('At least one step must be given.') has_extras = len(steps[0]) == 3 env_spec = tree.map_structure( _numeric_to_spec, specs.EnvironmentSpec( observations=steps[0][1].observation, actions=steps[0][0], rewards=steps[0][1].reward, discounts=steps[0][1].discount)) if has_extras: extras_spec = tree.map_structure(_numeric_to_spec, steps[0][2]) else: extras_spec = () signature = adder.signature(env_spec, extras_spec=extras_spec) for episode_id in range(repeat_episode_times): # Add all the data up to the final step. adder.add_first(first) for step in steps[:-1]: action, ts = step[0], step[1] if has_extras: extras = step[2] else: extras = () adder.add(action, next_timestep=ts, extras=extras) # Only check for the first episode. if episode_id == 0: if len(steps) == 1: # adder.add() has not been called yet, so no writers have been # created. self.assertEmpty(self.client.writers) else: # Make sure the writer has been created but not closed. self.assertLen(self.client.writers, 1) self.assertFalse(self.client.writers[0].closed) # Add the final step. adder.add(*steps[-1]) # Ending the episode should close the writer. No new writer should yet have # been created as it is constructed lazily. self.assertLen(self.client.writers, 1) if break_end_of_episode: self.assertTrue(self.client.writers[0].closed) # Make sure our expected and observed data match. observed_items = [p[1] for p in self.client.writers[0].priorities] self.assertEqual(len(expected_items), len(observed_items)) for expected_item, observed_item in zip(expected_items, observed_items): if pack_expected_items: expected_item = [expected_item] # Set check_types=False because tree.map_structure( np.testing.assert_array_almost_equal, expected_item, observed_item, check_types=False) def _check_signature(spec: tf.TensorSpec, value): # Convert int/float to numpy arrays of dtype np.int64 and np.float64. value = np.asarray(value) self.assertTrue(spec.is_compatible_with(tf.convert_to_tensor(value))) for step in self.client.writers[0].timesteps: tree.map_structure(_check_signature, signature, step) if break_end_of_episode: # Add the start of a second trajectory. adder.add_first(first) adder.add(*steps[0]) # Make sure this creates an new writer. self.assertLen(self.client.writers, 2) # The writer is closed if the recently added `dm_env.TimeStep`'s' # step_type is `dm_env.StepType.LAST`. if steps[0][1].last(): self.assertTrue(self.client.writers[1].closed) else: self.assertFalse(self.client.writers[1].closed)
def run_test_adder(self, adder: base.ReverbAdder, first: dm_env.TimeStep, steps: Sequence[Tuple[Any, dm_env.TimeStep]], expected_items: Sequence[Any]): """Runs a unit test case for the adder. Args: adder: The instance of `base.ReverbAdder` that is being tested. first: The first `dm_env.TimeStep` that is used to call `base.ReverbAdder.add_first()`. steps: A sequence of (action, timestep) tuples that are passed to `base.ReverbAdder.add()`. expected_items: The sequence of items that are expected to be created by calling the adder's `add_first()` method on `first` and `add()` on all of the elements in `steps`. """ if not steps: raise ValueError('At least one step must be given.') env_spec = tree.map_structure( _numeric_to_spec, specs.EnvironmentSpec(observations=steps[0][1].observation, actions=steps[0][0], rewards=steps[0][1].reward, discounts=steps[0][1].discount)) signature = adder.signature(env_spec) # Add all the data up to the final step. adder.add_first(first) for action, ts in steps[:-1]: adder.add(action, next_timestep=ts) if len(steps) == 1: # adder.add() has not been called yet, so no writers have been created. self.assertEmpty(self.client.writers) else: # Make sure the writer has been created but not closed. self.assertLen(self.client.writers, 1) self.assertFalse(self.client.writers[0].closed) # Add the final step. adder.add(*steps[-1]) # Ending the episode should close the writer. No new writer should yet have # been created as it is constructed lazily. self.assertLen(self.client.writers, 1) self.assertTrue(self.client.writers[0].closed) # Make sure our expected and observed data match. observed_items = [p[1] for p in self.client.writers[0].priorities] for expected_item, observed_item in zip(expected_items, observed_items): # Set check_types=False because tree.map_structure(np.testing.assert_array_almost_equal, expected_item, observed_item, check_types=False) def _check_signature(spec: tf.TensorSpec, value): # Convert int/float to numpy arrays of dtype np.int64 and np.float64. value = np.asarray(value) self.assertTrue( spec.is_compatible_with(tf.convert_to_tensor(value))) for step in self.client.writers[0].timesteps: tree.map_structure(_check_signature, signature, step) # Add the start of a second trajectory. adder.add_first(first) adder.add(*steps[0]) # Make sure this creates an new writer. self.assertLen(self.client.writers, 2) # The writer is closed if the recently added `dm_env.TimeStep`'s' step_type # is `dm_env.StepType.LAST`. if steps[0][1].last(): self.assertTrue(self.client.writers[1].closed) else: self.assertFalse(self.client.writers[1].closed)
def init(self, params): if not _TF_USE_GPU: tf.config.set_visible_devices([], 'GPU') tf.config.threading.set_inter_op_parallelism_threads(_TF_NUM_THREADS) tf.config.threading.set_intra_op_parallelism_threads(_TF_NUM_THREADS) if params.seed: agent_seed = params.seed + sum([ord(c) for c in params.name]) random.seed(agent_seed) np.random.seed(agent_seed) tf.random.set_seed(agent_seed) # Internalize params. self._params = params self._name = params.name # Whether learning stopped. self._stop = False # Define specs. Everything needs to be single precision by default. observation_spec = specs.Array(shape=(params.states.rank, ), dtype=np.float32, name='obs') action_spec = specs.BoundedArray(shape=(params.num_phases, ), dtype=np.float32, minimum=0., maximum=1., name='action') reward_spec = specs.Array(shape=(), dtype=np.float32, name='reward') discount_spec = specs.BoundedArray(shape=(), dtype=np.float32, minimum=0., maximum=1., name='discount') env_spec = specs.EnvironmentSpec(observations=observation_spec, actions=action_spec, rewards=reward_spec, discounts=discount_spec) # Logger. dir_path = f'{params.exp_path}/logs/{self._name}' self._logger = make_default_logger(directory=dir_path, label=self._name) agent_logger = make_default_logger(directory=dir_path, label=f'{self._name}-learning') networks = _make_networks(actions_dim=params.num_phases, state_dim=params.states.rank, policy_layers=params.policy_layers, critic_layers=params.critic_layers) self.agent = acme_agent.DDPG( environment_spec=env_spec, policy_network=networks['policy'], critic_network=networks['critic'], observation_network=networks['observation'], discount=params.discount_factor, batch_size=params.batch_size, prefetch_size=params.prefetch_size, target_update_period=params.target_update_period, min_replay_size=params.min_replay_size, max_replay_size=params.max_replay_size, samples_per_insert=params.samples_per_insert, n_step=params.n_step, sigma_init=params.sigma_init, sigma_final=params.sigma_final, sigma_schedule_timesteps=params.sigma_schedule_timesteps, clipping=params.clipping, logger=agent_logger, checkpoint=False, ) # Observations counter. self._obs_counter = 0
def init(self, params): if not _TF_USE_GPU: tf.config.set_visible_devices([], 'GPU') tf.config.threading.set_inter_op_parallelism_threads(_TF_NUM_THREADS) tf.config.threading.set_intra_op_parallelism_threads(_TF_NUM_THREADS) if params.seed: agent_seed = params.seed + sum([ord(c) for c in params.name]) random.seed(agent_seed) np.random.seed(agent_seed) tf.random.set_seed(agent_seed) # Internalize params. self._params = params self._name = params.name # Whether learning stopped. self._stop = False # Define specs. Everything needs to be single precision by default. observation_spec = specs.Array(shape=(params.states.rank, ), dtype=np.float32, name='obs') action_spec = specs.DiscreteArray(dtype=np.int32, num_values=params.actions.depth, name="action") reward_spec = specs.Array(shape=(), dtype=np.float32, name='reward') discount_spec = specs.BoundedArray(shape=(), dtype=np.float32, minimum=0., maximum=1., name='discount') env_spec = specs.EnvironmentSpec(observations=observation_spec, actions=action_spec, rewards=reward_spec, discounts=discount_spec) # Logger. dir_path = f'{params.exp_path}/logs/{self._name}' self._logger = make_default_logger(directory=dir_path, label=self._name) agent_logger = make_default_logger(directory=dir_path, label=f'{self._name}-learning') network = Network(num_actions=env_spec.actions.num_values, rnn_hidden_size=params.rnn_hidden_size, head_layers=params.head_layers) self.agent = acme_agent.R2D2( environment_spec=env_spec, network=network, batch_size=params.batch_size, samples_per_insert=params.samples_per_insert, burn_in_length=params.burn_in_length, trace_length=params.trace_length, replay_period=params.replay_period, min_replay_size=params.min_replay_size, max_replay_size=params.max_replay_size, discount=params.discount_factor, prefetch_size=params.prefetch_size, target_update_period=params.target_update_period, importance_sampling_exponent=params.importance_sampling_exponent, priority_exponent=params.priority_exponent, epsilon_init=params.epsilon_init, epsilon_final=params.epsilon_final, epsilon_schedule_timesteps=params.epsilon_schedule_timesteps, learning_rate=params.learning_rate, store_lstm_state=params.store_lstm_state, max_priority_weight=params.max_priority_weight, logger=agent_logger, checkpoint=False, ) # Observations counter. self._obs_counter = 0
def define_residual_spec(rl_features, env, base_agent, action_norm, action_norm_scale=1.0, include_base_action=True, include_base_feats=True, base_network=None): # TODO(minttu): pass in GymWrapper(env) without any other wrapper classes. """Defines environment observation and action spaces as seen by the RL agent. Args: rl_features: A list of state features visible to the agent. If set, they replace any visual features. env: The environment which defines the action space, rewards and discounts. base_agent: base agent to use in residual training. action_norm: bc_agent.ActionSpace object defining action normalization. action_norm_scale: Scalar by which to scale residual action normalization. include_base_action: If True, add base agent action to spec. include_base_feats: If True, add features given by base agent to spec. base_network: Network type used by the base agent, if applicable. Returns: residual_spec: An acme.specs.EnvironmentSpec instance defining the residual spec. """ feats_spec = collections.OrderedDict() visible_state_dim = 0 # This check allows train_bc to use this function to set residual spec # without using env wrappers. if isinstance(env, gym.Env): for k, v in env.observation_space.spaces.items(): if k in rl_features: visible_state_dim += v.shape[0] if v.shape else 1 else: if FLAGS.domain == 'mime': obs_space = mime_env_utils.make_dict_space(env.scene, *rl_features).spaces else: obs_space = env.observation_spec() for k, v in obs_space.items(): if k in rl_features: visible_state_dim += v.shape[0] if v.shape else 1 if include_base_feats: base_feat_size = { 'resnet18_narrow32': 256, 'hand_vil': 200, }[base_network] feats_spec['feats'] = specs.Array([base_feat_size], np.float32, 'feats') if visible_state_dim > 0: feats_spec['visible_state'] = (specs.Array([visible_state_dim], np.float32, 'visible_state')) if include_base_action: feats_spec['base_action'] = specs.Array([base_agent.action_target_dim], np.float32, 'base_action') if FLAGS.rl_observation_network is not None: # TODO(minttu): Get image size from env observation spec. if FLAGS.input_type == 'depth': feats_spec['depth'] = specs.Array( [FLAGS.image_size, FLAGS.image_size, 3], np.uint8, 'depth') elif FLAGS.input_type == 'rgb': image_size = FLAGS.image_size rgb_shape = ([3, image_size, image_size, 3] if FLAGS.late_fusion else [image_size, image_size, 9]) feats_spec['rgb'] = specs.Array(rgb_shape, np.uint8, 'rgb') if isinstance(env, gym.Env): env_action_spec = env.action_space env_action_spec.minimum = env_action_spec.low env_action_spec.maximum = env_action_spec.high env_action_spec.name = 'action' # Concatenating fields here since it is non-trivial to use dictionary # observations with DemoReader's generator. concat_shape = np.sum([a.shape for a in feats_spec.values()]) feats_spec = collections.OrderedDict() feats_spec['residual_obs'] = specs.Array((concat_shape, ), np.float32, 'residual_obs') else: env_action_spec = env.action_spec() env_min = env_action_spec.minimum env_max = env_action_spec.maximum # Allow (at the extreme) to fully reverse a base action (from one action # space limit to the opposite limit). min_residual = env_min - env_max if include_base_action else env_min max_residual = env_max - env_min if include_base_action else env_max print('min residual', min_residual, 'max residual', max_residual) residual_action_space = bc_agent.ActionSpace(action_norm, env=env, scale=action_norm_scale) if action_norm in ['centered', 'zeromean_unitvar']: # Reuse stats; normalization scheme may still be different. residual_action_space.mean = base_agent.action_space.mean residual_action_space.std = base_agent.action_space.std norm_min = residual_action_space.normalize_flat(min_residual) norm_max = residual_action_space.normalize_flat(max_residual) norm_action_spec = specs.BoundedArray(shape=env_action_spec.shape, dtype=env_action_spec.dtype, minimum=norm_min, maximum=norm_max, name=env_action_spec.name) print(env_action_spec) print(norm_action_spec) if isinstance(env, gym.Env): reward_spec = specs.BoundedArray(shape=(), dtype=float, minimum=env.reward_range[0], maximum=env.reward_range[1], name='reward') else: reward_spec = env.reward_spec() if isinstance(env, gym.Env): discount_spec = specs.BoundedArray(shape=(), dtype=float, minimum=0., maximum=1., name='discount') else: discount_spec = env.discount_spec() # residual_spec = specs.make_environment_spec(env) # Use same normalization for base agent and residual agent. residual_spec = specs.EnvironmentSpec(observations=feats_spec, actions=norm_action_spec, rewards=reward_spec, discounts=discount_spec) print('Residual spec', residual_spec) return residual_spec
def run_test_adder(self, adder: base.ReverbAdder, first: dm_env.TimeStep, steps: Sequence[Step], expected_items: Sequence[Any], pack_expected_items: bool = False, stack_sequence_fields: bool = True, repeat_episode_times: int = 1, break_end_of_episode: bool = True): """Runs a unit test case for the adder. Args: adder: The instance of `base.ReverbAdder` that is being tested. first: The first `dm_env.TimeStep` that is used to call `base.ReverbAdder.add_first()`. steps: A sequence of (action, timestep) tuples that are passed to `base.ReverbAdder.add()`. expected_items: The sequence of items that are expected to be created by calling the adder's `add_first()` method on `first` and `add()` on all of the elements in `steps`. pack_expected_items: Deprecated and not used. If true the expected items are given unpacked and need to be packed in a list before comparison. stack_sequence_fields: Whether to stack the sequence fields of the expected items before comparing to the observed items. Usually False for transition adders and True for both episode and sequence adders. repeat_episode_times: How many times to run an episode. break_end_of_episode: If False, an end of an episode does not break the sequence. """ del pack_expected_items if not steps: raise ValueError('At least one step must be given.') has_extras = len(steps[0]) == 3 env_spec = tree.map_structure( _numeric_to_spec, specs.EnvironmentSpec(observations=steps[0][1].observation, actions=steps[0][0], rewards=steps[0][1].reward, discounts=steps[0][1].discount)) if has_extras: extras_spec = tree.map_structure(_numeric_to_spec, steps[0][2]) else: extras_spec = () signature = adder.signature(env_spec, extras_spec=extras_spec) for episode_id in range(repeat_episode_times): # Add all the data up to the final step. adder.add_first(first) for step in steps[:-1]: action, ts = step[0], step[1] if has_extras: extras = step[2] else: extras = () adder.add(action, next_timestep=ts, extras=extras) # Add the final step. adder.add(*steps[-1]) # Ending the episode should close the writer. No new writer should yet have # been created as it is constructed lazily. if break_end_of_episode: self.assertEqual(self.client.writer.num_episodes, repeat_episode_times) # Make sure our expected and observed data match. observed_items = [p[2] for p in self.client.writer.priorities] # Check matching number of items. self.assertEqual(len(expected_items), len(observed_items)) # Check items are matching according to numpy's almost_equal. for expected_item, observed_item in zip(expected_items, observed_items): if stack_sequence_fields: expected_item = tree_utils.stack_sequence_fields(expected_item) # Set check_types=False because we check them below. tree.map_structure(np.testing.assert_array_almost_equal, expected_item, tuple(observed_item), check_types=False) # Make sure the signature matches was is being written by Reverb. def _check_signature(spec: tf.TensorSpec, value: np.ndarray): self.assertTrue( spec.is_compatible_with(tf.convert_to_tensor(value))) # Check the last transition's signature. tree.map_structure(_check_signature, signature, observed_items[-1])