Ejemplo n.º 1
0
    def test_make_dataset_nested_specs(self):
        environment_spec = specs.EnvironmentSpec(observations={
            'obs_1':
            specs.Array((3, 64, 64), 'uint8'),
            'obs_2':
            specs.Array((10, ), 'int32')
        },
                                                 actions=specs.BoundedArray(
                                                     (),
                                                     'float32',
                                                     minimum=-1.,
                                                     maximum=1.),
                                                 rewards=specs.Array(
                                                     (), 'float32'),
                                                 discounts=specs.BoundedArray(
                                                     (),
                                                     'float32',
                                                     minimum=0.,
                                                     maximum=1.))

        dataset = reverb_dataset.make_dataset(
            client=self.tf_client, environment_spec=environment_spec)

        self.assertTrue(
            _check_specs(tuple(environment_spec), dataset.element_spec.data))
Ejemplo n.º 2
0
    def test_make_dataset_nested_specs(self):
        environment_spec = specs.EnvironmentSpec(observations={
            'obs_1':
            specs.Array((3, 64, 64), 'uint8'),
            'obs_2':
            specs.Array((10, ), 'int32')
        },
                                                 actions=specs.BoundedArray(
                                                     (),
                                                     'float32',
                                                     minimum=-1.,
                                                     maximum=1.),
                                                 rewards=specs.Array(
                                                     (), 'float32'),
                                                 discounts=specs.BoundedArray(
                                                     (),
                                                     'float32',
                                                     minimum=0.,
                                                     maximum=1.))

        dataset = reverb_dataset.make_dataset(
            client=self.tf_client, environment_spec=environment_spec)

        expected_spec = adders.Step(observation=environment_spec.observations,
                                    action=environment_spec.actions,
                                    reward=environment_spec.rewards,
                                    discount=environment_spec.discounts,
                                    start_of_episode=specs.Array(shape=(),
                                                                 dtype=bool),
                                    extras=())

        self.assertTrue(_check_specs(expected_spec, dataset.element_spec.data))
Ejemplo n.º 3
0
    def __init__(self,
                 *,
                 num_actions: int = 1,
                 num_observations: int = 1,
                 action_dtype=np.int32,
                 obs_dtype=np.int32,
                 obs_shape: Sequence[int] = (),
                 discount_spec: Optional[types.NestedSpec] = None,
                 reward_spec: Optional[types.NestedSpec] = None,
                 **kwargs):
        """Initialize the environment."""
        if reward_spec is None:
            reward_spec = specs.Array((), np.float32)

        if discount_spec is None:
            discount_spec = specs.BoundedArray((), np.float32, 0.0, 1.0)

        actions = specs.DiscreteArray(num_actions, dtype=action_dtype)
        observations = specs.BoundedArray(shape=obs_shape,
                                          dtype=obs_dtype,
                                          minimum=obs_dtype(0),
                                          maximum=obs_dtype(num_observations -
                                                            1))

        super().__init__(spec=specs.EnvironmentSpec(observations=observations,
                                                    actions=actions,
                                                    rewards=reward_spec,
                                                    discounts=discount_spec),
                         **kwargs)
Ejemplo n.º 4
0
def _make_fake_env() -> dm_env.Environment:
  env_spec = specs.EnvironmentSpec(
      observations=specs.Array(shape=(10, 5), dtype=np.float32),
      actions=specs.DiscreteArray(num_values=3),
      rewards=specs.Array(shape=(), dtype=np.float32),
      discounts=specs.BoundedArray(
          shape=(), dtype=np.float32, minimum=0., maximum=1.),
  )
  return fakes.Environment(env_spec, episode_length=10)
Ejemplo n.º 5
0
    def __init__(self, environment_spec: specs.EnvironmentSpec,
                 action_spec: specs.BoundedArray, z_dim: int) -> None:
        self._z_dim = z_dim
        z_spec = specs.BoundedArray((z_dim, ),
                                    np.float64,
                                    minimum=0,
                                    maximum=1)
        # Modify the environment_spec to also include the latent variable
        # observation  (z)
        self._obs_space = environment_spec.observations
        assert (
            len(self._obs_space.shape) == 1
        ), f"Only vector observations are supported for now. Observations shape passed: {obs_shape}"
        updated_observations = specs.BoundedArray(
            (self._obs_space.shape[0] + z_dim, ),
            dtype=environment_spec.observations.dtype,
            name=environment_spec.observations.name,
            minimum=np.append(environment_spec.observations.minimum,
                              [0] * z_dim),
            maximum=np.append(environment_spec.observations.maximum,
                              [0] * z_dim),
        )
        environment_spec = specs.EnvironmentSpec(
            observations=updated_observations,
            actions=environment_spec.actions,
            rewards=environment_spec.rewards,
            discounts=environment_spec.discounts,
        )
        self._agent_networks = make_feed_forward_networks(action_spec, z_spec)
        self._agent = dmpo.DistributionalMPO(
            environment_spec=environment_spec,
            policy_network=self._agent_networks['policy'],
            critic_network=self._agent_networks['critic'],
            observation_network=self._agent_networks['observation'],  # pytype: disable=wrong-arg-types
            extra_modules_to_save={
                'discriminator': self._agent_networks['discriminator'],
            },
            return_action_entropy=True,
        )

        self._z_distribution = tfd.Categorical([1] * z_dim)
        self._current_z = self._z_distribution.sample()

        # Create discriminator optimizer.
        self._discriminator_optimizer = snt.optimizers.Adam(1e-4)
        self._discriminator_logger = loggers.make_default_logger(
            'discriminator')

        # Create variables for the discriminator.
        tf2_utils.create_variables(self._agent_networks['discriminator'],
                                   [self._obs_space])
Ejemplo n.º 6
0
  def test_make_dataset_with_variable_length_instances(self):
    """Dataset with variable length instances should have shapes with None."""
    environment_spec = specs.EnvironmentSpec(
        observations=specs.Array((0, 64, 64), 'uint8'),
        actions=specs.BoundedArray((), 'float32', minimum=-1., maximum=1.),
        rewards=specs.Array((), 'float32'),
        discounts=specs.BoundedArray((), 'float32', minimum=0., maximum=1.))

    dataset = reverb_dataset.make_dataset(
        server_address=self.server_address,
        environment_spec=environment_spec,
        convert_zero_size_to_none=True)

    self.assertSequenceEqual(dataset.element_spec.data[0].shape.as_list(),
                             [None, 64, 64])
Ejemplo n.º 7
0
    def test_step(self):
        simple_spec = specs.Array(shape=(), dtype=float)

        spec = specs.EnvironmentSpec(simple_spec, simple_spec, simple_spec,
                                     simple_spec)

        discriminator = _make_discriminator(spec)
        ail_network = ail_networks.AILNetworks(discriminator,
                                               imitation_reward_fn=lambda x: x,
                                               direct_rl_networks=None)

        loss = losses.gail_loss()

        optimizer = optax.adam(.01)

        step = jax.jit(
            functools.partial(ail_learning.ail_update_step,
                              optimizer=optimizer,
                              ail_network=ail_network,
                              loss_fn=loss))

        zero_transition = types.Transition(np.array([0.]), np.array([0.]), 0.,
                                           0., np.array([0.]))
        zero_transition = utils.add_batch_dim(zero_transition)

        one_transition = types.Transition(np.array([1.]), np.array([0.]), 0.,
                                          0., np.array([0.]))
        one_transition = utils.add_batch_dim(one_transition)

        key = jax.random.PRNGKey(0)
        discriminator_params, discriminator_state = discriminator.init(key)

        state = ail_learning.DiscriminatorTrainingState(
            optimizer_state=optimizer.init(discriminator_params),
            discriminator_params=discriminator_params,
            discriminator_state=discriminator_state,
            policy_params=None,
            key=key,
            steps=0,
        )

        expected_loss = [1.062, 1.057, 1.052]

        for i in range(3):
            state, loss = step(state, (one_transition, zero_transition))
            self.assertAlmostEqual(loss['total_loss'],
                                   expected_loss[i],
                                   places=3)
Ejemplo n.º 8
0
def get_specs(step):
    """Infer spec from an example step."""
    env_spec = tree.map_structure(
        _numeric_to_spec,
        specs.EnvironmentSpec(observations=step[1].observation,
                              actions=step[0],
                              rewards=step[1].reward,
                              discounts=step[1].discount))

    has_extras = len(step) == 3
    if has_extras:
        extras_spec = tree.map_structure(_numeric_to_spec, step[2])
    else:
        extras_spec = ()

    return env_spec, extras_spec
Ejemplo n.º 9
0
 def setUp(self):
     super().setUp()
     self.state_dims = 8
     self.action_dims = 4
     self.params = {
         'world': jnp.ones((3, )),
         'policy': jnp.ones((3, )),
         'value': jnp.ones((3, ))
     }
     self.env_spec = specs.EnvironmentSpec(
         observations=specs.Array(shape=(self.state_dims, ), dtype=float),
         actions=specs.Array(shape=(self.action_dims, ), dtype=float),
         rewards=specs.Array(shape=(1, ), dtype=float, name='reward'),
         discounts=specs.BoundedArray(shape=(),
                                      dtype=float,
                                      minimum=0.,
                                      maximum=1.,
                                      name='discount'))
Ejemplo n.º 10
0
  def __init__(self,
               *,
               action_dim: int = 1,
               observation_dim: int = 1,
               bounded: bool = False,
               dtype=np.float32,
               reward_dtype=np.float32,
               **kwargs):
    """Initialize the environment.

    Args:
      action_dim: number of action dimensions.
      observation_dim: number of observation dimensions.
      bounded: whether or not the actions are bounded in [-1, 1].
      dtype: dtype of the action and observation spaces.
      reward_dtype: dtype of the reward and discounts.
      **kwargs: additional kwargs passed to the Environment base class.
    """

    action_shape = () if action_dim == 0 else (action_dim,)
    observation_shape = () if observation_dim == 0 else (observation_dim,)

    observations = specs.Array(observation_shape, dtype)
    rewards = specs.Array((), reward_dtype)
    discounts = specs.BoundedArray((), reward_dtype, 0.0, 1.0)

    if bounded:
      actions = specs.BoundedArray(action_shape, dtype, -1.0, 1.0)
    else:
      actions = specs.Array(action_shape, dtype)

    super().__init__(
        spec=specs.EnvironmentSpec(
            observations=observations,
            actions=actions,
            rewards=rewards,
            discounts=discounts),
        **kwargs)
Ejemplo n.º 11
0
Archivo: fakes.py Proyecto: wilixx/acme
    def __init__(self,
                 *,
                 num_actions: int = 1,
                 num_observations: int = 1,
                 action_dtype=np.int32,
                 obs_dtype=np.int32,
                 reward_dtype=np.float32,
                 obs_shape: Sequence[int] = (),
                 **kwargs):
        """Initialize the environment."""
        actions = specs.DiscreteArray(num_actions, dtype=action_dtype)
        observations = specs.BoundedArray(shape=obs_shape,
                                          dtype=obs_dtype,
                                          minimum=obs_dtype(0),
                                          maximum=obs_dtype(num_observations -
                                                            1))
        rewards = specs.Array((), reward_dtype)
        discounts = specs.BoundedArray((), reward_dtype, 0.0, 1.0)

        super().__init__(spec=specs.EnvironmentSpec(observations=observations,
                                                    actions=actions,
                                                    rewards=rewards,
                                                    discounts=discounts),
                         **kwargs)
Ejemplo n.º 12
0
  def run_test_adder(self,
                     adder: base.ReverbAdder,
                     first: dm_env.TimeStep,
                     steps: Sequence[Step],
                     expected_items: Sequence[Any],
                     pack_expected_items: bool = False,
                     repeat_episode_times: int = 1,
                     break_end_of_episode: bool = True):
    """Runs a unit test case for the adder.

    Args:
      adder: The instance of `base.ReverbAdder` that is being tested.
      first: The first `dm_env.TimeStep` that is used to call
        `base.ReverbAdder.add_first()`.
      steps: A sequence of (action, timestep) tuples that are passed to
        `base.ReverbAdder.add()`.
      expected_items: The sequence of items that are expected to be created
        by calling the adder's `add_first()` method on `first` and `add()` on
        all of the elements in `steps`.
      pack_expected_items: If true the expected items are given unpacked and
        need to be packed in a list before comparison.
      repeat_episode_times: How many times to run an episode.
      break_end_of_episode: If False, an end of an episode does not break the
        sequence.
    """
    if not steps:
      raise ValueError('At least one step must be given.')

    has_extras = len(steps[0]) == 3
    env_spec = tree.map_structure(
        _numeric_to_spec,
        specs.EnvironmentSpec(
            observations=steps[0][1].observation,
            actions=steps[0][0],
            rewards=steps[0][1].reward,
            discounts=steps[0][1].discount))
    if has_extras:
      extras_spec = tree.map_structure(_numeric_to_spec, steps[0][2])
    else:
      extras_spec = ()
    signature = adder.signature(env_spec, extras_spec=extras_spec)

    for episode_id in range(repeat_episode_times):
      # Add all the data up to the final step.
      adder.add_first(first)
      for step in steps[:-1]:
        action, ts = step[0], step[1]

        if has_extras:
          extras = step[2]
        else:
          extras = ()

        adder.add(action, next_timestep=ts, extras=extras)

      # Only check for the first episode.
      if episode_id == 0:
        if len(steps) == 1:
          # adder.add() has not been called yet, so no writers have been
          # created.
          self.assertEmpty(self.client.writers)
        else:
          # Make sure the writer has been created but not closed.
          self.assertLen(self.client.writers, 1)
          self.assertFalse(self.client.writers[0].closed)

      # Add the final step.
      adder.add(*steps[-1])

    # Ending the episode should close the writer. No new writer should yet have
    # been created as it is constructed lazily.
    self.assertLen(self.client.writers, 1)
    if break_end_of_episode:
      self.assertTrue(self.client.writers[0].closed)

    # Make sure our expected and observed data match.
    observed_items = [p[1] for p in self.client.writers[0].priorities]
    self.assertEqual(len(expected_items), len(observed_items))
    for expected_item, observed_item in zip(expected_items, observed_items):
      if pack_expected_items:
        expected_item = [expected_item]
      # Set check_types=False because
      tree.map_structure(
          np.testing.assert_array_almost_equal,
          expected_item,
          observed_item,
          check_types=False)

    def _check_signature(spec: tf.TensorSpec, value):
      # Convert int/float to numpy arrays of dtype np.int64 and np.float64.
      value = np.asarray(value)
      self.assertTrue(spec.is_compatible_with(tf.convert_to_tensor(value)))

    for step in self.client.writers[0].timesteps:
      tree.map_structure(_check_signature, signature, step)

    if break_end_of_episode:
      # Add the start of a second trajectory.
      adder.add_first(first)
      adder.add(*steps[0])

      # Make sure this creates an new writer.
      self.assertLen(self.client.writers, 2)
      # The writer is closed if the recently added `dm_env.TimeStep`'s'
      # step_type is `dm_env.StepType.LAST`.
      if steps[0][1].last():
        self.assertTrue(self.client.writers[1].closed)
      else:
        self.assertFalse(self.client.writers[1].closed)
Ejemplo n.º 13
0
    def run_test_adder(self, adder: base.ReverbAdder, first: dm_env.TimeStep,
                       steps: Sequence[Tuple[Any, dm_env.TimeStep]],
                       expected_items: Sequence[Any]):
        """Runs a unit test case for the adder.

    Args:
      adder: The instance of `base.ReverbAdder` that is being tested.
      first: The first `dm_env.TimeStep` that is used to call
        `base.ReverbAdder.add_first()`.
      steps: A sequence of (action, timestep) tuples that are passed to
        `base.ReverbAdder.add()`.
      expected_items: The sequence of items that are expected to be created
        by calling the adder's `add_first()` method on `first` and `add()` on
        all of the elements in `steps`.
    """
        if not steps:
            raise ValueError('At least one step must be given.')

        env_spec = tree.map_structure(
            _numeric_to_spec,
            specs.EnvironmentSpec(observations=steps[0][1].observation,
                                  actions=steps[0][0],
                                  rewards=steps[0][1].reward,
                                  discounts=steps[0][1].discount))
        signature = adder.signature(env_spec)

        # Add all the data up to the final step.
        adder.add_first(first)
        for action, ts in steps[:-1]:
            adder.add(action, next_timestep=ts)

        if len(steps) == 1:
            # adder.add() has not been called yet, so no writers have been created.
            self.assertEmpty(self.client.writers)
        else:
            # Make sure the writer has been created but not closed.
            self.assertLen(self.client.writers, 1)
            self.assertFalse(self.client.writers[0].closed)

        # Add the final step.
        adder.add(*steps[-1])

        # Ending the episode should close the writer. No new writer should yet have
        # been created as it is constructed lazily.
        self.assertLen(self.client.writers, 1)
        self.assertTrue(self.client.writers[0].closed)

        # Make sure our expected and observed data match.
        observed_items = [p[1] for p in self.client.writers[0].priorities]
        for expected_item, observed_item in zip(expected_items,
                                                observed_items):
            # Set check_types=False because
            tree.map_structure(np.testing.assert_array_almost_equal,
                               expected_item,
                               observed_item,
                               check_types=False)

        def _check_signature(spec: tf.TensorSpec, value):
            # Convert int/float to numpy arrays of dtype np.int64 and np.float64.
            value = np.asarray(value)
            self.assertTrue(
                spec.is_compatible_with(tf.convert_to_tensor(value)))

        for step in self.client.writers[0].timesteps:
            tree.map_structure(_check_signature, signature, step)

        # Add the start of a second trajectory.
        adder.add_first(first)
        adder.add(*steps[0])

        # Make sure this creates an new writer.
        self.assertLen(self.client.writers, 2)
        # The writer is closed if the recently added `dm_env.TimeStep`'s' step_type
        # is `dm_env.StepType.LAST`.
        if steps[0][1].last():
            self.assertTrue(self.client.writers[1].closed)
        else:
            self.assertFalse(self.client.writers[1].closed)
Ejemplo n.º 14
0
    def init(self, params):

        if not _TF_USE_GPU:
            tf.config.set_visible_devices([], 'GPU')
        tf.config.threading.set_inter_op_parallelism_threads(_TF_NUM_THREADS)
        tf.config.threading.set_intra_op_parallelism_threads(_TF_NUM_THREADS)

        if params.seed:
            agent_seed = params.seed + sum([ord(c) for c in params.name])
            random.seed(agent_seed)
            np.random.seed(agent_seed)
            tf.random.set_seed(agent_seed)

        # Internalize params.
        self._params = params

        self._name = params.name

        # Whether learning stopped.
        self._stop = False

        # Define specs. Everything needs to be single precision by default.
        observation_spec = specs.Array(shape=(params.states.rank, ),
                                       dtype=np.float32,
                                       name='obs')
        action_spec = specs.BoundedArray(shape=(params.num_phases, ),
                                         dtype=np.float32,
                                         minimum=0.,
                                         maximum=1.,
                                         name='action')
        reward_spec = specs.Array(shape=(), dtype=np.float32, name='reward')
        discount_spec = specs.BoundedArray(shape=(),
                                           dtype=np.float32,
                                           minimum=0.,
                                           maximum=1.,
                                           name='discount')

        env_spec = specs.EnvironmentSpec(observations=observation_spec,
                                         actions=action_spec,
                                         rewards=reward_spec,
                                         discounts=discount_spec)

        # Logger.
        dir_path = f'{params.exp_path}/logs/{self._name}'
        self._logger = make_default_logger(directory=dir_path,
                                           label=self._name)
        agent_logger = make_default_logger(directory=dir_path,
                                           label=f'{self._name}-learning')

        networks = _make_networks(actions_dim=params.num_phases,
                                  state_dim=params.states.rank,
                                  policy_layers=params.policy_layers,
                                  critic_layers=params.critic_layers)

        self.agent = acme_agent.DDPG(
            environment_spec=env_spec,
            policy_network=networks['policy'],
            critic_network=networks['critic'],
            observation_network=networks['observation'],
            discount=params.discount_factor,
            batch_size=params.batch_size,
            prefetch_size=params.prefetch_size,
            target_update_period=params.target_update_period,
            min_replay_size=params.min_replay_size,
            max_replay_size=params.max_replay_size,
            samples_per_insert=params.samples_per_insert,
            n_step=params.n_step,
            sigma_init=params.sigma_init,
            sigma_final=params.sigma_final,
            sigma_schedule_timesteps=params.sigma_schedule_timesteps,
            clipping=params.clipping,
            logger=agent_logger,
            checkpoint=False,
        )

        # Observations counter.
        self._obs_counter = 0
Ejemplo n.º 15
0
    def init(self, params):

        if not _TF_USE_GPU:
            tf.config.set_visible_devices([], 'GPU')
        tf.config.threading.set_inter_op_parallelism_threads(_TF_NUM_THREADS)
        tf.config.threading.set_intra_op_parallelism_threads(_TF_NUM_THREADS)

        if params.seed:
            agent_seed = params.seed + sum([ord(c) for c in params.name])
            random.seed(agent_seed)
            np.random.seed(agent_seed)
            tf.random.set_seed(agent_seed)

        # Internalize params.
        self._params = params

        self._name = params.name

        # Whether learning stopped.
        self._stop = False

        # Define specs. Everything needs to be single precision by default.
        observation_spec = specs.Array(shape=(params.states.rank, ),
                                       dtype=np.float32,
                                       name='obs')
        action_spec = specs.DiscreteArray(dtype=np.int32,
                                          num_values=params.actions.depth,
                                          name="action")
        reward_spec = specs.Array(shape=(), dtype=np.float32, name='reward')
        discount_spec = specs.BoundedArray(shape=(),
                                           dtype=np.float32,
                                           minimum=0.,
                                           maximum=1.,
                                           name='discount')

        env_spec = specs.EnvironmentSpec(observations=observation_spec,
                                         actions=action_spec,
                                         rewards=reward_spec,
                                         discounts=discount_spec)

        # Logger.
        dir_path = f'{params.exp_path}/logs/{self._name}'
        self._logger = make_default_logger(directory=dir_path,
                                           label=self._name)
        agent_logger = make_default_logger(directory=dir_path,
                                           label=f'{self._name}-learning')

        network = Network(num_actions=env_spec.actions.num_values,
                          rnn_hidden_size=params.rnn_hidden_size,
                          head_layers=params.head_layers)

        self.agent = acme_agent.R2D2(
            environment_spec=env_spec,
            network=network,
            batch_size=params.batch_size,
            samples_per_insert=params.samples_per_insert,
            burn_in_length=params.burn_in_length,
            trace_length=params.trace_length,
            replay_period=params.replay_period,
            min_replay_size=params.min_replay_size,
            max_replay_size=params.max_replay_size,
            discount=params.discount_factor,
            prefetch_size=params.prefetch_size,
            target_update_period=params.target_update_period,
            importance_sampling_exponent=params.importance_sampling_exponent,
            priority_exponent=params.priority_exponent,
            epsilon_init=params.epsilon_init,
            epsilon_final=params.epsilon_final,
            epsilon_schedule_timesteps=params.epsilon_schedule_timesteps,
            learning_rate=params.learning_rate,
            store_lstm_state=params.store_lstm_state,
            max_priority_weight=params.max_priority_weight,
            logger=agent_logger,
            checkpoint=False,
        )

        # Observations counter.
        self._obs_counter = 0
Ejemplo n.º 16
0
def define_residual_spec(rl_features,
                         env,
                         base_agent,
                         action_norm,
                         action_norm_scale=1.0,
                         include_base_action=True,
                         include_base_feats=True,
                         base_network=None):
    # TODO(minttu): pass in GymWrapper(env) without any other wrapper classes.
    """Defines environment observation and action spaces as seen by the RL agent.

  Args:
    rl_features: A list of state features visible to the agent. If set, they
      replace any visual features.
    env: The environment which defines the action space, rewards and discounts.
    base_agent: base agent to use in residual training.
    action_norm: bc_agent.ActionSpace object defining action normalization.
    action_norm_scale: Scalar by which to scale residual action normalization.
    include_base_action: If True, add base agent action to spec.
    include_base_feats: If True, add features given by base agent to spec.
    base_network: Network type used by the base agent, if applicable.

  Returns:
    residual_spec: An acme.specs.EnvironmentSpec instance defining the residual
      spec.
  """
    feats_spec = collections.OrderedDict()
    visible_state_dim = 0
    # This check allows train_bc to use this function to set residual spec
    # without using env wrappers.
    if isinstance(env, gym.Env):
        for k, v in env.observation_space.spaces.items():
            if k in rl_features:
                visible_state_dim += v.shape[0] if v.shape else 1
    else:
        if FLAGS.domain == 'mime':
            obs_space = mime_env_utils.make_dict_space(env.scene,
                                                       *rl_features).spaces
        else:
            obs_space = env.observation_spec()
        for k, v in obs_space.items():
            if k in rl_features:
                visible_state_dim += v.shape[0] if v.shape else 1
    if include_base_feats:
        base_feat_size = {
            'resnet18_narrow32': 256,
            'hand_vil': 200,
        }[base_network]
        feats_spec['feats'] = specs.Array([base_feat_size], np.float32,
                                          'feats')
    if visible_state_dim > 0:
        feats_spec['visible_state'] = (specs.Array([visible_state_dim],
                                                   np.float32,
                                                   'visible_state'))
    if include_base_action:
        feats_spec['base_action'] = specs.Array([base_agent.action_target_dim],
                                                np.float32, 'base_action')
    if FLAGS.rl_observation_network is not None:
        # TODO(minttu): Get image size from env observation spec.
        if FLAGS.input_type == 'depth':
            feats_spec['depth'] = specs.Array(
                [FLAGS.image_size, FLAGS.image_size, 3], np.uint8, 'depth')
        elif FLAGS.input_type == 'rgb':
            image_size = FLAGS.image_size
            rgb_shape = ([3, image_size, image_size, 3]
                         if FLAGS.late_fusion else [image_size, image_size, 9])
            feats_spec['rgb'] = specs.Array(rgb_shape, np.uint8, 'rgb')
    if isinstance(env, gym.Env):
        env_action_spec = env.action_space
        env_action_spec.minimum = env_action_spec.low
        env_action_spec.maximum = env_action_spec.high
        env_action_spec.name = 'action'
        # Concatenating fields here since it is non-trivial to use dictionary
        # observations with DemoReader's generator.
        concat_shape = np.sum([a.shape for a in feats_spec.values()])
        feats_spec = collections.OrderedDict()
        feats_spec['residual_obs'] = specs.Array((concat_shape, ), np.float32,
                                                 'residual_obs')
    else:
        env_action_spec = env.action_spec()
    env_min = env_action_spec.minimum
    env_max = env_action_spec.maximum
    # Allow (at the extreme) to fully reverse a base action (from one action
    # space limit to the opposite limit).
    min_residual = env_min - env_max if include_base_action else env_min
    max_residual = env_max - env_min if include_base_action else env_max
    print('min residual', min_residual, 'max residual', max_residual)
    residual_action_space = bc_agent.ActionSpace(action_norm,
                                                 env=env,
                                                 scale=action_norm_scale)
    if action_norm in ['centered', 'zeromean_unitvar']:
        # Reuse stats; normalization scheme may still be different.
        residual_action_space.mean = base_agent.action_space.mean
        residual_action_space.std = base_agent.action_space.std
    norm_min = residual_action_space.normalize_flat(min_residual)
    norm_max = residual_action_space.normalize_flat(max_residual)
    norm_action_spec = specs.BoundedArray(shape=env_action_spec.shape,
                                          dtype=env_action_spec.dtype,
                                          minimum=norm_min,
                                          maximum=norm_max,
                                          name=env_action_spec.name)
    print(env_action_spec)
    print(norm_action_spec)

    if isinstance(env, gym.Env):
        reward_spec = specs.BoundedArray(shape=(),
                                         dtype=float,
                                         minimum=env.reward_range[0],
                                         maximum=env.reward_range[1],
                                         name='reward')
    else:
        reward_spec = env.reward_spec()
    if isinstance(env, gym.Env):
        discount_spec = specs.BoundedArray(shape=(),
                                           dtype=float,
                                           minimum=0.,
                                           maximum=1.,
                                           name='discount')
    else:
        discount_spec = env.discount_spec()
    # residual_spec = specs.make_environment_spec(env)
    # Use same normalization for base agent and residual agent.
    residual_spec = specs.EnvironmentSpec(observations=feats_spec,
                                          actions=norm_action_spec,
                                          rewards=reward_spec,
                                          discounts=discount_spec)
    print('Residual spec', residual_spec)
    return residual_spec
Ejemplo n.º 17
0
    def run_test_adder(self,
                       adder: base.ReverbAdder,
                       first: dm_env.TimeStep,
                       steps: Sequence[Step],
                       expected_items: Sequence[Any],
                       pack_expected_items: bool = False,
                       stack_sequence_fields: bool = True,
                       repeat_episode_times: int = 1,
                       break_end_of_episode: bool = True):
        """Runs a unit test case for the adder.

    Args:
      adder: The instance of `base.ReverbAdder` that is being tested.
      first: The first `dm_env.TimeStep` that is used to call
        `base.ReverbAdder.add_first()`.
      steps: A sequence of (action, timestep) tuples that are passed to
        `base.ReverbAdder.add()`.
      expected_items: The sequence of items that are expected to be created
        by calling the adder's `add_first()` method on `first` and `add()` on
        all of the elements in `steps`.
      pack_expected_items: Deprecated and not used. If true the expected items
        are given unpacked and need to be packed in a list before comparison.
      stack_sequence_fields: Whether to stack the sequence fields of the
        expected items before comparing to the observed items. Usually False
        for transition adders and True for both episode and sequence adders.
      repeat_episode_times: How many times to run an episode.
      break_end_of_episode: If False, an end of an episode does not break the
        sequence.
    """

        del pack_expected_items

        if not steps:
            raise ValueError('At least one step must be given.')

        has_extras = len(steps[0]) == 3
        env_spec = tree.map_structure(
            _numeric_to_spec,
            specs.EnvironmentSpec(observations=steps[0][1].observation,
                                  actions=steps[0][0],
                                  rewards=steps[0][1].reward,
                                  discounts=steps[0][1].discount))
        if has_extras:
            extras_spec = tree.map_structure(_numeric_to_spec, steps[0][2])
        else:
            extras_spec = ()
        signature = adder.signature(env_spec, extras_spec=extras_spec)

        for episode_id in range(repeat_episode_times):
            # Add all the data up to the final step.
            adder.add_first(first)
            for step in steps[:-1]:
                action, ts = step[0], step[1]

                if has_extras:
                    extras = step[2]
                else:
                    extras = ()

                adder.add(action, next_timestep=ts, extras=extras)

            # Add the final step.
            adder.add(*steps[-1])

        # Ending the episode should close the writer. No new writer should yet have
        # been created as it is constructed lazily.
        if break_end_of_episode:
            self.assertEqual(self.client.writer.num_episodes,
                             repeat_episode_times)

        # Make sure our expected and observed data match.
        observed_items = [p[2] for p in self.client.writer.priorities]

        # Check matching number of items.
        self.assertEqual(len(expected_items), len(observed_items))

        # Check items are matching according to numpy's almost_equal.
        for expected_item, observed_item in zip(expected_items,
                                                observed_items):
            if stack_sequence_fields:
                expected_item = tree_utils.stack_sequence_fields(expected_item)

            # Set check_types=False because we check them below.
            tree.map_structure(np.testing.assert_array_almost_equal,
                               expected_item,
                               tuple(observed_item),
                               check_types=False)

        # Make sure the signature matches was is being written by Reverb.
        def _check_signature(spec: tf.TensorSpec, value: np.ndarray):
            self.assertTrue(
                spec.is_compatible_with(tf.convert_to_tensor(value)))

        # Check the last transition's signature.
        tree.map_structure(_check_signature, signature, observed_items[-1])