Ejemplo n.º 1
0
    def test_make_dataset_nested_specs(self):
        environment_spec = specs.EnvironmentSpec(observations={
            'obs_1':
            specs.Array((3, 64, 64), 'uint8'),
            'obs_2':
            specs.Array((10, ), 'int32')
        },
                                                 actions=specs.BoundedArray(
                                                     (),
                                                     'float32',
                                                     minimum=-1.,
                                                     maximum=1.),
                                                 rewards=specs.Array(
                                                     (), 'float32'),
                                                 discounts=specs.BoundedArray(
                                                     (),
                                                     'float32',
                                                     minimum=0.,
                                                     maximum=1.))

        dataset = reverb_dataset.make_dataset(
            client=self.tf_client, environment_spec=environment_spec)

        self.assertTrue(
            _check_specs(tuple(environment_spec), dataset.element_spec.data))
Ejemplo n.º 2
0
    def test_make_dataset_nested_specs(self):
        environment_spec = specs.EnvironmentSpec(observations={
            'obs_1':
            specs.Array((3, 64, 64), 'uint8'),
            'obs_2':
            specs.Array((10, ), 'int32')
        },
                                                 actions=specs.BoundedArray(
                                                     (),
                                                     'float32',
                                                     minimum=-1.,
                                                     maximum=1.),
                                                 rewards=specs.Array(
                                                     (), 'float32'),
                                                 discounts=specs.BoundedArray(
                                                     (),
                                                     'float32',
                                                     minimum=0.,
                                                     maximum=1.))

        dataset = reverb_dataset.make_dataset(
            client=self.tf_client, environment_spec=environment_spec)

        expected_spec = adders.Step(observation=environment_spec.observations,
                                    action=environment_spec.actions,
                                    reward=environment_spec.rewards,
                                    discount=environment_spec.discounts,
                                    start_of_episode=specs.Array(shape=(),
                                                                 dtype=bool),
                                    extras=())

        self.assertTrue(_check_specs(expected_spec, dataset.element_spec.data))
Ejemplo n.º 3
0
    def __init__(self,
                 *,
                 num_actions: int = 1,
                 num_observations: int = 1,
                 action_dtype=np.int32,
                 obs_dtype=np.int32,
                 obs_shape: Sequence[int] = (),
                 discount_spec: Optional[types.NestedSpec] = None,
                 reward_spec: Optional[types.NestedSpec] = None,
                 **kwargs):
        """Initialize the environment."""
        if reward_spec is None:
            reward_spec = specs.Array((), np.float32)

        if discount_spec is None:
            discount_spec = specs.BoundedArray((), np.float32, 0.0, 1.0)

        actions = specs.DiscreteArray(num_actions, dtype=action_dtype)
        observations = specs.BoundedArray(shape=obs_shape,
                                          dtype=obs_dtype,
                                          minimum=obs_dtype(0),
                                          maximum=obs_dtype(num_observations -
                                                            1))

        super().__init__(spec=specs.EnvironmentSpec(observations=observations,
                                                    actions=actions,
                                                    rewards=reward_spec,
                                                    discounts=discount_spec),
                         **kwargs)
Ejemplo n.º 4
0
def _make_fake_env() -> dm_env.Environment:
  env_spec = specs.EnvironmentSpec(
      observations=specs.Array(shape=(10, 5), dtype=np.float32),
      actions=specs.BoundedArray(
          shape=(1,), dtype=np.float32, minimum=-10., maximum=10.),
      rewards=specs.Array(shape=(), dtype=np.float32),
      discounts=specs.BoundedArray(
          shape=(), dtype=np.float32, minimum=0., maximum=1.),
  )
  return fakes.Environment(env_spec, episode_length=10)
Ejemplo n.º 5
0
def _convert_to_spec(space: gym.Space,
                     name: Optional[str] = None) -> types.NestedSpec:
    """Converts an OpenAI Gym space to a dm_env spec or nested structure of specs.

  Box, MultiBinary and MultiDiscrete Gym spaces are converted to BoundedArray
  specs. Discrete OpenAI spaces are converted to DiscreteArray specs. Tuple and
  Dict spaces are recursively converted to tuples and dictionaries of specs.

  Args:
    space: The Gym space to convert.
    name: Optional name to apply to all return spec(s).

  Returns:
    A dm_env spec or nested structure of specs, corresponding to the input
    space.
  """
    if isinstance(space, spaces.Discrete):
        return specs.DiscreteArray(num_values=space.n,
                                   dtype=space.dtype,
                                   name=name)

    elif isinstance(space, spaces.Box):
        return specs.BoundedArray(shape=space.shape,
                                  dtype=space.dtype,
                                  minimum=space.low,
                                  maximum=space.high,
                                  name=name)

    elif isinstance(space, spaces.MultiBinary):
        return specs.BoundedArray(shape=space.shape,
                                  dtype=space.dtype,
                                  minimum=0.0,
                                  maximum=1.0,
                                  name=name)

    elif isinstance(space, spaces.MultiDiscrete):
        return specs.BoundedArray(shape=space.shape,
                                  dtype=space.dtype,
                                  minimum=np.zeros(space.shape),
                                  maximum=space.nvec - 1,
                                  name=name)

    elif isinstance(space, spaces.Tuple):
        return tuple(_convert_to_spec(s, name) for s in space.spaces)

    elif isinstance(space, spaces.Dict):
        return {
            key: _convert_to_spec(value, key)
            for key, value in space.spaces.items()
        }

    else:
        raise ValueError('Unexpected gym space: {}'.format(space))
Ejemplo n.º 6
0
    def __init__(self, environment_spec: specs.EnvironmentSpec,
                 action_spec: specs.BoundedArray, z_dim: int) -> None:
        self._z_dim = z_dim
        z_spec = specs.BoundedArray((z_dim, ),
                                    np.float64,
                                    minimum=0,
                                    maximum=1)
        # Modify the environment_spec to also include the latent variable
        # observation  (z)
        self._obs_space = environment_spec.observations
        assert (
            len(self._obs_space.shape) == 1
        ), f"Only vector observations are supported for now. Observations shape passed: {obs_shape}"
        updated_observations = specs.BoundedArray(
            (self._obs_space.shape[0] + z_dim, ),
            dtype=environment_spec.observations.dtype,
            name=environment_spec.observations.name,
            minimum=np.append(environment_spec.observations.minimum,
                              [0] * z_dim),
            maximum=np.append(environment_spec.observations.maximum,
                              [0] * z_dim),
        )
        environment_spec = specs.EnvironmentSpec(
            observations=updated_observations,
            actions=environment_spec.actions,
            rewards=environment_spec.rewards,
            discounts=environment_spec.discounts,
        )
        self._agent_networks = make_feed_forward_networks(action_spec, z_spec)
        self._agent = dmpo.DistributionalMPO(
            environment_spec=environment_spec,
            policy_network=self._agent_networks['policy'],
            critic_network=self._agent_networks['critic'],
            observation_network=self._agent_networks['observation'],  # pytype: disable=wrong-arg-types
            extra_modules_to_save={
                'discriminator': self._agent_networks['discriminator'],
            },
            return_action_entropy=True,
        )

        self._z_distribution = tfd.Categorical([1] * z_dim)
        self._current_z = self._z_distribution.sample()

        # Create discriminator optimizer.
        self._discriminator_optimizer = snt.optimizers.Adam(1e-4)
        self._discriminator_logger = loggers.make_default_logger(
            'discriminator')

        # Create variables for the discriminator.
        tf2_utils.create_variables(self._agent_networks['discriminator'],
                                   [self._obs_space])
Ejemplo n.º 7
0
  def test_make_dataset_with_variable_length_instances(self):
    """Dataset with variable length instances should have shapes with None."""
    environment_spec = specs.EnvironmentSpec(
        observations=specs.Array((0, 64, 64), 'uint8'),
        actions=specs.BoundedArray((), 'float32', minimum=-1., maximum=1.),
        rewards=specs.Array((), 'float32'),
        discounts=specs.BoundedArray((), 'float32', minimum=0., maximum=1.))

    dataset = reverb_dataset.make_dataset(
        server_address=self.server_address,
        environment_spec=environment_spec,
        convert_zero_size_to_none=True)

    self.assertSequenceEqual(dataset.element_spec.data[0].shape.as_list(),
                             [None, 64, 64])
Ejemplo n.º 8
0
 def observation_spec(self) -> specs.BoundedArray:
     """Returns the observation spec."""
     return specs.BoundedArray(shape=self._observation().shape,
                               dtype=self._observation().dtype,
                               name="board",
                               minimum=0,
                               maximum=MAX_APPLES_PER_USER)
Ejemplo n.º 9
0
    def __init__(self,
                 *,
                 num_observations: Mapping[str, int],
                 num_actions: int = 1,
                 action_dtype=np.int32,
                 obs_dtype=np.int32,
                 obs_shape: Sequence[int] = (),
                 discount_spec: Optional[types.NestedSpec] = None,
                 reward_spec: Optional[types.NestedSpec] = None,
                 **kwargs):
        """Initialize the environment."""

        observations_spec = {}
        for key in num_observations:
            observations_spec[key] = specs.BoundedArray(
                shape=obs_shape,
                dtype=obs_dtype,
                minimum=obs_dtype(0),
                maximum=obs_dtype(num_observations[key] - 1))

        super().__init__(num_actions=num_actions,
                         action_dtype=action_dtype,
                         observation_spec=observations_spec,
                         discount_spec=discount_spec,
                         reward_spec=reward_spec,
                         **kwargs)
Ejemplo n.º 10
0
    def __init__(self, DIAYN_agent: DIAYNAgent.DIAYNAgent,
                 environment_spec: specs.EnvironmentSpec,
                 action_spec: specs.BoundedArray, z_dim: int,
                 replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE,
                 replay_server_port: Optional[int] = None,
                 ) -> None:
        self._z_dim = z_dim
        z_spec = specs.BoundedArray((z_dim,), np.float64, minimum=0, maximum=1)
        self._environment_spec = environment_spec
        # Modify the environment_spec to also include the latent variable
        # observation  (z)
        self._obs_space = environment_spec.observations
        assert (len(self._obs_space.shape) == 1), f"Only vector observations are supported for now. Observations shape passed: {obs_shape}"
        self._agent_networks = make_feed_forward_networks(action_spec, z_spec)
        self._agent = dmpo.DistributionalMPO(
            environment_spec=environment_spec,
            policy_network=self._agent_networks['policy'],
            critic_network=self._agent_networks['critic'],
            observation_network=self._agent_networks['observation'],  # pytype: disable=wrong-arg-types
            extra_modules_to_save={
                'hierarchical_controller': self._agent_networks['hierarchical_controller'],
            },
            checkpoint_name='hierarchical_dmpo',
            replay_table_name=replay_table_name,
            replay_server_port=replay_server_port,
            return_action_entropy=True,
        )

        self._DIAYN_agent = DIAYN_agent

        # Create variables for the discriminator.
        tf2_utils.create_variables(
            self._agent_networks['hierarchical_controller'],
            [self._obs_space])
Ejemplo n.º 11
0
 def discount_spec(self) -> Dict[str, specs.BoundedArray]:
     discount_specs = {}
     for agent in self.agents:
         discount_specs[agent] = specs.BoundedArray((),
                                                    np.float32,
                                                    minimum=0,
                                                    maximum=1.0)
     return discount_specs
Ejemplo n.º 12
0
 def extra_spec(self) -> Dict[str, specs.BoundedArray]:
     state = self._environment.get_state()
     # TODO (dries): What should the real bounds be of the state spec?
     return {
         "s_t":
         specs.BoundedArray(state.shape,
                            np.float32,
                            minimum=float("-inf"),
                            maximum=float("inf"))
     }
Ejemplo n.º 13
0
  def observation_spec(self):
    # pov obs are uint8, but required type is float.
    obs_spec = self._observation_spec['pov']
    obs_spec = specs.BoundedArray(shape=obs_spec.shape,
      dtype=np.float32,
      minimum=obs_spec.minimum,
      maximum=obs_spec.maximum,
      name=obs_spec.name)

    return OVAR(observation=obs_spec,
                obs_vector=self._observation_spec['vector'],
                action=self.action_spec(),
                reward=self.reward_spec())
Ejemplo n.º 14
0
  def __init__(self,
               *,
               action_dim: int = 1,
               observation_dim: int = 1,
               bounded: bool = False,
               dtype=np.float32,
               reward_dtype=np.float32,
               **kwargs):
    """Initialize the environment.

    Args:
      action_dim: number of action dimensions.
      observation_dim: number of observation dimensions.
      bounded: whether or not the actions are bounded in [-1, 1].
      dtype: dtype of the action and observation spaces.
      reward_dtype: dtype of the reward and discounts.
      **kwargs: additional kwargs passed to the Environment base class.
    """

    action_shape = () if action_dim == 0 else (action_dim,)
    observation_shape = () if observation_dim == 0 else (observation_dim,)

    observations = specs.Array(observation_shape, dtype)
    rewards = specs.Array((), reward_dtype)
    discounts = specs.BoundedArray((), reward_dtype, 0.0, 1.0)

    if bounded:
      actions = specs.BoundedArray(action_shape, dtype, -1.0, 1.0)
    else:
      actions = specs.Array(action_shape, dtype)

    super().__init__(
        spec=specs.EnvironmentSpec(
            observations=observations,
            actions=actions,
            rewards=rewards,
            discounts=discounts),
        **kwargs)
Ejemplo n.º 15
0
    def extra_spec(self) -> Dict[str, specs.BoundedArray]:
        extras = {}
        if self.return_state_info:
            shape = self.environment._get_state().shape

            ex_spec = specs.BoundedArray(
                shape=shape,
                dtype="float32",
                name="observation",
                minimum=[float("-inf")] * shape[0],
                maximum=[float("inf")] * shape[0],
            )
            extras.update({"s_t": ex_spec})
        return extras
Ejemplo n.º 16
0
Archivo: fakes.py Proyecto: wilixx/acme
    def __init__(self,
                 *,
                 num_actions: int = 1,
                 num_observations: int = 1,
                 action_dtype=np.int32,
                 obs_dtype=np.int32,
                 reward_dtype=np.float32,
                 obs_shape: Sequence[int] = (),
                 **kwargs):
        """Initialize the environment."""
        actions = specs.DiscreteArray(num_actions, dtype=action_dtype)
        observations = specs.BoundedArray(shape=obs_shape,
                                          dtype=obs_dtype,
                                          minimum=obs_dtype(0),
                                          maximum=obs_dtype(num_observations -
                                                            1))
        rewards = specs.Array((), reward_dtype)
        discounts = specs.BoundedArray((), reward_dtype, 0.0, 1.0)

        super().__init__(spec=specs.EnvironmentSpec(observations=observations,
                                                    actions=actions,
                                                    rewards=rewards,
                                                    discounts=discounts),
                         **kwargs)
Ejemplo n.º 17
0
 def setUp(self):
     super().setUp()
     self.state_dims = 8
     self.action_dims = 4
     self.params = {
         'world': jnp.ones((3, )),
         'policy': jnp.ones((3, )),
         'value': jnp.ones((3, ))
     }
     self.env_spec = specs.EnvironmentSpec(
         observations=specs.Array(shape=(self.state_dims, ), dtype=float),
         actions=specs.Array(shape=(self.action_dims, ), dtype=float),
         rewards=specs.Array(shape=(1, ), dtype=float, name='reward'),
         discounts=specs.BoundedArray(shape=(),
                                      dtype=float,
                                      minimum=0.,
                                      maximum=1.,
                                      name='discount'))
Ejemplo n.º 18
0
def define_residual_spec(rl_features,
                         env,
                         base_agent,
                         action_norm,
                         action_norm_scale=1.0,
                         include_base_action=True,
                         include_base_feats=True,
                         base_network=None):
    # TODO(minttu): pass in GymWrapper(env) without any other wrapper classes.
    """Defines environment observation and action spaces as seen by the RL agent.

  Args:
    rl_features: A list of state features visible to the agent. If set, they
      replace any visual features.
    env: The environment which defines the action space, rewards and discounts.
    base_agent: base agent to use in residual training.
    action_norm: bc_agent.ActionSpace object defining action normalization.
    action_norm_scale: Scalar by which to scale residual action normalization.
    include_base_action: If True, add base agent action to spec.
    include_base_feats: If True, add features given by base agent to spec.
    base_network: Network type used by the base agent, if applicable.

  Returns:
    residual_spec: An acme.specs.EnvironmentSpec instance defining the residual
      spec.
  """
    feats_spec = collections.OrderedDict()
    visible_state_dim = 0
    # This check allows train_bc to use this function to set residual spec
    # without using env wrappers.
    if isinstance(env, gym.Env):
        for k, v in env.observation_space.spaces.items():
            if k in rl_features:
                visible_state_dim += v.shape[0] if v.shape else 1
    else:
        if FLAGS.domain == 'mime':
            obs_space = mime_env_utils.make_dict_space(env.scene,
                                                       *rl_features).spaces
        else:
            obs_space = env.observation_spec()
        for k, v in obs_space.items():
            if k in rl_features:
                visible_state_dim += v.shape[0] if v.shape else 1
    if include_base_feats:
        base_feat_size = {
            'resnet18_narrow32': 256,
            'hand_vil': 200,
        }[base_network]
        feats_spec['feats'] = specs.Array([base_feat_size], np.float32,
                                          'feats')
    if visible_state_dim > 0:
        feats_spec['visible_state'] = (specs.Array([visible_state_dim],
                                                   np.float32,
                                                   'visible_state'))
    if include_base_action:
        feats_spec['base_action'] = specs.Array([base_agent.action_target_dim],
                                                np.float32, 'base_action')
    if FLAGS.rl_observation_network is not None:
        # TODO(minttu): Get image size from env observation spec.
        if FLAGS.input_type == 'depth':
            feats_spec['depth'] = specs.Array(
                [FLAGS.image_size, FLAGS.image_size, 3], np.uint8, 'depth')
        elif FLAGS.input_type == 'rgb':
            image_size = FLAGS.image_size
            rgb_shape = ([3, image_size, image_size, 3]
                         if FLAGS.late_fusion else [image_size, image_size, 9])
            feats_spec['rgb'] = specs.Array(rgb_shape, np.uint8, 'rgb')
    if isinstance(env, gym.Env):
        env_action_spec = env.action_space
        env_action_spec.minimum = env_action_spec.low
        env_action_spec.maximum = env_action_spec.high
        env_action_spec.name = 'action'
        # Concatenating fields here since it is non-trivial to use dictionary
        # observations with DemoReader's generator.
        concat_shape = np.sum([a.shape for a in feats_spec.values()])
        feats_spec = collections.OrderedDict()
        feats_spec['residual_obs'] = specs.Array((concat_shape, ), np.float32,
                                                 'residual_obs')
    else:
        env_action_spec = env.action_spec()
    env_min = env_action_spec.minimum
    env_max = env_action_spec.maximum
    # Allow (at the extreme) to fully reverse a base action (from one action
    # space limit to the opposite limit).
    min_residual = env_min - env_max if include_base_action else env_min
    max_residual = env_max - env_min if include_base_action else env_max
    print('min residual', min_residual, 'max residual', max_residual)
    residual_action_space = bc_agent.ActionSpace(action_norm,
                                                 env=env,
                                                 scale=action_norm_scale)
    if action_norm in ['centered', 'zeromean_unitvar']:
        # Reuse stats; normalization scheme may still be different.
        residual_action_space.mean = base_agent.action_space.mean
        residual_action_space.std = base_agent.action_space.std
    norm_min = residual_action_space.normalize_flat(min_residual)
    norm_max = residual_action_space.normalize_flat(max_residual)
    norm_action_spec = specs.BoundedArray(shape=env_action_spec.shape,
                                          dtype=env_action_spec.dtype,
                                          minimum=norm_min,
                                          maximum=norm_max,
                                          name=env_action_spec.name)
    print(env_action_spec)
    print(norm_action_spec)

    if isinstance(env, gym.Env):
        reward_spec = specs.BoundedArray(shape=(),
                                         dtype=float,
                                         minimum=env.reward_range[0],
                                         maximum=env.reward_range[1],
                                         name='reward')
    else:
        reward_spec = env.reward_spec()
    if isinstance(env, gym.Env):
        discount_spec = specs.BoundedArray(shape=(),
                                           dtype=float,
                                           minimum=0.,
                                           maximum=1.,
                                           name='discount')
    else:
        discount_spec = env.discount_spec()
    # residual_spec = specs.make_environment_spec(env)
    # Use same normalization for base agent and residual agent.
    residual_spec = specs.EnvironmentSpec(observations=feats_spec,
                                          actions=norm_action_spec,
                                          rewards=reward_spec,
                                          discounts=discount_spec)
    print('Residual spec', residual_spec)
    return residual_spec
Ejemplo n.º 19
0
def agent_info_spec() -> specs.BoundedArray:
    """Create the spec for the agent_info part of the observation"""
    return specs.BoundedArray((4, ), dtype=np.float32, minimum=0.0, maximum=10)
Ejemplo n.º 20
0
    def init(self, params):

        if not _TF_USE_GPU:
            tf.config.set_visible_devices([], 'GPU')
        tf.config.threading.set_inter_op_parallelism_threads(_TF_NUM_THREADS)
        tf.config.threading.set_intra_op_parallelism_threads(_TF_NUM_THREADS)

        if params.seed:
            agent_seed = params.seed + sum([ord(c) for c in params.name])
            random.seed(agent_seed)
            np.random.seed(agent_seed)
            tf.random.set_seed(agent_seed)

        # Internalize params.
        self._params = params

        self._name = params.name

        # Whether learning stopped.
        self._stop = False

        # Define specs. Everything needs to be single precision by default.
        observation_spec = specs.Array(shape=(params.states.rank, ),
                                       dtype=np.float32,
                                       name='obs')
        action_spec = specs.BoundedArray(shape=(params.num_phases, ),
                                         dtype=np.float32,
                                         minimum=0.,
                                         maximum=1.,
                                         name='action')
        reward_spec = specs.Array(shape=(), dtype=np.float32, name='reward')
        discount_spec = specs.BoundedArray(shape=(),
                                           dtype=np.float32,
                                           minimum=0.,
                                           maximum=1.,
                                           name='discount')

        env_spec = specs.EnvironmentSpec(observations=observation_spec,
                                         actions=action_spec,
                                         rewards=reward_spec,
                                         discounts=discount_spec)

        # Logger.
        dir_path = f'{params.exp_path}/logs/{self._name}'
        self._logger = make_default_logger(directory=dir_path,
                                           label=self._name)
        agent_logger = make_default_logger(directory=dir_path,
                                           label=f'{self._name}-learning')

        networks = _make_networks(actions_dim=params.num_phases,
                                  state_dim=params.states.rank,
                                  policy_layers=params.policy_layers,
                                  critic_layers=params.critic_layers)

        self.agent = acme_agent.DDPG(
            environment_spec=env_spec,
            policy_network=networks['policy'],
            critic_network=networks['critic'],
            observation_network=networks['observation'],
            discount=params.discount_factor,
            batch_size=params.batch_size,
            prefetch_size=params.prefetch_size,
            target_update_period=params.target_update_period,
            min_replay_size=params.min_replay_size,
            max_replay_size=params.max_replay_size,
            samples_per_insert=params.samples_per_insert,
            n_step=params.n_step,
            sigma_init=params.sigma_init,
            sigma_final=params.sigma_final,
            sigma_schedule_timesteps=params.sigma_schedule_timesteps,
            clipping=params.clipping,
            logger=agent_logger,
            checkpoint=False,
        )

        # Observations counter.
        self._obs_counter = 0
Ejemplo n.º 21
0
 def discount_spec(self) -> specs.BoundedArray:
     return specs.BoundedArray((), np.float32, minimum=0, maximum=1.0)
Ejemplo n.º 22
0
 def reward_spec(self) -> specs.BoundedArray:
     return specs.BoundedArray((),
                               np.float32,
                               minimum=self._environment.game.min_utility(),
                               maximum=self._environment.game.max_utility())
Ejemplo n.º 23
0
 def discount_spec(self) -> Dict[str, specs.BoundedArray]:
     return {
         agent: specs.BoundedArray((), np.float32, minimum=0, maximum=1.0)
         for agent in self._possible_agents
     }
Ejemplo n.º 24
0
from typing import Optional

from acme import environment_loop
from acme import specs
from acme import types
from acme.testing import fakes
import numpy as np

from absl.testing import absltest
from absl.testing import parameterized

EPISODE_LENGTH = 10

# Discount specs
F32_2_MIN_0_MAX_1 = specs.BoundedArray(
    dtype=np.float32, shape=(2,), minimum=0.0, maximum=1.0)
F32_2x1_MIN_0_MAX_1 = specs.BoundedArray(
    dtype=np.float32, shape=(2, 1), minimum=0.0, maximum=1.0)
TREE_MIN_0_MAX_1 = {'a': F32_2_MIN_0_MAX_1, 'b': F32_2x1_MIN_0_MAX_1}

# Reward specs
F32 = specs.Array(dtype=np.float32, shape=())
F32_1x3 = specs.Array(dtype=np.float32, shape=(1, 3))
TREE = {'a': F32, 'b': F32_1x3}

TEST_CASES = (
    ('scalar_discount_scalar_reward', None, None),
    ('vector_discount_scalar_reward', F32_2_MIN_0_MAX_1, F32),
    ('matrix_discount_matrix_reward', F32_2x1_MIN_0_MAX_1, F32_1x3),
    ('tree_discount_tree_reward', TREE_MIN_0_MAX_1, TREE),
    )
Ejemplo n.º 25
0
    def __init__(self, num_players: int):
        self._reset_next_step = True

        self.scaling = 200.0

        # Chose action
        act_min = [0.0] * 7  # 6 + No action
        act_max = [1.0] * 7  # 6 + No action

        # Action continuous component
        # All directions are in x, y format
        act_min.extend([
            -100 / self.scaling,
            -1,
            -1,  # dash (power, direction)
            0,
            -1,
            -1,  # kick (power, direction)
            0,
            0,  # change_view (width, quality)
            -1,
            -1,
            0,  # tackle (direction, foul)
            -1,
            -1,  # turn (direction)
            -1,
            -1,
        ])  # turn_neck(direction)

        act_max.extend(
            [100 / self.scaling, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

        assert len(act_min) == len(act_max)
        action_spec = specs.BoundedArray(
            shape=(len(act_min), ),
            dtype="float32",
            name="action",
            minimum=act_min,
            maximum=act_max,
        )

        self.action_size = action_spec.shape[0]

        # obs_dict = {"time_left": 0, "side": 1, "sense_self": 2,
        #  "coords": (3, 5), "body_dir": (5, 7),
        #             "head_dir": (7, 9), "width": (9, 12),
        # "quality": 13, "stamina": 14, "effort": 15,
        #             "speed_amount": 16, "speed_direction": (17, 19),
        # "neck_direction": (19, 21),
        #             "see_ball": 21, "ball_dist": 22,
        # "ball_dir": (23, 25), "ball_dist_change": 25,
        #             "ball_dir_change": 26, "ball_speed": 27,
        # "last_action": (28, 28 + self.action_size),
        #             }

        # TODO: Check if all bounds are correct
        obs_min = [
            0.0,  # time_left
            0.0,  # side
            0.0,  # sense_self
            -100 / self.scaling,
            -50 / self.scaling,  # coords
            -1,
            -1,  # body_dir
            -1,
            -1,  # head_dir
            0,
            0,
            0,  # width
            0,  # quality
            0,  # stamina
            0,  # effort
            0,  # speed_amount
            -1,
            -1,  # speed_direction
            -1,
            -1,  # neck_direction
            0,  # see_ball
            0,  # ball_dist
            -1,
            -1,  # ball_dir
            -100 / self.scaling,  # ball_dist_change
            -180 / self.scaling,  # ball_dir_change
            0,  # ball_speed
        ]

        obs_max = [
            1.0,  # time_left
            1.0,  # side
            1.0,  # sense_self
            100 / self.scaling,
            50 / self.scaling,  # coords
            1,
            1,  # body_dir
            1,
            1,  # head_dir
            1,
            1,
            1,  # width
            1,  # quality
            1,  # stamina
            1,  # effort
            100 / self.scaling,  # speed_amount
            1,
            1,  # speed_direction
            1,
            1,  # neck_direction
            1,  # see_ball
            100 / self.scaling,  # ball_dist
            1,
            1,  # ball_dir
            100 / self.scaling,  # ball_dist_change
            180 / self.scaling,  # ball_dir_change
            100 / self.scaling,  # ball_speed
        ]

        # Last action
        obs_min.extend(action_spec.minimum)
        obs_max.extend(action_spec.maximum)

        # [see_player, is_on_team, player_distance,
        # player_direction] for num_agents-1
        self.num_agents = num_players

        # TODO: Add this in again.
        # for i in range(21):
        #     # [see_player, is_on_team, player_distance,
        # player_direction (x, y format)]
        #     obs_min.extend([0, 0, -200 / self.scaling, -1, -1])
        #     obs_max.extend([1, 1, +200 / self.scaling, 1, 1])

        assert len(obs_min) == len(obs_max)
        self.obs_size = len(obs_min)

        self.agents = ["player_" + str(r) for r in range(num_players)]

        self._observation_specs = {}
        self._action_specs = {}

        obs_spec = specs.BoundedArray(
            shape=(self.obs_size, ),
            dtype="float32",
            name="observation",
            minimum=obs_min,
            maximum=obs_max,
        )

        # Time_left, ball coords, ball delta_coords
        state_min = [0, -100 / self.scaling, -100 / self.scaling, -10, -10]
        state_max = [1, 100 / self.scaling, 100 / self.scaling, 10, 10]

        # First player is the critic player
        # Players sides,  coords, delta_coords, body_angle (x, y format),
        # head_angle (x, y format)
        for i in range(num_players):
            state_min.extend([
                0.0,
                -100 / self.scaling,
                -100 / self.scaling,
                -10,
                -10,
                -1,
                -1,
                -1,
                -1,
            ])
            state_max.extend([
                1.0, +100 / self.scaling, +100 / self.scaling, +10, +10, 1, 1,
                1, 1
            ])

        # Add all observations to state info
        for i in range(num_players):
            state_min.extend(obs_min)
            state_max.extend(obs_max)

        assert len(state_min) == len(state_max)
        self._state_spec = specs.BoundedArray(
            shape=(len(state_min), ),
            dtype="float32",
            name="state",
            minimum=state_min,
            maximum=state_max,
        )

        self._discount = dict(
            zip(self.agents, [np.float32(1.0)] * len(self.agents)))

        # TODO: Delete this
        # self.previous_act = {"player_0": None}

        for agent in self.agents:
            # TODO: Why is the action spec in two places?
            self._observation_specs[agent] = OLT(
                observation=obs_spec,
                legal_actions=action_spec,
                terminal=specs.Array((1, ), np.float32),
            )

            self._action_specs[agent] = action_spec
Ejemplo n.º 26
0
    def init(self, params):

        if not _TF_USE_GPU:
            tf.config.set_visible_devices([], 'GPU')
        tf.config.threading.set_inter_op_parallelism_threads(_TF_NUM_THREADS)
        tf.config.threading.set_intra_op_parallelism_threads(_TF_NUM_THREADS)

        if params.seed:
            agent_seed = params.seed + sum([ord(c) for c in params.name])
            random.seed(agent_seed)
            np.random.seed(agent_seed)
            tf.random.set_seed(agent_seed)

        # Internalize params.
        self._params = params

        self._name = params.name

        # Whether learning stopped.
        self._stop = False

        # Define specs. Everything needs to be single precision by default.
        observation_spec = specs.Array(shape=(params.states.rank, ),
                                       dtype=np.float32,
                                       name='obs')
        action_spec = specs.DiscreteArray(dtype=np.int32,
                                          num_values=params.actions.depth,
                                          name="action")
        reward_spec = specs.Array(shape=(), dtype=np.float32, name='reward')
        discount_spec = specs.BoundedArray(shape=(),
                                           dtype=np.float32,
                                           minimum=0.,
                                           maximum=1.,
                                           name='discount')

        env_spec = specs.EnvironmentSpec(observations=observation_spec,
                                         actions=action_spec,
                                         rewards=reward_spec,
                                         discounts=discount_spec)

        # Logger.
        dir_path = f'{params.exp_path}/logs/{self._name}'
        self._logger = make_default_logger(directory=dir_path,
                                           label=self._name)
        agent_logger = make_default_logger(directory=dir_path,
                                           label=f'{self._name}-learning')

        network = Network(num_actions=env_spec.actions.num_values,
                          rnn_hidden_size=params.rnn_hidden_size,
                          head_layers=params.head_layers)

        self.agent = acme_agent.R2D2(
            environment_spec=env_spec,
            network=network,
            batch_size=params.batch_size,
            samples_per_insert=params.samples_per_insert,
            burn_in_length=params.burn_in_length,
            trace_length=params.trace_length,
            replay_period=params.replay_period,
            min_replay_size=params.min_replay_size,
            max_replay_size=params.max_replay_size,
            discount=params.discount_factor,
            prefetch_size=params.prefetch_size,
            target_update_period=params.target_update_period,
            importance_sampling_exponent=params.importance_sampling_exponent,
            priority_exponent=params.priority_exponent,
            epsilon_init=params.epsilon_init,
            epsilon_final=params.epsilon_final,
            epsilon_schedule_timesteps=params.epsilon_schedule_timesteps,
            learning_rate=params.learning_rate,
            store_lstm_state=params.store_lstm_state,
            max_priority_weight=params.max_priority_weight,
            logger=agent_logger,
            checkpoint=False,
        )

        # Observations counter.
        self._obs_counter = 0