Ejemplo n.º 1
0
def make_networks(
    action_spec: specs.BoundedArray,
    policy_layer_sizes: Sequence[int] = (50, ),
    critic_layer_sizes: Sequence[int] = (50, ),
    vmin: float = -150.,
    vmax: float = 150.,
    num_atoms: int = 51,
):
    """Creates networks used by the agent."""

    num_dimensions = np.prod(action_spec.shape, dtype=int)

    policy_network = snt.Sequential([
        networks.LayerNormMLP(policy_layer_sizes, activate_final=True),
        networks.MultivariateNormalDiagHead(num_dimensions,
                                            tanh_mean=True,
                                            init_scale=0.3,
                                            fixed_scale=True,
                                            use_tfd_independent=False)
    ])

    # The multiplexer concatenates the (maybe transformed) observations/actions.
    critic_network = networks.CriticMultiplexer(
        critic_network=networks.LayerNormMLP(critic_layer_sizes,
                                             activate_final=True),
        action_network=networks.ClipToSpec(action_spec))
    critic_network = snt.Sequential(
        [critic_network,
         networks.DiscreteValuedHead(vmin, vmax, num_atoms)])

    return {
        'policy': policy_network,
        'critic': critic_network,
        'observation': tf2_utils.batch_concat,
    }
Ejemplo n.º 2
0
def make_dmpo_networks(
    action_spec,
    policy_layer_sizes = (300, 200),
    critic_layer_sizes = (400, 300),
    vmin = -150.,
    vmax = 150.,
    num_atoms = 51,
):
  """Creates networks used by the agent."""

  num_dimensions = np.prod(action_spec.shape, dtype=int)

  policy_network = snt.Sequential([
      networks.LayerNormMLP(policy_layer_sizes),
      networks.MultivariateNormalDiagHead(num_dimensions)
  ])
  # The multiplexer concatenates the (maybe transformed) observations/actions.
  critic_network = networks.CriticMultiplexer(
      critic_network=networks.LayerNormMLP(critic_layer_sizes),
      action_network=networks.ClipToSpec(action_spec))
  critic_network = snt.Sequential(
      [critic_network,
       networks.DiscreteValuedHead(vmin, vmax, num_atoms)])

  return {
      'policy': policy_network,
      'critic': critic_network,
      'observation': tf_utils.batch_concat,
  }
Ejemplo n.º 3
0
def make_mpo_networks(
    action_spec,
    policy_layer_sizes=(256, 256, 256),
    critic_layer_sizes=(512, 512, 256),
    policy_init_std=1e-9,
    obs_network=None):
  """Creates networks used by the agent."""

  num_dimensions = np.prod(action_spec.shape, dtype=int)
  critic_layer_sizes = list(critic_layer_sizes) + [1]

  policy_network = snt.Sequential([
      networks.LayerNormMLP(policy_layer_sizes),
      networks.MultivariateNormalDiagHead(
          num_dimensions,
          init_scale=policy_init_std,
          min_scale=1e-10)
  ])
  # The multiplexer concatenates the (maybe transformed) observations/actions.
  critic_network = networks.CriticMultiplexer(
      critic_network=networks.LayerNormMLP(critic_layer_sizes),
      action_network=networks.ClipToSpec(action_spec))
  if obs_network is None:
    obs_network = tf_utils.batch_concat

  return {
      'policy': policy_network,
      'critic': critic_network,
      'observation': obs_network,
  }
Ejemplo n.º 4
0
def make_networks(
    action_spec: specs.BoundedArray,
    policy_layer_sizes: Sequence[int] = (256, 256, 256),
    critic_layer_sizes: Sequence[int] = (512, 512, 256),
) -> Dict[str, types.TensorTransformation]:
    """Creates networks used by the agent."""

    num_dimensions = np.prod(action_spec.shape, dtype=int)

    policy_network = snt.Sequential([
        networks.LayerNormMLP(policy_layer_sizes, activate_final=True),
        networks.MultivariateNormalDiagHead(num_dimensions,
                                            init_scale=0.7,
                                            use_tfd_independent=True)
    ])

    # The multiplexer concatenates the (maybe transformed) observations/actions.
    multiplexer = networks.CriticMultiplexer(
        action_network=networks.ClipToSpec(action_spec))
    critic_network = snt.Sequential([
        multiplexer,
        networks.LayerNormMLP(critic_layer_sizes, activate_final=True),
        networks.NearZeroInitializedLinear(1),
    ])

    return {
        'policy': policy_network,
        'critic': critic_network,
        'observation': tf2_utils.batch_concat,
    }
Ejemplo n.º 5
0
def make_default_networks(
    action_spec: specs.BoundedArray,
    policy_layer_sizes: Sequence[int] = (256, 256, 256),
    critic_layer_sizes: Sequence[int] = (512, 512, 256),
) -> Mapping[str, types.TensorTransformation]:
  """Creates networks used by the agent."""

  # Get total number of action dimensions from action spec.
  num_dimensions = np.prod(action_spec.shape, dtype=int)

  policy_network = snt.Sequential([
      tf2_utils.batch_concat,
      networks.LayerNormMLP(policy_layer_sizes, activate_final=True),
      networks.MultivariateNormalDiagHead(
          num_dimensions,
          tanh_mean=True,
          min_scale=0.3,
          init_scale=0.7,
          fixed_scale=False,
          use_tfd_independent=False)
  ])
  # The multiplexer concatenates the (maybe transformed) observations/actions.
  multiplexer = networks.CriticMultiplexer(
      action_network=networks.ClipToSpec(action_spec))
  critic_network = snt.Sequential([
      multiplexer,
      networks.LayerNormMLP(critic_layer_sizes, activate_final=True),
      networks.NearZeroInitializedLinear(1),
  ])

  return {
      "policy": policy_network,
      "critic": critic_network,
  }
Ejemplo n.º 6
0
def make_networks(
    action_spec: specs.BoundedArray,
    num_critic_heads: int,
    policy_layer_sizes: Sequence[int] = (50, ),
    critic_layer_sizes: Sequence[int] = (50, ),
    num_layers_shared: int = 1,
    distributional_critic: bool = True,
    vmin: float = -150.,
    vmax: float = 150.,
    num_atoms: int = 51,
):
    """Creates networks used by the agent."""

    num_dimensions = np.prod(action_spec.shape, dtype=int)

    policy_network = snt.Sequential([
        networks.LayerNormMLP(policy_layer_sizes, activate_final=True),
        networks.MultivariateNormalDiagHead(num_dimensions,
                                            tanh_mean=False,
                                            init_scale=0.69)
    ])

    if not distributional_critic:
        critic_layer_sizes = list(critic_layer_sizes) + [1]

    if not num_layers_shared:
        # No layers are shared
        critic_network_base = None
    else:
        critic_network_base = networks.LayerNormMLP(
            critic_layer_sizes[:num_layers_shared], activate_final=True)
    critic_network_heads = [
        snt.nets.MLP(critic_layer_sizes,
                     activation=tf.nn.elu,
                     activate_final=False) for _ in range(num_critic_heads)
    ]
    if distributional_critic:
        critic_network_heads = [
            snt.Sequential(
                [c, networks.DiscreteValuedHead(vmin, vmax, num_atoms)])
            for c in critic_network_heads
        ]
    # The multiplexer concatenates the (maybe transformed) observations/actions.
    critic_network = snt.Sequential([
        networks.CriticMultiplexer(
            critic_network=critic_network_base,
            action_network=networks.ClipToSpec(action_spec)),
        networks.Multihead(network_heads=critic_network_heads),
    ])

    return {
        'policy': policy_network,
        'critic': critic_network,
        'observation': tf2_utils.batch_concat,
    }
Ejemplo n.º 7
0
def make_networks(
    action_spec: specs.BoundedArray,
    policy_layer_sizes: Sequence[int] = (50, 1024, 1024),
    critic_layer_sizes: Sequence[int] = (50, 1024, 1024),
    vmin: float = -150.,
    vmax: float = 150.,
    num_atoms: int = 51,
) -> Dict[str, snt.Module]:
    """Creates networks used by the agent."""

    num_dimensions = np.prod(action_spec.shape, dtype=int)

    policy_network = snt.Sequential([
        networks.LayerNormMLP(policy_layer_sizes,
                              w_init=snt.initializers.Orthogonal(),
                              activation=tf.nn.relu,
                              activate_final=True),
        networks.MultivariateNormalDiagHead(
            num_dimensions,
            tanh_mean=False,
            init_scale=1.0,
            fixed_scale=False,
            use_tfd_independent=True,
            w_init=snt.initializers.Orthogonal())
    ])

    # The multiplexer concatenates the (maybe transformed) observations/actions.
    critic_network = networks.CriticMultiplexer(
        observation_network=snt.Sequential([
            snt.Linear(critic_layer_sizes[0],
                       w_init=snt.initializers.Orthogonal()),
            snt.LayerNorm(axis=slice(1, None),
                          create_scale=True,
                          create_offset=True), tf.nn.tanh
        ]),
        critic_network=snt.nets.MLP(critic_layer_sizes[1:],
                                    w_init=snt.initializers.Orthogonal(),
                                    activation=tf.nn.relu,
                                    activate_final=True),
        action_network=networks.ClipToSpec(action_spec))
    critic_network = snt.Sequential([
        critic_network,
        networks.DiscreteValuedHead(vmin,
                                    vmax,
                                    num_atoms,
                                    w_init=snt.initializers.Orthogonal())
    ])
    observation_network = networks.DrQTorso()

    return {
        'policy': policy_network,
        'critic': critic_network,
        'observation': observation_network,
    }
Ejemplo n.º 8
0
def make_network_with_prior(
    action_spec: specs.BoundedArray,
    policy_layer_sizes: Sequence[int] = (200, 100),
    critic_layer_sizes: Sequence[int] = (400, 300),
    prior_layer_sizes: Sequence[int] = (200, 100),
    policy_keys: Optional[Sequence[str]] = None,
    prior_keys: Optional[Sequence[str]] = None,
) -> Mapping[str, types.TensorTransformation]:
  """Creates networks used by the agent."""

  # Get total number of action dimensions from action spec.
  num_dimensions = np.prod(action_spec.shape, dtype=int)
  flatten_concat_policy = functools.partial(
      svg0_utils.batch_concat_selection, concat_keys=policy_keys)
  flatten_concat_prior = functools.partial(
      svg0_utils.batch_concat_selection, concat_keys=prior_keys)

  policy_network = snt.Sequential([
      flatten_concat_policy,
      networks.LayerNormMLP(policy_layer_sizes, activate_final=True),
      networks.MultivariateNormalDiagHead(
          num_dimensions,
          tanh_mean=True,
          min_scale=0.1,
          init_scale=0.7,
          fixed_scale=False,
          use_tfd_independent=False)
  ])
  # The multiplexer concatenates the (maybe transformed) observations/actions.
  multiplexer = networks.CriticMultiplexer(
      observation_network=flatten_concat_policy,
      action_network=networks.ClipToSpec(action_spec))
  critic_network = snt.Sequential([
      multiplexer,
      networks.LayerNormMLP(critic_layer_sizes, activate_final=True),
      networks.NearZeroInitializedLinear(1),
  ])
  prior_network = snt.Sequential([
      flatten_concat_prior,
      networks.LayerNormMLP(prior_layer_sizes, activate_final=True),
      networks.MultivariateNormalDiagHead(
          num_dimensions,
          tanh_mean=True,
          min_scale=0.1,
          init_scale=0.7,
          fixed_scale=False,
          use_tfd_independent=False)
  ])
  return {
      "policy": policy_network,
      "critic": critic_network,
      "prior": prior_network,
  }
Ejemplo n.º 9
0
    def evaluator(
        self,
        variable_source: acme.VariableSource,
        counter: counting.Counter,
    ):
        """The evaluation process."""

        action_spec = self._environment_spec.actions
        observation_spec = self._environment_spec.observations

        # Create environment and target networks to act with.
        environment = self._environment_factory(True)
        agent_networks = self._network_factory(action_spec,
                                               self._num_critic_heads)

        # Make sure observation network is defined.
        observation_network = agent_networks.get('observation', tf.identity)

        # Create a deterministic behavior policy.
        evaluator_modules = [
            observation_network,
            agent_networks['policy'],
            networks.StochasticMeanHead(),
        ]
        if isinstance(action_spec, specs.BoundedArray):
            evaluator_modules += [networks.ClipToSpec(action_spec)]
        evaluator_network = snt.Sequential(evaluator_modules)

        # Ensure network variables are created.
        tf2_utils.create_variables(evaluator_network, [observation_spec])
        policy_variables = {'policy': evaluator_network.variables}

        # Create the variable client responsible for keeping the actor up-to-date.
        variable_client = tf2_variable_utils.VariableClient(variable_source,
                                                            policy_variables,
                                                            update_period=1000)

        # Make sure not to evaluate a random actor by assigning variables before
        # running the environment loop.
        variable_client.update_and_wait()

        # Create the agent.
        evaluator = actors.FeedForwardActor(policy_network=evaluator_network,
                                            variable_client=variable_client)

        # Create logger and counter.
        counter = counting.Counter(counter, 'evaluator')
        logger = loggers.make_default_logger('evaluator',
                                             time_delta=self._log_every,
                                             steps_key='evaluator_steps')

        # Create the run loop and return it.
        return acme.EnvironmentLoop(environment, evaluator, counter, logger)
Ejemplo n.º 10
0
def make_default_networks(
    environment_spec: specs.EnvironmentSpec,
    *,
    policy_layer_sizes: Sequence[int] = (256, 256, 256),
    critic_layer_sizes: Sequence[int] = (512, 512, 256),
    policy_init_scale: float = 0.7,
    critic_init_scale: float = 1e-3,
    critic_num_components: int = 5,
) -> Mapping[str, snt.Module]:
    """Creates networks used by the agent."""

    # Unpack the environment spec to get appropriate shapes, dtypes, etc.
    act_spec = environment_spec.actions
    obs_spec = environment_spec.observations
    num_dimensions = np.prod(act_spec.shape, dtype=int)

    # Create the observation network and make sure it's a Sonnet module.
    observation_network = tf2_utils.batch_concat
    observation_network = tf2_utils.to_sonnet_module(observation_network)

    # Create the policy network.
    policy_network = snt.Sequential([
        networks.LayerNormMLP(policy_layer_sizes, activate_final=True),
        networks.MultivariateNormalDiagHead(num_dimensions,
                                            init_scale=policy_init_scale,
                                            use_tfd_independent=True)
    ])

    # The multiplexer concatenates the (maybe transformed) observations/actions.
    critic_network = snt.Sequential([
        networks.CriticMultiplexer(
            action_network=networks.ClipToSpec(act_spec)),
        networks.LayerNormMLP(critic_layer_sizes, activate_final=True),
        networks.GaussianMixtureHead(num_dimensions=1,
                                     num_components=critic_num_components,
                                     init_scale=critic_init_scale)
    ])

    # Create network variables.
    # Get embedding spec by creating observation network variables.
    emb_spec = tf2_utils.create_variables(observation_network, [obs_spec])
    tf2_utils.create_variables(policy_network, [emb_spec])
    tf2_utils.create_variables(critic_network, [emb_spec, act_spec])

    return {
        'policy': policy_network,
        'critic': critic_network,
        'observation': observation_network,
    }
Ejemplo n.º 11
0
def load_policy_net(
    task_name: str,
    noise_level: float,
    dataset_path: str,
    environment_spec: specs.EnvironmentSpec,
    near_policy_dataset: bool = False,
    ):
    dataset_path = Path(dataset_path)
    if task_name.startswith("bsuite"):
        # BSuite tasks.
        bsuite_id = task_name[len("bsuite_"):] + "/0"
        path = bsuite_policy_path(
            bsuite_id, noise_level, near_policy_dataset, dataset_path)
        logging.info("Policy path: %s", path)
        policy_net = tf.saved_model.load(path)

        policy_noise_level = 0.1  # params["policy_noise_level"]
        observation_network = tf2_utils.to_sonnet_module(functools.partial(
            tf.reshape, shape=(-1,) + environment_spec.observations.shape))
        policy_net = snt.Sequential([
            observation_network,
            policy_net,
            # Uncomment this line to add action noise to the target policy.
            lambda q: trfl.epsilon_greedy(q, epsilon=policy_noise_level).sample(),
        ])
    elif task_name.startswith("dm_control"):
        # DM Control tasks.
        if near_policy_dataset:
            raise ValueError(
                "Near-policy dataset is not available for dm_control tasks.")
        dm_control_task = task_name[len("dm_control_"):]
        path = dm_control_policy_path(
            dm_control_task, noise_level, dataset_path)
        logging.info("Policy path: %s", path)
        policy_net = tf.saved_model.load(path)

        policy_noise_level = 0.2  # params["policy_noise_level"]
        observation_network = tf2_utils.to_sonnet_module(tf2_utils.batch_concat)
        policy_net = snt.Sequential([
            observation_network,
            policy_net,
            # Uncomment these two lines to add action noise to target policy.
            acme_utils.GaussianNoise(policy_noise_level),
            networks.ClipToSpec(environment_spec.actions),
        ])
    else:
        raise ValueError(f"task name {task_name} is unsupported.")
    return policy_net
Ejemplo n.º 12
0
def make_dmpo_networks(
    action_spec,
    policy_layer_sizes=(256, 256, 256),
    critic_layer_sizes=(512, 512, 256),
    vmin=-150.,
    vmax=150.,
    num_atoms=51,
    policy_init_std=1e-9,
    obs_network=None,
    binary_grip_action=False):
  """Creates networks used by the agent."""

  num_dimensions = np.prod(action_spec.shape, dtype=int)
  if policy_layer_sizes:
    policy_network = snt.Sequential([
        networks.LayerNormMLP([int(l) for l in policy_layer_sizes]),
        networks.MultivariateNormalDiagHead(
            num_dimensions,
            init_scale=policy_init_std,
            min_scale=1e-10)
    ])
  else:
    # Useful when initializing from a trained BC network.
    policy_network = snt.Sequential([
        ArmPolicyNormalDiagHead(
            binary_grip_action=binary_grip_action,
            num_dimensions=num_dimensions,
            init_scale=policy_init_std,
            min_scale=1e-10)
    ])
  # The multiplexer concatenates the (maybe transformed) observations/actions.
  critic_network = networks.CriticMultiplexer(
      critic_network=networks.LayerNormMLP(critic_layer_sizes),
      action_network=networks.ClipToSpec(action_spec))
  critic_network = snt.Sequential(
      [critic_network,
       networks.DiscreteValuedHead(vmin, vmax, num_atoms)])
  if obs_network is None:
    obs_network = tf_utils.batch_concat

  return {
      'policy': policy_network,
      'critic': critic_network,
      'observation': obs_network,
  }
Ejemplo n.º 13
0
def make_feed_forward_networks(
    action_spec: specs.BoundedArray,
    z_spec: specs.BoundedArray,
    policy_layer_sizes: Tuple[int, ...] = (256, 256),
    critic_layer_sizes: Tuple[int, ...] = (256, 256),
    discriminator_layer_sizes: Tuple[int, ...] = (256, 256),
    hierarchical_controller_layer_sizes: Tuple[int, ...] = (256, 256),
    vmin: float = -150.,  # Minimum value for the Critic distribution.
    vmax: float = 150.,  # Maximum value for the Critic distribution.
    num_atoms: int = 51,  # Number of atoms for the discrete value distribution.
) -> Dict[str, types.TensorTransformation]:
    num_dimensions = np.prod(action_spec.shape, dtype=int)
    z_dim = np.prod(z_spec.shape, dtype=int)

    observation_network = tf2_utils.batch_concat

    policy_network = snt.Sequential([
        networks.LayerNormMLP(policy_layer_sizes),
        networks.MultivariateNormalDiagHead(num_dimensions)
    ])

    critic_multiplexer = networks.CriticMultiplexer(
        critic_network=networks.LayerNormMLP(critic_layer_sizes),
        action_network=networks.ClipToSpec(action_spec))

    critic_network = snt.Sequential([
        critic_multiplexer,
        networks.DiscreteValuedHead(vmin, vmax, num_atoms),
    ])

    # The discriminator in DIAYN uses the same architecture as the critic.
    discriminator_network = networks.LayerNormMLP(discriminator_layer_sizes +
                                                  (z_dim, ))

    hierarchical_controller_network = networks.LayerNormMLP(
        hierarchical_controller_layer_sizes + (z_dim, ))

    return {
        'policy': policy_network,
        'critic': critic_network,
        'observation': observation_network,
        'discriminator': discriminator_network,
        'hierarchical_controller': hierarchical_controller_network,
    }
Ejemplo n.º 14
0
def make_networks(
    action_spec: specs.BoundedArray,
    policy_layer_sizes: Sequence[int] = (256, 256, 256),
    critic_layer_sizes: Sequence[int] = (512, 512, 256),
    vmin: float = -150.,
    vmax: float = 150.,
    num_atoms: int = 51,
) -> Dict[str, types.TensorTransformation]:
  """Creates networks used by the agent."""

  # Get total number of action dimensions from action spec.
  num_dimensions = np.prod(action_spec.shape, dtype=int)

  # Create the shared observation network; here simply a state-less operation.
  observation_network = tf2_utils.batch_concat

  # Create the policy network.
  policy_network = snt.Sequential([
      networks.LayerNormMLP(policy_layer_sizes),
      networks.MultivariateNormalDiagHead(num_dimensions)
  ])

  # The multiplexer transforms concatenates the observations/actions.
  multiplexer = networks.CriticMultiplexer(
      critic_network=networks.LayerNormMLP(critic_layer_sizes),
      action_network=networks.ClipToSpec(action_spec))

  # Create the critic network.
  critic_network = snt.Sequential([
      multiplexer,
      networks.DiscreteValuedHead(vmin, vmax, num_atoms),
  ])

  return {
      'policy': policy_network,
      'critic': critic_network,
      'observation': observation_network,
  }
Ejemplo n.º 15
0
def make_networks(
        action_spec: specs.BoundedArray,
        policy_layer_sizes: Sequence[int] = (50, 50),
        critic_layer_sizes: Sequence[int] = (50, 50),
):
    """Creates networks used by the agent."""

    num_dimensions = np.prod(action_spec.shape, dtype=int)

    observation_network = tf2_utils.batch_concat
    policy_network = snt.Sequential([
        networks.LayerNormMLP(policy_layer_sizes, activate_final=True),
        networks.MultivariateNormalDiagHead(num_dimensions,
                                            tanh_mean=True,
                                            init_scale=0.3,
                                            fixed_scale=True,
                                            use_tfd_independent=False)
    ])
    evaluator_network = snt.Sequential([
        observation_network,
        policy_network,
        networks.StochasticMeanHead(),
    ])
    # The multiplexer concatenates the (maybe transformed) observations/actions.
    multiplexer = networks.CriticMultiplexer(
        action_network=networks.ClipToSpec(action_spec))
    critic_network = snt.Sequential([
        multiplexer,
        networks.LayerNormMLP(critic_layer_sizes, activate_final=True),
        networks.NearZeroInitializedLinear(1),
    ])

    return {
        'policy': policy_network,
        'critic': critic_network,
        'observation': observation_network,
        'evaluator': evaluator_network,
    }
Ejemplo n.º 16
0
  def make_policy(
      self,
      environment_spec: specs.EnvironmentSpec,
      sigma: float = 0.0,
  ) -> snt.Module:
    """Create a single network which evaluates the policy."""
    # Stack the observation and policy networks.
    stack = [
        self.observation_network,
        self.policy_network,
    ]

    # If a stochastic/non-greedy policy is requested, add Gaussian noise on
    # top to enable a simple form of exploration.
    # TODO(mwhoffman): Refactor this to remove it from the class.
    if sigma > 0.0:
      stack += [
          network_utils.ClippedGaussian(sigma),
          network_utils.ClipToSpec(environment_spec.actions),
      ]

    # Return a network which sequentially evaluates everything in the stack.
    return snt.Sequential(stack)
Ejemplo n.º 17
0
    def __init__(self,
                 environment_spec: specs.EnvironmentSpec,
                 policy_network: snt.Module,
                 critic_network: snt.Module,
                 observation_network: types.TensorTransformation = tf.identity,
                 discount: float = 0.99,
                 batch_size: int = 256,
                 prefetch_size: int = 4,
                 target_update_period: int = 100,
                 min_replay_size: int = 1000,
                 max_replay_size: int = 1000000,
                 samples_per_insert: float = 32.0,
                 n_step: int = 5,
                 sigma: float = 0.3,
                 clipping: bool = True,
                 logger: loggers.Logger = None,
                 counter: counting.Counter = None,
                 checkpoint: bool = True,
                 replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE):
        """Initialize the agent.

    Args:
      environment_spec: description of the actions, observations, etc.
      policy_network: the online (optimized) policy.
      critic_network: the online critic.
      observation_network: optional network to transform the observations before
        they are fed into any network.
      discount: discount to use for TD updates.
      batch_size: batch size for updates.
      prefetch_size: size to prefetch from replay.
      target_update_period: number of learner steps to perform before updating
        the target networks.
      min_replay_size: minimum replay size before updating.
      max_replay_size: maximum replay size.
      samples_per_insert: number of samples to take from replay for every insert
        that is made.
      n_step: number of steps to squash into a single transition.
      sigma: standard deviation of zero-mean, Gaussian exploration noise.
      clipping: whether to clip gradients by global norm.
      logger: logger object to be used by learner.
      counter: counter object used to keep track of steps.
      checkpoint: boolean indicating whether to checkpoint the learner.
      replay_table_name: string indicating what name to give the replay table.
    """
        # Create a replay server to add data to. This uses no limiter behavior in
        # order to allow the Agent interface to handle it.
        replay_table = reverb.Table(
            name=replay_table_name,
            sampler=reverb.selectors.Uniform(),
            remover=reverb.selectors.Fifo(),
            max_size=max_replay_size,
            rate_limiter=reverb.rate_limiters.MinSize(1),
            signature=adders.NStepTransitionAdder.signature(environment_spec))
        self._server = reverb.Server([replay_table], port=None)

        # The adder is used to insert observations into replay.
        address = f'localhost:{self._server.port}'
        adder = adders.NStepTransitionAdder(
            priority_fns={replay_table_name: lambda x: 1.},
            client=reverb.Client(address),
            n_step=n_step,
            discount=discount)

        # The dataset provides an interface to sample from replay.
        dataset = datasets.make_reverb_dataset(
            table=replay_table_name,
            client=reverb.TFClient(address),
            environment_spec=environment_spec,
            batch_size=batch_size,
            prefetch_size=prefetch_size,
            transition_adder=True)

        # Get observation and action specs.
        act_spec = environment_spec.actions
        obs_spec = environment_spec.observations
        emb_spec = tf2_utils.create_variables(observation_network, [obs_spec])  # pytype: disable=wrong-arg-types

        # Make sure observation network is a Sonnet Module.
        observation_network = tf2_utils.to_sonnet_module(observation_network)

        # Create target networks.
        target_policy_network = copy.deepcopy(policy_network)
        target_critic_network = copy.deepcopy(critic_network)
        target_observation_network = copy.deepcopy(observation_network)

        # Create the behavior policy.
        behavior_network = snt.Sequential([
            observation_network,
            policy_network,
            networks.ClippedGaussian(sigma),
            networks.ClipToSpec(act_spec),
        ])

        # Create variables.
        tf2_utils.create_variables(policy_network, [emb_spec])
        tf2_utils.create_variables(critic_network, [emb_spec, act_spec])
        tf2_utils.create_variables(target_policy_network, [emb_spec])
        tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec])
        tf2_utils.create_variables(target_observation_network, [obs_spec])

        # Create the actor which defines how we take actions.
        actor = actors.FeedForwardActor(behavior_network, adder=adder)

        # Create optimizers.
        policy_optimizer = snt.optimizers.Adam(learning_rate=1e-4)
        critic_optimizer = snt.optimizers.Adam(learning_rate=1e-4)

        # The learner updates the parameters (and initializes them).
        learner = learning.DDPGLearner(
            policy_network=policy_network,
            critic_network=critic_network,
            observation_network=observation_network,
            target_policy_network=target_policy_network,
            target_critic_network=target_critic_network,
            target_observation_network=target_observation_network,
            policy_optimizer=policy_optimizer,
            critic_optimizer=critic_optimizer,
            clipping=clipping,
            discount=discount,
            target_update_period=target_update_period,
            dataset=dataset,
            counter=counter,
            logger=logger,
            checkpoint=checkpoint,
        )

        super().__init__(actor=actor,
                         learner=learner,
                         min_observations=max(batch_size, min_replay_size),
                         observations_per_step=float(batch_size) /
                         samples_per_insert)
Ejemplo n.º 18
0
def make_default_networks(
    environment_spec: mava_specs.MAEnvironmentSpec,
    policy_networks_layer_sizes: Union[Dict[str, Sequence],
                                       Sequence] = (256, 256, 256),
    critic_networks_layer_sizes: Union[Dict[str, Sequence],
                                       Sequence] = (512, 512, 256),
    shared_weights: bool = True,
    sigma: float = 0.3,
    archecture_type: ArchitectureType = ArchitectureType.feedforward,
) -> Mapping[str, types.TensorTransformation]:
    """Default networks for maddpg.

    Args:
        environment_spec (mava_specs.MAEnvironmentSpec): description of the action and
            observation spaces etc. for each agent in the system.
        policy_networks_layer_sizes (Union[Dict[str, Sequence], Sequence], optional):
            size of policy networks. Defaults to (256, 256, 256).
        critic_networks_layer_sizes (Union[Dict[str, Sequence], Sequence], optional):
            size of critic networks. Defaults to (512, 512, 256).
        shared_weights (bool, optional): whether agents should share weights or not.
            Defaults to True.
        sigma (float, optional): hyperparameters used to add Gaussian noise for
            simple exploration. Defaults to 0.3.
        archecture_type (ArchitectureType, optional): archecture used for
            agent networks. Can be feedforward or recurrent. Defaults to
            ArchitectureType.feedforward.

    Returns:
        Mapping[str, types.TensorTransformation]: returned agent networks.
    """

    # Set Policy function and layer size
    if archecture_type == ArchitectureType.feedforward:
        policy_network_func = snt.Sequential
    elif archecture_type == ArchitectureType.recurrent:
        policy_networks_layer_sizes = (128, 128)
        policy_network_func = snt.DeepRNN

    specs = environment_spec.get_agent_specs()

    # Create agent_type specs
    if shared_weights:
        type_specs = {key.split("_")[0]: specs[key] for key in specs.keys()}
        specs = type_specs

    if isinstance(policy_networks_layer_sizes, Sequence):
        policy_networks_layer_sizes = {
            key: policy_networks_layer_sizes
            for key in specs.keys()
        }
    if isinstance(critic_networks_layer_sizes, Sequence):
        critic_networks_layer_sizes = {
            key: critic_networks_layer_sizes
            for key in specs.keys()
        }

    observation_networks = {}
    policy_networks = {}
    critic_networks = {}
    for key in specs.keys():
        # TODO (dries): Make specs[key].actions
        #  return a list of specs for hybrid action space
        # Get total number of action dimensions from action spec.
        agent_act_spec = specs[key].actions
        if type(specs[key].actions) == DiscreteArray:
            num_actions = agent_act_spec.num_values
            minimum = [-1.0] * num_actions
            maximum = [1.0] * num_actions
            agent_act_spec = BoundedArray(
                shape=(num_actions, ),
                minimum=minimum,
                maximum=maximum,
                dtype="float32",
                name="actions",
            )

        # Get total number of action dimensions from action spec.
        num_dimensions = np.prod(agent_act_spec.shape, dtype=int)

        # An optional network to process observations
        observation_network = tf2_utils.to_sonnet_module(tf.identity)
        # Create the policy network.
        if archecture_type == ArchitectureType.feedforward:
            policy_network = [
                networks.LayerNormMLP(policy_networks_layer_sizes[key],
                                      activate_final=True),
            ]
        elif archecture_type == ArchitectureType.recurrent:
            policy_network = [
                networks.LayerNormMLP(policy_networks_layer_sizes[key][:-1],
                                      activate_final=True),
                snt.LSTM(policy_networks_layer_sizes[key][-1]),
            ]

        policy_network += [
            networks.NearZeroInitializedLinear(num_dimensions),
            networks.TanhToSpec(agent_act_spec),
        ]

        # Add Gaussian noise for simple exploration.
        if sigma and sigma > 0.0:
            policy_network += [
                networks.ClippedGaussian(sigma),
                networks.ClipToSpec(agent_act_spec),
            ]

        policy_network = policy_network_func(policy_network)

        # Create the critic network.
        critic_network = snt.Sequential([
            # The multiplexer concatenates the observations/actions.
            networks.CriticMultiplexer(),
            networks.LayerNormMLP(list(critic_networks_layer_sizes[key]) + [1],
                                  activate_final=False),
        ])
        observation_networks[key] = observation_network
        policy_networks[key] = policy_network
        critic_networks[key] = critic_network

    return {
        "policies": policy_networks,
        "critics": critic_networks,
        "observations": observation_networks,
    }
 def __init__(self, visualRadius, action_size, action_spec, exploration_sigma):
     super(ActorNetwork, self).__init__(name="commons-actor")
     self.policy_network = PolicyNetwork(
         visualRadius, action_size, action_spec)
     self.behavior_network = self.policy_network + snt.Sequential([networks.ClippedGaussian(exploration_sigma),
                                                                   networks.ClipToSpec(action_spec)])
Ejemplo n.º 20
0
def make_networks(
    environment_spec: mava_specs.MAEnvironmentSpec,
    policy_networks_layer_sizes: Union[Dict[str, Sequence], Sequence] = (
        256,
        256,
        256,
    ),
    critic_networks_layer_sizes: Union[Dict[str, Sequence], Sequence] = (512, 512, 256),
    shared_weights: bool = True,
    sigma: float = 0.3,
) -> Mapping[str, types.TensorTransformation]:
    """Creates networks used by the agents."""
    specs = environment_spec.get_agent_specs()

    # Create agent_type specs
    if shared_weights:
        type_specs = {key.split("_")[0]: specs[key] for key in specs.keys()}
        specs = type_specs

    if isinstance(policy_networks_layer_sizes, Sequence):
        policy_networks_layer_sizes = {
            key: policy_networks_layer_sizes for key in specs.keys()
        }
    if isinstance(critic_networks_layer_sizes, Sequence):
        critic_networks_layer_sizes = {
            key: critic_networks_layer_sizes for key in specs.keys()
        }

    observation_networks = {}
    policy_networks = {}
    critic_networks = {}
    for key in specs.keys():

        # Get total number of action dimensions from action spec.
        num_dimensions = np.prod(specs[key].actions.shape, dtype=int)

        # Create the shared observation network; here simply a state-less operation.
        observation_network = tf2_utils.to_sonnet_module(tf.identity)

        # Create the policy network.
        policy_network = snt.Sequential(
            [
                networks.LayerNormMLP(
                    policy_networks_layer_sizes[key], activate_final=True
                ),
                networks.NearZeroInitializedLinear(num_dimensions),
                networks.TanhToSpec(specs[key].actions),
                networks.ClippedGaussian(sigma),
                networks.ClipToSpec(specs[key].actions),
            ]
        )

        # Create the critic network.
        critic_network = snt.Sequential(
            [
                # The multiplexer concatenates the observations/actions.
                networks.CriticMultiplexer(),
                networks.LayerNormMLP(
                    critic_networks_layer_sizes[key], activate_final=False
                ),
                snt.Linear(1),
            ]
        )
        observation_networks[key] = observation_network
        policy_networks[key] = policy_network
        critic_networks[key] = critic_network

    return {
        "policies": policy_networks,
        "critics": critic_networks,
        "observations": observation_networks,
    }
Ejemplo n.º 21
0
# Create the target networks
target_policy_network = copy.deepcopy(policy_network)
target_critic_network = copy.deepcopy(critic_network)
target_observation_network = copy.deepcopy(observation_network)

# Get observation and action specs.
act_spec = environment_spec.actions
obs_spec = environment_spec.observations
emb_spec = tf2_utils.create_variables(observation_network, [obs_spec])

# Create the behavior policy.
behavior_network = snt.Sequential([
    observation_network,
    policy_network,
    networks.ClippedGaussian(0.3), #sigma = 0.3
    networks.ClipToSpec(act_spec),
])

# We must create the variables in the networks before passing them to learner.
# Create variables.
tf2_utils.create_variables(policy_network, [emb_spec])
tf2_utils.create_variables(critic_network, [emb_spec, act_spec])
tf2_utils.create_variables(target_policy_network, [emb_spec])
tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec])
tf2_utils.create_variables(target_observation_network, [obs_spec])

actor = actors.FeedForwardActor(behavior_network, adder=adder)

learner = d4pg.D4PGLearner(policy_network=policy_network,
                           critic_network=critic_network,
                           observation_network=observation_network,