Beispiel #1
0
def default_agent(obs_spec: dm_env.specs.Array,
                  action_spec: dm_env.specs.DiscreteArray):
    """Initialize a DQN agent with default parameters."""
    del obs_spec  # Unused.
    hidden_units = [50, 50]
    online_network = snt.Sequential([
        snt.Flatten(),
        snt.nets.MLP(hidden_units + [action_spec.num_values]),
    ])
    target_network = snt.Sequential([
        snt.Flatten(),
        snt.nets.MLP(hidden_units + [action_spec.num_values]),
    ])
    return DQN(action_spec=action_spec,
               online_network=online_network,
               target_network=target_network,
               batch_size=32,
               discount=0.99,
               replay_capacity=10000,
               min_replay_size=100,
               sgd_period=1,
               target_update_period=4,
               optimizer=snt.optimizers.Adam(learning_rate=1e-3),
               epsilon=0.05,
               seed=42)
Beispiel #2
0
 def __init__(self, hidden_sizes: Sequence[int], num_actions: int):
     super().__init__(name='policy_value_net')
     self._num_actions = num_actions
     self._torso = snt.Sequential([
         snt.Flatten(),
         snt.nets.MLP(hidden_sizes, activate_final=True, name='net'),
     ])
     self._core = snt.DeepRNN([
         snt.Flatten(),
         snt.LSTM(hidden_sizes[-1], name='rnn'),
     ])
     self._policy_head = snt.Linear(num_actions, name='policy')
     self._value_head = snt.Linear(1, name='value')
Beispiel #3
0
    def test_mcts(self):
        # Create a fake environment to test with.
        num_actions = 5
        environment = fakes.DiscreteEnvironment(num_actions=num_actions,
                                                num_observations=10,
                                                obs_dtype=np.float32,
                                                episode_length=10)
        spec = specs.make_environment_spec(environment)

        network = snt.Sequential([
            snt.Flatten(),
            snt.nets.MLP([50, 50]),
            networks.PolicyValueHead(spec.actions.num_values),
        ])
        model = simulator.Simulator(environment)
        optimizer = snt.optimizers.Adam(1e-3)

        # Construct the agent.
        agent = mcts.MCTS(environment_spec=spec,
                          network=network,
                          model=model,
                          optimizer=optimizer,
                          n_step=1,
                          discount=1.,
                          replay_capacity=100,
                          num_simulations=10,
                          batch_size=10)

        # Try running the environment loop. We have no assertions here because all
        # we care about is that the agent runs without raising any errors.
        loop = acme.EnvironmentLoop(environment, agent)
        loop.run(num_episodes=2)
 def _make_network(spec) -> snt.Module:
     network = snt.Sequential([
         snt.Flatten(),
         snt.nets.MLP([50, 50, spec.actions.num_values]),
     ])
     tf2_utils.create_variables(network, [spec.observations])
     return network
def main(_):
    wb_run = wandb.init(project="offline-rl",
                        group=FLAGS.logs_tag,
                        id=FLAGS.wandb_id or str(int(time.time())),
                        config=FLAGS.flag_values_dict(),
                        reinit=FLAGS.acme_id is None) if FLAGS.wandb else None

    # Create an environment and grab the spec.
    environment, environment_spec = _build_environment(
        FLAGS.environment_name, max_steps=FLAGS.ep_max_len)

    network = snt.Sequential([
        snt.Flatten(),
        snt.nets.MLP([128, 64, 32, environment_spec.actions.num_values])
    ])

    disp, disp_loop = _build_custom_loggers(wb_run)

    # Construct the agent.
    agent = dqn.DQN(environment_spec=environment_spec,
                    network=network,
                    batch_size=FLAGS.batch_size,
                    n_step=FLAGS.n_step_returns,
                    epsilon=FLAGS.epsilon,
                    logger=disp)

    # Run the environment loop.
    loop = EnvironmentLoop(environment, agent, logger=disp_loop)
    loop.run(num_episodes=FLAGS.n_episodes)  # pytype: disable=attribute-error
    agent._checkpointer.save(force=True)
    wandb.save(agent._checkpointer._checkpoint_dir)
    wandb.run.summary.update(
        {"checkpoint_dir": agent._checkpointer._checkpoint_dir})
Beispiel #6
0
 def __init__(self, action_spec: specs.DiscreteArray):
     super().__init__(name='r2d2_test_network')
     self._net = snt.DeepRNN([
         snt.Flatten(),
         snt.LSTM(20),
         snt.nets.MLP([50, 50, action_spec.num_values])
     ])
Beispiel #7
0
def _make_network(action_spec: specs.DiscreteArray) -> snt.RNNCore:
    return snt.DeepRNN([
        snt.Flatten(),
        snt.LSTM(20),
        snt.nets.MLP([50, 50]),
        networks.PolicyValueHead(action_spec.num_values),
    ])
Beispiel #8
0
    def __init__(self,
                 data_format: str = 'NHWC',
                 activation: Callable[[tf.Tensor], tf.Tensor] = tf.nn.relu,
                 output_dtype: tf.DType = tf.float32,
                 name: str = 'resnet_torso'):
        super().__init__(name=name)

        self._output_dtype = output_dtype

        # Create a Conv2D factory since we'll be making quite a few.
        gain = 2**0.5 if activation == tf.nn.relu else 1.

        def build_conv_layer(name: str,
                             output_channels: int = 32,
                             kernel_shape: Sequence[int] = (3, 3),
                             stride: int = 1):
            return snt.Conv2D(output_channels=output_channels,
                              kernel_shape=kernel_shape,
                              stride=stride,
                              padding='SAME',
                              data_format=data_format,
                              w_init=snt.initializers.Orthogonal(gain=gain,
                                                                 seed=None),
                              b_init=snt.initializers.Zeros(),
                              name=name)

        self._network = snt.Sequential([
            build_conv_layer('conv_0', stride=2), activation,
            build_conv_layer('conv_1', stride=1), activation,
            build_conv_layer('conv_2', stride=1), activation,
            build_conv_layer('conv_3', stride=1), activation,
            snt.Flatten()
        ])
Beispiel #9
0
    def __init__(self, n_latent=4, kernel_size=4, name=None):
        super(VariationalAutoEncoder, self).__init__(name=name)

        self.n_latent = n_latent
        self.encoder = snt.Sequential([
            snt.Conv2D(4, kernel_size, stride=4, padding='SAME'),
            tf.nn.relu,  # [b, 250, 250, 4]
            snt.Conv2D(16, kernel_size, stride=4, padding='SAME'),
            tf.nn.relu,  # [b, 63, 63, 16]
            snt.Conv2D(32, kernel_size, stride=4, padding='SAME'),
            tf.nn.relu,  # [b, 16, 16, 32]
            snt.Conv2D(64, kernel_size, stride=2, padding='SAME'),
            tf.nn.relu,  # [b, 8, 8, 64]
            snt.Flatten()
        ])

        self.mn = snt.nets.MLP([n_latent], activation=tf.nn.relu)
        self.std = snt.nets.MLP([n_latent], activation=tf.nn.relu)

        self.decoder = snt.Sequential([
            snt.nets.MLP([8 * 8 * 64], activation=tf.nn.leaky_relu),
            snt.Reshape([8, 8, 64]),
            snt.Conv2DTranspose(64, kernel_size, stride=2, padding='SAME'),
            tf.nn.relu,  # [b, 16, 16, 64]
            snt.Conv2DTranspose(32, kernel_size, stride=4, padding='SAME'),
            tf.nn.relu,  # [b, 64, 64, 32]
            snt.Conv2DTranspose(16, kernel_size, stride=4, padding='SAME'),
            tf.nn.relu,  # [b, 256, 256, 16]
            snt.Conv2DTranspose(4, kernel_size, stride=4, padding='SAME'),
            tf.nn.relu,  # [b, 1024, 1024, 4]
            snt.Conv2D(1, kernel_size, padding='SAME')
        ])  # [b, 1024, 1024, 1]
Beispiel #10
0
 def __init__(self, hidden_sizes: Sequence[int], num_actions: int):
     super().__init__(name='policy_value_net')
     self._torso = snt.Sequential([
         snt.Flatten(),
         snt.nets.MLP(hidden_sizes, activate_final=True),
     ])
     self._policy_head = snt.Linear(num_actions)
     self._value_head = snt.Linear(1)
Beispiel #11
0
 def __call__(self, inputs: tf.Tensor, state: snt.LSTMState):
   flat_inputs = snt.Flatten()(inputs)
   embedding = self._torso(flat_inputs)
   lstm_output, next_state = self._core(embedding, state)
   embedding += tf.nn.relu(lstm_output)  # Note: skip connection.
   logits = self._policy_head(embedding)
   value = self._value_head(embedding)
   return (logits, value), next_state
Beispiel #12
0
 def __init__(self, hidden_sizes: Sequence[int],
              action_spec: specs.DiscreteArray):
     super().__init__(name='policy_value_net')
     self._torso = snt.Sequential([
         snt.Flatten(),
         snt.nets.MLP(hidden_sizes, activate_final=True),
     ])
     self._policy_head = snt.Linear(action_spec.num_values)
     self._value_head = snt.Linear(1)
     self._action_dtype = action_spec.dtype
Beispiel #13
0
def make_ensemble(num_actions: int,
                  num_ensemble: int = 20,
                  num_hidden_layers: int = 2,
                  num_units: int = 50,
                  prior_scale: float = 3.) -> Sequence[snt.Module]:
    """Convenience function to make an ensemble from flags."""
    output_sizes = [num_units] * num_hidden_layers + [num_actions]
    ensemble = []
    for _ in range(num_ensemble):
        network = snt.Sequential([
            snt.Flatten(),
            snt.nets.MLP(output_sizes),
        ])
        prior_network = snt.Sequential([
            snt.Flatten(),
            snt.nets.MLP(output_sizes),
        ])
        ensemble.append(NetworkWithPrior(network, prior_network, prior_scale))
    return ensemble
Beispiel #14
0
def run(bsuite_id: str) -> str:
    """Runs a DQN agent on a given bsuite environment, logging to CSV."""

    env = bsuite.load_and_record(
        bsuite_id=bsuite_id,
        save_path=FLAGS.save_path,
        logging_mode=FLAGS.logging_mode,
        overwrite=FLAGS.overwrite,
    )

    # Making the networks.
    hidden_units = [FLAGS.num_units] * FLAGS.num_hidden_layers
    online_network = snt.Sequential([
        snt.Flatten(),
        snt.nets.MLP(hidden_units + [env.action_spec().num_values]),
    ])
    target_network = snt.Sequential([
        snt.Flatten(),
        snt.nets.MLP(hidden_units + [env.action_spec().num_values]),
    ])

    agent = dqn.DQNTF2(
        action_spec=env.action_spec(),
        online_network=online_network,
        target_network=target_network,
        batch_size=FLAGS.batch_size,
        discount=FLAGS.discount,
        replay_capacity=FLAGS.replay_capacity,
        min_replay_size=FLAGS.min_replay_size,
        sgd_period=FLAGS.sgd_period,
        target_update_period=FLAGS.target_update_period,
        optimizer=snt.optimizers.Adam(learning_rate=FLAGS.learning_rate),
        epsilon=FLAGS.epsilon,
        seed=FLAGS.seed,
    )

    experiment.run(
        agent=agent,
        environment=env,
        num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes,  # pytype: disable=attribute-error
        verbose=FLAGS.verbose)

    return bsuite_id
Beispiel #15
0
    def encode(self, image):
        """Encode the image observation."""

        convnet_output = self._convnet(image)

        # Store unflattened convnet output shape for use in decoder.
        self._convnet_output_shape = convnet_output.shape[1:]

        # Flatten convnet outputs and pass through final layer to get image code.
        print("Encode the image observation.")
        return self._post_convnet_layer(snt.Flatten()(convnet_output))
Beispiel #16
0
 def __init__(self):
     super().__init__(name='atari_torso')
     self._network = snt.Sequential([
         snt.Conv2D(32, [8, 8], [4, 4]),
         tf.nn.relu,
         snt.Conv2D(64, [4, 4], [2, 2]),
         tf.nn.relu,
         snt.Conv2D(64, [3, 3], [1, 1]),
         tf.nn.relu,
         snt.Flatten(),
     ])
Beispiel #17
0
def make_dqn(num_actions: int):
    return snt.Sequential([
        snt.Conv2D(32, [3, 3], [2, 2]),
        tf.nn.relu,
        snt.Conv2D(32, [3, 3], [2, 2]),
        tf.nn.relu,
        snt.Conv2D(32, [3, 3], [2, 2]),
        tf.nn.relu,
        snt.Conv2D(32, [3, 3], [2, 2]),
        tf.nn.relu,
        snt.Flatten(),
        duelling.DuellingMLP(num_actions, hidden_sizes=[512]),
    ])
Beispiel #18
0
def _preprocess_inputs(inputs: tf.Tensor, output_dtype: tf.DType) -> tf.Tensor:
  """Returns the `Tensor` corresponding to the preprocessed inputs."""
  rank = inputs.shape.rank
  if rank < 4:
    raise ValueError(
        'Input Tensor must have at least 4 dimensions (for '
        'batch size, height, width, and channels), but it only has '
        '{}'.format(rank))

  flattened_inputs = snt.Flatten(preserve_dims=3)(inputs)
  processed_inputs = tf.image.convert_image_dtype(
      flattened_inputs, dtype=output_dtype)
  return processed_inputs
Beispiel #19
0
    def test_recurrent(self):
        environment = _make_fake_env()
        env_spec = specs.make_environment_spec(environment)

        network = snt.DeepRNN([
            snt.Flatten(),
            snt.Linear(env_spec.actions.num_values),
            lambda x: tf.argmax(x, axis=-1, output_type=env_spec.actions.dtype
                                ),
        ])

        actor = actors_tf2.RecurrentActor(network)
        loop = environment_loop.EnvironmentLoop(environment, actor)
        loop.run(20)
Beispiel #20
0
    def test_feedforward(self):
        environment = _make_fake_env()
        env_spec = specs.make_environment_spec(environment)

        network = snt.Sequential([
            snt.Flatten(),
            snt.Linear(env_spec.actions.num_values),
            lambda x: tf.argmax(x, axis=-1, output_type=env_spec.actions.dtype
                                ),
        ])

        actor = actors_tf2.FeedForwardActor(network)
        loop = environment_loop.EnvironmentLoop(environment, actor)
        loop.run(20)
Beispiel #21
0
    def __init__(self,
                 num_actions: int,
                 rnn_hidden_size: int = 10,
                 head_layers: Sequence[int] = [5]):
        super().__init__(name='r2d2_network')

        self._net = snt.DeepRNN([
            snt.Flatten(),
            # LSTM core.
            snt.LSTM(rnn_hidden_size),
            # Dueling MLP head.
            networks.DuellingMLP(num_actions=num_actions,
                                 hidden_sizes=head_layers)
        ])
Beispiel #22
0
  def __call__(self, state: tf.Tensor,
               action: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:

    embedded_state = snt.Flatten()(state)
    embedded_action = tf.one_hot(action, depth=self._num_actions)

    embedding = tf.concat([embedded_state, embedded_action], axis=-1)

    # Predict the next state, reward, and termination.
    next_state = self._state_network(embedding)
    reward = self._reward_network(embedding)
    discount_logits = self._discount_network(embedding)

    return next_state, reward, discount_logits
Beispiel #23
0
def batch_concat(inputs: types.NestedTensor) -> tf.Tensor:
    """Concatenate a collection of Tensors while preserving the batch dimension.

  This takes a potentially nested collection of tensors, flattens everything
  but the batch (first) dimension, and concatenates along the resulting data
  (second) dimension.

  Args:
    inputs: a tensor or nested collection of tensors.

  Returns:
    A concatenated tensor which maintains the batch dimension but concatenates
    all other data along the flattened second dimension.
  """
    flat_leaves = tree.map_structure(snt.Flatten(), inputs)
    return tf.concat(tree.flatten(flat_leaves), axis=-1)
 def __init__(self, action_spec):
     super().__init__(name='r2d2_test_network')
     self._net = snt.DeepRNN([
         snt.Conv2D(32, [8, 8], [4, 4]),
         tf.nn.relu,
         snt.Conv2D(64, [4, 4], [2, 2]),
         tf.nn.relu,
         snt.Conv2D(64, [3, 3], [1, 1]),
         tf.nn.relu,
         snt.Flatten(),
         snt.LSTM(20),
         tf.nn.relu,
         #snt.LSTM(160),
         #snt.nets.MLP([50, 50,512]),
         #tf.nn.relu,
         snt.nets.MLP([50, 50, action_spec])
     ])
Beispiel #25
0
def main(_):
    # Create an environment and grab the spec.
    environment = bsuite.load_from_id('catch/0')
    environment = wrappers.SinglePrecisionWrapper(environment)
    environment_spec = specs.make_environment_spec(environment)

    network = snt.Sequential([
        snt.Flatten(),
        snt.nets.MLP([50, 50, environment_spec.actions.num_values])
    ])

    # Construct the agent.
    agent = dqn.DQN(environment_spec=environment_spec, network=network)

    # Run the environment loop.
    loop = acme.EnvironmentLoop(environment, agent)
    loop.run(num_episodes=environment.bsuite_num_episodes)  # pytype: disable=attribute-error
Beispiel #26
0
    def _build(self, inputs, prev_state):
        """Connects the DNC core into the graph.

    Args:
      inputs: Tensor input.
      prev_state: A `DNCState` tuple containing the fields `access_output`,
          `access_state` and `controller_state`. `access_state` is a 3-D Tensor
          of shape `[batch_size, num_reads, word_size]` containing read words.
          `access_state` is a tuple of the access module's state, and
          `controller_state` is a tuple of controller module's state.

    Returns:
      A tuple `(output, next_state)` where `output` is a tensor and `next_state`
      is a `DNCState` tuple containing the fields `access_output`,
      `access_state`, and `controller_state`.
    """

        prev_access_output = prev_state.access_output
        prev_access_state = prev_state.access_state
        prev_controller_state = prev_state.controller_state

        batch_flatten = snt.Flatten()
        controller_input = tf.concat(
            [batch_flatten(inputs),
             batch_flatten(prev_access_output)], 1)

        controller_output, controller_state = self._controller(
            controller_input, prev_controller_state)

        controller_output = self._clip_if_enabled(controller_output)
        controller_state = tf.nest.map_structure(self._clip_if_enabled,
                                                 controller_state)

        access_output, access_state = self._access(controller_output,
                                                   prev_access_state)

        output = tf.concat([controller_output,
                            batch_flatten(access_output)], 1)
        output = snt.Linear(output_size=self._output_size.as_list()[0],
                            name='output_linear')(output)
        output = self._clip_if_enabled(output)

        return output, DNCState(access_output=access_output,
                                access_state=access_state,
                                controller_state=controller_state)
Beispiel #27
0
    def __init__(self,
                 action_spec: specs.DiscreteArray,
                 name: Optional[Text] = None):
        super().__init__(name=name)

        # Spatial
        self.conv1 = snt.Conv2D(16, 1, 1, data_format="NHWC", name="conv_1")
        self.conv2 = snt.Conv2D(32, 3, 1, data_format="NHWC", name="conv_2")
        self.conv3 = snt.Conv2D(64, 3, 1, data_format="NHWC", name="conv_3")
        self.conv4 = snt.Conv2D(32, 3, 1, data_format="NHWC", name="conv_4")
        self.flatten = snt.Flatten()

        self.fc1 = snt.Linear(256, name="fc_1")

        # Flat
        self.flat = snt.nets.MLP([64, 64], name="mlp_1")
        self.rnn = snt.DeepRNN([
            snt.nets.MLP([50, 50], activate_final=True, name="mlp_2"),
            snt.GRU(512, name="gru"),
            networks.PolicyValueHead(action_spec.num_values)
        ])
Beispiel #28
0
def main(_):
    # Create an environment and grab the spec.
    raw_environment = bsuite.load_and_record_to_csv(
        bsuite_id=FLAGS.bsuite_id,
        results_dir=FLAGS.results_dir,
        overwrite=FLAGS.overwrite,
    )
    environment = wrappers.SinglePrecisionWrapper(raw_environment)
    environment_spec = specs.make_environment_spec(environment)

    network = snt.Sequential([
        snt.Flatten(),
        snt.nets.MLP([50, 50, environment_spec.actions.num_values])
    ])

    # Construct the agent.
    agent = dqn.DQN(environment_spec=environment_spec, network=network)

    # Run the environment loop.
    loop = acme.EnvironmentLoop(environment, agent)
    loop.run(num_episodes=environment.bsuite_num_episodes)  # pytype: disable=attribute-error
Beispiel #29
0
    def __init__(
            self,
            num_channels: Sequence[int] = (16, 32,
                                           32),  # default to IMPALA resnet.
            num_blocks: Sequence[int] = (2, 2, 2),  # default to IMPALA resnet.
            num_output_hidden: Sequence[int] = (
                256, ),  # default to IMPALA resnet.
            conv_shape: Union[int, Sequence[int]] = 3,
            conv_stride: Union[int, Sequence[int]] = 1,
            pool_size: Union[int, Sequence[int]] = 3,
            pool_stride: Union[int, Sequence[int]] = 2,
            data_format: str = 'NHWC',
            activation: Callable[[tf.Tensor], tf.Tensor] = tf.nn.relu,
            output_dtype: tf.DType = tf.float32,
            name: str = 'resnet_torso'):
        super().__init__(name=name)

        self._output_dtype = output_dtype
        self._num_layers = len(num_blocks)

        # Create sequence of residual blocks.
        blocks = []
        for i in range(self._num_layers):
            blocks.append(
                ResidualBlockGroup(num_blocks[i],
                                   num_channels[i],
                                   conv_shape,
                                   conv_stride,
                                   pool_size,
                                   pool_stride,
                                   data_format=data_format,
                                   activation=activation))

        # Create output layer.
        out_layer = snt.nets.MLP(num_output_hidden, activation=activation)

        # Compose blocks and final layer.
        self._resnet = snt.Sequential(
            blocks + [activation, snt.Flatten(), out_layer])
Beispiel #30
0
def main(_):
    # Create an environment and grab the spec.
    env_configs = {'players': FLAGS.num_players} if FLAGS.num_players else {}
    raw_environment = rl_environment.Environment(FLAGS.game, **env_configs)

    environment = open_spiel_wrapper.OpenSpielWrapper(raw_environment)
    environment = wrappers.SinglePrecisionWrapper(
        environment)  # type: open_spiel_wrapper.OpenSpielWrapper
    environment_spec = acme.make_environment_spec(environment)

    # Build the networks.
    networks = []
    policy_networks = []
    for _ in range(environment.num_players):
        network = legal_actions.MaskedSequential([
            snt.Flatten(),
            snt.nets.MLP([50, 50, environment_spec.actions.num_values])
        ])
        policy_network = snt.Sequential([
            network,
            legal_actions.EpsilonGreedy(epsilon=0.1, threshold=-1e8)
        ])
        networks.append(network)
        policy_networks.append(policy_network)

    # Construct the agents.
    agents = []

    for network, policy_network in zip(networks, policy_networks):
        agents.append(
            dqn.DQN(environment_spec=environment_spec,
                    network=network,
                    policy_network=policy_network))

    # Run the environment loop.
    loop = open_spiel_environment_loop.OpenSpielEnvironmentLoop(
        environment, agents)
    loop.run(num_episodes=100000)