def default_agent(obs_spec: dm_env.specs.Array, action_spec: dm_env.specs.DiscreteArray): """Initialize a DQN agent with default parameters.""" del obs_spec # Unused. hidden_units = [50, 50] online_network = snt.Sequential([ snt.Flatten(), snt.nets.MLP(hidden_units + [action_spec.num_values]), ]) target_network = snt.Sequential([ snt.Flatten(), snt.nets.MLP(hidden_units + [action_spec.num_values]), ]) return DQN(action_spec=action_spec, online_network=online_network, target_network=target_network, batch_size=32, discount=0.99, replay_capacity=10000, min_replay_size=100, sgd_period=1, target_update_period=4, optimizer=snt.optimizers.Adam(learning_rate=1e-3), epsilon=0.05, seed=42)
def __init__(self, hidden_sizes: Sequence[int], num_actions: int): super().__init__(name='policy_value_net') self._num_actions = num_actions self._torso = snt.Sequential([ snt.Flatten(), snt.nets.MLP(hidden_sizes, activate_final=True, name='net'), ]) self._core = snt.DeepRNN([ snt.Flatten(), snt.LSTM(hidden_sizes[-1], name='rnn'), ]) self._policy_head = snt.Linear(num_actions, name='policy') self._value_head = snt.Linear(1, name='value')
def test_mcts(self): # Create a fake environment to test with. num_actions = 5 environment = fakes.DiscreteEnvironment(num_actions=num_actions, num_observations=10, obs_dtype=np.float32, episode_length=10) spec = specs.make_environment_spec(environment) network = snt.Sequential([ snt.Flatten(), snt.nets.MLP([50, 50]), networks.PolicyValueHead(spec.actions.num_values), ]) model = simulator.Simulator(environment) optimizer = snt.optimizers.Adam(1e-3) # Construct the agent. agent = mcts.MCTS(environment_spec=spec, network=network, model=model, optimizer=optimizer, n_step=1, discount=1., replay_capacity=100, num_simulations=10, batch_size=10) # Try running the environment loop. We have no assertions here because all # we care about is that the agent runs without raising any errors. loop = acme.EnvironmentLoop(environment, agent) loop.run(num_episodes=2)
def _make_network(spec) -> snt.Module: network = snt.Sequential([ snt.Flatten(), snt.nets.MLP([50, 50, spec.actions.num_values]), ]) tf2_utils.create_variables(network, [spec.observations]) return network
def main(_): wb_run = wandb.init(project="offline-rl", group=FLAGS.logs_tag, id=FLAGS.wandb_id or str(int(time.time())), config=FLAGS.flag_values_dict(), reinit=FLAGS.acme_id is None) if FLAGS.wandb else None # Create an environment and grab the spec. environment, environment_spec = _build_environment( FLAGS.environment_name, max_steps=FLAGS.ep_max_len) network = snt.Sequential([ snt.Flatten(), snt.nets.MLP([128, 64, 32, environment_spec.actions.num_values]) ]) disp, disp_loop = _build_custom_loggers(wb_run) # Construct the agent. agent = dqn.DQN(environment_spec=environment_spec, network=network, batch_size=FLAGS.batch_size, n_step=FLAGS.n_step_returns, epsilon=FLAGS.epsilon, logger=disp) # Run the environment loop. loop = EnvironmentLoop(environment, agent, logger=disp_loop) loop.run(num_episodes=FLAGS.n_episodes) # pytype: disable=attribute-error agent._checkpointer.save(force=True) wandb.save(agent._checkpointer._checkpoint_dir) wandb.run.summary.update( {"checkpoint_dir": agent._checkpointer._checkpoint_dir})
def __init__(self, action_spec: specs.DiscreteArray): super().__init__(name='r2d2_test_network') self._net = snt.DeepRNN([ snt.Flatten(), snt.LSTM(20), snt.nets.MLP([50, 50, action_spec.num_values]) ])
def _make_network(action_spec: specs.DiscreteArray) -> snt.RNNCore: return snt.DeepRNN([ snt.Flatten(), snt.LSTM(20), snt.nets.MLP([50, 50]), networks.PolicyValueHead(action_spec.num_values), ])
def __init__(self, data_format: str = 'NHWC', activation: Callable[[tf.Tensor], tf.Tensor] = tf.nn.relu, output_dtype: tf.DType = tf.float32, name: str = 'resnet_torso'): super().__init__(name=name) self._output_dtype = output_dtype # Create a Conv2D factory since we'll be making quite a few. gain = 2**0.5 if activation == tf.nn.relu else 1. def build_conv_layer(name: str, output_channels: int = 32, kernel_shape: Sequence[int] = (3, 3), stride: int = 1): return snt.Conv2D(output_channels=output_channels, kernel_shape=kernel_shape, stride=stride, padding='SAME', data_format=data_format, w_init=snt.initializers.Orthogonal(gain=gain, seed=None), b_init=snt.initializers.Zeros(), name=name) self._network = snt.Sequential([ build_conv_layer('conv_0', stride=2), activation, build_conv_layer('conv_1', stride=1), activation, build_conv_layer('conv_2', stride=1), activation, build_conv_layer('conv_3', stride=1), activation, snt.Flatten() ])
def __init__(self, n_latent=4, kernel_size=4, name=None): super(VariationalAutoEncoder, self).__init__(name=name) self.n_latent = n_latent self.encoder = snt.Sequential([ snt.Conv2D(4, kernel_size, stride=4, padding='SAME'), tf.nn.relu, # [b, 250, 250, 4] snt.Conv2D(16, kernel_size, stride=4, padding='SAME'), tf.nn.relu, # [b, 63, 63, 16] snt.Conv2D(32, kernel_size, stride=4, padding='SAME'), tf.nn.relu, # [b, 16, 16, 32] snt.Conv2D(64, kernel_size, stride=2, padding='SAME'), tf.nn.relu, # [b, 8, 8, 64] snt.Flatten() ]) self.mn = snt.nets.MLP([n_latent], activation=tf.nn.relu) self.std = snt.nets.MLP([n_latent], activation=tf.nn.relu) self.decoder = snt.Sequential([ snt.nets.MLP([8 * 8 * 64], activation=tf.nn.leaky_relu), snt.Reshape([8, 8, 64]), snt.Conv2DTranspose(64, kernel_size, stride=2, padding='SAME'), tf.nn.relu, # [b, 16, 16, 64] snt.Conv2DTranspose(32, kernel_size, stride=4, padding='SAME'), tf.nn.relu, # [b, 64, 64, 32] snt.Conv2DTranspose(16, kernel_size, stride=4, padding='SAME'), tf.nn.relu, # [b, 256, 256, 16] snt.Conv2DTranspose(4, kernel_size, stride=4, padding='SAME'), tf.nn.relu, # [b, 1024, 1024, 4] snt.Conv2D(1, kernel_size, padding='SAME') ]) # [b, 1024, 1024, 1]
def __init__(self, hidden_sizes: Sequence[int], num_actions: int): super().__init__(name='policy_value_net') self._torso = snt.Sequential([ snt.Flatten(), snt.nets.MLP(hidden_sizes, activate_final=True), ]) self._policy_head = snt.Linear(num_actions) self._value_head = snt.Linear(1)
def __call__(self, inputs: tf.Tensor, state: snt.LSTMState): flat_inputs = snt.Flatten()(inputs) embedding = self._torso(flat_inputs) lstm_output, next_state = self._core(embedding, state) embedding += tf.nn.relu(lstm_output) # Note: skip connection. logits = self._policy_head(embedding) value = self._value_head(embedding) return (logits, value), next_state
def __init__(self, hidden_sizes: Sequence[int], action_spec: specs.DiscreteArray): super().__init__(name='policy_value_net') self._torso = snt.Sequential([ snt.Flatten(), snt.nets.MLP(hidden_sizes, activate_final=True), ]) self._policy_head = snt.Linear(action_spec.num_values) self._value_head = snt.Linear(1) self._action_dtype = action_spec.dtype
def make_ensemble(num_actions: int, num_ensemble: int = 20, num_hidden_layers: int = 2, num_units: int = 50, prior_scale: float = 3.) -> Sequence[snt.Module]: """Convenience function to make an ensemble from flags.""" output_sizes = [num_units] * num_hidden_layers + [num_actions] ensemble = [] for _ in range(num_ensemble): network = snt.Sequential([ snt.Flatten(), snt.nets.MLP(output_sizes), ]) prior_network = snt.Sequential([ snt.Flatten(), snt.nets.MLP(output_sizes), ]) ensemble.append(NetworkWithPrior(network, prior_network, prior_scale)) return ensemble
def run(bsuite_id: str) -> str: """Runs a DQN agent on a given bsuite environment, logging to CSV.""" env = bsuite.load_and_record( bsuite_id=bsuite_id, save_path=FLAGS.save_path, logging_mode=FLAGS.logging_mode, overwrite=FLAGS.overwrite, ) # Making the networks. hidden_units = [FLAGS.num_units] * FLAGS.num_hidden_layers online_network = snt.Sequential([ snt.Flatten(), snt.nets.MLP(hidden_units + [env.action_spec().num_values]), ]) target_network = snt.Sequential([ snt.Flatten(), snt.nets.MLP(hidden_units + [env.action_spec().num_values]), ]) agent = dqn.DQNTF2( action_spec=env.action_spec(), online_network=online_network, target_network=target_network, batch_size=FLAGS.batch_size, discount=FLAGS.discount, replay_capacity=FLAGS.replay_capacity, min_replay_size=FLAGS.min_replay_size, sgd_period=FLAGS.sgd_period, target_update_period=FLAGS.target_update_period, optimizer=snt.optimizers.Adam(learning_rate=FLAGS.learning_rate), epsilon=FLAGS.epsilon, seed=FLAGS.seed, ) experiment.run( agent=agent, environment=env, num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes, # pytype: disable=attribute-error verbose=FLAGS.verbose) return bsuite_id
def encode(self, image): """Encode the image observation.""" convnet_output = self._convnet(image) # Store unflattened convnet output shape for use in decoder. self._convnet_output_shape = convnet_output.shape[1:] # Flatten convnet outputs and pass through final layer to get image code. print("Encode the image observation.") return self._post_convnet_layer(snt.Flatten()(convnet_output))
def __init__(self): super().__init__(name='atari_torso') self._network = snt.Sequential([ snt.Conv2D(32, [8, 8], [4, 4]), tf.nn.relu, snt.Conv2D(64, [4, 4], [2, 2]), tf.nn.relu, snt.Conv2D(64, [3, 3], [1, 1]), tf.nn.relu, snt.Flatten(), ])
def make_dqn(num_actions: int): return snt.Sequential([ snt.Conv2D(32, [3, 3], [2, 2]), tf.nn.relu, snt.Conv2D(32, [3, 3], [2, 2]), tf.nn.relu, snt.Conv2D(32, [3, 3], [2, 2]), tf.nn.relu, snt.Conv2D(32, [3, 3], [2, 2]), tf.nn.relu, snt.Flatten(), duelling.DuellingMLP(num_actions, hidden_sizes=[512]), ])
def _preprocess_inputs(inputs: tf.Tensor, output_dtype: tf.DType) -> tf.Tensor: """Returns the `Tensor` corresponding to the preprocessed inputs.""" rank = inputs.shape.rank if rank < 4: raise ValueError( 'Input Tensor must have at least 4 dimensions (for ' 'batch size, height, width, and channels), but it only has ' '{}'.format(rank)) flattened_inputs = snt.Flatten(preserve_dims=3)(inputs) processed_inputs = tf.image.convert_image_dtype( flattened_inputs, dtype=output_dtype) return processed_inputs
def test_recurrent(self): environment = _make_fake_env() env_spec = specs.make_environment_spec(environment) network = snt.DeepRNN([ snt.Flatten(), snt.Linear(env_spec.actions.num_values), lambda x: tf.argmax(x, axis=-1, output_type=env_spec.actions.dtype ), ]) actor = actors_tf2.RecurrentActor(network) loop = environment_loop.EnvironmentLoop(environment, actor) loop.run(20)
def test_feedforward(self): environment = _make_fake_env() env_spec = specs.make_environment_spec(environment) network = snt.Sequential([ snt.Flatten(), snt.Linear(env_spec.actions.num_values), lambda x: tf.argmax(x, axis=-1, output_type=env_spec.actions.dtype ), ]) actor = actors_tf2.FeedForwardActor(network) loop = environment_loop.EnvironmentLoop(environment, actor) loop.run(20)
def __init__(self, num_actions: int, rnn_hidden_size: int = 10, head_layers: Sequence[int] = [5]): super().__init__(name='r2d2_network') self._net = snt.DeepRNN([ snt.Flatten(), # LSTM core. snt.LSTM(rnn_hidden_size), # Dueling MLP head. networks.DuellingMLP(num_actions=num_actions, hidden_sizes=head_layers) ])
def __call__(self, state: tf.Tensor, action: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: embedded_state = snt.Flatten()(state) embedded_action = tf.one_hot(action, depth=self._num_actions) embedding = tf.concat([embedded_state, embedded_action], axis=-1) # Predict the next state, reward, and termination. next_state = self._state_network(embedding) reward = self._reward_network(embedding) discount_logits = self._discount_network(embedding) return next_state, reward, discount_logits
def batch_concat(inputs: types.NestedTensor) -> tf.Tensor: """Concatenate a collection of Tensors while preserving the batch dimension. This takes a potentially nested collection of tensors, flattens everything but the batch (first) dimension, and concatenates along the resulting data (second) dimension. Args: inputs: a tensor or nested collection of tensors. Returns: A concatenated tensor which maintains the batch dimension but concatenates all other data along the flattened second dimension. """ flat_leaves = tree.map_structure(snt.Flatten(), inputs) return tf.concat(tree.flatten(flat_leaves), axis=-1)
def __init__(self, action_spec): super().__init__(name='r2d2_test_network') self._net = snt.DeepRNN([ snt.Conv2D(32, [8, 8], [4, 4]), tf.nn.relu, snt.Conv2D(64, [4, 4], [2, 2]), tf.nn.relu, snt.Conv2D(64, [3, 3], [1, 1]), tf.nn.relu, snt.Flatten(), snt.LSTM(20), tf.nn.relu, #snt.LSTM(160), #snt.nets.MLP([50, 50,512]), #tf.nn.relu, snt.nets.MLP([50, 50, action_spec]) ])
def main(_): # Create an environment and grab the spec. environment = bsuite.load_from_id('catch/0') environment = wrappers.SinglePrecisionWrapper(environment) environment_spec = specs.make_environment_spec(environment) network = snt.Sequential([ snt.Flatten(), snt.nets.MLP([50, 50, environment_spec.actions.num_values]) ]) # Construct the agent. agent = dqn.DQN(environment_spec=environment_spec, network=network) # Run the environment loop. loop = acme.EnvironmentLoop(environment, agent) loop.run(num_episodes=environment.bsuite_num_episodes) # pytype: disable=attribute-error
def _build(self, inputs, prev_state): """Connects the DNC core into the graph. Args: inputs: Tensor input. prev_state: A `DNCState` tuple containing the fields `access_output`, `access_state` and `controller_state`. `access_state` is a 3-D Tensor of shape `[batch_size, num_reads, word_size]` containing read words. `access_state` is a tuple of the access module's state, and `controller_state` is a tuple of controller module's state. Returns: A tuple `(output, next_state)` where `output` is a tensor and `next_state` is a `DNCState` tuple containing the fields `access_output`, `access_state`, and `controller_state`. """ prev_access_output = prev_state.access_output prev_access_state = prev_state.access_state prev_controller_state = prev_state.controller_state batch_flatten = snt.Flatten() controller_input = tf.concat( [batch_flatten(inputs), batch_flatten(prev_access_output)], 1) controller_output, controller_state = self._controller( controller_input, prev_controller_state) controller_output = self._clip_if_enabled(controller_output) controller_state = tf.nest.map_structure(self._clip_if_enabled, controller_state) access_output, access_state = self._access(controller_output, prev_access_state) output = tf.concat([controller_output, batch_flatten(access_output)], 1) output = snt.Linear(output_size=self._output_size.as_list()[0], name='output_linear')(output) output = self._clip_if_enabled(output) return output, DNCState(access_output=access_output, access_state=access_state, controller_state=controller_state)
def __init__(self, action_spec: specs.DiscreteArray, name: Optional[Text] = None): super().__init__(name=name) # Spatial self.conv1 = snt.Conv2D(16, 1, 1, data_format="NHWC", name="conv_1") self.conv2 = snt.Conv2D(32, 3, 1, data_format="NHWC", name="conv_2") self.conv3 = snt.Conv2D(64, 3, 1, data_format="NHWC", name="conv_3") self.conv4 = snt.Conv2D(32, 3, 1, data_format="NHWC", name="conv_4") self.flatten = snt.Flatten() self.fc1 = snt.Linear(256, name="fc_1") # Flat self.flat = snt.nets.MLP([64, 64], name="mlp_1") self.rnn = snt.DeepRNN([ snt.nets.MLP([50, 50], activate_final=True, name="mlp_2"), snt.GRU(512, name="gru"), networks.PolicyValueHead(action_spec.num_values) ])
def main(_): # Create an environment and grab the spec. raw_environment = bsuite.load_and_record_to_csv( bsuite_id=FLAGS.bsuite_id, results_dir=FLAGS.results_dir, overwrite=FLAGS.overwrite, ) environment = wrappers.SinglePrecisionWrapper(raw_environment) environment_spec = specs.make_environment_spec(environment) network = snt.Sequential([ snt.Flatten(), snt.nets.MLP([50, 50, environment_spec.actions.num_values]) ]) # Construct the agent. agent = dqn.DQN(environment_spec=environment_spec, network=network) # Run the environment loop. loop = acme.EnvironmentLoop(environment, agent) loop.run(num_episodes=environment.bsuite_num_episodes) # pytype: disable=attribute-error
def __init__( self, num_channels: Sequence[int] = (16, 32, 32), # default to IMPALA resnet. num_blocks: Sequence[int] = (2, 2, 2), # default to IMPALA resnet. num_output_hidden: Sequence[int] = ( 256, ), # default to IMPALA resnet. conv_shape: Union[int, Sequence[int]] = 3, conv_stride: Union[int, Sequence[int]] = 1, pool_size: Union[int, Sequence[int]] = 3, pool_stride: Union[int, Sequence[int]] = 2, data_format: str = 'NHWC', activation: Callable[[tf.Tensor], tf.Tensor] = tf.nn.relu, output_dtype: tf.DType = tf.float32, name: str = 'resnet_torso'): super().__init__(name=name) self._output_dtype = output_dtype self._num_layers = len(num_blocks) # Create sequence of residual blocks. blocks = [] for i in range(self._num_layers): blocks.append( ResidualBlockGroup(num_blocks[i], num_channels[i], conv_shape, conv_stride, pool_size, pool_stride, data_format=data_format, activation=activation)) # Create output layer. out_layer = snt.nets.MLP(num_output_hidden, activation=activation) # Compose blocks and final layer. self._resnet = snt.Sequential( blocks + [activation, snt.Flatten(), out_layer])
def main(_): # Create an environment and grab the spec. env_configs = {'players': FLAGS.num_players} if FLAGS.num_players else {} raw_environment = rl_environment.Environment(FLAGS.game, **env_configs) environment = open_spiel_wrapper.OpenSpielWrapper(raw_environment) environment = wrappers.SinglePrecisionWrapper( environment) # type: open_spiel_wrapper.OpenSpielWrapper environment_spec = acme.make_environment_spec(environment) # Build the networks. networks = [] policy_networks = [] for _ in range(environment.num_players): network = legal_actions.MaskedSequential([ snt.Flatten(), snt.nets.MLP([50, 50, environment_spec.actions.num_values]) ]) policy_network = snt.Sequential([ network, legal_actions.EpsilonGreedy(epsilon=0.1, threshold=-1e8) ]) networks.append(network) policy_networks.append(policy_network) # Construct the agents. agents = [] for network, policy_network in zip(networks, policy_networks): agents.append( dqn.DQN(environment_spec=environment_spec, network=network, policy_network=policy_network)) # Run the environment loop. loop = open_spiel_environment_loop.OpenSpielEnvironmentLoop( environment, agents) loop.run(num_episodes=100000)