def make_networks( action_spec: types.NestedSpec, policy_layer_sizes: Sequence[int] = (10, 10), critic_layer_sizes: Sequence[int] = (10, 10), vmin: float = -150., vmax: float = 150., num_atoms: int = 51, ) -> Dict[str, snt.Module]: """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_layer_sizes = list(policy_layer_sizes) + [num_dimensions] policy_network = snt.Sequential( [networks.LayerNormMLP(policy_layer_sizes), tf.tanh]) critic_network = snt.Sequential([ networks.CriticMultiplexer(critic_network=networks.LayerNormMLP( critic_layer_sizes, activate_final=True)), networks.DiscreteValuedHead(vmin, vmax, num_atoms) ]) return { 'policy': policy_network, 'critic': critic_network, }
def make_d4pg_networks( action_spec, policy_layer_sizes=(256, 256, 256), critic_layer_sizes=(512, 512, 256), vmin=-150., vmax=150., num_atoms=201): """Creates networks used by the d4pg agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_layer_sizes = list(policy_layer_sizes) + [int(num_dimensions)] policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes), networks.TanhToSpec(action_spec) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = snt.Sequential([ networks.CriticMultiplexer( critic_network=networks.LayerNormMLP( critic_layer_sizes, activate_final=True)), networks.DiscreteValuedHead(vmin, vmax, num_atoms) ]) return { 'policy': policy_network, 'critic': critic_network, 'observation': tf2_utils.batch_concat, }
def make_networks( action_spec: specs.BoundedArray, policy_layer_sizes: Sequence[int] = (50, ), critic_layer_sizes: Sequence[int] = (50, ), vmin: float = -150., vmax: float = 150., num_atoms: int = 51, ): """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes, activate_final=True), networks.MultivariateNormalDiagHead(num_dimensions, tanh_mean=True, init_scale=0.3, fixed_scale=True, use_tfd_independent=False) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(critic_layer_sizes, activate_final=True), action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential( [critic_network, networks.DiscreteValuedHead(vmin, vmax, num_atoms)]) return { 'policy': policy_network, 'critic': critic_network, 'observation': tf2_utils.batch_concat, }
def make_networks( action_spec: specs.Array, policy_layer_sizes: Sequence[int] = (300, 200), critic_layer_sizes: Sequence[int] = (400, 300), ) -> Dict[str, snt.Module]: """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) critic_layer_sizes = list(critic_layer_sizes) policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes), networks.MultivariateNormalDiagHead(num_dimensions), ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = snt.Sequential([ networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(critic_layer_sizes)), networks.DiscreteValuedHead(0., 1., 10), ]) return { 'policy': policy_network, 'critic': critic_network, }
def make_networks( action_spec: specs.BoundedArray, policy_layer_sizes: Sequence[int] = (256, 256, 256), critic_layer_sizes: Sequence[int] = (512, 512, 256), vmin: float = -150., vmax: float = 150., num_atoms: int = 51, ) -> Dict[str, Union[snt.Module, Callable[[tf.Tensor], tf.Tensor]]]: """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes, activate_final=True), networks.NearZeroInitializedLinear(num_dimensions), networks.TanhToSpec(action_spec) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = snt.Sequential([ networks.CriticMultiplexer(), networks.LayerNormMLP(critic_layer_sizes, activate_final=True), networks.DiscreteValuedHead(vmin, vmax, num_atoms), ]) return { 'policy': policy_network, 'critic': critic_network, 'observation': tf2_utils.batch_concat, }
def make_dmpo_networks( action_spec, policy_layer_sizes = (300, 200), critic_layer_sizes = (400, 300), vmin = -150., vmax = 150., num_atoms = 51, ): """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes), networks.MultivariateNormalDiagHead(num_dimensions) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(critic_layer_sizes), action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential( [critic_network, networks.DiscreteValuedHead(vmin, vmax, num_atoms)]) return { 'policy': policy_network, 'critic': critic_network, 'observation': tf_utils.batch_concat, }
def make_networks( action_spec: specs.BoundedArray, num_critic_heads: int, policy_layer_sizes: Sequence[int] = (50, ), critic_layer_sizes: Sequence[int] = (50, ), num_layers_shared: int = 1, distributional_critic: bool = True, vmin: float = -150., vmax: float = 150., num_atoms: int = 51, ): """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes, activate_final=True), networks.MultivariateNormalDiagHead(num_dimensions, tanh_mean=False, init_scale=0.69) ]) if not distributional_critic: critic_layer_sizes = list(critic_layer_sizes) + [1] if not num_layers_shared: # No layers are shared critic_network_base = None else: critic_network_base = networks.LayerNormMLP( critic_layer_sizes[:num_layers_shared], activate_final=True) critic_network_heads = [ snt.nets.MLP(critic_layer_sizes, activation=tf.nn.elu, activate_final=False) for _ in range(num_critic_heads) ] if distributional_critic: critic_network_heads = [ snt.Sequential( [c, networks.DiscreteValuedHead(vmin, vmax, num_atoms)]) for c in critic_network_heads ] # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = snt.Sequential([ networks.CriticMultiplexer( critic_network=critic_network_base, action_network=networks.ClipToSpec(action_spec)), networks.Multihead(network_heads=critic_network_heads), ]) return { 'policy': policy_network, 'critic': critic_network, 'observation': tf2_utils.batch_concat, }
def make_networks( action_spec: specs.BoundedArray, policy_layer_sizes: Sequence[int] = (50, 1024, 1024), critic_layer_sizes: Sequence[int] = (50, 1024, 1024), vmin: float = -150., vmax: float = 150., num_atoms: int = 51, ) -> Dict[str, snt.Module]: """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes, w_init=snt.initializers.Orthogonal(), activation=tf.nn.relu, activate_final=True), networks.MultivariateNormalDiagHead( num_dimensions, tanh_mean=False, init_scale=1.0, fixed_scale=False, use_tfd_independent=True, w_init=snt.initializers.Orthogonal()) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = networks.CriticMultiplexer( observation_network=snt.Sequential([ snt.Linear(critic_layer_sizes[0], w_init=snt.initializers.Orthogonal()), snt.LayerNorm(axis=slice(1, None), create_scale=True, create_offset=True), tf.nn.tanh ]), critic_network=snt.nets.MLP(critic_layer_sizes[1:], w_init=snt.initializers.Orthogonal(), activation=tf.nn.relu, activate_final=True), action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential([ critic_network, networks.DiscreteValuedHead(vmin, vmax, num_atoms, w_init=snt.initializers.Orthogonal()) ]) observation_network = networks.DrQTorso() return { 'policy': policy_network, 'critic': critic_network, 'observation': observation_network, }
def make_d4pg_agent(env_spec: specs.EnvironmentSpec, logger: Logger, checkpoint_path: str, hyperparams: Dict): params = DEFAULT_PARAMS.copy() params.update(hyperparams) action_size = np.prod(env_spec.actions.shape, dtype=int).item() policy_network = snt.Sequential([ networks.LayerNormMLP(layer_sizes=[*params.pop('policy_layers'), action_size]), networks.NearZeroInitializedLinear(output_size=action_size), networks.TanhToSpec(env_spec.actions), ]) critic_network = snt.Sequential([ networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(layer_sizes=[*params.pop('critic_layers'), 1]) ), networks.DiscreteValuedHead(vmin=-100.0, vmax=100.0, num_atoms=params.pop('atoms')) ]) observation_network = tf.identity # Make sure observation network is a Sonnet Module. observation_network = tf2_utils.to_sonnet_module(observation_network) actor = FeedForwardActor(policy_network=snt.Sequential([ observation_network, policy_network ])) # Create optimizers. policy_optimizer = Adam(params.pop('policy_lr')) critic_optimizer = Adam(params.pop('critic_lr')) # The learner updates the parameters (and initializes them). agent = D4PG( environment_spec=env_spec, policy_network=policy_network, critic_network=critic_network, observation_network=observation_network, policy_optimizer=policy_optimizer, critic_optimizer=critic_optimizer, logger=logger, checkpoint_path=checkpoint_path, **params ) agent.__setattr__('eval_actor', actor) return agent
def make_value_func_dm_control( distributional: bool = True, layer_sizes: str = '512,512,256', vmin: float = 0., vmax: float = 100., num_atoms: int = 21, ) -> snt.Module: layer_sizes = list(map(int, layer_sizes.split(','))) if distributional: head = networks.DiscreteValuedHead(vmin, vmax, num_atoms) else: head = snt.Linear(1) value_function = snt.Sequential([ networks.CriticMultiplexer(), networks.LayerNormMLP(layer_sizes, activate_final=True), head ]) return value_function
def make_dmpo_networks( action_spec, policy_layer_sizes=(256, 256, 256), critic_layer_sizes=(512, 512, 256), vmin=-150., vmax=150., num_atoms=51, policy_init_std=1e-9, obs_network=None, binary_grip_action=False): """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) if policy_layer_sizes: policy_network = snt.Sequential([ networks.LayerNormMLP([int(l) for l in policy_layer_sizes]), networks.MultivariateNormalDiagHead( num_dimensions, init_scale=policy_init_std, min_scale=1e-10) ]) else: # Useful when initializing from a trained BC network. policy_network = snt.Sequential([ ArmPolicyNormalDiagHead( binary_grip_action=binary_grip_action, num_dimensions=num_dimensions, init_scale=policy_init_std, min_scale=1e-10) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(critic_layer_sizes), action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential( [critic_network, networks.DiscreteValuedHead(vmin, vmax, num_atoms)]) if obs_network is None: obs_network = tf_utils.batch_concat return { 'policy': policy_network, 'critic': critic_network, 'observation': obs_network, }
def make_feed_forward_networks( action_spec: specs.BoundedArray, z_spec: specs.BoundedArray, policy_layer_sizes: Tuple[int, ...] = (256, 256), critic_layer_sizes: Tuple[int, ...] = (256, 256), discriminator_layer_sizes: Tuple[int, ...] = (256, 256), hierarchical_controller_layer_sizes: Tuple[int, ...] = (256, 256), vmin: float = -150., # Minimum value for the Critic distribution. vmax: float = 150., # Maximum value for the Critic distribution. num_atoms: int = 51, # Number of atoms for the discrete value distribution. ) -> Dict[str, types.TensorTransformation]: num_dimensions = np.prod(action_spec.shape, dtype=int) z_dim = np.prod(z_spec.shape, dtype=int) observation_network = tf2_utils.batch_concat policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes), networks.MultivariateNormalDiagHead(num_dimensions) ]) critic_multiplexer = networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(critic_layer_sizes), action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential([ critic_multiplexer, networks.DiscreteValuedHead(vmin, vmax, num_atoms), ]) # The discriminator in DIAYN uses the same architecture as the critic. discriminator_network = networks.LayerNormMLP(discriminator_layer_sizes + (z_dim, )) hierarchical_controller_network = networks.LayerNormMLP( hierarchical_controller_layer_sizes + (z_dim, )) return { 'policy': policy_network, 'critic': critic_network, 'observation': observation_network, 'discriminator': discriminator_network, 'hierarchical_controller': hierarchical_controller_network, }
def make_value_func_bsuite( environment_spec: EnvironmentSpec, distributional: bool = True, layer_sizes: str = '50,50', vmin: float = 0., vmax: float = 100., num_atoms: int = 21, ) -> snt.Module: layer_sizes = list(map(int, layer_sizes.split(','))) action_network = functools.partial( tf.one_hot, depth=environment_spec.actions.num_values) if distributional: head = networks.DiscreteValuedHead(vmin, vmax, num_atoms) else: head = snt.Linear(1) value_function = snt.Sequential([ networks.CriticMultiplexer(action_network=action_network), snt.nets.MLP(layer_sizes, activate_final=True), head ]) return value_function
def make_networks(action_spec: specs.BoundedArray): """Simple networks for testing..""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_network = snt.Sequential([ networks.LayerNormMLP([50], activate_final=True), networks.NearZeroInitializedLinear(num_dimensions), networks.TanhToSpec(action_spec) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = snt.Sequential([ networks.CriticMultiplexer( critic_network=networks.LayerNormMLP([50], activate_final=True)), networks.DiscreteValuedHead(-1., 1., 10) ]) return { 'policy': policy_network, 'critic': critic_network, 'observation': tf2_utils.batch_concat, }
def make_networks( action_spec: specs.BoundedArray, policy_layer_sizes: Sequence[int] = (256, 256, 256), critic_layer_sizes: Sequence[int] = (512, 512, 256), vmin: float = -150., vmax: float = 150., num_atoms: int = 51, ) -> Dict[str, types.TensorTransformation]: """Creates networks used by the agent.""" # Get total number of action dimensions from action spec. num_dimensions = np.prod(action_spec.shape, dtype=int) # Create the shared observation network; here simply a state-less operation. observation_network = tf2_utils.batch_concat # Create the policy network. policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes), networks.MultivariateNormalDiagHead(num_dimensions) ]) # The multiplexer transforms concatenates the observations/actions. multiplexer = networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(critic_layer_sizes), action_network=networks.ClipToSpec(action_spec)) # Create the critic network. critic_network = snt.Sequential([ multiplexer, networks.DiscreteValuedHead(vmin, vmax, num_atoms), ]) return { 'policy': policy_network, 'critic': critic_network, 'observation': observation_network, }
# Create the shared observation network; here simply a state-less operation. observation_network = tf2_utils.batch_concat # Create the deterministic policy network. policy_network = snt.Sequential([ networks.LayerNormMLP((256, 256, 256), activate_final=True), networks.NearZeroInitializedLinear(num_dimensions), networks.TanhToSpec(environment_spec.actions), ]) # Create the distributional critic network. critic_network = snt.Sequential([ # The multiplexer concatenates the observations/actions. networks.CriticMultiplexer(), networks.LayerNormMLP((512, 512, 256), activate_final=True), networks.DiscreteValuedHead(vmin=-150., vmax=150., num_atoms=51), ]) # Create a logger for the agent and environment loop. agent_logger = loggers.TerminalLogger(label='agent', time_delta=10.) env_loop_logger = loggers.TerminalLogger(label='env_loop', time_delta=10.) # Create the D4PG agent. agent = d4pg.D4PG(environment_spec=environment_spec, policy_network=policy_network, critic_network=critic_network, observation_network=observation_network, sigma=1.0, logger=agent_logger, checkpoint=False)