def __init__(self, action_spec, actor_network: Network, critic_network: Network, critic_loss=None, target_entropy=None, initial_log_alpha=0.0, target_update_tau=0.05, target_update_period=1, dqda_clipping=None, actor_optimizer=None, critic_optimizer=None, alpha_optimizer=None, gradient_clipping=None, train_step_counter=None, debug_summaries=False, name="SacAlgorithm"): """Create a SacAlgorithm Args: action_spec (nested BoundedTensorSpec): representing the actions. actor_network (Network): The network will be called with call(observation, step_type). critic_network (Network): The network will be called with call(observation, action, step_type). critic_loss (None|OneStepTDLoss): an object for calculating critic loss. If None, a default OneStepTDLoss will be used. initial_log_alpha (float): initial value for variable log_alpha target_entropy (float|None): The target average policy entropy, for updating alpha. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. dqda_clipping (float): when computing the actor loss, clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Does not perform clipping if dqda_clipping == 0. actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor. critic_optimizer (tf.optimizers.Optimizer): The optimizer for critic. alpha_optimizer (tf.optimizers.Optimizer): The optimizer for alpha. gradient_clipping (float): Norm length to clip gradients. train_step_counter (tf.Variable): An optional counter to increment every time the a new iteration is started. If None, it will use tf.summary.experimental.get_step(). If this is still None, a counter will be created. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ critic_network1 = critic_network critic_network2 = critic_network.copy(name='CriticNetwork2') log_alpha = tfa_common.create_variable(name='log_alpha', initial_value=initial_log_alpha, dtype=tf.float32, trainable=True) super().__init__( action_spec, train_state_spec=SacState( share=SacShareState(actor=actor_network.state_spec), actor=SacActorState(critic1=critic_network.state_spec, critic2=critic_network.state_spec), critic=SacCriticState( critic1=critic_network.state_spec, critic2=critic_network.state_spec, target_critic1=critic_network.state_spec, target_critic2=critic_network.state_spec)), action_distribution_spec=actor_network.output_spec, predict_state_spec=actor_network.state_spec, optimizer=[actor_optimizer, critic_optimizer, alpha_optimizer], get_trainable_variables_func=[ lambda: actor_network.trainable_variables, lambda: (critic_network1.trainable_variables + critic_network2. trainable_variables), lambda: [log_alpha] ], gradient_clipping=gradient_clipping, train_step_counter=train_step_counter, debug_summaries=debug_summaries, name=name) self._log_alpha = log_alpha self._actor_network = actor_network self._critic_network1 = critic_network1 self._critic_network2 = critic_network2 self._target_critic_network1 = self._critic_network1.copy( name='TargetCriticNetwork1') self._target_critic_network2 = self._critic_network2.copy( name='TargetCriticNetwork2') self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._alpha_optimizer = alpha_optimizer if critic_loss is None: critic_loss = OneStepTDLoss(debug_summaries=debug_summaries) self._critic_loss = critic_loss flat_action_spec = tf.nest.flatten(self._action_spec) self._is_continuous = tensor_spec.is_continuous(flat_action_spec[0]) if target_entropy is None: target_entropy = np.sum( list( map(dist_utils.calc_default_target_entropy, flat_action_spec))) self._target_entropy = target_entropy self._dqda_clipping = dqda_clipping self._update_target = common.get_target_updater( models=[self._critic_network1, self._critic_network2], target_models=[ self._target_critic_network1, self._target_critic_network2 ], tau=target_update_tau, period=target_update_period) tfa_common.soft_variables_update( self._critic_network1.variables, self._target_critic_network1.variables, tau=1.0) tfa_common.soft_variables_update( self._critic_network2.variables, self._target_critic_network2.variables, tau=1.0)
def __init__(self, action_spec, actor_network: Network, critic_network: Network, ou_stddev=0.2, ou_damping=0.15, critic_loss=None, target_update_tau=0.05, target_update_period=1, dqda_clipping=None, actor_optimizer=None, critic_optimizer=None, gradient_clipping=None, train_step_counter=None, debug_summaries=False, name="DdpgAlgorithm"): """ Args: action_spec (nested BoundedTensorSpec): representing the actions. actor_network (Network): The network will be called with call(observation, step_type). critic_network (Network): The network will be called with call(observation, action, step_type). ou_stddev (float): Standard deviation for the Ornstein-Uhlenbeck (OU) noise added in the default collect policy. ou_damping (float): Damping factor for the OU noise added in the default collect policy. critic_loss (None|OneStepTDLoss): an object for calculating critic loss. If None, a default OneStepTDLoss will be used. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. dqda_clipping (float): when computing the actor loss, clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Does not perform clipping if dqda_clipping == 0. actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor. critic_optimizer (tf.optimizers.Optimizer): The optimizer for actor. gradient_clipping (float): Norm length to clip gradients. train_step_counter (tf.Variable): An optional counter to increment every time the a new iteration is started. If None, it will use tf.summary.experimental.get_step(). If this is still None, a counter will be created. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ train_state_spec = DdpgState( actor=DdpgActorState(actor=actor_network.state_spec, critic=critic_network.state_spec), critic=DdpgCriticState(critic=critic_network.state_spec, target_actor=actor_network.state_spec, target_critic=critic_network.state_spec)) super().__init__(action_spec, train_state_spec=train_state_spec, action_distribution_spec=action_spec, optimizer=[actor_optimizer, critic_optimizer], get_trainable_variables_func=[ lambda: actor_network.trainable_variables, lambda: critic_network.trainable_variables ], gradient_clipping=gradient_clipping, train_step_counter=train_step_counter, debug_summaries=debug_summaries, name=name) self._actor_network = actor_network self._critic_network = critic_network self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._target_actor_network = actor_network.copy( name='target_actor_network') self._target_critic_network = critic_network.copy( name='target_critic_network') self._ou_stddev = ou_stddev self._ou_damping = ou_damping if critic_loss is None: critic_loss = OneStepTDLoss(debug_summaries=debug_summaries) self._critic_loss = critic_loss self._ou_process = self._create_ou_process(ou_stddev, ou_damping) self._update_target = common.get_target_updater( models=[self._actor_network, self._critic_network], target_models=[ self._target_actor_network, self._target_critic_network ], tau=target_update_tau, period=target_update_period) self._dqda_clipping = dqda_clipping tfa_common.soft_variables_update(self._critic_network.variables, self._target_critic_network.variables, tau=1.0) tfa_common.soft_variables_update(self._actor_network.variables, self._target_actor_network.variables, tau=1.0)
def __init__(self, observation_spec, action_spec: BoundedTensorSpec, actor_network_ctor=ActorNetwork, critic_network_ctor=CriticNetwork, use_parallel_network=False, reward_weights=None, env=None, config: TrainerConfig = None, ou_stddev=0.2, ou_damping=0.15, critic_loss_ctor=None, num_critic_replicas=1, target_update_tau=0.05, target_update_period=1, rollout_random_action=0., dqda_clipping=None, action_l2=0, actor_optimizer=None, critic_optimizer=None, debug_summaries=False, name="DdpgAlgorithm"): """ Args: observation_spec (nested TensorSpec): representing the observations. action_spec (nested BoundedTensorSpec): representing the actions. actor_network_ctor (Callable): Function to construct the actor network. ``actor_network_ctor`` needs to accept ``input_tensor_spec`` and ``action_spec`` as its arguments and return an actor network. The constructed network will be called with ``forward(observation, state)``. critic_network_ctor (Callable): Function to construct the critic network. ``critic_netwrok_ctor`` needs to accept ``input_tensor_spec`` which is a tuple of ``(observation_spec, action_spec)``. The constructed network will be called with ``forward((observation, action), state)``. use_parallel_network (bool): whether to use parallel network for calculating critics. reward_weights (list[float]): this is only used when the reward is multidimensional. In that case, the weighted sum of the q values is used for training the actor. num_critic_replicas (int): number of critics to be used. Default is 1. env (Environment): The environment to interact with. env is a batched environment, which means that it runs multiple simulations simultateously. ``env`` only needs to be provided to the root algorithm. config (TrainerConfig): config for training. config only needs to be provided to the algorithm which performs ``train_iter()`` by itself. ou_stddev (float): Standard deviation for the Ornstein-Uhlenbeck (OU) noise added in the default collect policy. ou_damping (float): Damping factor for the OU noise added in the default collect policy. critic_loss_ctor (None|OneStepTDLoss|MultiStepLoss): a critic loss constructor. If ``None``, a default ``OneStepTDLoss`` will be used. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. rollout_random_action (float): the probability of taking a uniform random action during a ``rollout_step()``. 0 means always directly taking actions added with OU noises and 1 means always sample uniformly random actions. A bigger value results in more exploration during rollout. dqda_clipping (float): when computing the actor loss, clips the gradient dqda element-wise between ``[-dqda_clipping, dqda_clipping]``. Does not perform clipping if ``dqda_clipping == 0``. action_l2 (float): weight of squared action l2-norm on actor loss. actor_optimizer (torch.optim.optimizer): The optimizer for actor. critic_optimizer (torch.optim.optimizer): The optimizer for critic. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ critic_network = critic_network_ctor( input_tensor_spec=(observation_spec, action_spec)) actor_network = actor_network_ctor(input_tensor_spec=observation_spec, action_spec=action_spec) if use_parallel_network: critic_networks = critic_network.make_parallel(num_critic_replicas) else: critic_networks = alf.networks.NaiveParallelNetwork( critic_network, num_critic_replicas) self._action_l2 = action_l2 train_state_spec = DdpgState( actor=DdpgActorState(actor=actor_network.state_spec, critics=critic_networks.state_spec), critics=DdpgCriticState(critics=critic_networks.state_spec, target_actor=actor_network.state_spec, target_critics=critic_networks.state_spec)) super().__init__(observation_spec, action_spec, train_state_spec=train_state_spec, env=env, config=config, debug_summaries=debug_summaries, name=name) if actor_optimizer is not None: self.add_optimizer(actor_optimizer, [actor_network]) if critic_optimizer is not None: self.add_optimizer(critic_optimizer, [critic_networks]) self._actor_network = actor_network self._num_critic_replicas = num_critic_replicas self._critic_networks = critic_networks self._reward_weights = None if reward_weights: self._reward_weights = torch.tensor(reward_weights, dtype=torch.float32) self._target_actor_network = actor_network.copy( name='target_actor_networks') self._target_critic_networks = critic_networks.copy( name='target_critic_networks') self._rollout_random_action = float(rollout_random_action) if critic_loss_ctor is None: critic_loss_ctor = OneStepTDLoss critic_loss_ctor = functools.partial(critic_loss_ctor, debug_summaries=debug_summaries) self._critic_losses = [None] * num_critic_replicas for i in range(num_critic_replicas): self._critic_losses[i] = critic_loss_ctor(name=("critic_loss" + str(i))) self._ou_process = common.create_ou_process(action_spec, ou_stddev, ou_damping) self._update_target = common.get_target_updater( models=[self._actor_network, self._critic_networks], target_models=[ self._target_actor_network, self._target_critic_networks ], tau=target_update_tau, period=target_update_period) self._dqda_clipping = dqda_clipping
def __init__(self, observation_spec, action_spec: BoundedTensorSpec, actor_network_cls=ActorDistributionNetwork, critic_network_cls=CriticNetwork, q_network_cls=QNetwork, reward_weights=None, use_entropy_reward=True, use_parallel_network=False, num_critic_replicas=2, env=None, config: TrainerConfig = None, critic_loss_ctor=None, target_entropy=None, prior_actor_ctor=None, target_kld_per_dim=3., initial_log_alpha=0.0, max_log_alpha=None, target_update_tau=0.05, target_update_period=1, dqda_clipping=None, actor_optimizer=None, critic_optimizer=None, alpha_optimizer=None, debug_summaries=False, name="SacAlgorithm"): """ Args: observation_spec (nested TensorSpec): representing the observations. action_spec (nested BoundedTensorSpec): representing the actions; can be a mixture of discrete and continuous actions. The number of continuous actions can be arbitrary while only one discrete action is allowed currently. If it's a mixture, then it must be a tuple/list ``(discrete_action_spec, continuous_action_spec)``. actor_network_cls (Callable): is used to construct the actor network. The constructed actor network will be called to sample continuous actions. All of its output specs must be continuous. Note that we don't need a discrete actor network because a discrete action can simply be sampled from the Q values. critic_network_cls (Callable): is used to construct critic network. for estimating ``Q(s,a)`` given that the action is continuous. q_network (Callable): is used to construct QNetwork for estimating ``Q(s,a)`` given that the action is discrete. Its output spec must be consistent with the discrete action in ``action_spec``. reward_weights (None|list[float]): this is only used when the reward is multidimensional. In that case, the weighted sum of the q values is used for training the actor if reward_weights is not None. Otherwise, the sum of the q values is used. use_entropy_reward (bool): whether to include entropy as reward use_parallel_network (bool): whether to use parallel network for calculating critics. num_critic_replicas (int): number of critics to be used. Default is 2. env (Environment): The environment to interact with. ``env`` is a batched environment, which means that it runs multiple simulations simultateously. ``env` only needs to be provided to the root algorithm. config (TrainerConfig): config for training. It only needs to be provided to the algorithm which performs ``train_iter()`` by itself. critic_loss_ctor (None|OneStepTDLoss|MultiStepLoss): a critic loss constructor. If ``None``, a default ``OneStepTDLoss`` will be used. initial_log_alpha (float): initial value for variable ``log_alpha``. max_log_alpha (float|None): if not None, ``log_alpha`` will be capped at this value. target_entropy (float|Callable|None): If a floating value, it's the target average policy entropy, for updating ``alpha``. If a callable function, then it will be called on the action spec to calculate a target entropy. If ``None``, a default entropy will be calculated. For the mixed action type, discrete action and continuous action will have separate alphas and target entropies, so this argument can be a 2-element list/tuple, where the first is for discrete action and the second for continuous action. prior_actor_ctor (Callable): If provided, it will be called using ``prior_actor_ctor(observation_spec, action_spec, debug_summaries=debug_summaries)`` to constructor a prior actor. The output of the prior actor is the distribution of the next action. Two prior actors are implemented: ``alf.algorithms.prior_actor.SameActionPriorActor`` and ``alf.algorithms.prior_actor.UniformPriorActor``. target_kld_per_dim (float): ``alpha`` is dynamically adjusted so that the KLD is about ``target_kld_per_dim * dim``. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. dqda_clipping (float): when computing the actor loss, clips the gradient dqda element-wise between ``[-dqda_clipping, dqda_clipping]``. Will not perform clipping if ``dqda_clipping == 0``. actor_optimizer (torch.optim.optimizer): The optimizer for actor. critic_optimizer (torch.optim.optimizer): The optimizer for critic. alpha_optimizer (torch.optim.optimizer): The optimizer for alpha. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ self._num_critic_replicas = num_critic_replicas self._use_parallel_network = use_parallel_network critic_networks, actor_network, self._act_type, reward_dim = self._make_networks( observation_spec, action_spec, actor_network_cls, critic_network_cls, q_network_cls) self._use_entropy_reward = use_entropy_reward if reward_dim > 1: assert not use_entropy_reward, ( "use_entropy_reward=True is not supported for multidimensional reward" ) assert self._act_type == ActionType.Continuous, ( "Only continuous action is supported for multidimensional reward" ) self._reward_weights = None if reward_weights: assert reward_dim > 1, ( "reward_weights cannot be used for one dimensional reward") assert len(reward_weights) == reward_dim, ( "Mismatch between len(reward_weights)=%s and reward_dim=%s" % (len(reward_weights), reward_dim)) self._reward_weights = torch.tensor(reward_weights, dtype=torch.float32) def _init_log_alpha(): return nn.Parameter(torch.tensor(float(initial_log_alpha))) if self._act_type == ActionType.Mixed: # separate alphas for discrete and continuous actions log_alpha = type(action_spec)( (_init_log_alpha(), _init_log_alpha())) else: log_alpha = _init_log_alpha() action_state_spec = SacActionState( actor_network=(() if self._act_type == ActionType.Discrete else actor_network.state_spec), critic=(() if self._act_type == ActionType.Continuous else critic_networks.state_spec)) super().__init__( observation_spec, action_spec, train_state_spec=SacState( action=action_state_spec, actor=(() if self._act_type != ActionType.Continuous else critic_networks.state_spec), critic=SacCriticState( critics=critic_networks.state_spec, target_critics=critic_networks.state_spec)), predict_state_spec=SacState(action=action_state_spec), env=env, config=config, debug_summaries=debug_summaries, name=name) if actor_optimizer is not None: self.add_optimizer(actor_optimizer, [actor_network]) if critic_optimizer is not None: self.add_optimizer(critic_optimizer, [critic_networks]) if alpha_optimizer is not None: self.add_optimizer(alpha_optimizer, nest.flatten(log_alpha)) self._log_alpha = log_alpha if self._act_type == ActionType.Mixed: self._log_alpha_paralist = nn.ParameterList( nest.flatten(log_alpha)) if max_log_alpha is not None: self._max_log_alpha = torch.tensor(float(max_log_alpha)) else: self._max_log_alpha = None self._actor_network = actor_network self._critic_networks = critic_networks self._target_critic_networks = self._critic_networks.copy( name='target_critic_networks') if critic_loss_ctor is None: critic_loss_ctor = OneStepTDLoss critic_loss_ctor = functools.partial(critic_loss_ctor, debug_summaries=debug_summaries) # Have different names to separate their summary curves self._critic_losses = [] for i in range(num_critic_replicas): self._critic_losses.append( critic_loss_ctor(name="critic_loss%d" % (i + 1))) self._prior_actor = None if prior_actor_ctor is not None: assert self._act_type == ActionType.Continuous, ( "Only continuous action is supported when using prior_actor") self._prior_actor = prior_actor_ctor( observation_spec=observation_spec, action_spec=action_spec, debug_summaries=debug_summaries) total_action_dims = sum( [spec.numel for spec in alf.nest.flatten(action_spec)]) self._target_entropy = -target_kld_per_dim * total_action_dims else: if self._act_type == ActionType.Mixed: if not isinstance(target_entropy, (tuple, list)): target_entropy = nest.map_structure( lambda _: target_entropy, self._action_spec) # separate target entropies for discrete and continuous actions self._target_entropy = nest.map_structure( lambda spec, t: _set_target_entropy(self.name, t, [spec]), self._action_spec, target_entropy) else: self._target_entropy = _set_target_entropy( self.name, target_entropy, nest.flatten(self._action_spec)) self._dqda_clipping = dqda_clipping self._update_target = common.get_target_updater( models=[self._critic_networks], target_models=[self._target_critic_networks], tau=target_update_tau, period=target_update_period)
def __init__(self, observation_spec, action_spec, actor_network: DistributionNetwork, critic_network: Network, gamma=0.99, ou_stddev=0.2, ou_damping=0.15, actor_optimizer=None, critic_optimizer=None, target_update_tau=0.05, target_update_period=10, dqda_clipping=None, gradient_clipping=None, debug_summaries=False, name="SarsaAlgorithm"): """Create an SarsaAlgorithm. Args: action_spec (nested BoundedTensorSpec): representing the actions. observation_spec (nested TensorSpec): spec for observation. actor_network (Network|DistributionNetwork): The network will be called with call(observation, step_type). If it is DistributionNetwork an action will be sampled. critic_network (Network): The network will be called with call(observation, action, step_type). gamma (float): discount rate for reward ou_stddev (float): Only used for DDPG. Standard deviation for the Ornstein-Uhlenbeck (OU) noise added in the default collect policy. ou_damping (float): Only used for DDPG. Damping factor for the OU noise added in the default collect policy. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. dqda_clipping (float): when computing the actor loss, clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Does not perform clipping if dqda_clipping == 0. actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor. critic_optimizer (tf.optimizers.Optimizer): The optimizer for actor. gradient_clipping (float): Norm length to clip gradients. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ if isinstance(actor_network, DistributionNetwork): self._action_distribution_spec = actor_network.output_spec elif isinstance(actor_network, Network): self._action_distribution_spec = action_spec else: raise ValueError("Expect DistributionNetwork or Network for" " `actor_network`, got %s" % type(actor_network)) super().__init__(observation_spec, action_spec, predict_state_spec=SarsaState( prev_observation=observation_spec, prev_step_type=tf.TensorSpec((), tf.int32), actor=actor_network.state_spec), train_state_spec=SarsaState( prev_observation=observation_spec, prev_step_type=tf.TensorSpec((), tf.int32), actor=actor_network.state_spec, target_actor=actor_network.state_spec, critic=critic_network.state_spec, target_critic=critic_network.state_spec, ), optimizer=[actor_optimizer, critic_optimizer], trainable_module_sets=[[actor_network], [critic_network]], gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, name=name) self._actor_network = actor_network self._critic_network = critic_network self._target_actor_network = actor_network.copy( name='target_actor_network') self._target_critic_network = critic_network.copy( name='target_critic_network') self._update_target = common.get_target_updater( models=[self._actor_network, self._critic_network], target_models=[ self._target_actor_network, self._target_critic_network ], tau=target_update_tau, period=target_update_period) self._dqda_clipping = dqda_clipping self._gamma = gamma self._ou_process = create_ou_process(action_spec, ou_stddev, ou_damping)
def __init__(self, observation_spec, action_spec, actor_network_ctor, critic_network_ctor, use_parallel_network=False, num_critic_replicas=2, env=None, config=None, critic_loss_cls=OneStepTDLoss, target_entropy=None, use_entropy_reward=False, initial_alpha=1.0, ou_stddev=0.2, ou_damping=0.15, actor_optimizer=None, critic_optimizer=None, alpha_optimizer=None, target_update_tau=0.05, target_update_period=10, use_smoothed_actor=False, dqda_clipping=0., on_policy=False, debug_summaries=False, name="SarsaAlgorithm"): """ Args: action_spec (nested BoundedTensorSpec): representing the actions. observation_spec (nested TensorSpec): spec for observation. actor_network_ctor (Callable): Function to construct the actor network. ``actor_network_ctor`` needs to accept ``input_tensor_spec`` and ``action_spec`` as its arguments and return an actor network. The constructed network will be called with ``forward(observation, state)``. critic_network_ctor (Callable): Function to construct the critic network. ``critic_netwrok_ctor`` needs to accept ``input_tensor_spec`` which is a tuple of ``(observation_spec, action_spec)``. The constructed network will be called with ``forward((observation, action), state)``. use_parallel_network (bool): whether to use parallel network for calculating critics. This can be useful when ``mini_batch_size * mini_batch_length`` (when ``temporally_independent_train_step`` is True) or ``mini_batch_size`` (when ``temporally_independent_train_step`` is False) is not very large. You have to test to see which way is faster for your particular situation. num_critic_replicas (int): number of critics to be used. Default is 2. env (Environment): The environment to interact with. ``env`` is a batched environment, which means that it runs multiple simulations simultaneously. Running multiple environments in parallel is crucial to on-policy algorithms as it increases the diversity of data and decreases temporal correlation. ``env`` only needs to be provided to the root ``Algorithm``. config (TrainerConfig): config for training. ``config`` only needs to be provided to the algorithm which performs ``train_iter()`` by itself. initial_alpha (float|None): If provided, will add ``-alpha*entropy`` to the loss to encourage diverse action. target_entropy (float|Callable|None): If a floating value, it's the target average policy entropy, for updating ``alpha``. If a callable function, then it will be called on the action spec to calculate a target entropy. If ``None``, a default entropy will be calculated. use_entropy_reward (bool): If ``True``, will use alpha*entropy as additional reward. ou_stddev (float): Only used for DDPG. Standard deviation for the Ornstein-Uhlenbeck (OU) noise added in the default collect policy. ou_damping (float): Only used for DDPG. Damping factor for the OU noise added in the default collect policy. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. use_smoothed_actor (bool): use a smoothed version of actor for predict and rollout. This option can be used if ``on_policy`` is ``False``. dqda_clipping (float): when computing the actor loss, clips the gradient ``dqda`` element-wise between ``[-dqda_clipping, dqda_clipping]``. Does not perform clipping if ``dqda_clipping == 0``. actor_optimizer (torch.optim.Optimizer): The optimizer for actor. critic_optimizer (torch.optim.Optimizer): The optimizer for critic networks. alpha_optimizer (torch.optim.Optimizer): The optimizer for alpha. Only used if ``initial_alpha`` is not ``None``. on_policy (bool): whether it is used as an on-policy algorithm. debug_summaries (bool): ``True`` if debug summaries should be created. name (str): The name of this algorithm. """ critic_network = critic_network_ctor( input_tensor_spec=(observation_spec, action_spec)) actor_network = actor_network_ctor(input_tensor_spec=observation_spec, action_spec=action_spec) flat_action_spec = alf.nest.flatten(action_spec) is_continuous = min( map(lambda spec: spec.is_continuous, flat_action_spec)) assert is_continuous, ( "SarsaAlgorithm only supports continuous action." " action_spec: %s" % action_spec) if use_parallel_network: critic_networks = critic_network.make_parallel(num_critic_replicas) else: critic_networks = alf.networks.NaiveParallelNetwork( critic_network, num_critic_replicas) self._on_policy = on_policy if not actor_network.is_distribution_output: noise_process = alf.networks.OUProcess(state_spec=action_spec, damping=ou_damping, stddev=ou_stddev) noise_state = noise_process.state_spec else: noise_process = None noise_state = () super().__init__(observation_spec, action_spec, env=env, config=config, predict_state_spec=SarsaState( noise=noise_state, prev_observation=observation_spec, prev_step_type=alf.TensorSpec((), torch.int32), actor=actor_network.state_spec), train_state_spec=SarsaState( noise=noise_state, prev_observation=observation_spec, prev_step_type=alf.TensorSpec((), torch.int32), actor=actor_network.state_spec, critics=critic_networks.state_spec, target_critics=critic_networks.state_spec, ), debug_summaries=debug_summaries, name=name) self._actor_network = actor_network self._num_critic_replicas = num_critic_replicas self._critic_networks = critic_networks self._target_critic_networks = critic_networks.copy( name='target_critic_networks') self.add_optimizer(actor_optimizer, [actor_network]) self.add_optimizer(critic_optimizer, [critic_networks]) self._log_alpha = None self._use_entropy_reward = False if initial_alpha is not None: if actor_network.is_distribution_output: self._target_entropy = _set_target_entropy( self.name, target_entropy, flat_action_spec) log_alpha = torch.tensor(np.log(initial_alpha), dtype=torch.float32) if alpha_optimizer is None: self._log_alpha = log_alpha else: self._log_alpha = nn.Parameter(log_alpha) self.add_optimizer(alpha_optimizer, [self._log_alpha]) self._use_entropy_reward = use_entropy_reward else: logging.info( "initial_alpha and alpha_optimizer is ignored. " "The `actor_network` needs to output Distribution in " "order to use entropy as regularization or reward") models = copy.copy(critic_networks) target_models = copy.copy(self._target_critic_networks) self._rollout_actor_network = self._actor_network if use_smoothed_actor: assert not on_policy, ("use_smoothed_actor can only be used in " "off-policy training") self._rollout_actor_network = actor_network.copy( name='rollout_actor_network') models.append(self._actor_network) target_models.append(self._rollout_actor_network) self._update_target = common.get_target_updater( models=models, target_models=target_models, tau=target_update_tau, period=target_update_period) self._dqda_clipping = dqda_clipping self._noise_process = noise_process self._critic_losses = [] for i in range(num_critic_replicas): self._critic_losses.append( critic_loss_cls(debug_summaries=debug_summaries and i == 0)) self._is_rnn = len(alf.nest.flatten(critic_network.state_spec)) > 0
def __init__( self, observation_spec, action_spec: BoundedTensorSpec, critic_network: MdqCriticNetwork, env=None, config: TrainerConfig = None, critic_loss_ctor=None, target_entropy=dist_utils.calc_default_target_entropy_quantized, initial_log_alpha=0.0, target_update_tau=0.05, target_update_period=1, distill_noise=0.01, critic_optimizer=None, alpha_optimizer=None, debug_summaries=False, name="MdqAlgorithm"): """ Args: observation_spec (nested TensorSpec): representing the observations. action_spec (nested BoundedTensorSpec): representing the actions. critic_network (MdqCriticNetwork): an instance of MdqCriticNetwork env (Environment): The environment to interact with. ``env`` is a batched environment, which means that it runs multiple simulations simultateously. ``env` only needs to be provided to the root algorithm. config (TrainerConfig): config for training. It only needs to be provided to the algorithm which performs ``train_iter()`` by itself. critic_loss_ctor (None|OneStepTDLoss|MultiStepLoss): a critic loss constructor. If ``None``, a default ``OneStepTDLoss`` will be used. initial_log_alpha (float): initial value for variable ``log_alpha``. target_entropy (float|Callable): If a floating value, it's the target average policy entropy, for updating ``alpha``. If a callable function, then it will be called on the action spec to calculate a target entropy. Note that in MDQ algorithm, as the continuous action is represented by a discrete distribution for each action dimension, ``calc_default_target_entropy_quantized`` is used to compute the target entropy by default. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. distill_noise (int): the std of random Gaussian noise added to the action used for distillation. critic_optimizer (torch.optim.optimizer): The optimizer for critic. alpha_optimizer (torch.optim.optimizer): The optimizer for alpha. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ critic_networks = critic_network target_critic_networks = critic_networks.copy( name='target_critic_networks') train_state_spec = MdqState( critic=MdqCriticState(critic=critic_networks.state_spec, target_critic=critic_networks.state_spec)) super().__init__(observation_spec, action_spec, train_state_spec=train_state_spec, env=env, config=config, debug_summaries=debug_summaries, name=name) self._critic_networks = critic_networks self._target_critic_networks = target_critic_networks self.add_optimizer(critic_optimizer, [critic_networks]) if critic_loss_ctor is None: critic_loss_ctor = OneStepTDLoss critic_loss_ctor = functools.partial(critic_loss_ctor, debug_summaries=debug_summaries) flat_action_spec = nest.flatten(self._action_spec) self._flat_action_spec = flat_action_spec self._action_dim = flat_action_spec[0].shape[0] self._log_pi_uniform_prior = self._critic_networks.get_uniform_prior_logpi( ) self._num_critic_replicas = self._critic_networks._num_critic_replicas self._critic_losses = [] for i in range(self._num_critic_replicas): self._critic_losses.append( critic_loss_ctor(name="critic_loss%d" % (i + 1))) self._is_continuous = flat_action_spec[0].is_continuous self._target_entropy = _set_target_entropy(self.name, target_entropy, flat_action_spec) log_alpha = nn.Parameter(torch.Tensor([float(initial_log_alpha)])) self._log_alpha = log_alpha self._update_target = common.get_target_updater( models=[self._critic_networks], target_models=[self._target_critic_networks], tau=target_update_tau, period=target_update_period) if alpha_optimizer is not None: self.add_optimizer(alpha_optimizer, [log_alpha]) self._distill_noise = distill_noise
def __init__(self, output_dim, noise_dim=32, input_tensor_spec=None, hidden_layers=(256, ), net: Network = None, net_moving_average_rate=None, entropy_regularization=0., mi_weight=None, mi_estimator_cls=MIEstimator, par_vi="gfsf", optimizer=None, name="Generator"): r"""Create a Generator. Args: output_dim (int): dimension of output noise_dim (int): dimension of noise input_tensor_spec (nested TensorSpec): spec of inputs. If there is no inputs, this should be None. hidden_layers (tuple): size of hidden layers. net (Network): network for generating outputs from [noise, inputs] or noise (if inputs is None). If None, a default one with hidden_layers will be created net_moving_average_rate (float): If provided, use a moving average version of net to do prediction. This has been shown to be effective for GAN training (arXiv:1907.02544, arXiv:1812.04948). entropy_regularization (float): weight of entropy regularization mi_estimator_cls (type): the class of mutual information estimator for maximizing the mutual information between [noise, inputs] and [outputs, inputs]. par_vi (string): ParVI methods, options are [``svgd``, ``svgd2``, ``svgd3``, ``gfsf``], * svgd: empirical expectation of SVGD is evaluated by a single resampled particle. The main benefit of this choice is it supports conditional case, while all other options do not. * svgd2: empirical expectation of SVGD is evaluated by splitting half of the sampled batch. It is a trade-off between computational efficiency and convergence speed. * svgd3: empirical expectation of SVGD is evaluated by resampled particles of the same batch size. It has better convergence but involves resampling, so less efficient computaionally comparing with svgd2. * gfsf: wasserstein gradient flow with smoothed functions. It involves a kernel matrix inversion, so computationally most expensive, but in some case the convergence seems faster than svgd approaches. optimizer (torch.optim.Optimizer): (optional) optimizer for training name (str): name of this generator """ super().__init__(train_state_spec=(), optimizer=optimizer, name=name) self._noise_dim = noise_dim self._entropy_regularization = entropy_regularization self._par_vi = par_vi if entropy_regularization == 0: self._grad_func = self._ml_grad else: if par_vi == 'gfsf': self._grad_func = self._gfsf_grad elif par_vi == 'svgd': self._grad_func = self._svgd_grad elif par_vi == 'svgd2': self._grad_func = self._svgd_grad2 elif par_vi == 'svgd3': self._grad_func = self._svgd_grad3 else: raise ValueError("Unsupported par_vi method: %s" % par_vi) self._kernel_width_averager = AdaptiveAverager( tensor_spec=TensorSpec(shape=())) noise_spec = TensorSpec(shape=(noise_dim, )) if net is None: net_input_spec = noise_spec if input_tensor_spec is not None: net_input_spec = [net_input_spec, input_tensor_spec] net = EncodingNetwork(input_tensor_spec=net_input_spec, fc_layer_params=hidden_layers, last_layer_size=output_dim, last_activation=math_ops.identity, name="Generator") self._mi_estimator = None self._input_tensor_spec = input_tensor_spec if mi_weight is not None: x_spec = noise_spec y_spec = TensorSpec((output_dim, )) if input_tensor_spec is not None: x_spec = [x_spec, input_tensor_spec] self._mi_estimator = mi_estimator_cls(x_spec, y_spec, sampler='shift') self._mi_weight = mi_weight self._net = net self._predict_net = None self._net_moving_average_rate = net_moving_average_rate if net_moving_average_rate: self._predict_net = net.copy(name="Genrator_average") self._predict_net_updater = common.get_target_updater( self._net, self._predict_net, tau=net_moving_average_rate)
def __init__(self, observation_spec, action_spec, model_ctor, mcts_algorithm_ctor, num_unroll_steps, td_steps, recurrent_gradient_scaling_factor=0.5, reward_normalizer=None, reward_clip_value=-1., train_reward_function=True, train_game_over_function=True, reanalyze_ratio=0., reanalyze_td_steps=5, reanalyze_batch_size=None, data_transformer_ctor=None, target_update_tau=1., target_update_period=1000, debug_summaries=False, name="MuZero"): """ Args: observation_spec (TensorSpec): representing the observations. action_spec (BoundedTensorSpec): representing the actions. model_ctor (Callable): will be called as ``model_ctor(observation_spec=?, action_spec=?, debug_summaries=?)`` to construct the model. The model should follow the interface ``alf.algorithms.mcts_models.MCTSModel``. mcts_algorithm_ctor (Callable): will be called as ``mcts_algorithm_ctor(observation_spec=?, action_spec=?, debug_summaries=?)`` to construct an ``MCTSAlgorithm`` instance. num_unroll_steps (int): steps for unrolling the model during training. td_steps (int): bootstrap so many steps into the future for calculating the discounted return. -1 means to bootstrap to the end of the game. Can only used for environments whose rewards are zero except for the last step as the current implmentation only use the reward at the last step to calculate the return. recurrent_gradient_scaling_factor (float): the gradient go through the ``model.recurrent_inference`` is scaled by this factor. This is suggested in Appendix G. reward_normalizer (Normalizer|None): if provided, will be used to normalize reward. train_reward_function (bool): whether train reward function. If False, reward should only be given at the last step of an episode. train_game_over_function (bool): whether train game over function. reanalyze_ratio (float): float number in [0., 1.]. Reanalyze so much portion of data retrieved from replay buffer. Reanalyzing means using recent model to calculate the value and policy target. reanalyze_td_steps (int): the n for the n-step return for reanalyzing. reanalyze_batch_size (int|None): the memory usage may be too much for reanalyzing all the data for one training iteration. If so, provide a number for this so that it will analyzing the data in several batches. data_transformer_ctor (Callable|list[Callable]): should be same as ``TrainerConfig.data_transformer_ctor``. target_update_tau (float): Factor for soft update of the target networks used for reanalyzing. target_update_period (int): Period for soft update of the target networks used for reanalyzing. debug_summaries (bool): name (str): """ model = model_ctor(observation_spec, action_spec, debug_summaries=debug_summaries) mcts = mcts_algorithm_ctor(observation_spec=observation_spec, action_spec=action_spec, debug_summaries=debug_summaries) mcts.set_model(model) self._device = alf.get_default_device() super().__init__(observation_spec=observation_spec, action_spec=action_spec, train_state_spec=mcts.predict_state_spec, predict_state_spec=mcts.predict_state_spec, rollout_state_spec=mcts.predict_state_spec, debug_summaries=debug_summaries, name=name) self._mcts = mcts self._model = model self._num_unroll_steps = num_unroll_steps self._td_steps = td_steps self._discount = mcts.discount self._recurrent_gradient_scaling_factor = recurrent_gradient_scaling_factor self._reward_normalizer = reward_normalizer self._reward_clip_value = reward_clip_value self._train_reward_function = train_reward_function self._train_game_over_function = train_game_over_function self._reanalyze_ratio = reanalyze_ratio self._reanalyze_td_steps = reanalyze_td_steps self._reanalyze_batch_size = reanalyze_batch_size self._data_transformer = None self._data_transformer_ctor = data_transformer_ctor self._update_target = None if reanalyze_ratio > 0: self._target_model = model_ctor(observation_spec, action_spec, debug_summaries=debug_summaries) self._update_target = common.get_target_updater( models=[self._model], target_models=[self._target_model], tau=target_update_tau, period=target_update_period)