class DQN(Agent): """ Deep Q-Network algorithm. "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. """ def __init__(self, approximator, policy, mdp_info, batch_size, initial_replay_size, max_replay_size, target_update_frequency=2500, fit_params=None, approximator_params=None, n_approximators=1, history_length=1, clip_reward=True, max_no_op_actions=0, no_op_action_value=0, dtype=np.float32): """ Constructor. Args: batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; target_update_frequency (int): the number of samples collected between each update of the target network; fit_params (dict, None): parameters of the fitting algorithm of the approximator; approximator_params (dict, None): parameters of the approximator to build; n_approximators (int, 1): the number of approximator to use in ``AverageDQN``; history_length (int, 1): the number of samples composing a state; clip_reward (bool, True): whether to clip the reward or not; max_no_op_actions (int, 0): maximum number of no-op actions that can be sampled; no_op_action_value (int, 0): value of the no-op action; dtype (object, np.float32): dtype of the state array. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self._max_no_op_actions = max_no_op_actions self._no_op_action_value = no_op_action_value self._replay_memory = ReplayMemory(mdp_info, initial_replay_size, max_replay_size, history_length, dtype) self._buffer = Buffer(history_length, dtype) self._n_updates = 0 self._episode_steps = 0 self._no_op_actions = None apprx_params_train = deepcopy(approximator_params) apprx_params_target = deepcopy(approximator_params) self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in range(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super().__init__(policy, mdp_info) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Maximum action-value for each state in ``next_state``. """ q = self.target_approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) return np.max(q, axis=1) def draw_action(self, state): self._buffer.add(state) if self._episode_steps < self._no_op_actions: action = np.array([self._no_op_action_value]) self.policy.update() else: extended_state = self._buffer.get() action = super(DQN, self).draw_action(extended_state) self._episode_steps += 1 return action def episode_start(self): if self._max_no_op_actions == 0: self._no_op_actions = 0 else: self._no_op_actions = np.random.randint( self._buffer.size, self._max_no_op_actions + 1) self._episode_steps = 0
class DQN(Agent): """ Deep Q-Network algorithm. "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. """ def __init__(self, approximator, policy, mdp_info, batch_size, initial_replay_size, max_replay_size, target_update_frequency=2500, fit_params=None, approximator_params=None, n_approximators=1, history_length=1, clip_reward=True, max_no_op_actions=0, no_op_action_value=0, dtype=np.float32): """ Constructor. Args: batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; target_update_frequency (int): the number of samples collected between each update of the target network; fit_params (dict, None): parameters of the fitting algorithm of the approximator; approximator_params (dict, None): parameters of the approximator to build; n_approximators (int, 1): the number of approximator to use in ``AverageDQN``; history_length (int, 1): the number of samples composing a state; clip_reward (bool, True): whether to clip the reward or not; max_no_op_actions (int, 0): maximum number of no-op actions that can be sampled; no_op_action_value (int, 0): value of the no-op action; dtype (object, np.float32): dtype of the state array. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self._max_no_op_actions = max_no_op_actions self._no_op_action_value = no_op_action_value self._replay_memory = ReplayMemory(mdp_info, initial_replay_size, max_replay_size, history_length, dtype) self._buffer = Buffer(history_length, dtype) self._n_updates = 0 self._episode_steps = 0 self._no_op_actions = None apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in range(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super(DQN, self).__init__(policy, mdp_info) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Maximum action-value for each state in ``next_state``. """ q = self.target_approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) return np.max(q, axis=1) def draw_action(self, state): self._buffer.add(state) if self._episode_steps < self._no_op_actions: action = np.array([self._no_op_action_value]) self.policy.update() else: extended_state = self._buffer.get() action = super(DQN, self).draw_action(extended_state) self._episode_steps += 1 return action def episode_start(self): if self._max_no_op_actions == 0: self._no_op_actions = 0 else: self._no_op_actions = np.random.randint( self._buffer.size, self._max_no_op_actions + 1) self._episode_steps = 0
class DQN(Agent): """ Deep Q-Network algorithm. "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. """ def __init__(self, approximator, policy, mdp_info, batch_size, approximator_params, target_update_frequency, replay_memory=None, initial_replay_size=500, max_replay_size=5000, fit_params=None, n_approximators=1, clip_reward=True): """ Constructor. Args: approximator (object): the approximator to use to fit the Q-function; batch_size (int): the number of samples in a batch; approximator_params (dict): parameters of the approximator to build; target_update_frequency (int): the number of samples collected between each update of the target network; replay_memory ([ReplayMemory, PrioritizedReplayMemory], None): the object of the replay memory to use; if None, a default replay memory is created; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; fit_params (dict, None): parameters of the fitting algorithm of the approximator; n_approximators (int, 1): the number of approximator to use in ``AverageDQN``; clip_reward (bool, True): whether to clip the reward or not. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency if replay_memory is not None: self._replay_memory = replay_memory if isinstance(replay_memory, PrioritizedReplayMemory): self._fit = self._fit_prioritized else: self._fit = self._fit_standard else: self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._fit = self._fit_standard self._n_updates = 0 apprx_params_train = deepcopy(approximator_params) apprx_params_target = deepcopy(approximator_params) self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in range(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super().__init__(policy, mdp_info) def fit(self, dataset): self._fit(dataset) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _fit_standard(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ = \ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, **self._fit_params) def _fit_prioritized(self, dataset): self._replay_memory.add( dataset, np.ones(len(dataset)) * self._replay_memory.max_priority) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, idxs, is_weight = \ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next td_error = q - self.approximator.predict(state, action) self._replay_memory.update(td_error, idxs) self.approximator.fit(state, action, q, weights=is_weight, **self._fit_params) def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Maximum action-value for each state in ``next_state``. """ q = self.target_approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) return np.max(q, axis=1) def draw_action(self, state): action = super(DQN, self).draw_action(np.array(state)) return action
class DQN(Agent): """ Deep Q-Network algorithm. "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. """ def __init__(self, approximator, policy, mdp_info, params): alg_params = params['algorithm_params'] self._batch_size = alg_params.get('batch_size') self._n_approximators = alg_params.get('n_approximators', 1) self._clip_reward = alg_params.get('clip_reward', True) self._train_frequency = alg_params.get('train_frequency') self._target_update_frequency = alg_params.get( 'target_update_frequency') self._max_no_op_actions = alg_params.get('max_no_op_actions', 0) self._no_op_action_value = alg_params.get('no_op_action_value', 0) self._replay_memory = ReplayMemory( mdp_info, alg_params.get('initial_replay_size'), alg_params.get('max_replay_size'), alg_params.get('history_length', 1)) self._buffer = Buffer(size=alg_params.get('history_length', 1)) self._n_updates = 0 self._episode_steps = 0 self._no_op_actions = None apprx_params_train = deepcopy(params['approximator_params']) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(params['approximator_params']) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in xrange(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super(DQN, self).__init__(policy, mdp_info, params) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, **self.params['fit_params']) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Maximum action-value for each state in ``next_state``. """ q = self.target_approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) return np.max(q, axis=1) def draw_action(self, state): self._buffer.add(state) if self._episode_steps < self._no_op_actions: action = np.array([self._no_op_action_value]) self.policy.update() else: extended_state = self._buffer.get() action = super(DQN, self).draw_action(extended_state) self._episode_steps += 1 return action def episode_start(self): self._no_op_actions = np.random.randint(self._buffer.size, self._max_no_op_actions + 1) self._episode_steps = 0
class DDPG(Agent): """ Deep Deterministic Policy Gradient algorithm. "Continuous Control with Deep Reinforcement Learning". Lillicrap T. P. et al.. 2016. """ def __init__(self, actor_approximator, critic_approximator, policy_class, mdp_info, batch_size, initial_replay_size, max_replay_size, tau, actor_params, critic_params, policy_params, actor_fit_params=None, critic_fit_params=None): """ Constructor. Args: actor_approximator (object): the approximator to use for the actor; critic_approximator (object): the approximator to use for the critic; policy_class (Policy): class of the policy; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; actor_params (dict): parameters of the actor approximator to build; critic_params (dict): parameters of the critic approximator to build; policy_params (dict): parameters of the policy to build; actor_fit_params (dict, None): parameters of the fitting algorithm of the actor approximator; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._actor_fit_params = dict() if actor_fit_params is None else actor_fit_params self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(critic_approximator, **critic_params) self._target_critic_approximator = Regressor(critic_approximator, **target_critic_params) if 'loss' not in actor_params: actor_params['loss'] = ActorLoss(self._critic_approximator) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(actor_approximator, **actor_params) self._target_actor_approximator = Regressor(actor_approximator, **target_actor_params) self._target_actor_approximator.model.set_weights( self._actor_approximator.model.get_weights()) self._target_critic_approximator.model.set_weights( self._critic_approximator.model.get_weights()) policy = policy_class(self._actor_approximator, **policy_params) super().__init__(policy, mdp_info) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) self._actor_approximator.fit(state, state, **self._actor_fit_params) self._update_target() def _update_target(self): """ Update the target networks. """ critic_weights = self._tau * self._critic_approximator.model.get_weights() critic_weights += (1 - self._tau) * self._target_critic_approximator.get_weights() self._target_critic_approximator.set_weights(critic_weights) actor_weights = self._tau * self._actor_approximator.model.get_weights() actor_weights += (1 - self._tau) * self._target_actor_approximator.get_weights() self._target_actor_approximator.set_weights(actor_weights) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a = self._target_actor_approximator(next_state) q = self._target_critic_approximator.predict(next_state, a) q *= 1 - absorbing return q
class DDPG(ReparametrizationAC): """ Deep Deterministic Policy Gradient algorithm. "Continuous Control with Deep Reinforcement Learning". Lillicrap T. P. et al.. 2016. """ def __init__(self, mdp_info, policy_class, policy_params, batch_size, initial_replay_size, max_replay_size, tau, critic_params, actor_params, actor_optimizer, policy_delay=1, critic_fit_params=None): """ Constructor. Args: policy_class (Policy): class of the policy; policy_params (dict): parameters of the policy to build; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; actor_params (dict): parameters of the actor approximator to build; critic_params (dict): parameters of the critic approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; policy_delay (int, 1): the number of updates of the critic after which an actor update is implemented; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._policy_delay = policy_delay self._fit_count = 0 self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(TorchApproximator, **actor_params) self._target_actor_approximator = Regressor(TorchApproximator, **target_actor_params) self._init_target() policy = policy_class(self._actor_approximator, **policy_params) policy_parameters = self._actor_approximator.model.network.parameters() super().__init__(policy, mdp_info, actor_optimizer, policy_parameters) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) if self._fit_count % self._policy_delay == 0: loss = self._loss(state) self._optimize_actor_parameters(loss) self._update_target() self._fit_count += 1 def _loss(self, state): action = self._actor_approximator(state, output_tensor=True) q = self._critic_approximator(state, action, output_tensor=True) return -q.mean() def _init_target(self): """ Init weights for target approximators """ self._target_actor_approximator.set_weights( self._actor_approximator.get_weights()) self._target_critic_approximator.set_weights( self._critic_approximator.get_weights()) def _update_target(self): """ Update the target networks. """ critic_weights = self._tau * self._critic_approximator.get_weights() critic_weights += (1 - self._tau) * self._target_critic_approximator.get_weights() self._target_critic_approximator.set_weights(critic_weights) actor_weights = self._tau * self._actor_approximator.get_weights() actor_weights += (1 - self._tau) * self._target_actor_approximator.get_weights() self._target_actor_approximator.set_weights(actor_weights) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a = self._target_actor_approximator(next_state) q = self._target_critic_approximator.predict(next_state, a) q *= 1 - absorbing return q
class SAC(ReparametrizationAC): """ Soft Actor-Critic algorithm. "Soft Actor-Critic Algorithms and Applications". Haarnoja T. et al.. 2019. """ def __init__(self, mdp_info, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, target_entropy=None, critic_fit_params=None): """ Constructor. Args: batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; warmup_transitions (int): number of samples to accumulate in the replay memory to start the policy fitting; tau (float): value of coefficient for soft updates; lr_alpha (float): Learning rate for the entropy coefficient; actor_mu_params (dict): parameters of the actor mean approximator to build; actor_sigma_params (dict): parameters of the actor sigma approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; target_entropy (float, None): target entropy for the policy, if None a default value is computed ; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._warmup_transitions = warmup_transitions self._tau = tau if target_entropy is None: self._target_entropy = -np.prod(mdp_info.action_space.shape).astype(np.float32) else: self._target_entropy = target_entropy self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) if 'n_models' in critic_params.keys(): assert critic_params['n_models'] == 2 else: critic_params['n_models'] = 2 if 'prediction' in critic_params.keys(): assert critic_params['prediction'] == 'min' else: critic_params['prediction'] = 'min' target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) self._log_alpha = torch.tensor(0., requires_grad=True, dtype=torch.float32) self._alpha_optim = optim.Adam([self._log_alpha], lr=lr_alpha) actor_mu_approximator = Regressor(TorchApproximator, **actor_mu_params) actor_sigma_approximator = Regressor(TorchApproximator, **actor_sigma_params) policy = SACPolicy(actor_mu_approximator, actor_sigma_approximator, mdp_info.action_space.low, mdp_info.action_space.high) self._init_target() policy_parameters = chain(actor_mu_approximator.model.network.parameters(), actor_sigma_approximator.model.network.parameters()) super().__init__(policy, mdp_info, actor_optimizer, policy_parameters) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ = \ self._replay_memory.get(self._batch_size) if self._replay_memory.size > self._warmup_transitions: action_new, log_prob = self.policy.compute_action_and_log_prob_t(state) loss = self._loss(state, action_new, log_prob) self._optimize_actor_parameters(loss) self._update_alpha(log_prob.detach()) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) self._update_target() def _init_target(self): """ Init weights for target approximators. """ for i in range(len(self._critic_approximator)): self._target_critic_approximator.model[i].set_weights( self._critic_approximator.model[i].get_weights()) def _loss(self, state, action_new, log_prob): q_0 = self._critic_approximator(state, action_new, output_tensor=True, idx=0) q_1 = self._critic_approximator(state, action_new, output_tensor=True, idx=1) q = torch.min(q_0, q_1) return (self._alpha * log_prob - q).mean() def _update_alpha(self, log_prob): alpha_loss = - (self._log_alpha * (log_prob + self._target_entropy)).mean() self._alpha_optim.zero_grad() alpha_loss.backward() self._alpha_optim.step() def _update_target(self): """ Update the target networks. """ for i in range(len(self._target_critic_approximator)): critic_weights_i = self._tau * self._critic_approximator.model[i].get_weights() critic_weights_i += (1 - self._tau) * self._target_critic_approximator.model[i].get_weights() self._target_critic_approximator.model[i].set_weights(critic_weights_i) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a, log_prob_next = self.policy.compute_action_and_log_prob(next_state) q = self._target_critic_approximator.predict(next_state, a) - self._alpha_np * log_prob_next q *= 1 - absorbing return q @property def _alpha(self): return self._log_alpha.exp() @property def _alpha_np(self): return self._alpha.detach().cpu().numpy()