class TrueOnlineSARSALambda(TD): """ True Online SARSA(lambda) with linear function approximation. "True Online TD(lambda)". Seijen H. V. et al.. 2014. """ def __init__(self, policy, mdp_info, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(LinearApproximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff self._q_old = None super(TrueOnlineSARSALambda, self).__init__(self.Q, policy, mdp_info, learning_rate, features) def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) q_current = self.Q.predict(phi_state, action) if self._q_old is None: self._q_old = q_current alpha = self.alpha(state, action) e_phi = self.e.dot(phi_state_action) self.e = self.mdp_info.gamma * self._lambda * self.e + alpha * ( 1. - self.mdp_info.gamma * self._lambda * e_phi) * phi_state_action self.next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - self._q_old theta = self.Q.get_weights() theta += delta * self.e + alpha * ( self._q_old - q_current) * phi_state_action self.Q.set_weights(theta) self._q_old = q_next def episode_start(self): self._q_old = None self.e = np.zeros(self.Q.weights_size)
class SARSALambdaContinuous(TD): """ Continuous version of SARSA(lambda) algorithm. """ def __init__(self, approximator, policy, mdp_info, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(approximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff super(SARSALambdaContinuous, self).__init__(self.Q, policy, mdp_info, learning_rate, features) def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) q_current = self.Q.predict(phi_state, action) alpha = self.alpha(state, action) self.e = self.mdp_info.gamma * self._lambda * self.e + self.Q.diff( phi_state, action) self.next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - q_current theta = self.Q.get_weights() theta += alpha * delta * self.e self.Q.set_weights(theta) def episode_start(self): self.e = np.zeros(self.Q.weights_size)
class SAC_AVG(Agent): """ Stochastic Actor critic in the average reward setting as presented in: "Model-Free Reinforcement Learning with Continuous Action in Practice". Degris T. et al.. 2012. """ def __init__(self, policy, mdp_info, alpha_theta, alpha_v, alpha_r, lambda_par=.9, value_function_features=None, policy_features=None): """ Constructor. Args: policy (ParametricPolicy): a differentiable stochastic policy; mdp_info: information about the MDP; alpha_theta (Parameter): learning rate for policy update; alpha_v (Parameter): learning rate for the value function; alpha_r (Parameter): learning rate for the reward trace; lambda_par (float, 0.9): trace decay parameter; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_v = alpha_v self._alpha_r = alpha_r self._lambda = lambda_par super().__init__(policy, mdp_info, policy_features) if self._psi is not None: input_shape = (self._psi.size,) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1,)) self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) self._r_bar = 0 def episode_start(self): self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) def fit(self, dataset): for step in dataset: s, a, r, ss, absorbing, _ = step s_phi = self.phi(s) if self.phi is not None else s s_psi = self._psi(s) if self._psi is not None else s ss_psi = self._psi(ss) if self._psi is not None else ss v_next = self._V(ss_psi) if not absorbing else 0 # Compute TD error delta = r - self._r_bar + v_next - self._V(s_psi) # Update traces self._r_bar += self._alpha_r() * delta self._e_v = self._lambda * self._e_v + s_psi self._e_theta = self._lambda * self._e_theta + \ self.policy.diff_log(s_phi, a) # Update value function delta_v = self._alpha_v(s, a) * delta * self._e_v v_new = self._V.get_weights() + delta_v self._V.set_weights(v_new) # Update policy delta_theta = self._alpha_theta(s, a) * delta * self._e_theta theta_new = self.policy.get_weights() + delta_theta self.policy.set_weights(theta_new)
import numpy as np from matplotlib import pyplot as plt from mushroom.approximators import Regressor from mushroom.approximators.parametric import LinearApproximator x = np.arange(10).reshape(-1, 1) intercept = 10 noise = np.random.randn(10, 1) * 1 y = 2 * x + intercept + noise phi = np.concatenate((np.ones(10).reshape(-1, 1), x), axis=1) regressor = Regressor(LinearApproximator, input_shape=(2,), output_shape=(1,)) regressor.fit(phi, y) print('Weights: ' + str(regressor.get_weights())) print('Gradient: ' + str(regressor.diff(np.array([[5.]])))) plt.scatter(x, y) plt.plot(x, regressor.predict(phi)) plt.show()
class COPDAC_Q(Agent): def __init__(self, policy, mu, mdp_info, alpha_theta, alpha_omega, alpha_v, value_function_features=None, policy_features=None): self._mu = mu self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_omega = alpha_omega self._alpha_v = alpha_v if value_function_features is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._A = Regressor(LinearApproximator, input_shape=(self._mu.weights_size, ), output_shape=(1, )) super().__init__(policy, mdp_info, policy_features) def fit(self, dataset): for step in dataset: s, a, r, ss, absorbing, _ = step s_phi = self.phi(s) if self.phi is not None else s s_psi = self._psi(s) if self._psi is not None else s ss_phi = self.phi(ss) if self.phi is not None else ss q_next = self._Q(ss, self._mu(ss_phi)) if not absorbing else 0 grad_mu_s = np.atleast_2d(self._mu.diff(s_phi)) omega = self._A.get_weights() delta = r + self.mdp_info.gamma * q_next - self._Q(s, a) delta_theta = self._alpha_theta(s, a) * grad_mu_s.T.dot( grad_mu_s.dot(omega)) delta_omega = self._alpha_omega(s, a) * delta * self._nu(s, a) delta_v = self._alpha_v(s, a) * delta * s_psi theta_new = self._mu.get_weights() + delta_theta self._mu.set_weights(theta_new) omega_new = omega + delta_omega self._A.set_weights(omega_new) v_new = self._V.get_weights() + delta_v self._V.set_weights(v_new) # print('V max:', np.max(v_new)) # print('V min:', np.min(v_new)) # print('A max:', np.max(omega_new)) # print('A min:', np.min(omega_new)) def _Q(self, state, action): state_psi = self._psi(state) if self._psi is not None else state return self._V(state_psi) + self._A(self._nu(state, action)) def _nu(self, state, action): state_phi = self.phi(state) if self.phi is not None else state grad_mu = np.atleast_2d(self._mu.diff(state_phi)) delta = action - self._mu(state_phi) return delta.dot(grad_mu)
class COPDAC_Q(Agent): """ Compatible off-policy deterministic actor-critic algorithm. "Deterministic Policy Gradient Algorithms". Silver D. et al.. 2014. """ def __init__(self, policy, mu, mdp_info, alpha_theta, alpha_omega, alpha_v, value_function_features=None, policy_features=None): self._mu = mu self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_omega = alpha_omega self._alpha_v = alpha_v if self._psi is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._A = Regressor(LinearApproximator, input_shape=(self._mu.weights_size, ), output_shape=(1, )) super().__init__(policy, mdp_info, policy_features) def fit(self, dataset): for step in dataset: s, a, r, ss, absorbing, _ = step s_phi = self.phi(s) if self.phi is not None else s s_psi = self._psi(s) if self._psi is not None else s ss_psi = self._psi(ss) if self._psi is not None else ss q_next = self._V(ss_psi).item() if not absorbing else 0 grad_mu_s = np.atleast_2d(self._mu.diff(s_phi)) omega = self._A.get_weights() delta = r + self.mdp_info.gamma * q_next - self._Q(s, a) delta_theta = self._alpha_theta(s, a) * \ omega.dot(grad_mu_s.T).dot(grad_mu_s) delta_omega = self._alpha_omega(s, a) * delta * self._nu(s, a) delta_v = self._alpha_v(s, a) * delta * s_psi theta_new = self._mu.get_weights() + delta_theta self._mu.set_weights(theta_new) omega_new = omega + delta_omega self._A.set_weights(omega_new) v_new = self._V.get_weights() + delta_v self._V.set_weights(v_new) def _Q(self, state, action): state_psi = self._psi(state) if self._psi is not None else state return self._V(state_psi).item() + self._A(self._nu(state, action)).item() def _nu(self, state, action): state_phi = self.phi(state) if self.phi is not None else state grad_mu = np.atleast_2d(self._mu.diff(state_phi)) delta = action - self._mu(state_phi) return delta.dot(grad_mu)
import numpy as np from matplotlib import pyplot as plt from mushroom.approximators import Regressor from mushroom.approximators.parametric import LinearApproximator x = np.arange(10).reshape(-1, 1) intercept = 10 noise = np.random.randn(10, 1) * 1 y = 2 * x + intercept + noise phi = np.concatenate((np.ones(10).reshape(-1, 1), x), axis=1) regressor = Regressor(LinearApproximator, input_shape=(2, ), output_shape=(1, )) regressor.fit(phi, y) print('Weights: ' + str(regressor.get_weights())) print('Gradient: ' + str(regressor.diff(np.array([[5.]])))) plt.scatter(x, y) plt.plot(x, regressor.predict(phi)) plt.show()
class StochasticAC_AVG(Agent): """ Stochastic Actor critic in the average reward setting as presented in: "Model-Free Reinforcement Learning with Continuous Action in Practice". Degris T. et al.. 2012. """ def __init__(self, policy, mdp_info, alpha_theta, alpha_v, alpha_r, lambda_par=.9, value_function_features=None, policy_features=None): """ Constructor. Args: policy (ParametricPolicy): a differentiable stochastic policy; mdp_info: information about the MDP; alpha_theta (Parameter): learning rate for policy update; alpha_v (Parameter): learning rate for the value function; alpha_r (Parameter): learning rate for the reward trace; lambda_par (float, 0.9): trace decay parameter; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_v = alpha_v self._alpha_r = alpha_r self._lambda = lambda_par super().__init__(policy, mdp_info, policy_features) if self._psi is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) self._r_bar = 0 def episode_start(self): self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) super().episode_start() def fit(self, dataset): for step in dataset: s, a, r, ss, absorbing, _ = step s_phi = self.phi(s) if self.phi is not None else s s_psi = self._psi(s) if self._psi is not None else s ss_psi = self._psi(ss) if self._psi is not None else ss v_next = self._V(ss_psi) if not absorbing else 0 # Compute TD error delta = r - self._r_bar + v_next - self._V(s_psi) # Update traces self._r_bar += self._alpha_r() * delta self._e_v = self._lambda * self._e_v + s_psi self._e_theta = self._lambda * self._e_theta + \ self.policy.diff_log(s_phi, a) # Update value function delta_v = self._alpha_v(s, a) * delta * self._e_v v_new = self._V.get_weights() + delta_v self._V.set_weights(v_new) # Update policy delta_theta = self._alpha_theta(s, a) * delta * self._e_theta theta_new = self.policy.get_weights() + delta_theta self.policy.set_weights(theta_new)
class COPDAC_Q(Agent): def __init__(self, policy, mu, mdp_info, alpha_theta, alpha_omega, alpha_v, value_function_features=None, policy_features=None): self._mu = mu self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_omega = alpha_omega self._alpha_v = alpha_v if self._psi is not None: input_shape = (self._psi.size,) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1,)) self._A = Regressor(LinearApproximator, input_shape=(self._mu.weights_size,), output_shape=(1,)) super().__init__(policy, mdp_info, policy_features) def fit(self, dataset): for step in dataset: s, a, r, ss, absorbing, _ = step s_phi = self.phi(s) if self.phi is not None else s s_psi = self._psi(s) if self._psi is not None else s ss_psi = self._psi(ss) if self._psi is not None else ss q_next = np.asscalar(self._V(ss_psi)) if not absorbing else 0 grad_mu_s = np.atleast_2d(self._mu.diff(s_phi)) omega = self._A.get_weights() delta = r + self.mdp_info.gamma * q_next - self._Q(s, a) delta_theta = self._alpha_theta(s, a) * \ omega.dot(grad_mu_s.T).dot(grad_mu_s) delta_omega = self._alpha_omega(s, a) * delta * self._nu(s, a) delta_v = self._alpha_v(s, a) * delta * s_psi theta_new = self._mu.get_weights() + delta_theta self._mu.set_weights(theta_new) omega_new = omega + delta_omega self._A.set_weights(omega_new) v_new = self._V.get_weights() + delta_v self._V.set_weights(v_new) def _Q(self, state, action): state_psi = self._psi(state) if self._psi is not None else state return np.asscalar(self._V(state_psi)) + \ np.asscalar(self._A(self._nu(state, action))) def _nu(self, state, action): state_phi = self.phi(state) if self.phi is not None else state grad_mu = np.atleast_2d(self._mu.diff(state_phi)) delta = action - self._mu(state_phi) return delta.dot(grad_mu)
class COPDAC_Q(Agent): """ Compatible off-policy deterministic actor-critic algorithm. "Deterministic Policy Gradient Algorithms". Silver D. et al.. 2014. """ def __init__(self, policy, mu, mdp_info, alpha_theta, alpha_omega, alpha_v, value_function_features=None, policy_features=None): """ Constructor. Args: policy (Policy): any exploration policy, possibly using the deterministic policy as mean regressor; mu (Regressor): regressor that describe the deterministic policy to be learned i.e., the deterministic mapping between state and action. alpha_theta (Parameter): learning rate for policy update; alpha_omega (Parameter): learning rate for the advantage function; alpha_v (Parameter): learning rate for the value function; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._mu = mu self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_omega = alpha_omega self._alpha_v = alpha_v if self._psi is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._A = Regressor(LinearApproximator, input_shape=(self._mu.weights_size, ), output_shape=(1, )) super().__init__(policy, mdp_info, policy_features) def fit(self, dataset): for step in dataset: s, a, r, ss, absorbing, _ = step s_phi = self.phi(s) if self.phi is not None else s s_psi = self._psi(s) if self._psi is not None else s ss_psi = self._psi(ss) if self._psi is not None else ss q_next = self._V(ss_psi).item() if not absorbing else 0 grad_mu_s = np.atleast_2d(self._mu.diff(s_phi)) omega = self._A.get_weights() delta = r + self.mdp_info.gamma * q_next - self._Q(s, a) delta_theta = self._alpha_theta(s, a) * \ omega.dot(grad_mu_s.T).dot(grad_mu_s) delta_omega = self._alpha_omega(s, a) * delta * self._nu(s, a) delta_v = self._alpha_v(s, a) * delta * s_psi theta_new = self._mu.get_weights() + delta_theta self._mu.set_weights(theta_new) omega_new = omega + delta_omega self._A.set_weights(omega_new) v_new = self._V.get_weights() + delta_v self._V.set_weights(v_new) def _Q(self, state, action): state_psi = self._psi(state) if self._psi is not None else state return self._V(state_psi).item() + self._A(self._nu(state, action)).item() def _nu(self, state, action): state_phi = self.phi(state) if self.phi is not None else state grad_mu = np.atleast_2d(self._mu.diff(state_phi)) delta = action - self._mu(state_phi) return delta.dot(grad_mu)
class DDPG(ReparametrizationAC): """ Deep Deterministic Policy Gradient algorithm. "Continuous Control with Deep Reinforcement Learning". Lillicrap T. P. et al.. 2016. """ def __init__(self, mdp_info, policy_class, policy_params, batch_size, initial_replay_size, max_replay_size, tau, critic_params, actor_params, actor_optimizer, policy_delay=1, critic_fit_params=None): """ Constructor. Args: policy_class (Policy): class of the policy; policy_params (dict): parameters of the policy to build; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; actor_params (dict): parameters of the actor approximator to build; critic_params (dict): parameters of the critic approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; policy_delay (int, 1): the number of updates of the critic after which an actor update is implemented; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._policy_delay = policy_delay self._fit_count = 0 self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(TorchApproximator, **actor_params) self._target_actor_approximator = Regressor(TorchApproximator, **target_actor_params) self._init_target() policy = policy_class(self._actor_approximator, **policy_params) policy_parameters = self._actor_approximator.model.network.parameters() super().__init__(policy, mdp_info, actor_optimizer, policy_parameters) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) if self._fit_count % self._policy_delay == 0: loss = self._loss(state) self._optimize_actor_parameters(loss) self._update_target() self._fit_count += 1 def _loss(self, state): action = self._actor_approximator(state, output_tensor=True) q = self._critic_approximator(state, action, output_tensor=True) return -q.mean() def _init_target(self): """ Init weights for target approximators """ self._target_actor_approximator.set_weights( self._actor_approximator.get_weights()) self._target_critic_approximator.set_weights( self._critic_approximator.get_weights()) def _update_target(self): """ Update the target networks. """ critic_weights = self._tau * self._critic_approximator.get_weights() critic_weights += (1 - self._tau) * self._target_critic_approximator.get_weights() self._target_critic_approximator.set_weights(critic_weights) actor_weights = self._tau * self._actor_approximator.get_weights() actor_weights += (1 - self._tau) * self._target_actor_approximator.get_weights() self._target_actor_approximator.set_weights(actor_weights) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a = self._target_actor_approximator(next_state) q = self._target_critic_approximator.predict(next_state, a) q *= 1 - absorbing return q
class TrueOnlineSARSALambda(TD): """ True Online SARSA(lambda) with linear function approximation. "True Online TD(lambda)". Seijen H. V. et al.. 2014. """ def __init__(self, policy, mdp_info, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(LinearApproximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff self._q_old = None super().__init__(self.Q, policy, mdp_info, learning_rate, features) def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) q_current = self.Q.predict(phi_state, action) if self._q_old is None: self._q_old = q_current alpha = self.alpha(state, action) e_phi = self.e.dot(phi_state_action) self.e = self.mdp_info.gamma * self._lambda * self.e + alpha * ( 1. - self.mdp_info.gamma * self._lambda * e_phi) * phi_state_action self.next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - self._q_old theta = self.Q.get_weights() theta += delta * self.e + alpha * (self._q_old - q_current) * phi_state_action self.Q.set_weights(theta) self._q_old = q_next def episode_start(self): self._q_old = None self.e = np.zeros(self.Q.weights_size) super().episode_start()
class DDPG(Agent): """ Deep Deterministic Policy Gradient algorithm. "Continuous Control with Deep Reinforcement Learning". Lillicrap T. P. et al.. 2016. """ def __init__(self, actor_approximator, critic_approximator, policy_class, mdp_info, batch_size, initial_replay_size, max_replay_size, tau, actor_params, critic_params, policy_params, actor_fit_params=None, critic_fit_params=None): """ Constructor. Args: actor_approximator (object): the approximator to use for the actor; critic_approximator (object): the approximator to use for the critic; policy_class (Policy): class of the policy; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; actor_params (dict): parameters of the actor approximator to build; critic_params (dict): parameters of the critic approximator to build; policy_params (dict): parameters of the policy to build; actor_fit_params (dict, None): parameters of the fitting algorithm of the actor approximator; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._actor_fit_params = dict( ) if actor_fit_params is None else actor_fit_params self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._n_updates = 0 target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(critic_approximator, **critic_params) self._target_critic_approximator = Regressor(critic_approximator, **target_critic_params) if 'loss' not in actor_params: actor_params['loss'] = ActorLoss(self._critic_approximator) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(actor_approximator, **actor_params) self._target_actor_approximator = Regressor(actor_approximator, **target_actor_params) self._target_actor_approximator.model.set_weights( self._actor_approximator.model.get_weights()) self._target_critic_approximator.model.set_weights( self._critic_approximator.model.get_weights()) policy = policy_class(self._actor_approximator, **policy_params) super().__init__(policy, mdp_info) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) self._actor_approximator.fit(state, state, **self._actor_fit_params) self._n_updates += 1 self._update_target() def _update_target(self): """ Update the target networks. """ critic_weights = self._tau * self._critic_approximator.model.get_weights( ) critic_weights += ( 1 - self._tau) * self._target_critic_approximator.get_weights() self._target_critic_approximator.set_weights(critic_weights) actor_weights = self._tau * self._actor_approximator.model.get_weights( ) actor_weights += ( 1 - self._tau) * self._target_actor_approximator.get_weights() self._target_actor_approximator.set_weights(actor_weights) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a = self._target_actor_approximator(next_state) q = self._target_critic_approximator.predict(next_state, a) q *= 1 - absorbing return q