class DDPG(DeepAC): """ Deep Deterministic Policy Gradient algorithm. "Continuous Control with Deep Reinforcement Learning". Lillicrap T. P. et al.. 2016. """ def __init__(self, mdp_info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, tau, policy_delay=1, critic_fit_params=None): """ Constructor. Args: policy_class (Policy): class of the policy; policy_params (dict): parameters of the policy to build; actor_params (dict): parameters of the actor approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; policy_delay (int, 1): the number of updates of the critic after which an actor update is implemented; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._policy_delay = policy_delay self._fit_count = 0 self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(TorchApproximator, **actor_params) self._target_actor_approximator = Regressor(TorchApproximator, **target_actor_params) self._init_target(self._critic_approximator, self._target_critic_approximator) self._init_target(self._actor_approximator, self._target_actor_approximator) policy = policy_class(self._actor_approximator, **policy_params) policy_parameters = self._actor_approximator.model.network.parameters() super().__init__(mdp_info, policy, actor_optimizer, policy_parameters) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) if self._fit_count % self._policy_delay == 0: loss = self._loss(state) self._optimize_actor_parameters(loss) self._update_target(self._critic_approximator, self._target_critic_approximator) self._update_target(self._actor_approximator, self._target_actor_approximator) self._fit_count += 1 def _loss(self, state): action = self._actor_approximator(state, output_tensor=True) q = self._critic_approximator(state, action, output_tensor=True) return -q.mean() def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a = self._target_actor_approximator(next_state) q = self._target_critic_approximator.predict(next_state, a) q *= 1 - absorbing return q
class PPO(Agent): """ Proximal Policy Optimization algorithm. "Proximal Policy Optimization Algorithms". Schulman J. et al.. 2017. """ def __init__(self, mdp_info, policy, actor_optimizer, critic_params, n_epochs_policy, batch_size, eps_ppo, lam, quiet=True, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; n_epochs_policy (int): number of policy updates for every dataset; batch_size (int): size of minibatches for every optimization step eps_ppo (float): value for probability ratio clipping; lam float(float, 1.): lambda coefficient used by generalized advantage estimation; quiet (bool, True): if true, the algorithm will print debug information; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict(n_epochs=10) if critic_fit_params is None else critic_fit_params self._n_epochs_policy = n_epochs_policy self._batch_size = batch_size self._eps_ppo = eps_ppo self._optimizer = actor_optimizer['class'](policy.parameters(), **actor_optimizer['params']) self._lambda = lam self._V = Regressor(TorchApproximator, **critic_params) self._quiet = quiet self._iter = 1 self._add_save_attr( _critic_fit_params='pickle', _n_epochs_policy='primitive', _batch_size='primitive', _eps_ppo='primitive', _optimizer='torch', _lambda='primitive', _V='mushroom', _quiet='primitive', _iter='primitive' ) super().__init__(mdp_info, policy, None) def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) x, u, r, xn, absorbing, last = parse_dataset(dataset) x = x.astype(np.float32) u = u.astype(np.float32) r = r.astype(np.float32) xn = xn.astype(np.float32) obs = to_float_tensor(x, self.policy.use_cuda) act = to_float_tensor(u, self.policy.use_cuda) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = to_float_tensor(np_adv, self.policy.use_cuda) old_pol_dist = self.policy.distribution_t(obs) old_log_p = old_pol_dist.log_prob(act)[:, None].detach() self._V.fit(x, v_target, **self._critic_fit_params) self._update_policy(obs, act, adv, old_log_p) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1 def _update_policy(self, obs, act, adv, old_log_p): for epoch in range(self._n_epochs_policy): for obs_i, act_i, adv_i, old_log_p_i in minibatch_generator( self._batch_size, obs, act, adv, old_log_p): self._optimizer.zero_grad() prob_ratio = torch.exp( self.policy.log_prob_t(obs_i, act_i) - old_log_p_i ) clipped_ratio = torch.clamp(prob_ratio, 1 - self._eps_ppo, 1 + self._eps_ppo) loss = -torch.mean(torch.min(prob_ratio * adv_i, clipped_ratio * adv_i)) loss.backward() self._optimizer.step() def _print_fit_info(self, dataset, x, v_target, old_pol_dist): if not self._quiet: logging_verr = [] torch_v_targets = torch.tensor(v_target, dtype=torch.float) for idx in range(len(self._V)): v_pred = torch.tensor(self._V(x, idx=idx), dtype=torch.float) v_err = F.mse_loss(v_pred, torch_v_targets) logging_verr.append(v_err.item()) logging_ent = self.policy.entropy(x) new_pol_dist = self.policy.distribution(x) logging_kl = torch.mean(torch.distributions.kl.kl_divergence( new_pol_dist, old_pol_dist)) avg_rwd = np.mean(compute_J(dataset)) tqdm.write("Iterations Results:\n\trewards {} vf_loss {}\n\tentropy {} kl {}".format( avg_rwd, logging_verr, logging_ent, logging_kl)) tqdm.write( '--------------------------------------------------------------------------------------------------') def _post_load(self): if self._optimizer is not None: update_optimizer_parameters(self._optimizer, list(self.policy.parameters()))
class SAC(DeepAC): """ Soft Actor-Critic algorithm. "Soft Actor-Critic Algorithms and Applications". Haarnoja T. et al.. 2019. """ def __init__(self, mdp_info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, target_entropy=None, critic_fit_params=None): """ Constructor. Args: actor_mu_params (dict): parameters of the actor mean approximator to build; actor_sigma_params (dict): parameters of the actor sigm approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; warmup_transitions (int): number of samples to accumulate in the replay memory to start the policy fitting; tau (float): value of coefficient for soft updates; lr_alpha (float): Learning rate for the entropy coefficient; target_entropy (float, None): target entropy for the policy, if None a default value is computed ; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._warmup_transitions = warmup_transitions self._tau = tau if target_entropy is None: self._target_entropy = -np.prod(mdp_info.action_space.shape).astype(np.float32) else: self._target_entropy = target_entropy self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) if 'n_models' in critic_params.keys(): assert critic_params['n_models'] == 2 else: critic_params['n_models'] = 2 target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) actor_mu_approximator = Regressor(TorchApproximator, **actor_mu_params) actor_sigma_approximator = Regressor(TorchApproximator, **actor_sigma_params) policy = SACPolicy(actor_mu_approximator, actor_sigma_approximator, mdp_info.action_space.low, mdp_info.action_space.high) self._init_target(self._critic_approximator, self._target_critic_approximator) self._log_alpha = torch.tensor(0., dtype=torch.float32) if policy.use_cuda: self._log_alpha = self._log_alpha.cuda().requires_grad_() else: self._log_alpha.requires_grad_() self._alpha_optim = optim.Adam([self._log_alpha], lr=lr_alpha) policy_parameters = chain(actor_mu_approximator.model.network.parameters(), actor_sigma_approximator.model.network.parameters()) self._add_save_attr( _critic_fit_params='pickle', _batch_size='numpy', _warmup_transitions='numpy', _tau='numpy', _target_entropy='numpy', _replay_memory='pickle', _critic_approximator='pickle', _target_critic_approximator='pickle', _log_alpha='pickle', _alpha_optim='pickle' ) super().__init__(mdp_info, policy, actor_optimizer, policy_parameters) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ = \ self._replay_memory.get(self._batch_size) if self._replay_memory.size > self._warmup_transitions: action_new, log_prob = self.policy.compute_action_and_log_prob_t(state) loss = self._loss(state, action_new, log_prob) self._optimize_actor_parameters(loss) self._update_alpha(log_prob.detach()) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) self._update_target(self._critic_approximator, self._target_critic_approximator) def _loss(self, state, action_new, log_prob): q_0 = self._critic_approximator(state, action_new, output_tensor=True, idx=0) q_1 = self._critic_approximator(state, action_new, output_tensor=True, idx=1) q = torch.min(q_0, q_1) return (self._alpha * log_prob - q).mean() def _update_alpha(self, log_prob): alpha_loss = - (self._log_alpha * (log_prob + self._target_entropy)).mean() self._alpha_optim.zero_grad() alpha_loss.backward() self._alpha_optim.step() def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a, log_prob_next = self.policy.compute_action_and_log_prob(next_state) q = self._target_critic_approximator.predict( next_state, a, prediction='min') - self._alpha_np * log_prob_next q *= 1 - absorbing return q def _post_load(self): if self._optimizer is not None: self._parameters = list( chain(self.policy._mu_approximator.model.network.parameters(), self.policy._sigma_approximator.model.network.parameters() ) ) @property def _alpha(self): return self._log_alpha.exp() @property def _alpha_np(self): return self._alpha.detach().cpu().numpy()
class A2C(DeepAC): """ Advantage Actor Critic algorithm (A2C). Synchronous version of the A3C algorithm. "Asynchronous Methods for Deep Reinforcement Learning". Mnih V. et. al.. 2016. """ def __init__(self, mdp_info, policy, actor_optimizer, critic_params, ent_coeff, max_grad_norm=None, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; ent_coeff ([float, Parameter], 0): coefficient for the entropy penalty; max_grad_norm (float, None): maximum norm for gradient clipping. If None, no clipping will be performed, unless specified otherwise in actor_optimizer; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._entropy_coeff = to_parameter(ent_coeff) self._V = Regressor(TorchApproximator, **critic_params) if 'clipping' not in actor_optimizer and max_grad_norm is not None: actor_optimizer = deepcopy(actor_optimizer) clipping_params = dict(max_norm=max_grad_norm, norm_type=2) actor_optimizer['clipping'] = dict( method=torch.nn.utils.clip_grad_norm_, params=clipping_params) self._add_save_attr(_critic_fit_params='pickle', _entropy_coeff='mushroom', _V='mushroom') super().__init__(mdp_info, policy, actor_optimizer, policy.parameters()) def fit(self, dataset): state, action, reward, next_state, absorbing, _ = parse_dataset( dataset) v, adv = compute_advantage_montecarlo(self._V, state, next_state, reward, absorbing, self.mdp_info.gamma) self._V.fit(state, v, **self._critic_fit_params) loss = self._loss(state, action, adv) self._optimize_actor_parameters(loss) def _loss(self, state, action, adv): use_cuda = self.policy.use_cuda s = to_float_tensor(state, use_cuda) a = to_float_tensor(action, use_cuda) adv_t = to_float_tensor(adv, use_cuda) gradient_loss = -torch.mean(self.policy.log_prob_t(s, a) * adv_t) entropy_loss = -self.policy.entropy_t(s) return gradient_loss + self._entropy_coeff() * entropy_loss def _post_load(self): self._update_optimizer_parameters(self.policy.parameters())
import numpy as np from matplotlib import pyplot as plt from mushroom_rl.approximators import Regressor from mushroom_rl.approximators.parametric import LinearApproximator x = np.arange(10).reshape(-1, 1) intercept = 10 noise = np.random.randn(10, 1) * 1 y = 2 * x + intercept + noise phi = np.concatenate((np.ones(10).reshape(-1, 1), x), axis=1) regressor = Regressor(LinearApproximator, input_shape=(2, ), output_shape=(1, )) regressor.fit(phi, y) print('Weights: ' + str(regressor.get_weights())) print('Gradient: ' + str(regressor.diff(np.array([[5.]])))) plt.scatter(x, y) plt.plot(x, regressor.predict(phi)) plt.show()
class TRPO(Agent): """ Trust Region Policy optimization algorithm. "Trust Region Policy Optimization". Schulman J. et al.. 2015. """ def __init__(self, mdp_info, policy, critic_params, ent_coeff=0., max_kl=.001, lam=1., n_epochs_line_search=10, n_epochs_cg=10, cg_damping=1e-2, cg_residual_tol=1e-10, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm critic_params (dict): parameters of the critic approximator to build; ent_coeff ([float, Parameter], 0): coefficient for the entropy penalty; max_kl ([float, Parameter], .001): maximum kl allowed for every policy update; lam float([float, Parameter], 1.): lambda coefficient used by generalized advantage estimation; n_epochs_line_search ([int, Parameter], 10): maximum number of iterations of the line search algorithm; n_epochs_cg ([int, Parameter], 10): maximum number of iterations of the conjugate gradient algorithm; cg_damping ([float, Parameter], 1e-2): damping factor for the conjugate gradient algorithm; cg_residual_tol ([float, Parameter], 1e-10): conjugate gradient residual tolerance; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict( n_epochs=5) if critic_fit_params is None else critic_fit_params self._n_epochs_line_search = to_parameter(n_epochs_line_search) self._n_epochs_cg = to_parameter(n_epochs_cg) self._cg_damping = to_parameter(cg_damping) self._cg_residual_tol = to_parameter(cg_residual_tol) self._max_kl = to_parameter(max_kl) self._ent_coeff = to_parameter(ent_coeff) self._lambda = to_parameter(lam) self._V = Regressor(TorchApproximator, **critic_params) self._iter = 1 self._old_policy = None self._add_save_attr(_critic_fit_params='pickle', _n_epochs_line_search='mushroom', _n_epochs_cg='mushroom', _cg_damping='mushroom', _cg_residual_tol='mushroom', _max_kl='mushroom', _ent_coeff='mushroom', _lambda='mushroom', _V='mushroom', _old_policy='mushroom', _iter='primitive') super().__init__(mdp_info, policy, None) def fit(self, dataset): state, action, reward, next_state, absorbing, last = parse_dataset( dataset) x = state.astype(np.float32) u = action.astype(np.float32) r = reward.astype(np.float32) xn = next_state.astype(np.float32) obs = to_float_tensor(x, self.policy.use_cuda) act = to_float_tensor(u, self.policy.use_cuda) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda()) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = to_float_tensor(np_adv, self.policy.use_cuda) # Policy update self._old_policy = deepcopy(self.policy) old_pol_dist = self._old_policy.distribution_t(obs) old_log_prob = self._old_policy.log_prob_t(obs, act).detach() zero_grad(self.policy.parameters()) loss = self._compute_loss(obs, act, adv, old_log_prob) prev_loss = loss.item() # Compute Gradient loss.backward() g = get_gradient(self.policy.parameters()) # Compute direction through conjugate gradient stepdir = self._conjugate_gradient(g, obs, old_pol_dist) # Line search self._line_search(obs, act, adv, old_log_prob, old_pol_dist, prev_loss, stepdir) # VF update self._V.fit(x, v_target, **self._critic_fit_params) # Print fit information self._log_info(dataset, x, v_target, old_pol_dist) self._iter += 1 def _fisher_vector_product(self, p, obs, old_pol_dist): p_tensor = torch.from_numpy(p) if self.policy.use_cuda: p_tensor = p_tensor.cuda() return self._fisher_vector_product_t(p_tensor, obs, old_pol_dist) def _fisher_vector_product_t(self, p, obs, old_pol_dist): kl = self._compute_kl(obs, old_pol_dist) grads = torch.autograd.grad(kl, self.policy.parameters(), create_graph=True) flat_grad_kl = torch.cat([grad.view(-1) for grad in grads]) kl_v = torch.sum(flat_grad_kl * p) grads_v = torch.autograd.grad(kl_v, self.policy.parameters(), create_graph=False) flat_grad_grad_kl = torch.cat( [grad.contiguous().view(-1) for grad in grads_v]).data return flat_grad_grad_kl + p * self._cg_damping() def _conjugate_gradient(self, b, obs, old_pol_dist): p = b.detach().cpu().numpy() r = b.detach().cpu().numpy() x = np.zeros_like(p) r2 = r.dot(r) for i in range(self._n_epochs_cg()): z = self._fisher_vector_product( p, obs, old_pol_dist).detach().cpu().numpy() v = r2 / p.dot(z) x += v * p r -= v * z r2_new = r.dot(r) mu = r2_new / r2 p = r + mu * p r2 = r2_new if r2 < self._cg_residual_tol(): break return x def _line_search(self, obs, act, adv, old_log_prob, old_pol_dist, prev_loss, stepdir): # Compute optimal step size direction = self._fisher_vector_product( stepdir, obs, old_pol_dist).detach().cpu().numpy() shs = .5 * stepdir.dot(direction) lm = np.sqrt(shs / self._max_kl()) full_step = stepdir / lm stepsize = 1. # Save old policy parameters theta_old = self.policy.get_weights() # Perform Line search violation = True for _ in range(self._n_epochs_line_search()): theta_new = theta_old + full_step * stepsize self.policy.set_weights(theta_new) new_loss = self._compute_loss(obs, act, adv, old_log_prob) kl = self._compute_kl(obs, old_pol_dist) improve = new_loss - prev_loss if kl <= self._max_kl.get_value() * 1.5 and improve >= 0: violation = False break stepsize *= .5 if violation: self.policy.set_weights(theta_old) def _compute_kl(self, obs, old_pol_dist): new_pol_dist = self.policy.distribution_t(obs) return torch.mean( torch.distributions.kl.kl_divergence(old_pol_dist, new_pol_dist)) def _compute_loss(self, obs, act, adv, old_log_prob): ratio = torch.exp(self.policy.log_prob_t(obs, act) - old_log_prob) J = torch.mean(ratio * adv) return J + self._ent_coeff() * self.policy.entropy_t(obs) def _log_info(self, dataset, x, v_target, old_pol_dist): if self._logger: logging_verr = [] torch_v_targets = torch.tensor(v_target, dtype=torch.float) for idx in range(len(self._V)): v_pred = torch.tensor(self._V(x, idx=idx), dtype=torch.float) v_err = F.mse_loss(v_pred, torch_v_targets) logging_verr.append(v_err.item()) logging_ent = self.policy.entropy(x) new_pol_dist = self.policy.distribution(x) logging_kl = torch.mean( torch.distributions.kl.kl_divergence(old_pol_dist, new_pol_dist)) avg_rwd = np.mean(compute_J(dataset)) msg = "Iteration {}:\n\t\t\t\trewards {} vf_loss {}\n\t\t\t\tentropy {} kl {}".format( self._iter, avg_rwd, logging_verr, logging_ent, logging_kl) self._logger.info(msg) self._logger.weak_line()
class DDPG(DeepAC): def __init__(self, mdp_info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, replay_memory, tau, optimization_steps, comm, policy_delay=1, critic_fit_params=None): self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._optimization_steps = optimization_steps self._comm = comm self._policy_delay = policy_delay self._fit_count = 0 if comm.Get_rank() == 0: self._replay_memory = replay_memory target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(TorchApproximator, **actor_params) self._target_actor_approximator = Regressor(TorchApproximator, **target_actor_params) self._init_target(self._critic_approximator, self._target_critic_approximator) self._init_target(self._actor_approximator, self._target_actor_approximator) policy = policy_class(self._actor_approximator, **policy_params) policy_parameters = self._actor_approximator.model.network.parameters() self._add_save_attr(_critic_fit_params='pickle', _batch_size='numpy', _tau='numpy', _policy_delay='numpy', _fit_count='numpy', _replay_memory='pickle', _critic_approximator='pickle', _target_critic_approximator='pickle', _actor_approximator='pickle', _target_actor_approximator='pickle') super().__init__(mdp_info, policy, actor_optimizer, policy_parameters) def fit(self, dataset): if self._comm.Get_rank() == 0: for i in range(1, self._comm.Get_size()): dataset += self._comm.recv(source=i) self._replay_memory.add(dataset) self._comm.Barrier() else: self._comm.send(dataset, dest=0) self._comm.Barrier() for _ in range(self._optimization_steps): if self._comm.Get_rank() == 0: state, action, reward, next_state =\ self._replay_memory.get(self._batch_size * self._comm.Get_size()) else: state = None action = None reward = None next_state = None state, action, reward, next_state = self._comm.bcast( [state, action, reward, next_state], root=0) start = self._batch_size * self._comm.Get_rank() stop = start + self._batch_size state = state[start:stop] action = action[start:stop] reward = reward[start:stop] next_state = next_state[start:stop] q_next = self._next_q(next_state) q = reward + self.mdp_info.gamma * q_next q = np.clip(q, -1 / (1 - self.mdp_info.gamma), 0) self._critic_approximator.fit(state, action, q, **self._critic_fit_params) if self._fit_count % self._policy_delay == 0: loss = self._loss(state) self._optimize_actor_parameters(loss) self._fit_count += 1 self._update_target(self._critic_approximator, self._target_critic_approximator) self._update_target(self._actor_approximator, self._target_actor_approximator) def _loss(self, state): action = self._actor_approximator(state, output_tensor=True, scaled=False) q = self._critic_approximator(state, action, output_tensor=True) return -q.mean() + (action**2).mean() def _next_q(self, next_state): a = self._target_actor_approximator(next_state) q = self._target_critic_approximator.predict(next_state, a) return q def _post_load(self): if self._optimizer is not None: self._parameters = list( self._actor_approximator.model.network.parameters()) def draw_action(self, state): state = np.append(state['observation'], state['desired_goal']) if self._comm.Get_rank() == 0: mu = self._replay_memory._mu sigma2 = self._replay_memory._sigma2 else: mu = None sigma2 = None mu, sigma2 = self._comm.bcast([mu, sigma2], root=0) if not np.any(sigma2 == 0): state = normalize_and_clip(state, mu, sigma2) return self.policy.draw_action(state)
class DDPG(Agent): def __init__(self, actor_approximator, critic_approximator, policy_class, mdp_info, batch_size, initial_replay_size, max_replay_size, tau, actor_params, critic_params, policy_params, n_actions_per_head, history_length=1, n_input_per_mdp=None, n_games=1, dtype=np.uint8): self._batch_size = batch_size self._n_games = n_games if n_input_per_mdp is None: self._n_input_per_mdp = [ mdp_info.observation_space.shape for _ in range(self._n_games) ] else: self._n_input_per_mdp = n_input_per_mdp self._n_actions_per_head = n_actions_per_head self._max_actions = max(n_actions_per_head)[0] self._history_length = history_length self._tau = tau self._replay_memory = [ ReplayMemory(initial_replay_size, max_replay_size) for _ in range(self._n_games) ] self._n_updates = 0 target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(critic_approximator, **critic_params) self._target_critic_approximator = Regressor(critic_approximator, **target_critic_params) if 'loss' not in actor_params: actor_params['loss'] = ActorLoss(self._critic_approximator) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(actor_approximator, n_fit_targets=2, **actor_params) self._target_actor_approximator = Regressor(actor_approximator, n_fit_targets=2, **target_actor_params) self._target_actor_approximator.model.set_weights( self._actor_approximator.model.get_weights()) self._target_critic_approximator.model.set_weights( self._critic_approximator.model.get_weights()) policy = policy_class(self._actor_approximator, **policy_params) super().__init__(mdp_info, policy) n_samples = self._batch_size * self._n_games self._state_idxs = np.zeros(n_samples, dtype=np.int) self._state = np.zeros(((n_samples, self._history_length) + self.mdp_info.observation_space.shape), dtype=dtype).squeeze() self._action = np.zeros((n_samples, self._max_actions)) self._reward = np.zeros(n_samples) self._next_state_idxs = np.zeros(n_samples, dtype=np.int) self._next_state = np.zeros(((n_samples, self._history_length) + self.mdp_info.observation_space.shape), dtype=dtype).squeeze() self._absorbing = np.zeros(n_samples) def fit(self, dataset): s = np.array([d[0][0] for d in dataset]).ravel() games = np.unique(s) for g in games: idxs = np.argwhere(s == g).ravel() d = list() for idx in idxs: d.append(dataset[idx]) self._replay_memory[g].add(d) fit_condition = np.all([rm.initialized for rm in self._replay_memory]) if fit_condition: for i in range(len(self._replay_memory)): game_state, game_action, game_reward, game_next_state,\ game_absorbing, _ = self._replay_memory[i].get( self._batch_size) start = self._batch_size * i stop = start + self._batch_size self._state_idxs[start:stop] = np.ones(self._batch_size) * i self._state[ start:stop, :self._n_input_per_mdp[i][0]] = game_state self._action[ start:stop, :self._n_actions_per_head[i][0]] = game_action self._reward[start:stop] = game_reward self._next_state_idxs[start:stop] = np.ones( self._batch_size) * i self._next_state[ start:stop, :self._n_input_per_mdp[i][0]] = game_next_state self._absorbing[start:stop] = game_absorbing q_next = self._next_q() q = self._reward + q_next self._critic_approximator.fit(self._state, self._action, q, idx=self._state_idxs) self._actor_approximator.fit(self._state, self._state, self._state_idxs, idx=self._state_idxs) self._n_updates += 1 self._update_target() def get_shared_weights(self): cw = self._critic_approximator.model.network.get_shared_weights() aw = self._actor_approximator.model.network.get_shared_weights() return [cw, aw] def set_shared_weights(self, weights): self._critic_approximator.model.network.set_shared_weights(weights[0]) self._actor_approximator.model.network.set_shared_weights(weights[1]) def freeze_shared_weights(self): self._critic_approximator.model.network.freeze_shared_weights() self._actor_approximator.model.network.freeze_shared_weights() def unfreeze_shared_weights(self): self._critic_approximator.model.network.unfreeze_shared_weights() self._actor_approximator.model.network.unfreeze_shared_weights() def _update_target(self): """ Update the target networks. """ critic_weights = self._tau * self._critic_approximator.model.get_weights( ) critic_weights += ( 1 - self._tau) * self._target_critic_approximator.get_weights() self._target_critic_approximator.set_weights(critic_weights) actor_weights = self._tau * self._actor_approximator.model.get_weights( ) actor_weights += ( 1 - self._tau) * self._target_actor_approximator.get_weights() self._target_actor_approximator.set_weights(actor_weights) def _next_q(self): a = self._target_actor_approximator(self._next_state, idx=self._next_state_idxs) q = self._target_critic_approximator( self._next_state, a, idx=self._next_state_idxs).ravel() out_q = np.zeros(self._batch_size * self._n_games) for i in range(self._n_games): start = self._batch_size * i stop = start + self._batch_size out_q[start:stop] = q[start:stop] * self.mdp_info.gamma[i] if np.any(self._absorbing[start:stop]): out_q[start:stop] = out_q[start:stop] * ( 1 - self._absorbing[start:stop]) return out_q
class OptionSAC(DeepOptionAC): def __init__(self, mdp_info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, rarhmm: rARHMM, target_entropy=None, critic_fit_params=None): """ Constructor. Args: actor_mu_params (dict): parameters of the actor mean approximator to build; actor_sigma_params (dict): parameters of the actor sigm approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; warmup_transitions (int): number of samples to accumulate in the replay memory to start the policy fitting; tau (float): value of coefficient for soft updates; lr_alpha (float): Learning rate for the entropy coefficient; target_entropy (float, None): target entropy for the policy, if None a default value is computed ; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self.rarhmm = rarhmm self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._warmup_transitions = warmup_transitions self._tau = tau if target_entropy is None: self._target_entropy = -np.prod(mdp_info.action_space.shape).astype(np.float32) else: self._target_entropy = target_entropy self._replay_memory = OptionReplayMemory(initial_replay_size, max_replay_size) if 'n_models' in critic_params.keys(): assert critic_params['n_models'] == 2 else: critic_params['n_models'] = 2 target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) actor_mu_approximator = [Regressor(TorchApproximator, **actor_mu_params) for _ in range(rarhmm.nb_states)] actor_sigma_approximator = [Regressor(TorchApproximator, **actor_sigma_params) for _ in range(rarhmm.nb_states)] policy = [SACPolicy(actor_mu_approximator[o], actor_sigma_approximator[o], mdp_info.action_space.low, mdp_info.action_space.high) for o in range(rarhmm.nb_states)] self._init_target(self._critic_approximator, self._target_critic_approximator) self._log_alpha = torch.tensor(0., dtype=torch.float32) if policy[0].use_cuda: self._log_alpha = self._log_alpha.cuda().requires_grad_() else: self._log_alpha.requires_grad_() self._alpha_optim = optim.Adam([self._log_alpha], lr=lr_alpha) policy_parameters = [chain(actor_mu_approximator[o].model.network.parameters(), actor_sigma_approximator[o].model.network.parameters()) for o in range(rarhmm.nb_states)] self._add_save_attr( _critic_fit_params='pickle', _batch_size='numpy', _warmup_transitions='numpy', _tau='numpy', _target_entropy='numpy', _replay_memory='pickle', _critic_approximator='pickle', _target_critic_approximator='pickle', _log_alpha='pickle', _alpha_optim='pickle' ) super().__init__(mdp_info, policy, actor_optimizer, policy_parameters, rarhmm.nb_states) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, option, option_weight = \ self._replay_memory.get(self._batch_size) if self._replay_memory.size > self._warmup_transitions: action_new = torch.empty((0, self.mdp_info.action_space.shape[0])) log_prob = torch.empty((0)) for o in range(self.rarhmm.nb_states): selection = (o == option) _action_new, _log_prob = self.policy[o].compute_action_and_log_prob_t(state[selection]) action_new = torch.cat([action_new, _action_new]) # TODO: check if stacking is correct log_prob = torch.cat([log_prob, _log_prob]) loss = self._loss(state[selection], _action_new, _log_prob) self._optimize_actor_parameters(loss, o) # TODO: Look at global loss self._update_alpha(log_prob.detach()) q_next = self._next_q(next_state, absorbing, option) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) self._update_target(self._critic_approximator, self._target_critic_approximator) def _loss(self, state, action_new, log_prob): q_0 = self._critic_approximator(state, action_new, output_tensor=True, idx=0) q_1 = self._critic_approximator(state, action_new, output_tensor=True, idx=1) q = torch.min(q_0, q_1) return (self._alpha * log_prob - q).mean() def _update_alpha(self, log_prob): alpha_loss = - (self._log_alpha * (log_prob + self._target_entropy)).mean() self._alpha_optim.zero_grad() alpha_loss.backward() self._alpha_optim.step() def _next_q(self, next_state, absorbing, option=None): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ # TODO: submit options batch # a = np.empty((0, self.mdp_info.action_space.shape[0])) # log_prob_next = np.empty((0)) # for o in range(self.n_options): # selection = (o == option) # _a, _log_prob_next = self.policy[o].compute_action_and_log_prob(next_state[selection]) # a = np.vstack([a, _a]) # log_prob_next = np.hstack([log_prob_next, _log_prob_next]) a = 0 log_prob_next = 0 for o in range(self.n_options): _a, _log_prob_next = self.policy[o].compute_action_and_log_prob(next_state) a += _a log_prob_next += _log_prob_next a /= self.n_options log_prob_next /= self.n_options q = self._target_critic_approximator.predict( next_state, a, prediction='min') - self._alpha_np * log_prob_next q *= 1 - absorbing return q def _post_load(self): if self._optimizer is not None: self._parameters = list( chain(self.policy._mu_approximator.model.network.parameters(), self.policy._sigma_approximator.model.network.parameters() ) ) @property def _alpha(self): return self._log_alpha.exp() @property def _alpha_np(self): return self._alpha.detach().cpu().numpy()