def __init__(self, env, policy, value_function, inner_lr=_Default(1e-2), outer_lr=1e-3, max_kl_step=0.01, max_path_length=500, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=40, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): policy_optimizer = OptimizerWrapper( (torch.optim.Adam, dict(lr=inner_lr)), policy) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = VPG(env.spec, policy, value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) meta_optimizer = (ConjugateGradientOptimizer, dict(max_constraint_value=max_kl_step)) super().__init__(inner_algo=inner_algo, env=env, policy=policy, meta_optimizer=meta_optimizer, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates, meta_evaluator=meta_evaluator, evaluate_every_n_epochs=evaluate_every_n_epochs)
def __init__( self, env_spec, policy, value_function, policy_optimizer=None, vf_optimizer=None, max_path_length=500, num_train_per_epoch=1, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', ): self.discount = discount self.policy = policy self.max_path_length = max_path_length self._value_function = value_function self._gae_lambda = gae_lambda self._center_adv = center_adv self._positive_adv = positive_adv self._policy_ent_coeff = policy_ent_coeff self._use_softplus_entropy = use_softplus_entropy self._stop_entropy_gradient = stop_entropy_gradient self._entropy_method = entropy_method self._n_samples = num_train_per_epoch self._env_spec = env_spec self._maximum_entropy = (entropy_method == 'max') self._entropy_regularzied = (entropy_method == 'regularized') self._check_entropy_configuration(entropy_method, center_adv, stop_entropy_gradient, policy_ent_coeff) self._episode_reward_mean = collections.deque(maxlen=100) self.sampler_cls = OnPolicyVectorizedSampler if policy_optimizer: self._policy_optimizer = policy_optimizer else: self._policy_optimizer = OptimizerWrapper(torch.optim.Adam, policy) if vf_optimizer: self._vf_optimizer = vf_optimizer else: self._vf_optimizer = OptimizerWrapper(torch.optim.Adam, value_function) self._old_policy = copy.deepcopy(self.policy)
def ppo_metarl_pytorch(ctxt, env_id, seed): """Create metarl PyTorch PPO model and training. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) env = MetaRLEnv(normalize(gym.make(env_id))) policy = PyTorch_GMP(env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), policy, max_optimization_epochs=10, minibatch_size=64) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, minibatch_size=64) algo = PyTorch_PPO(env_spec=env.spec, policy=policy, value_function=value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, max_path_length=hyper_parameters['max_path_length'], discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def __init__(self, env_spec, policy, value_function, policy_optimizer=None, vf_optimizer=None, max_path_length=500, lr_clip_range=2e-1, num_train_per_epoch=1, discount=0.99, gae_lambda=0.97, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy'): if policy_optimizer is None: policy_optimizer = OptimizerWrapper( (torch.optim.Adam, dict(lr=2.5e-4)), policy, max_optimization_epochs=10, minibatch_size=64) if vf_optimizer is None: vf_optimizer = OptimizerWrapper( (torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, minibatch_size=64) super().__init__(env_spec=env_spec, policy=policy, value_function=value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, max_path_length=max_path_length, num_train_per_epoch=num_train_per_epoch, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) self._lr_clip_range = lr_clip_range
class VPG(RLAlgorithm): """Vanilla Policy Gradient (REINFORCE). VPG, also known as Reinforce, trains stochastic policy in an on-policy way. Args: env_spec (metarl.envs.EnvSpec): Environment specification. policy (metarl.torch.policies.Policy): Policy. value_function (metarl.torch.value_functions.ValueFunction): The value function. policy_optimizer (metarl.torch.optimizer.OptimizerWrapper): Optimizer for policy. vf_optimizer (metarl.torch.optimizer.OptimizerWrapper): Optimizer for value function. max_path_length (int): Maximum length of a single rollout. num_train_per_epoch (int): Number of train_once calls per epoch. discount (float): Discount. gae_lambda (float): Lambda used for generalized advantage estimation. center_adv (bool): Whether to rescale the advantages so that they have mean 0 and standard deviation 1. positive_adv (bool): Whether to shift the advantages so that they are always positive. When used in conjunction with center_adv the advantages will be standardized before shifting. policy_ent_coeff (float): The coefficient of the policy entropy. Setting it to zero would mean no entropy regularization. use_softplus_entropy (bool): Whether to estimate the softmax distribution of the entropy to prevent the entropy from being negative. stop_entropy_gradient (bool): Whether to stop the entropy gradient. entropy_method (str): A string from: 'max', 'regularized', 'no_entropy'. The type of entropy method to use. 'max' adds the dense entropy to the reward for each time step. 'regularized' adds the mean entropy to the surrogate objective. See https://arxiv.org/abs/1805.00909 for more details. """ def __init__( self, env_spec, policy, value_function, policy_optimizer=None, vf_optimizer=None, max_path_length=500, num_train_per_epoch=1, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', ): self.discount = discount self.policy = policy self.max_path_length = max_path_length self._value_function = value_function self._gae_lambda = gae_lambda self._center_adv = center_adv self._positive_adv = positive_adv self._policy_ent_coeff = policy_ent_coeff self._use_softplus_entropy = use_softplus_entropy self._stop_entropy_gradient = stop_entropy_gradient self._entropy_method = entropy_method self._n_samples = num_train_per_epoch self._env_spec = env_spec self._maximum_entropy = (entropy_method == 'max') self._entropy_regularzied = (entropy_method == 'regularized') self._check_entropy_configuration(entropy_method, center_adv, stop_entropy_gradient, policy_ent_coeff) self._episode_reward_mean = collections.deque(maxlen=100) self.sampler_cls = OnPolicyVectorizedSampler if policy_optimizer: self._policy_optimizer = policy_optimizer else: self._policy_optimizer = OptimizerWrapper(torch.optim.Adam, policy) if vf_optimizer: self._vf_optimizer = vf_optimizer else: self._vf_optimizer = OptimizerWrapper(torch.optim.Adam, value_function) self._old_policy = copy.deepcopy(self.policy) @staticmethod def _check_entropy_configuration(entropy_method, center_adv, stop_entropy_gradient, policy_ent_coeff): if entropy_method not in ('max', 'regularized', 'no_entropy'): raise ValueError('Invalid entropy_method') if entropy_method == 'max': if center_adv: raise ValueError('center_adv should be False when ' 'entropy_method is max') if not stop_entropy_gradient: raise ValueError('stop_gradient should be True when ' 'entropy_method is max') if entropy_method == 'no_entropy': if policy_ent_coeff != 0.0: raise ValueError('policy_ent_coeff should be zero ' 'when there is no entropy method') def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs, actions, rewards, returns, valids, baselines = \ self.process_samples(paths) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) rewards_flat = torch.cat(filter_valids(rewards, valids)) returns_flat = torch.cat(filter_valids(returns, valids)) advs_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): policy_loss_before = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_before = self._value_function.compute_loss( obs_flat, returns_flat) kl_before = self._compute_kl_constraint(obs) self._train(obs_flat, actions_flat, rewards_flat, returns_flat, advs_flat) with torch.no_grad(): policy_loss_after = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_after = self._value_function.compute_loss( obs_flat, returns_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', (policy_loss_before - policy_loss_after).item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) with tabular.prefix(self._value_function.name): tabular.record('/LossBefore', vf_loss_before.item()) tabular.record('/LossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), discount=self.discount) return np.mean(undiscounted_returns) def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ last_return = None for _ in runner.step_epochs(): for _ in range(self._n_samples): runner.step_path = runner.obtain_samples(runner.step_itr) last_return = self.train_once(runner.step_itr, runner.step_path) runner.step_itr += 1 return last_return def _train(self, obs, actions, rewards, returns, advs): r"""Train the policy and value function with minibatch. Args: obs (torch.Tensor): Observation from the environment with shape :math:`(N, O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N, A*)`. rewards (torch.Tensor): Acquired rewards with shape :math:`(N, )`. returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. advs (torch.Tensor): Advantage value at each step with shape :math:`(N, )`. """ for dataset in self._policy_optimizer.get_minibatch( obs, actions, rewards, advs): self._train_policy(*dataset) for dataset in self._vf_optimizer.get_minibatch(obs, returns): self._train_value_function(*dataset) def _train_policy(self, obs, actions, rewards, advantages): r"""Train the policy. Args: obs (torch.Tensor): Observation from the environment with shape :math:`(N, O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N, A*)`. rewards (torch.Tensor): Acquired rewards with shape :math:`(N, )`. advantages (torch.Tensor): Advantage value at each step with shape :math:`(N, )`. Returns: torch.Tensor: Calculated mean scalar value of policy loss (float). """ self._policy_optimizer.zero_grad() loss = self._compute_loss_with_adv(obs, actions, rewards, advantages) loss.backward() self._policy_optimizer.step() return loss def _train_value_function(self, obs, returns): r"""Train the value function. Args: obs (torch.Tensor): Observation from the environment with shape :math:`(N, O*)`. returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. Returns: torch.Tensor: Calculated mean scalar value of value function loss (float). """ self._vf_optimizer.zero_grad() loss = self._value_function.compute_loss(obs, returns) loss.backward() self._vf_optimizer.step() return loss def _compute_loss(self, obs, actions, rewards, valids, baselines): r"""Compute mean value of loss. Notes: P is the maximum path length (self.max_path_length) Args: obs (torch.Tensor): Observation from the environment with shape :math:`(N, P, O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N, P, A*)`. rewards (torch.Tensor): Acquired rewards with shape :math:`(N, P)`. valids (list[int]): Numbers of valid steps in each paths baselines (torch.Tensor): Value function estimation at each step with shape :math:`(N, P)`. Returns: torch.Tensor: Calculated negative mean scalar value of objective (float). """ obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) rewards_flat = torch.cat(filter_valids(rewards, valids)) advantages_flat = self._compute_advantage(rewards, valids, baselines) return self._compute_loss_with_adv(obs_flat, actions_flat, rewards_flat, advantages_flat) def _compute_loss_with_adv(self, obs, actions, rewards, advantages): r"""Compute mean value of loss. Args: obs (torch.Tensor): Observation from the environment with shape :math:`(N \dot [T], O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N \dot [T], A*)`. rewards (torch.Tensor): Acquired rewards with shape :math:`(N \dot [T], )`. advantages (torch.Tensor): Advantage value at each step with shape :math:`(N \dot [T], )`. Returns: torch.Tensor: Calculated negative mean scalar value of objective. """ objectives = self._compute_objective(advantages, obs, actions, rewards) if self._entropy_regularzied: policy_entropies = self._compute_policy_entropy(obs) objectives += self._policy_ent_coeff * policy_entropies return -objectives.mean() def _compute_advantage(self, rewards, valids, baselines): r"""Compute mean value of loss. Notes: P is the maximum path length (self.max_path_length) Args: rewards (torch.Tensor): Acquired rewards with shape :math:`(N, P)`. valids (list[int]): Numbers of valid steps in each paths baselines (torch.Tensor): Value function estimation at each step with shape :math:`(N, P)`. Returns: torch.Tensor: Calculated advantage values given rewards and baselines with shape :math:`(N \dot [T], )`. """ advantages = compute_advantages(self.discount, self._gae_lambda, self.max_path_length, baselines, rewards) advantage_flat = torch.cat(filter_valids(advantages, valids)) if self._center_adv: means = advantage_flat.mean() variance = advantage_flat.var() advantage_flat = (advantage_flat - means) / (variance + 1e-8) if self._positive_adv: advantage_flat -= advantage_flat.min() return advantage_flat def _compute_kl_constraint(self, obs): r"""Compute KL divergence. Compute the KL divergence between the old policy distribution and current policy distribution. Notes: P is the maximum path length (self.max_path_length) Args: obs (torch.Tensor): Observation from the environment with shape :math:`(N, P, O*)`. Returns: torch.Tensor: Calculated mean scalar value of KL divergence (float). """ with torch.no_grad(): old_dist = self._old_policy(obs) new_dist = self.policy(obs) kl_constraint = torch.distributions.kl.kl_divergence( old_dist, new_dist) return kl_constraint.mean() def _compute_policy_entropy(self, obs): r"""Compute entropy value of probability distribution. Notes: P is the maximum path length (self.max_path_length) Args: obs (torch.Tensor): Observation from the environment with shape :math:`(N, P, O*)`. Returns: torch.Tensor: Calculated entropy values given observation with shape :math:`(N, P)`. """ if self._stop_entropy_gradient: with torch.no_grad(): policy_entropy = self.policy.entropy(obs) else: policy_entropy = self.policy.entropy(obs) # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = F.softplus(policy_entropy) return policy_entropy def _compute_objective(self, advantages, obs, actions, rewards): r"""Compute objective value. Args: advantages (torch.Tensor): Advantage value at each step with shape :math:`(N \dot [T], )`. obs (torch.Tensor): Observation from the environment with shape :math:`(N \dot [T], O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N \dot [T], A*)`. rewards (torch.Tensor): Acquired rewards with shape :math:`(N \dot [T], )`. Returns: torch.Tensor: Calculated objective values with shape :math:`(N \dot [T], )`. """ del rewards log_likelihoods = self.policy.log_likelihood(obs, actions) return log_likelihoods * advantages def process_samples(self, paths): r"""Process sample data based on the collected paths. Notes: P is the maximum path length (self.max_path_length) Args: paths (list[dict]): A list of collected paths Returns: torch.Tensor: The observations of the environment with shape :math:`(N, P, O*)`. torch.Tensor: The actions fed to the environment with shape :math:`(N, P, A*)`. torch.Tensor: The acquired rewards with shape :math:`(N, P)`. list[int]: Numbers of valid steps in each paths. torch.Tensor: Value function estimation at each step with shape :math:`(N, P)`. """ valids = torch.Tensor([len(path['actions']) for path in paths]).int() obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]) actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]) rewards = torch.stack([ pad_to_last(path['rewards'], total_length=self.max_path_length) for path in paths ]) returns = torch.stack([ pad_to_last(tu.discount_cumsum(path['rewards'], self.discount).copy(), total_length=self.max_path_length) for path in paths ]) with torch.no_grad(): baselines = self._value_function(obs) return obs, actions, rewards, returns, valids, baselines