class EpsGreedy(nn.Module): """ Epsilon-greedy action selection """ def __init__(self, epsilon: typing.Union[Schedule, float], environment): super().__init__() if isinstance(epsilon, Schedule): self.epsilon_schedule = epsilon else: self.epsilon_schedule = ConstantSchedule(epsilon) self.action_space = environment.action_space def forward(self, actions, batch_info=None): if batch_info is None: # Just take final value if there is no batch info epsilon = self.epsilon_schedule.value(1.0) else: epsilon = self.epsilon_schedule.value(batch_info['progress']) random_samples = torch.randint_like(actions, self.action_space.n) selector = torch.rand_like(random_samples, dtype=torch.float32) # Actions with noise applied noisy_actions = torch.where(selector > epsilon, actions, random_samples) return noisy_actions def reset_training_state(self, dones, batch_info): """ A hook for a model to react when during training episode is finished """ pass
def __init__(self, epsilon: typing.Union[Schedule, float], environment): super().__init__() if isinstance(epsilon, Schedule): self.epsilon_schedule = epsilon else: self.epsilon_schedule = ConstantSchedule(epsilon) self.action_space = environment.action_space
def __init__(self, entropy_coefficient, value_coefficient, cliprange, max_grad_norm): super().__init__(max_grad_norm) self.entropy_coefficient = entropy_coefficient self.value_coefficient = value_coefficient if isinstance(cliprange, numbers.Number): self.cliprange = ConstantSchedule(cliprange) else: self.cliprange = cliprange
def __init__(self, entropy_coefficient, value_coefficient, cliprange, max_grad_norm, discount_factor: float, normalize_advantage: bool = True, gae_lambda: float = 1.0): super().__init__(max_grad_norm) self.entropy_coefficient = entropy_coefficient self.value_coefficient = value_coefficient self.normalize_advantage = normalize_advantage if isinstance(cliprange, numbers.Number): self.cliprange = ConstantSchedule(cliprange) else: self.cliprange = cliprange self.gae_lambda = gae_lambda self.discount_factor = discount_factor
class PpoPolicyGradient(OptimizerAlgoBase): """ Proximal Policy Optimization - https://arxiv.org/abs/1707.06347 """ def __init__(self, entropy_coefficient, value_coefficient, cliprange, max_grad_norm): super().__init__(max_grad_norm) self.entropy_coefficient = entropy_coefficient self.value_coefficient = value_coefficient if isinstance(cliprange, numbers.Number): self.cliprange = ConstantSchedule(cliprange) else: self.cliprange = cliprange def calculate_gradient(self, batch_info, device, model, rollout): """ Calculate loss of the supplied rollout """ observations = rollout['observations'] returns = rollout['returns'] advantages = rollout['advantages'] rollout_values = rollout['values'] rollout_actions = rollout['actions'] rollout_logprobs = rollout['logprobs'] # Select the cliprange current_cliprange = self.cliprange.value(batch_info['progress']) # Normalize the advantages? advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # PART 0 - model_evaluation eval_action_pd_params, eval_value_outputs = model(observations) # PART 1 - policy entropy policy_entropy = torch.mean(model.entropy(eval_action_pd_params)) # PART 2 - value function value_output_clipped = rollout_values + torch.clamp( eval_value_outputs - rollout_values, -current_cliprange, current_cliprange) value_loss_part1 = (eval_value_outputs - returns).pow(2) value_loss_part2 = (value_output_clipped - returns).pow(2) value_loss = 0.5 * torch.mean( torch.max(value_loss_part1, value_loss_part2)) # PART 3 - policy gradient loss eval_logprobs = model.logprob(rollout_actions, eval_action_pd_params) ratio = torch.exp(eval_logprobs - rollout_logprobs) pg_loss_part1 = -advantages * ratio pg_loss_part2 = -advantages * torch.clamp( ratio, 1.0 - current_cliprange, 1.0 + current_cliprange) policy_loss = torch.mean(torch.max(pg_loss_part1, pg_loss_part2)) loss_value = (policy_loss - self.entropy_coefficient * policy_entropy + self.value_coefficient * value_loss) loss_value.backward() with torch.no_grad(): approx_kl_divergence = 0.5 * torch.mean( (eval_logprobs - rollout_logprobs)**2) clip_fraction = torch.mean( (torch.abs(ratio - 1.0) > current_cliprange).to( dtype=torch.float)) return { 'policy_loss': policy_loss.item(), 'value_loss': value_loss.item(), 'policy_entropy': policy_entropy.item(), 'approx_kl_divergence': approx_kl_divergence.item(), 'clip_fraction': clip_fraction.item(), 'advantage_norm': torch.norm(advantages).item(), 'explained_variance': explained_variance(returns, rollout_values) } def metrics(self) -> list: """ List of metrics to track for this learning process """ return [ AveragingNamedMetric("policy_loss"), AveragingNamedMetric("value_loss"), AveragingNamedMetric("policy_entropy"), AveragingNamedMetric("approx_kl_divergence"), AveragingNamedMetric("clip_fraction"), AveragingNamedMetric("grad_norm"), AveragingNamedMetric("advantage_norm"), AveragingNamedMetric("explained_variance") ]
class PpoPolicyGradient(OptimizerAlgoBase): """ Proximal Policy Optimization - https://arxiv.org/abs/1707.06347 """ def __init__(self, entropy_coefficient, value_coefficient, cliprange, max_grad_norm, discount_factor: float, normalize_advantage: bool = True, gae_lambda: float = 1.0): super().__init__(max_grad_norm) self.entropy_coefficient = entropy_coefficient self.value_coefficient = value_coefficient self.normalize_advantage = normalize_advantage if isinstance(cliprange, numbers.Number): self.cliprange = ConstantSchedule(cliprange) else: self.cliprange = cliprange self.gae_lambda = gae_lambda self.discount_factor = discount_factor def process_rollout(self, batch_info, rollout: Rollout): """ Process rollout for ALGO before any chunking/shuffling """ assert isinstance(rollout, Trajectories), "PPO requires trajectory rollouts" advantages = discount_bootstrap_gae( rewards_buffer=rollout.transition_tensors['rewards'], dones_buffer=rollout.transition_tensors['dones'], values_buffer=rollout.transition_tensors['values'], final_values=rollout.rollout_tensors['final_values'], discount_factor=self.discount_factor, gae_lambda=self.gae_lambda, number_of_steps=rollout.num_steps) returns = advantages + rollout.transition_tensors['values'] rollout.transition_tensors['advantages'] = advantages rollout.transition_tensors['returns'] = returns return rollout def calculate_gradient(self, batch_info, device, model, rollout): """ Calculate loss of the supplied rollout """ evaluator = model.evaluate(rollout) # Part 0.0 - Rollout values advantages = evaluator.get('rollout:advantages') rollout_values = evaluator.get('rollout:values') rollout_action_logprobs = evaluator.get('rollout:action:logprobs') returns = evaluator.get('rollout:returns') # PART 0.1 - Model evaluation entropy = evaluator.get('model:entropy') model_values = evaluator.get('model:values') model_action_logprobs = evaluator.get('model:action:logprobs') # Select the cliprange current_cliprange = self.cliprange.value(batch_info['progress']) # Normalize the advantages? if self.normalize_advantage: advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # PART 1 - policy entropy policy_entropy = torch.mean(entropy) # PART 2 - value function value_output_clipped = rollout_values + torch.clamp( model_values - rollout_values, -current_cliprange, current_cliprange) value_loss_part1 = (model_values - returns).pow(2) value_loss_part2 = (value_output_clipped - returns).pow(2) value_loss = 0.5 * torch.mean( torch.max(value_loss_part1, value_loss_part2)) # PART 3 - policy gradient loss ratio = torch.exp(model_action_logprobs - rollout_action_logprobs) pg_loss_part1 = -advantages * ratio pg_loss_part2 = -advantages * torch.clamp( ratio, 1.0 - current_cliprange, 1.0 + current_cliprange) policy_loss = torch.mean(torch.max(pg_loss_part1, pg_loss_part2)) loss_value = (policy_loss - self.entropy_coefficient * policy_entropy + self.value_coefficient * value_loss) loss_value.backward() with torch.no_grad(): approx_kl_divergence = 0.5 * torch.mean( (model_action_logprobs - rollout_action_logprobs).pow(2)) clip_fraction = torch.mean( (torch.abs(ratio - 1.0) > current_cliprange).to( dtype=torch.float)) return { 'policy_loss': policy_loss.item(), 'value_loss': value_loss.item(), 'policy_entropy': policy_entropy.item(), 'approx_kl_divergence': approx_kl_divergence.item(), 'clip_fraction': clip_fraction.item(), 'advantage_norm': torch.norm(advantages).item(), 'explained_variance': explained_variance(returns, rollout_values) } def metrics(self) -> list: """ List of metrics to track for this learning process """ return [ AveragingNamedMetric("policy_loss"), AveragingNamedMetric("value_loss"), AveragingNamedMetric("policy_entropy"), AveragingNamedMetric("approx_kl_divergence"), AveragingNamedMetric("clip_fraction"), AveragingNamedMetric("grad_norm"), AveragingNamedMetric("advantage_norm"), AveragingNamedMetric("explained_variance") ]