Beispiel #1
0
class EpsGreedy(nn.Module):
    """ Epsilon-greedy action selection """
    def __init__(self, epsilon: typing.Union[Schedule, float], environment):
        super().__init__()

        if isinstance(epsilon, Schedule):
            self.epsilon_schedule = epsilon
        else:
            self.epsilon_schedule = ConstantSchedule(epsilon)

        self.action_space = environment.action_space

    def forward(self, actions, batch_info=None):
        if batch_info is None:
            # Just take final value if there is no batch info
            epsilon = self.epsilon_schedule.value(1.0)
        else:
            epsilon = self.epsilon_schedule.value(batch_info['progress'])

        random_samples = torch.randint_like(actions, self.action_space.n)
        selector = torch.rand_like(random_samples, dtype=torch.float32)

        # Actions with noise applied
        noisy_actions = torch.where(selector > epsilon, actions,
                                    random_samples)

        return noisy_actions

    def reset_training_state(self, dones, batch_info):
        """ A hook for a model to react when during training episode is finished """
        pass
Beispiel #2
0
    def __init__(self, epsilon: typing.Union[Schedule, float], environment):
        super().__init__()

        if isinstance(epsilon, Schedule):
            self.epsilon_schedule = epsilon
        else:
            self.epsilon_schedule = ConstantSchedule(epsilon)

        self.action_space = environment.action_space
Beispiel #3
0
    def __init__(self, entropy_coefficient, value_coefficient, cliprange,
                 max_grad_norm):
        super().__init__(max_grad_norm)

        self.entropy_coefficient = entropy_coefficient
        self.value_coefficient = value_coefficient

        if isinstance(cliprange, numbers.Number):
            self.cliprange = ConstantSchedule(cliprange)
        else:
            self.cliprange = cliprange
Beispiel #4
0
    def __init__(self,
                 entropy_coefficient,
                 value_coefficient,
                 cliprange,
                 max_grad_norm,
                 discount_factor: float,
                 normalize_advantage: bool = True,
                 gae_lambda: float = 1.0):
        super().__init__(max_grad_norm)

        self.entropy_coefficient = entropy_coefficient
        self.value_coefficient = value_coefficient
        self.normalize_advantage = normalize_advantage

        if isinstance(cliprange, numbers.Number):
            self.cliprange = ConstantSchedule(cliprange)
        else:
            self.cliprange = cliprange

        self.gae_lambda = gae_lambda
        self.discount_factor = discount_factor
Beispiel #5
0
class PpoPolicyGradient(OptimizerAlgoBase):
    """ Proximal Policy Optimization - https://arxiv.org/abs/1707.06347 """
    def __init__(self, entropy_coefficient, value_coefficient, cliprange,
                 max_grad_norm):
        super().__init__(max_grad_norm)

        self.entropy_coefficient = entropy_coefficient
        self.value_coefficient = value_coefficient

        if isinstance(cliprange, numbers.Number):
            self.cliprange = ConstantSchedule(cliprange)
        else:
            self.cliprange = cliprange

    def calculate_gradient(self, batch_info, device, model, rollout):
        """ Calculate loss of the supplied rollout """
        observations = rollout['observations']
        returns = rollout['returns']
        advantages = rollout['advantages']
        rollout_values = rollout['values']
        rollout_actions = rollout['actions']
        rollout_logprobs = rollout['logprobs']

        # Select the cliprange
        current_cliprange = self.cliprange.value(batch_info['progress'])

        # Normalize the advantages?
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-8)

        # PART 0 - model_evaluation
        eval_action_pd_params, eval_value_outputs = model(observations)

        # PART 1 - policy entropy
        policy_entropy = torch.mean(model.entropy(eval_action_pd_params))

        # PART 2 - value function
        value_output_clipped = rollout_values + torch.clamp(
            eval_value_outputs - rollout_values, -current_cliprange,
            current_cliprange)
        value_loss_part1 = (eval_value_outputs - returns).pow(2)
        value_loss_part2 = (value_output_clipped - returns).pow(2)
        value_loss = 0.5 * torch.mean(
            torch.max(value_loss_part1, value_loss_part2))

        # PART 3 - policy gradient loss
        eval_logprobs = model.logprob(rollout_actions, eval_action_pd_params)
        ratio = torch.exp(eval_logprobs - rollout_logprobs)

        pg_loss_part1 = -advantages * ratio
        pg_loss_part2 = -advantages * torch.clamp(
            ratio, 1.0 - current_cliprange, 1.0 + current_cliprange)
        policy_loss = torch.mean(torch.max(pg_loss_part1, pg_loss_part2))

        loss_value = (policy_loss - self.entropy_coefficient * policy_entropy +
                      self.value_coefficient * value_loss)

        loss_value.backward()

        with torch.no_grad():
            approx_kl_divergence = 0.5 * torch.mean(
                (eval_logprobs - rollout_logprobs)**2)
            clip_fraction = torch.mean(
                (torch.abs(ratio - 1.0) > current_cliprange).to(
                    dtype=torch.float))

        return {
            'policy_loss': policy_loss.item(),
            'value_loss': value_loss.item(),
            'policy_entropy': policy_entropy.item(),
            'approx_kl_divergence': approx_kl_divergence.item(),
            'clip_fraction': clip_fraction.item(),
            'advantage_norm': torch.norm(advantages).item(),
            'explained_variance': explained_variance(returns, rollout_values)
        }

    def metrics(self) -> list:
        """ List of metrics to track for this learning process """
        return [
            AveragingNamedMetric("policy_loss"),
            AveragingNamedMetric("value_loss"),
            AveragingNamedMetric("policy_entropy"),
            AveragingNamedMetric("approx_kl_divergence"),
            AveragingNamedMetric("clip_fraction"),
            AveragingNamedMetric("grad_norm"),
            AveragingNamedMetric("advantage_norm"),
            AveragingNamedMetric("explained_variance")
        ]
Beispiel #6
0
class PpoPolicyGradient(OptimizerAlgoBase):
    """ Proximal Policy Optimization - https://arxiv.org/abs/1707.06347 """
    def __init__(self,
                 entropy_coefficient,
                 value_coefficient,
                 cliprange,
                 max_grad_norm,
                 discount_factor: float,
                 normalize_advantage: bool = True,
                 gae_lambda: float = 1.0):
        super().__init__(max_grad_norm)

        self.entropy_coefficient = entropy_coefficient
        self.value_coefficient = value_coefficient
        self.normalize_advantage = normalize_advantage

        if isinstance(cliprange, numbers.Number):
            self.cliprange = ConstantSchedule(cliprange)
        else:
            self.cliprange = cliprange

        self.gae_lambda = gae_lambda
        self.discount_factor = discount_factor

    def process_rollout(self, batch_info, rollout: Rollout):
        """ Process rollout for ALGO before any chunking/shuffling  """
        assert isinstance(rollout,
                          Trajectories), "PPO requires trajectory rollouts"

        advantages = discount_bootstrap_gae(
            rewards_buffer=rollout.transition_tensors['rewards'],
            dones_buffer=rollout.transition_tensors['dones'],
            values_buffer=rollout.transition_tensors['values'],
            final_values=rollout.rollout_tensors['final_values'],
            discount_factor=self.discount_factor,
            gae_lambda=self.gae_lambda,
            number_of_steps=rollout.num_steps)

        returns = advantages + rollout.transition_tensors['values']

        rollout.transition_tensors['advantages'] = advantages
        rollout.transition_tensors['returns'] = returns

        return rollout

    def calculate_gradient(self, batch_info, device, model, rollout):
        """ Calculate loss of the supplied rollout """
        evaluator = model.evaluate(rollout)

        # Part 0.0 - Rollout values
        advantages = evaluator.get('rollout:advantages')
        rollout_values = evaluator.get('rollout:values')
        rollout_action_logprobs = evaluator.get('rollout:action:logprobs')
        returns = evaluator.get('rollout:returns')

        # PART 0.1 - Model evaluation
        entropy = evaluator.get('model:entropy')
        model_values = evaluator.get('model:values')
        model_action_logprobs = evaluator.get('model:action:logprobs')

        # Select the cliprange
        current_cliprange = self.cliprange.value(batch_info['progress'])

        # Normalize the advantages?
        if self.normalize_advantage:
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-8)

        # PART 1 - policy entropy
        policy_entropy = torch.mean(entropy)

        # PART 2 - value function
        value_output_clipped = rollout_values + torch.clamp(
            model_values - rollout_values, -current_cliprange,
            current_cliprange)
        value_loss_part1 = (model_values - returns).pow(2)
        value_loss_part2 = (value_output_clipped - returns).pow(2)
        value_loss = 0.5 * torch.mean(
            torch.max(value_loss_part1, value_loss_part2))

        # PART 3 - policy gradient loss
        ratio = torch.exp(model_action_logprobs - rollout_action_logprobs)

        pg_loss_part1 = -advantages * ratio
        pg_loss_part2 = -advantages * torch.clamp(
            ratio, 1.0 - current_cliprange, 1.0 + current_cliprange)
        policy_loss = torch.mean(torch.max(pg_loss_part1, pg_loss_part2))

        loss_value = (policy_loss - self.entropy_coefficient * policy_entropy +
                      self.value_coefficient * value_loss)

        loss_value.backward()

        with torch.no_grad():
            approx_kl_divergence = 0.5 * torch.mean(
                (model_action_logprobs - rollout_action_logprobs).pow(2))
            clip_fraction = torch.mean(
                (torch.abs(ratio - 1.0) > current_cliprange).to(
                    dtype=torch.float))

        return {
            'policy_loss': policy_loss.item(),
            'value_loss': value_loss.item(),
            'policy_entropy': policy_entropy.item(),
            'approx_kl_divergence': approx_kl_divergence.item(),
            'clip_fraction': clip_fraction.item(),
            'advantage_norm': torch.norm(advantages).item(),
            'explained_variance': explained_variance(returns, rollout_values)
        }

    def metrics(self) -> list:
        """ List of metrics to track for this learning process """
        return [
            AveragingNamedMetric("policy_loss"),
            AveragingNamedMetric("value_loss"),
            AveragingNamedMetric("policy_entropy"),
            AveragingNamedMetric("approx_kl_divergence"),
            AveragingNamedMetric("clip_fraction"),
            AveragingNamedMetric("grad_norm"),
            AveragingNamedMetric("advantage_norm"),
            AveragingNamedMetric("explained_variance")
        ]