Example #1
0
    def __init__(
        self,
        action_dist: ActionDistribution,
        actions: TensorType,
        advantages: TensorType,
        v_target: TensorType,
        vf: TensorType,
        valid_mask: TensorType,
        vf_loss_coeff: float = 0.5,
        entropy_coeff: float = 0.01,
        use_critic: bool = True,
    ):
        log_prob = action_dist.logp(actions)

        # The "policy gradients" loss
        self.pi_loss = -tf.reduce_sum(
            tf.boolean_mask(log_prob * advantages, valid_mask))

        delta = tf.boolean_mask(vf - v_target, valid_mask)

        # Compute a value function loss.
        if use_critic:
            self.vf_loss = 0.5 * tf.reduce_sum(tf.math.square(delta))
        # Ignore the value function.
        else:
            self.vf_loss = tf.constant(0.0)

        self.entropy = tf.reduce_sum(
            tf.boolean_mask(action_dist.entropy(), valid_mask))

        self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
                           self.entropy * entropy_coeff)
Example #2
0
    def _get_torch_exploration_action(self, action_dist: ActionDistribution,
                                      timestep: Union[TensorType, int],
                                      explore: Union[TensorType, bool]):
        # Set last timestep or (if not given) increase by one.
        self.last_timestep = timestep if timestep is not None else \
            self.last_timestep + 1

        # Apply exploration.
        if explore:
            # Random exploration phase.
            if self.last_timestep < self.random_timesteps:
                action, logp = \
                    self.random_exploration.get_torch_exploration_action(
                        action_dist, explore=True)
            # Take a sample from our distribution.
            else:
                action = action_dist.sample()
                logp = action_dist.sampled_action_logp()

        # No exploration -> Return deterministic actions.
        else:
            action = action_dist.deterministic_sample()
            logp = torch.zeros_like(action_dist.sampled_action_logp())

        return action, logp
Example #3
0
    def get_exploration_action(
        self,
        action_distribution: ActionDistribution,
        timestep: Union[int, TensorType],
        explore: bool = True,
    ):
        assert (self.framework == "torch"
                ), "ERROR: SlateSoftQ only supports torch so far!"

        cls = type(action_distribution)

        # Re-create the action distribution with the correct temperature
        # applied.
        action_distribution = cls(action_distribution.inputs,
                                  self.model,
                                  temperature=self.temperature)
        batch_size = action_distribution.inputs.size()[0]
        action_logp = torch.zeros(batch_size, dtype=torch.float)

        self.last_timestep = timestep

        # Explore.
        if explore:
            # Return stochastic sample over (q-value) logits.
            action = action_distribution.sample()
        # Return the deterministic "sample" (argmax) over (q-value) logits.
        else:
            action = action_distribution.deterministic_sample()

        return action, action_logp
Example #4
0
 def __init__(self, inputs, model, input_lens):
     # skip TFActionDistribution init
     ActionDistribution.__init__(self, inputs, model)
     self.cats = [
         Categorical(input_, model)
         for input_ in tf.split(inputs, input_lens, axis=1)
     ]
     self.sample_op = self._build_sample_op()
 def __init__(self, inputs: List[TensorType], model: ModelV2,
              input_lens: Union[List[int], np.ndarray, Tuple[int, ...]]):
     # skip TFActionDistribution init
     ActionDistribution.__init__(self, inputs, model)
     self.cats = [
         Categorical(input_, model)
         for input_ in tf.split(inputs, input_lens, axis=1)
     ]
     self.sample_op = self._build_sample_op()
     self.sampled_action_logp_op = self.logp(self.sample_op)
Example #6
0
 def __init__(self, inputs, model):
     # skip TFActionDistribution init
     ActionDistribution.__init__(self, inputs, model)
     input_lens = inputs.get_shape().as_list()[1]
     self.cats = [
         Categorical(input_, model)
         for input_ in tf.unstack(inputs, input_lens, axis=1)
     ]
     self.sample_op = self._build_sample_op()
     self.sampled_action_logp_op = self.logp(self.sample_op)
Example #7
0
 def __init__(self, inputs, model):
     # skip TFActionDistribution init
     ActionDistribution.__init__(self, inputs, model)
     input_lens = inputs.get_shape().as_list()[1]
     self.gausses = [
         SquashedGaussian(tf.squeeze(input_, axis=1), model)
         for input_ in tf.split(inputs, input_lens, axis=1)
     ]
     self.sample_op = self._build_sample_op()
     self.sampled_action_logp_op = self.logp(self.sample_op)
Example #8
0
 def __init__(self, inputs, model, action_space, child_distributions,
              input_lens):
     # skip TFActionDistribution init
     ActionDistribution.__init__(self, inputs, model)
     self.input_lens = input_lens
     split_inputs = tf.split(inputs, self.input_lens, axis=1)
     child_list = []
     for i, distribution in enumerate(child_distributions):
         child_list.append(distribution(split_inputs[i], model))
     self.child_distributions = child_list
Example #9
0
    def __init__(self, inputs, model, *, child_distributions, input_lens, action_space):
        ActionDistribution.__init__(self, inputs, model)

        self.action_space_struct = get_base_struct_from_space(action_space)

        self.input_lens = np.array(input_lens, dtype=np.int32)
        split_inputs = tf.split(inputs, self.input_lens, axis=1)
        self.flat_child_distributions = tree.map_structure(
            lambda dist, input_: dist(input_, model), child_distributions, split_inputs
        )
Example #10
0
    def _get_torch_exploration_action(self, action_dist: ActionDistribution,
                                      explore: bool,
                                      timestep: Union[int, TensorType]):
        # Set last timestep or (if not given) increase by one.
        self.last_timestep = timestep if timestep is not None else \
            self.last_timestep + 1

        # Apply exploration.
        if explore:
            # Random exploration phase.
            if self.last_timestep < self.random_timesteps:
                action, _ = \
                    self.random_exploration.get_torch_exploration_action(
                        action_dist, explore=True)
            # Apply base-scaled and time-annealed scaled OU-noise to
            # deterministic actions.
            else:
                det_actions = action_dist.deterministic_sample()
                scale = self.scale_schedule(self.last_timestep)
                gaussian_sample = scale * torch.normal(
                    mean=torch.zeros(self.ou_state.size()), std=1.0) \
                    .to(self.device)
                ou_new = self.ou_theta * -self.ou_state + \
                    self.ou_sigma * gaussian_sample
                self.ou_state += ou_new
                high_m_low = torch.from_numpy(
                    self.action_space.high - self.action_space.low). \
                    to(self.device)
                high_m_low = torch.where(
                    torch.isinf(high_m_low),
                    torch.ones_like(high_m_low).to(self.device), high_m_low)
                noise = scale * self.ou_base_scale * self.ou_state * high_m_low

                action = torch.min(
                    torch.max(
                        det_actions + noise,
                        torch.tensor(self.action_space.low,
                                     dtype=torch.float32,
                                     device=self.device)),
                    torch.tensor(self.action_space.high,
                                 dtype=torch.float32,
                                 device=self.device))

        # No exploration -> Return deterministic actions.
        else:
            action = action_dist.deterministic_sample()

        # Logp=always zero.
        logp = torch.zeros((action.size()[0], ),
                           dtype=torch.float32,
                           device=self.device)

        return action, logp
Example #11
0
 def get_exploration_action(
     self,
     *,
     action_distribution: ActionDistribution,
     timestep: int,
     explore: bool = True,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
     if explore:
         if timestep < self._pure_exploration_steps:
             return super().get_exploration_action(
                 action_distribution=action_distribution,
                 timestep=timestep,
                 explore=explore,
             )
         return action_distribution.sample()
     return action_distribution.deterministic_sample()
Example #12
0
    def _get_torch_exploration_action(self,
                                      action_distribution: ActionDistribution,
                                      explore: bool,
                                      timestep: Union[int, TensorType]):
        """Torch method to produce an epsilon exploration action.

        Args:
            action_distribution (ActionDistribution): The instantiated
                ActionDistribution object to work with when creating
                exploration actions.

        Returns:
            torch.Tensor: The exploration-action.
        """
        q_values = action_distribution.inputs
        self.last_timestep = timestep
        exploit_action = action_distribution.deterministic_sample()
        batch_size = q_values.size()[0]
        action_logp = torch.zeros(batch_size, dtype=torch.float)

        # Explore.
        if explore:
            # Get the current epsilon.
            epsilon = self.epsilon_schedule(self.last_timestep)
            if isinstance(action_distribution, TorchMultiActionDistribution):
                exploit_action = tree.flatten(exploit_action)
                for i in range(batch_size):
                    if random.random() < epsilon:
                        # TODO: (bcahlit) Mask out actions
                        random_action = tree.flatten(
                            self.action_space.sample())
                        for j in range(len(exploit_action)):
                            exploit_action[j][i] = torch.tensor(
                                random_action[j])
                exploit_action = tree.unflatten_as(
                    action_distribution.action_space_struct, exploit_action)

                return exploit_action, action_logp

            else:
                # Mask out actions, whose Q-values are -inf, so that we don't
                # even consider them for exploration.
                random_valid_action_logits = torch.where(
                    q_values <= FLOAT_MIN,
                    torch.ones_like(q_values) * 0.0, torch.ones_like(q_values))
                # A random action.
                random_actions = torch.squeeze(torch.multinomial(
                    random_valid_action_logits, 1),
                                               axis=1)

                # Pick either random or greedy.
                action = torch.where(
                    torch.empty(
                        (batch_size, )).uniform_().to(self.device) < epsilon,
                    random_actions, exploit_action)

                return action, action_logp
        # Return the deterministic "sample" (argmax) over the logits.
        else:
            return exploit_action, action_logp
Example #13
0
    def _get_tf_exploration_action_op(self, action_dist: ActionDistribution,
                                      explore: Union[bool, TensorType],
                                      timestep: Union[int, TensorType]):
        ts = timestep if timestep is not None else self.last_timestep
        scale = self.scale_schedule(ts)

        # The deterministic actions (if explore=False).
        deterministic_actions = action_dist.deterministic_sample()

        # Apply base-scaled and time-annealed scaled OU-noise to
        # deterministic actions.
        gaussian_sample = tf.random.normal(shape=[self.action_space.low.size],
                                           stddev=self.stddev)
        ou_new = self.ou_theta * -self.ou_state + \
            self.ou_sigma * gaussian_sample
        if self.framework in ["tf2", "tfe"]:
            self.ou_state.assign_add(ou_new)
            ou_state_new = self.ou_state
        else:
            ou_state_new = tf1.assign_add(self.ou_state, ou_new)
        high_m_low = self.action_space.high - self.action_space.low
        high_m_low = tf.where(tf.math.is_inf(high_m_low),
                              tf.ones_like(high_m_low), high_m_low)
        noise = scale * self.ou_base_scale * ou_state_new * high_m_low
        stochastic_actions = tf.clip_by_value(
            deterministic_actions + noise,
            self.action_space.low * tf.ones_like(deterministic_actions),
            self.action_space.high * tf.ones_like(deterministic_actions))

        # Stochastic actions could either be: random OR action + noise.
        random_actions, _ = \
            self.random_exploration.get_tf_exploration_action_op(
                action_dist, explore)
        exploration_actions = tf.cond(
            pred=tf.convert_to_tensor(ts < self.random_timesteps),
            true_fn=lambda: random_actions,
            false_fn=lambda: stochastic_actions)

        # Chose by `explore` (main exploration switch).
        action = tf.cond(pred=tf.constant(explore, dtype=tf.bool)
                         if isinstance(explore, bool) else explore,
                         true_fn=lambda: exploration_actions,
                         false_fn=lambda: deterministic_actions)
        # Logp=always zero.
        batch_size = tf.shape(deterministic_actions)[0]
        logp = tf.zeros(shape=(batch_size, ), dtype=tf.float32)

        # Increment `last_timestep` by 1 (or set to `timestep`).
        if self.framework in ["tf2", "tfe"]:
            if timestep is None:
                self.last_timestep.assign_add(1)
            else:
                self.last_timestep.assign(timestep)
            return action, logp
        else:
            assign_op = (tf1.assign_add(self.last_timestep, 1)
                         if timestep is None else tf1.assign(
                             self.last_timestep, timestep))
            with tf1.control_dependencies([assign_op, ou_state_new]):
                return action, logp
Example #14
0
 def extra_action_out_fn(
         policy: Policy, input_dict, state_batches, model,
         action_dist: ActionDistribution) -> Dict[str, TensorType]:
     action = action_dist.deterministic_sample()
     action_probs = torch.zeros_like(policy.q_values).long()
     action_probs[0][action[0]] = 1.0
     return {"q_values": policy.q_values, "action_probs": action_probs}
Example #15
0
 def kl(self, q: ActionDistribution) -> torch.Tensor:
     """ KL(self || q) estimated with monte carlo sampling
     """
     rsamples = self.__rsamples().unbind(0)
     log_ratios = torch.stack(
         [self.logp(rsample) - q.logp(rsample) for rsample in rsamples])
     assert not torch.isnan(log_ratios).any(), "output nan aborting"
     return log_ratios.mean(0)
Example #16
0
 def __init__(self,
              inputs: List[TensorType],
              model: ModelV2,
              input_lens: Union[List[int], np.ndarray, Tuple[int, ...]],
              action_space=None):
     # skip TFActionDistribution init
     ActionDistribution.__init__(self, inputs, model)
     self.cats = [
         Categorical(input_, model)
         for input_ in tf.split(inputs, input_lens, axis=1)
     ]
     self.action_space = action_space
     if self.action_space is None:
         self.action_space = gym.spaces.MultiDiscrete(
             [c.inputs.shape[1] for c in self.cats])
     self.sample_op = self._build_sample_op()
     self.sampled_action_logp_op = self.logp(self.sample_op)
Example #17
0
    def _get_torch_exploration_action(
        self,
        action_dist: ActionDistribution,
        explore: bool,
        timestep: Union[int, TensorType],
    ):
        # Set last timestep or (if not given) increase by one.
        self.last_timestep = (timestep if timestep is not None else
                              self.last_timestep + 1)

        # Apply exploration.
        if explore:
            # Random exploration phase.
            if self.last_timestep < self.random_timesteps:
                action, _ = self.random_exploration.get_torch_exploration_action(
                    action_dist, explore=True)
            # Take a Gaussian sample with our stddev (mean=0.0) and scale it.
            else:
                det_actions = action_dist.deterministic_sample()
                scale = self.scale_schedule(self.last_timestep)
                gaussian_sample = scale * torch.normal(
                    mean=torch.zeros(det_actions.size()), std=self.stddev).to(
                        self.device)
                action = torch.min(
                    torch.max(
                        det_actions + gaussian_sample,
                        torch.tensor(
                            self.action_space.low,
                            dtype=torch.float32,
                            device=self.device,
                        ),
                    ),
                    torch.tensor(self.action_space.high,
                                 dtype=torch.float32,
                                 device=self.device),
                )
        # No exploration -> Return deterministic actions.
        else:
            action = action_dist.deterministic_sample()

        # Logp=always zero.
        logp = torch.zeros((action.size()[0], ),
                           dtype=torch.float32,
                           device=self.device)

        return action, logp
Example #18
0
 def kl(self, q: ActionDistribution) -> torch.Tensor:
     """ KL(self || q) estimated with monte carlo sampling
     """
     rsamples, logps = self.__rsamples_logps()
     logp_rsamples = zip(logps.unbind(0), rsamples.unbind(0))
     log_ratios = torch.stack(
         [logp - q.logp(rsample) for (logp, rsample) in logp_rsamples])
     assert not torch.isnan(log_ratios).any(), "output nan aborting"
     return log_ratios.mean(0)
Example #19
0
    def _get_tf_exploration_action_op(
        self,
        action_distribution: ActionDistribution,
        explore: Union[bool, TensorType],
        timestep: Union[int, TensorType],
    ) -> "tf.Tensor":

        per_slate_q_values = action_distribution.inputs
        all_slates = self.model.slates

        exploit_indices = action_distribution.deterministic_sample()
        exploit_action = tf.gather(all_slates, exploit_indices)

        batch_size = tf.shape(per_slate_q_values)[0]
        action_logp = tf.zeros(batch_size, dtype=tf.float32)

        # Get the current epsilon.
        epsilon = self.epsilon_schedule(
            timestep if timestep is not None else self.last_timestep
        )
        # Mask out actions, whose Q-values are -inf, so that we don't
        # even consider them for exploration.
        random_valid_action_logits = tf.where(
            tf.equal(per_slate_q_values, tf.float32.min),
            tf.ones_like(per_slate_q_values) * tf.float32.min,
            tf.ones_like(per_slate_q_values),
        )
        # A random action.
        random_indices = tf.squeeze(
            tf.random.categorical(random_valid_action_logits, 1), axis=1
        )
        random_actions = tf.gather(all_slates, random_indices)

        choose_random = (
            tf.random.uniform(
                tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32
            )
            < epsilon
        )

        # Pick either random or greedy.
        action = tf.cond(
            pred=tf.constant(explore, dtype=tf.bool)
            if isinstance(explore, bool)
            else explore,
            true_fn=(lambda: tf.where(choose_random, random_actions, exploit_action)),
            false_fn=lambda: exploit_action,
        )

        if self.framework in ["tf2", "tfe"] and not self.policy_config["eager_tracing"]:
            self.last_timestep = timestep
            return action, action_logp
        else:
            assign_op = tf1.assign(self.last_timestep, tf.cast(timestep, tf.int64))
            with tf1.control_dependencies([assign_op]):
                return action, action_logp
Example #20
0
 def get_torch_exploration_action(self, action_dist: ActionDistribution,
                                  explore: bool):
     if explore:
         req = force_tuple(
             action_dist.required_model_output_shape(
                 self.action_space, self.model.model_config))
         # Add a batch dimension?
         if len(action_dist.inputs.shape) == len(req) + 1:
             batch_size = action_dist.inputs.shape[0]
             a = np.stack(
                 [self.action_space.sample() for _ in range(batch_size)])
         else:
             a = self.action_space.sample()
         # Convert action to torch tensor.
         action = torch.from_numpy(a).to(self.device)
     else:
         action = action_dist.deterministic_sample()
     logp = torch.zeros(
         (action.size()[0], ), dtype=torch.float32, device=self.device)
     return action, logp
Example #21
0
    def __init__(self, policy: Policy, value_estimates: TensorType,
                 action_dist: ActionDistribution, train_batch: SampleBatch,
                 vf_loss_coeff: float, beta: float):

        # L = - A * log\pi_\theta(a|s)
        logprobs = action_dist.logp(train_batch[SampleBatch.ACTIONS])

        if beta != 0.0:
            cumulative_rewards = train_batch[Postprocessing.ADVANTAGES]
            # Advantage Estimation.
            adv = cumulative_rewards - value_estimates
            adv_squared = tf.reduce_mean(tf.math.square(adv))
            # Value function's loss term (MSE).
            self.v_loss = 0.5 * adv_squared

            # Perform moving averaging of advantage^2.
            rate = policy.config["moving_average_sqd_adv_norm_update_rate"]

            # Update averaged advantage norm.
            # Eager.
            if policy.config["framework"] in ["tf2", "tfe"]:
                update_term = adv_squared - policy._moving_average_sqd_adv_norm
                policy._moving_average_sqd_adv_norm.assign_add(rate *
                                                               update_term)

                # Exponentially weighted advantages.
                c = tf.math.sqrt(policy._moving_average_sqd_adv_norm)
                exp_advs = tf.math.exp(beta * (adv / (1e-8 + c)))
            # Static graph.
            else:
                update_adv_norm = tf1.assign_add(
                    ref=policy._moving_average_sqd_adv_norm,
                    value=rate *
                    (adv_squared - policy._moving_average_sqd_adv_norm))

                # Exponentially weighted advantages.
                with tf1.control_dependencies([update_adv_norm]):
                    exp_advs = tf.math.exp(beta * tf.math.divide(
                        adv, 1e-8 +
                        tf.math.sqrt(policy._moving_average_sqd_adv_norm)))
            exp_advs = tf.stop_gradient(exp_advs)

            self.explained_variance = tf.reduce_mean(
                explained_variance(cumulative_rewards, value_estimates))

        else:
            # Value function's loss term (MSE).
            self.v_loss = tf.constant(0.0)
            exp_advs = 1.0

        self.p_loss = -1.0 * tf.reduce_mean(exp_advs * logprobs)

        self.total_loss = self.p_loss + vf_loss_coeff * self.v_loss
Example #22
0
    def _get_tf_exploration_action_op(
        self,
        action_distribution: ActionDistribution,
        explore: Union[bool, TensorType],
        timestep: Union[int, TensorType],
    ) -> "tf.Tensor":

        per_slate_q_values = action_distribution.inputs
        all_slates = action_distribution.all_slates

        exploit_action = action_distribution.deterministic_sample()

        batch_size, num_slates = (
            tf.shape(per_slate_q_values)[0],
            tf.shape(per_slate_q_values)[1],
        )
        action_logp = tf.zeros(batch_size, dtype=tf.float32)

        # Get the current epsilon.
        epsilon = self.epsilon_schedule(
            timestep if timestep is not None else self.last_timestep)
        # A random action.
        random_indices = tf.random.uniform(
            (batch_size, ),
            minval=0,
            maxval=num_slates,
            dtype=tf.dtypes.int32,
        )
        random_actions = tf.gather(all_slates, random_indices)

        choose_random = (tf.random.uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) <
                         epsilon)

        # Pick either random or greedy.
        action = tf.cond(
            pred=tf.constant(explore, dtype=tf.bool) if isinstance(
                explore, bool) else explore,
            true_fn=(lambda: tf.where(choose_random, random_actions,
                                      exploit_action)),
            false_fn=lambda: exploit_action,
        )

        if self.framework in ["tf2", "tfe"
                              ] and not self.policy_config["eager_tracing"]:
            self.last_timestep = timestep
            return action, action_logp
        else:
            assign_op = tf1.assign(self.last_timestep,
                                   tf.cast(timestep, tf.int64))
            with tf1.control_dependencies([assign_op]):
                return action, action_logp
Example #23
0
    def _get_tf_exploration_action_op(
        self,
        action_dist: ActionDistribution,
        explore: bool,
        timestep: Union[int, TensorType],
    ):
        ts = timestep if timestep is not None else self.last_timestep

        # The deterministic actions (if explore=False).
        deterministic_actions = action_dist.deterministic_sample()

        # Take a Gaussian sample with our stddev (mean=0.0) and scale it.
        gaussian_sample = self.scale_schedule(ts) * tf.random.normal(
            tf.shape(deterministic_actions), stddev=self.stddev)

        # Stochastic actions could either be: random OR action + noise.
        random_actions, _ = self.random_exploration.get_tf_exploration_action_op(
            action_dist, explore)
        stochastic_actions = tf.cond(
            pred=tf.convert_to_tensor(ts < self.random_timesteps),
            true_fn=lambda: random_actions,
            false_fn=lambda: tf.clip_by_value(
                deterministic_actions + gaussian_sample,
                self.action_space.low * tf.ones_like(deterministic_actions),
                self.action_space.high * tf.ones_like(deterministic_actions),
            ),
        )

        # Chose by `explore` (main exploration switch).
        action = tf.cond(
            pred=tf.constant(explore, dtype=tf.bool) if isinstance(
                explore, bool) else explore,
            true_fn=lambda: stochastic_actions,
            false_fn=lambda: deterministic_actions,
        )
        # Logp=always zero.
        logp = zero_logps_from_actions(deterministic_actions)

        # Increment `last_timestep` by 1 (or set to `timestep`).
        if self.framework in ["tf2", "tfe"]:
            if timestep is None:
                self.last_timestep.assign_add(1)
            else:
                self.last_timestep.assign(tf.cast(timestep, tf.int64))
            return action, logp
        else:
            assign_op = (tf1.assign_add(self.last_timestep, 1)
                         if timestep is None else tf1.assign(
                             self.last_timestep, timestep))
            with tf1.control_dependencies([assign_op]):
                return action, logp
Example #24
0
    def _get_torch_exploration_action(
        self,
        action_distribution: ActionDistribution,
        explore: bool,
        timestep: Union[int, TensorType],
    ) -> "torch.Tensor":

        per_slate_q_values = action_distribution.inputs
        all_slates = self.model.slates

        exploit_indices = action_distribution.deterministic_sample()
        exploit_action = all_slates[exploit_indices]

        batch_size = per_slate_q_values.size()[0]
        action_logp = torch.zeros(batch_size, dtype=torch.float)

        self.last_timestep = timestep

        # Explore.
        if explore:
            # Get the current epsilon.
            epsilon = self.epsilon_schedule(self.last_timestep)
            # Mask out actions, whose Q-values are -inf, so that we don't
            # even consider them for exploration.
            random_valid_action_logits = torch.where(
                per_slate_q_values <= FLOAT_MIN,
                torch.ones_like(per_slate_q_values) * 0.0,
                torch.ones_like(per_slate_q_values),
            )
            # A random action.
            random_indices = torch.squeeze(torch.multinomial(
                random_valid_action_logits, 1),
                                           axis=1)
            random_actions = all_slates[random_indices]

            # Pick either random or greedy.
            action = torch.where(
                torch.empty(
                    (batch_size, )).uniform_().to(self.device) < epsilon,
                random_actions,
                exploit_action,
            )

            return action, action_logp
        # Return the deterministic "sample" (argmax) over the logits.
        else:
            return exploit_action, action_logp
Example #25
0
    def _get_torch_exploration_action(
        self,
        action_distribution: ActionDistribution,
        explore: bool,
        timestep: Union[int, TensorType],
    ) -> "torch.Tensor":

        per_slate_q_values = action_distribution.inputs
        all_slates = self.model.slates

        exploit_indices = action_distribution.deterministic_sample()
        exploit_action = all_slates[exploit_indices]

        batch_size = per_slate_q_values.size()[0]
        action_logp = torch.zeros(batch_size, dtype=torch.float)

        self.last_timestep = timestep

        # Explore.
        if explore:
            # Get the current epsilon.
            epsilon = self.epsilon_schedule(self.last_timestep)
            # A random action.
            random_indices = torch.randint(0, per_slate_q_values.shape[1],
                                           (per_slate_q_values.shape[0], ))
            random_actions = all_slates[random_indices]

            # Pick either random or greedy.
            action = torch.where(
                torch.empty((batch_size, )).uniform_() < epsilon,
                random_actions,
                exploit_action,
            )
            return action, action_logp
        # Return the deterministic "sample" (argmax) over the logits.
        else:
            return exploit_action, action_logp
Example #26
0
    def __init__(
        self,
        policy: Policy,
        value_estimates: TensorType,
        action_dist: ActionDistribution,
        train_batch: SampleBatch,
        vf_loss_coeff: float,
        beta: float,
    ):
        # L = - A * log\pi_\theta(a|s)
        logprobs = action_dist.logp(train_batch[SampleBatch.ACTIONS])
        if beta != 0.0:
            cumulative_rewards = train_batch[Postprocessing.ADVANTAGES]
            # Advantage Estimation.
            adv = cumulative_rewards - value_estimates
            adv_squared = tf.reduce_mean(tf.math.square(adv))
            # Value function's loss term (MSE).
            self.v_loss = 0.5 * adv_squared

            # Perform moving averaging of advantage^2.
            rate = policy.config["moving_average_sqd_adv_norm_update_rate"]

            # Update averaged advantage norm.
            # Eager.
            if policy.config["framework"] in ["tf2", "tfe"]:
                update_term = adv_squared - policy._moving_average_sqd_adv_norm
                policy._moving_average_sqd_adv_norm.assign_add(rate * update_term)

                # Exponentially weighted advantages.
                c = tf.math.sqrt(policy._moving_average_sqd_adv_norm)
                exp_advs = tf.math.exp(beta * (adv / (1e-8 + c)))
            # Static graph.
            else:
                update_adv_norm = tf1.assign_add(
                    ref=policy._moving_average_sqd_adv_norm,
                    value=rate * (adv_squared - policy._moving_average_sqd_adv_norm),
                )

                # Exponentially weighted advantages.
                with tf1.control_dependencies([update_adv_norm]):
                    exp_advs = tf.math.exp(
                        beta
                        * tf.math.divide(
                            adv,
                            1e-8 + tf.math.sqrt(policy._moving_average_sqd_adv_norm),
                        )
                    )
            exp_advs = tf.stop_gradient(exp_advs)

            self.explained_variance = tf.reduce_mean(
                explained_variance(cumulative_rewards, value_estimates)
            )

        else:
            # Value function's loss term (MSE).
            self.v_loss = tf.constant(0.0)
            exp_advs = 1.0

        # logprob loss alone tends to push action distributions to
        # have very low entropy, resulting in worse performance for
        # unfamiliar situations.
        # A scaled logstd loss term encourages stochasticity, thus
        # alleviate the problem to some extent.
        logstd_coeff = policy.config["bc_logstd_coeff"]
        if logstd_coeff > 0.0:
            logstds = tf.reduce_sum(action_dist.log_std, axis=1)
        else:
            logstds = 0.0

        self.p_loss = -1.0 * tf.reduce_mean(
            exp_advs * (logprobs + logstd_coeff * logstds)
        )

        self.total_loss = self.p_loss + vf_loss_coeff * self.v_loss