Example #1
0
    def action(self, context):
        """Samples rewards from posterior, and chooses best action accordingly."""

        # Round robin until each action has been selected "initial_pulls" times
        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            return self.t % self.hparams.num_actions

        vals = []
        states, rewards, actions = bandit_utils.get_data_with_masked_rewards(
            self.data_h)
        state_action_pairs = tf.concat([states, actions], axis=-1)

        historical_x = tf.to_float(tf.expand_dims(state_action_pairs, axis=0))
        historical_y = tf.to_float(rewards.reshape(1, -1, 1))

        context = tf.to_float(context)
        tiled_context = tf.concat([
            tf.tile(tf.reshape(context, [1, -1]),
                    [self.hparams.num_actions, 1]), self._one_hot_vectors
        ],
                                  axis=-1)
        target_x = tf.expand_dims(tiled_context, axis=0)
        target_y = None

        prediction = self.snp(historical_x, historical_y, target_x, target_y)
        vals = tf.squeeze(prediction.distribution.mean())

        return tf.argmax(vals).numpy()
Example #2
0
    def action(self, context):
        """Samples rewards from posterior, and chooses best action accordingly.

    Args:
      context: A d-dimensional np.ndarray with the context.

    Returns:
      Greedy action based on Thompson sampling.
    """
        # Round robin until each action has been selected "initial_pulls" times
        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            return self.t % self.hparams.num_actions

        vals = []

        context = tf.to_float(context)
        if self._is_anp:
            contexts, rewards, actions = self.data_h.get_data_with_weights()
            historical_x = tf.to_float(tf.expand_dims(contexts, axis=0))
            historical_y = tf.to_float(
                tf.expand_dims(rewards * actions, axis=0))
            target_x = tf.expand_dims(tf.reshape(context, [1, -1]), axis=0)
        else:
            contexts, rewards, actions = utils.get_data_with_masked_rewards(
                self.data_h)
            context_action_pairs = tf.concat([contexts, actions], axis=-1)

            historical_x = tf.to_float(
                tf.expand_dims(context_action_pairs, axis=0))
            historical_y = tf.to_float(rewards.reshape(1, -1, 1))
            tiled_context = tf.concat([
                tf.tile(tf.reshape(context, [1, -1]),
                        [self.hparams.num_actions, 1]), self._one_hot_vectors
            ],
                                      axis=-1)
            target_x = tf.expand_dims(tiled_context, axis=0)
        target_y = None

        predictions = self.snp(historical_x, historical_y, target_x, target_y)
        vals = tf.squeeze(predictions.distribution.mean())

        return tf.argmax(vals).numpy()