Python PerArmPolicyInfo Exemples, tf_agents.policies.utils.PerArmPolicyInfo Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : neural_linucb_policy.py Projet : tensorflow/agents

    def __init__(self,
                 encoding_network: types.Network,
                 encoding_dim: int,
                 reward_layer: tf.keras.layers.Dense,
                 epsilon_greedy: float,
                 actions_from_reward_layer: types.Bool,
                 cov_matrix: Sequence[types.Float],
                 data_vector: Sequence[types.Float],
                 num_samples: Sequence[types.Int],
                 time_step_spec: types.TimeStep,
                 alpha: float = 1.0,
                 emit_policy_info: Sequence[Text] = (),
                 emit_log_probability: bool = False,
                 accepts_per_arm_features: bool = False,
                 distributed_use_reward_layer: bool = False,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 name: Optional[Text] = None):
        """Initializes `NeuralLinUCBPolicy`.

    Args:
      encoding_network: network that encodes the observations.
      encoding_dim: (int) dimension of the encoded observations.
      reward_layer: final layer that predicts the expected reward per arm. In
        case the policy accepts per-arm features, the output of this layer has
        to be a scalar. This is because in the per-arm case, all encoded
        observations have to go through the same computation to get the reward
        estimates. The `num_actions` dimension of the encoded observation is
        treated as a batch dimension in the reward layer.
      epsilon_greedy: (float) representing the probability of choosing a random
        action instead of the greedy action.
      actions_from_reward_layer: (boolean variable) whether to get actions from
        the reward layer or from LinUCB.
      cov_matrix: list of the covariance matrices. There exists one covariance
        matrix per arm, unless the policy accepts per-arm features, in which
        case this list must have a single element.
      data_vector: list of the data vectors. A data vector is a weighted sum
        of the observations, where the weight is the corresponding reward. Each
        arm has its own data vector, unless the policy accepts per-arm features,
        in which case this list must have a single element.
      num_samples: list of number of samples per arm. If the policy accepts per-
        arm features, this is a single-element list counting the number of
        steps.
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      alpha: (float) non-negative weight multiplying the confidence intervals.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: (bool) whether to emit log probabilities.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      distributed_use_reward_layer: (bool) Whether to pick the actions using
        the network or use LinUCB. This applies only in distributed training
        setting and has a similar role to the `actions_from_reward_layer`
        mentioned above.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit policy and 2)
        the mask. The mask should be a 0-1 `Tensor` of shape
        `[batch_size, num_actions]`. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      name: The name of this policy.
    """
        policy_utilities.check_no_mask_with_arm_features(
            accepts_per_arm_features,
            observation_and_action_constraint_splitter)
        encoding_network.create_variables()
        self._encoding_network = encoding_network
        self._reward_layer = reward_layer
        self._encoding_dim = encoding_dim

        if accepts_per_arm_features and reward_layer.units != 1:
            raise ValueError(
                'The output dimension of the reward layer must be 1, got'
                ' {}'.format(reward_layer.units))

        if not isinstance(cov_matrix, (list, tuple)):
            raise ValueError(
                'cov_matrix must be a list of matrices (Tensors).')
        self._cov_matrix = cov_matrix

        if not isinstance(data_vector, (list, tuple)):
            raise ValueError(
                'data_vector must be a list of vectors (Tensors).')
        self._data_vector = data_vector

        if not isinstance(num_samples, (list, tuple)):
            raise ValueError(
                'num_samples must be a list of vectors (Tensors).')
        self._num_samples = num_samples

        self._alpha = alpha
        self._actions_from_reward_layer = actions_from_reward_layer
        self._epsilon_greedy = epsilon_greedy
        self._dtype = self._data_vector[0].dtype
        self._distributed_use_reward_layer = distributed_use_reward_layer

        if len(cov_matrix) != len(data_vector):
            raise ValueError(
                'The size of list cov_matrix must match the size of '
                'list data_vector. Got {} for cov_matrix and {} '
                'for data_vector'.format(len(self._cov_matrix),
                                         len((data_vector))))
        if len(num_samples) != len(cov_matrix):
            raise ValueError('The size of num_samples must match the size of '
                             'list cov_matrix. Got {} for num_samples and {} '
                             'for cov_matrix'.format(len(self._num_samples),
                                                     len((cov_matrix))))

        self._accepts_per_arm_features = accepts_per_arm_features
        if observation_and_action_constraint_splitter is not None:
            context_spec, _ = observation_and_action_constraint_splitter(
                time_step_spec.observation)
        else:
            context_spec = time_step_spec.observation
        if accepts_per_arm_features:
            self._num_actions = tf.nest.flatten(context_spec[
                bandit_spec_utils.PER_ARM_FEATURE_KEY])[0].shape.as_list()[0]
            self._num_models = 1
        else:
            self._num_actions = len(cov_matrix)
            self._num_models = self._num_actions
        cov_matrix_dim = tf.compat.dimension_value(cov_matrix[0].shape[0])
        if self._encoding_dim != cov_matrix_dim:
            raise ValueError('The dimension of matrix `cov_matrix` must match '
                             'encoding dimension {}.'
                             'Got {} for `cov_matrix`.'.format(
                                 self._encoding_dim, cov_matrix_dim))
        data_vector_dim = tf.compat.dimension_value(data_vector[0].shape[0])
        if self._encoding_dim != data_vector_dim:
            raise ValueError(
                'The dimension of vector `data_vector` must match '
                'encoding  dimension {}. '
                'Got {} for `data_vector`.'.format(self._encoding_dim,
                                                   data_vector_dim))
        action_spec = tensor_spec.BoundedTensorSpec(shape=(),
                                                    dtype=tf.int32,
                                                    minimum=0,
                                                    maximum=self._num_actions -
                                                    1,
                                                    name='action')

        self._emit_policy_info = emit_policy_info
        predicted_rewards_mean = ()
        if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
            predicted_rewards_mean = tensor_spec.TensorSpec(
                [self._num_actions], dtype=tf.float32)
        predicted_rewards_optimistic = ()
        if (policy_utilities.InfoFields.PREDICTED_REWARDS_OPTIMISTIC
                in emit_policy_info):
            predicted_rewards_optimistic = tensor_spec.TensorSpec(
                [self._num_actions], dtype=tf.float32)
        if accepts_per_arm_features:
            chosen_arm_features_info_spec = (
                policy_utilities.create_chosen_arm_features_info_spec(
                    time_step_spec.observation))
            info_spec = policy_utilities.PerArmPolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                predicted_rewards_optimistic=predicted_rewards_optimistic,
                chosen_arm_features=chosen_arm_features_info_spec)
        else:
            info_spec = policy_utilities.PolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                predicted_rewards_optimistic=predicted_rewards_optimistic)

        super(NeuralLinUCBPolicy,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             emit_log_probability=emit_log_probability,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             info_spec=info_spec,
                             name=name)

Exemple #2

0

Afficher le fichier

Fichier : boltzmann_reward_prediction_policy.py Projet : tensorflow/agents

    def __init__(self,
                 time_step_spec: types.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 reward_network: types.Network,
                 temperature: types.FloatOrReturningFloat = 1.0,
                 boltzmann_gumbel_exploration_constant: Optional[
                     types.Float] = None,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 accepts_per_arm_features: bool = False,
                 constraints: Tuple[constr.NeuralConstraint, ...] = (),
                 emit_policy_info: Tuple[Text, ...] = (),
                 num_samples_list: Sequence[tf.Variable] = (),
                 name: Optional[Text] = None):
        """Builds a BoltzmannRewardPredictionPolicy given a reward network.

    This policy takes a tf_agents.Network predicting rewards and chooses an
    action with weighted probabilities (i.e., using a softmax over the network
    estimates of value for each action).

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      reward_network: An instance of a `tf_agents.network.Network`,
        callable via `network(observation, step_type) -> (output, final_state)`.
      temperature: float or callable that returns a float. The temperature used
        in the Boltzmann exploration.
      boltzmann_gumbel_exploration_constant: optional positive float. When
        provided, the policy implements Neural Bandit with Boltzmann-Gumbel
        exploration from the paper:
        N. Cesa-Bianchi et al., "Boltzmann Exploration Done Right", NIPS 2017.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the network and 2) the
        mask.  The mask should be a 0-1 `Tensor` of shape
        `[batch_size, num_actions]`. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      constraints: iterable of constraints objects that are instances of
        `tf_agents.bandits.agents.NeuralConstraint`.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      num_samples_list: list or tuple of tf.Variable's. Used only in
        Boltzmann-Gumbel exploration. Otherwise, empty.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      NotImplementedError: If `action_spec` contains more than one
        `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid.
    """
        policy_utilities.check_no_mask_with_arm_features(
            accepts_per_arm_features,
            observation_and_action_constraint_splitter)
        flat_action_spec = tf.nest.flatten(action_spec)
        if len(flat_action_spec) > 1:
            raise NotImplementedError(
                'action_spec can only contain a single BoundedTensorSpec.')

        self._temperature = temperature
        action_spec = flat_action_spec[0]
        if (not tensor_spec.is_bounded(action_spec)
                or not tensor_spec.is_discrete(action_spec)
                or action_spec.shape.rank > 1
                or action_spec.shape.num_elements() != 1):
            raise NotImplementedError(
                'action_spec must be a BoundedTensorSpec of type int32 and shape (). '
                'Found {}.'.format(action_spec))
        self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1
        self._action_offset = action_spec.minimum
        reward_network.create_variables()
        self._reward_network = reward_network
        self._constraints = constraints

        self._boltzmann_gumbel_exploration_constant = (
            boltzmann_gumbel_exploration_constant)
        self._num_samples_list = num_samples_list
        if self._boltzmann_gumbel_exploration_constant is not None:
            if self._boltzmann_gumbel_exploration_constant <= 0.0:
                raise ValueError(
                    'The Boltzmann-Gumbel exploration constant is expected to be ',
                    'positive. Found: ',
                    self._boltzmann_gumbel_exploration_constant)
            if self._action_offset > 0:
                raise NotImplementedError(
                    'Action offset is not supported when ',
                    'Boltzmann-Gumbel exploration is enabled.')
            if accepts_per_arm_features:
                raise NotImplementedError(
                    'Boltzmann-Gumbel exploration is not supported ',
                    'for arm features case.')
            if len(self._num_samples_list) != self._expected_num_actions:
                raise ValueError(
                    'Size of num_samples_list: ', len(self._num_samples_list),
                    ' does not match the expected number of actions:',
                    self._expected_num_actions)

        self._emit_policy_info = emit_policy_info
        predicted_rewards_mean = ()
        if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
            predicted_rewards_mean = tensor_spec.TensorSpec(
                [self._expected_num_actions])
        bandit_policy_type = ()
        if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info:
            bandit_policy_type = (
                policy_utilities.create_bandit_policy_type_tensor_spec(
                    shape=[1]))
        if accepts_per_arm_features:
            # The features for the chosen arm is saved to policy_info.
            chosen_arm_features_info = (
                policy_utilities.create_chosen_arm_features_info_spec(
                    time_step_spec.observation))
            info_spec = policy_utilities.PerArmPolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                bandit_policy_type=bandit_policy_type,
                chosen_arm_features=chosen_arm_features_info)
        else:
            info_spec = policy_utilities.PolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                bandit_policy_type=bandit_policy_type)

        self._accepts_per_arm_features = accepts_per_arm_features

        super(BoltzmannRewardPredictionPolicy,
              self).__init__(time_step_spec,
                             action_spec,
                             policy_state_spec=reward_network.state_spec,
                             clip=False,
                             info_spec=info_spec,
                             emit_log_probability='log_probability'
                             in emit_policy_info,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             name=name)

Exemple #3

0

Afficher le fichier

  def testLinearAgentUpdatePerArmFeatures(self,
                                          batch_size,
                                          context_dim,
                                          exploration_policy,
                                          dtype,
                                          use_eigendecomp=False,
                                          set_example_weights=False):
    """Check that the agent updates for specified actions and rewards."""

    # Construct a `Trajectory` for the given action, observation, reward.
    num_actions = 5
    global_context_dim = context_dim
    arm_context_dim = 3
    initial_step, final_step = (
        _get_initial_and_final_steps_with_per_arm_features(
            batch_size,
            global_context_dim,
            num_actions,
            arm_context_dim,
            num_actions_feature=True))
    action = np.random.randint(num_actions, size=batch_size, dtype=np.int32)
    action_step = policy_step.PolicyStep(
        action=tf.convert_to_tensor(action),
        info=policy_utilities.PerArmPolicyInfo(
            chosen_arm_features=np.arange(
                batch_size * arm_context_dim, dtype=np.float32).reshape(
                    [batch_size, arm_context_dim])))
    experience = _get_experience(initial_step, action_step, final_step)

    # Construct an agent and perform the update.
    observation_spec = bandit_spec_utils.create_per_arm_observation_spec(
        context_dim, arm_context_dim, num_actions, add_num_actions_feature=True)
    time_step_spec = time_step.time_step_spec(observation_spec)
    action_spec = tensor_spec.BoundedTensorSpec(
        dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
    agent = linear_agent.LinearBanditAgent(
        exploration_policy=exploration_policy,
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        use_eigendecomp=use_eigendecomp,
        accepts_per_arm_features=True,
        dtype=dtype)
    self.evaluate(agent.initialize())
    weights = tf.linspace(
        start=1.5, stop=10.5, num=batch_size) if set_example_weights else None
    loss_info = agent.train(experience, weights)
    self.evaluate(loss_info)
    final_a = self.evaluate(agent.cov_matrix)
    final_b = self.evaluate(agent.data_vector)

    # Compute the expected updated estimates.
    global_observation = experience.observation[
        bandit_spec_utils.GLOBAL_FEATURE_KEY]
    arm_observation = experience.policy_info.chosen_arm_features
    overall_observation = tf.squeeze(
        tf.concat([global_observation, arm_observation], axis=-1), axis=1)
    squeezed_rewards = tf.squeeze(experience.reward, axis=1)
    observation, rewards = _maybe_weight_observation_and_reward(
        overall_observation, squeezed_rewards, weights)
    expected_a_new = tf.matmul(observation, observation, transpose_a=True)
    expected_b_new = bandit_utils.sum_reward_weighted_observations(
        rewards, observation)
    self.assertAllClose(expected_a_new, final_a[0])
    self.assertAllClose(expected_b_new, final_b[0])

Exemple #4

0

Afficher le fichier

Fichier : boltzmann_reward_prediction_policy.py Projet : tensorflow/agents

    def _distribution(self, time_step, policy_state):
        observation = time_step.observation
        if self.observation_and_action_constraint_splitter is not None:
            observation, _ = self.observation_and_action_constraint_splitter(
                observation)

        predictions, policy_state = self._reward_network(
            observation, time_step.step_type, policy_state)
        batch_size = tf.shape(predictions)[0]

        if isinstance(self._reward_network,
                      heteroscedastic_q_network.HeteroscedasticQNetwork):
            predicted_reward_values = predictions.q_value_logits
        else:
            predicted_reward_values = predictions

        predicted_reward_values.shape.with_rank_at_least(2)
        predicted_reward_values.shape.with_rank_at_most(3)
        if predicted_reward_values.shape[
                -1] is not None and predicted_reward_values.shape[
                    -1] != self._expected_num_actions:
            raise ValueError(
                'The number of actions ({}) does not match the reward_network output'
                ' size ({}).'.format(self._expected_num_actions,
                                     predicted_reward_values.shape[1]))

        mask = constr.construct_mask_from_multiple_sources(
            time_step.observation,
            self._observation_and_action_constraint_splitter,
            self._constraints, self._expected_num_actions)

        if self._boltzmann_gumbel_exploration_constant is not None:
            logits = predicted_reward_values

            # Apply masking if needed. Overwrite the logits for invalid actions to
            # logits.dtype.min.
            if mask is not None:
                almost_neg_inf = tf.constant(logits.dtype.min,
                                             dtype=logits.dtype)
                logits = tf.compat.v2.where(tf.cast(mask, tf.bool), logits,
                                            almost_neg_inf)

            gumbel_dist = tfp.distributions.Gumbel(loc=0., scale=1.)
            gumbel_samples = gumbel_dist.sample(tf.shape(logits))
            num_samples_list_float = tf.stack([
                tf.cast(x.read_value(), tf.float32)
                for x in self._num_samples_list
            ],
                                              axis=-1)
            exploration_weights = tf.math.divide_no_nan(
                self._boltzmann_gumbel_exploration_constant,
                tf.sqrt(num_samples_list_float))
            final_logits = logits + exploration_weights * gumbel_samples
            actions = tf.cast(tf.math.argmax(final_logits, axis=1),
                              self._action_spec.dtype)
            # Log probability is not available in closed form. We treat this as a
            # deterministic policy at the moment.
            log_probability = tf.zeros([batch_size], tf.float32)
        else:
            # Apply the temperature scaling, needed for Boltzmann exploration.
            logits = predicted_reward_values / self._get_temperature_value()

            # Apply masking if needed. Overwrite the logits for invalid actions to
            # logits.dtype.min.
            if mask is not None:
                almost_neg_inf = tf.constant(logits.dtype.min,
                                             dtype=logits.dtype)
                logits = tf.compat.v2.where(tf.cast(mask, tf.bool), logits,
                                            almost_neg_inf)

            if self._action_offset != 0:
                distribution = shifted_categorical.ShiftedCategorical(
                    logits=logits,
                    dtype=self._action_spec.dtype,
                    shift=self._action_offset)
            else:
                distribution = tfp.distributions.Categorical(
                    logits=logits, dtype=self._action_spec.dtype)

            actions = distribution.sample()
            log_probability = distribution.log_prob(actions)

        bandit_policy_values = tf.fill(
            [batch_size, 1], policy_utilities.BanditPolicyType.BOLTZMANN)

        if self._accepts_per_arm_features:
            # Saving the features for the chosen action to the policy_info.
            def gather_observation(obs):
                return tf.gather(params=obs, indices=actions, batch_dims=1)

            chosen_arm_features = tf.nest.map_structure(
                gather_observation,
                observation[bandit_spec_utils.PER_ARM_FEATURE_KEY])
            policy_info = policy_utilities.PerArmPolicyInfo(
                log_probability=log_probability
                if policy_utilities.InfoFields.LOG_PROBABILITY
                in self._emit_policy_info else (),
                predicted_rewards_mean=(
                    predicted_reward_values
                    if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
                    in self._emit_policy_info else ()),
                bandit_policy_type=(
                    bandit_policy_values
                    if policy_utilities.InfoFields.BANDIT_POLICY_TYPE
                    in self._emit_policy_info else ()),
                chosen_arm_features=chosen_arm_features)
        else:
            policy_info = policy_utilities.PolicyInfo(
                log_probability=log_probability
                if policy_utilities.InfoFields.LOG_PROBABILITY
                in self._emit_policy_info else (),
                predicted_rewards_mean=(
                    predicted_reward_values
                    if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
                    in self._emit_policy_info else ()),
                bandit_policy_type=(
                    bandit_policy_values
                    if policy_utilities.InfoFields.BANDIT_POLICY_TYPE
                    in self._emit_policy_info else ()))

        return policy_step.PolicyStep(
            tfp.distributions.Deterministic(loc=actions), policy_state,
            policy_info)

Exemple #5

0

Afficher le fichier

  def __init__(self,
               time_step_spec: types.TimeStep,
               action_spec: types.NestedTensorSpec,
               reward_network: types.Network,
               observation_and_action_constraint_splitter: Optional[
                   types.Splitter] = None,
               accepts_per_arm_features: bool = False,
               constraints: Tuple[constr.NeuralConstraint, ...] = (),
               emit_policy_info: Tuple[Text, ...] = (),
               name: Optional[Text] = None):
    """Builds a GreedyRewardPredictionPolicy given a reward tf_agents.Network.

    This policy takes a tf_agents.Network predicting rewards and generates the
    action corresponding to the largest predicted reward.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      reward_network: An instance of a `tf_agents.network.Network`,
        callable via `network(observation, step_type) -> (output, final_state)`.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the network and 2) the
        mask.  The mask should be a 0-1 `Tensor` of shape
        `[batch_size, num_actions]`. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      constraints: iterable of constraints objects that are instances of
        `tf_agents.bandits.agents.NeuralConstraint`.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      NotImplementedError: If `action_spec` contains more than one
        `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid.
    """
    policy_utilities.check_no_mask_with_arm_features(
        accepts_per_arm_features, observation_and_action_constraint_splitter)
    flat_action_spec = tf.nest.flatten(action_spec)
    if len(flat_action_spec) > 1:
      raise NotImplementedError(
          'action_spec can only contain a single BoundedTensorSpec.')

    action_spec = flat_action_spec[0]
    if (not tensor_spec.is_bounded(action_spec) or
        not tensor_spec.is_discrete(action_spec) or
        action_spec.shape.rank > 1 or
        action_spec.shape.num_elements() != 1):
      raise NotImplementedError(
          'action_spec must be a BoundedTensorSpec of type int32 and shape (). '
          'Found {}.'.format(action_spec))
    self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1
    self._action_offset = action_spec.minimum
    reward_network.create_variables()
    self._reward_network = reward_network
    self._constraints = constraints

    self._emit_policy_info = emit_policy_info
    predicted_rewards_mean = ()
    if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
      predicted_rewards_mean = tensor_spec.TensorSpec(
          [self._expected_num_actions])
    bandit_policy_type = ()
    if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info:
      bandit_policy_type = (
          policy_utilities.create_bandit_policy_type_tensor_spec(shape=[1]))
    if accepts_per_arm_features:
      # The features for the chosen arm is saved to policy_info.
      chosen_arm_features_info = (
          policy_utilities.create_chosen_arm_features_info_spec(
              time_step_spec.observation))
      info_spec = policy_utilities.PerArmPolicyInfo(
          predicted_rewards_mean=predicted_rewards_mean,
          bandit_policy_type=bandit_policy_type,
          chosen_arm_features=chosen_arm_features_info)
    else:
      info_spec = policy_utilities.PolicyInfo(
          predicted_rewards_mean=predicted_rewards_mean,
          bandit_policy_type=bandit_policy_type)

    self._accepts_per_arm_features = accepts_per_arm_features

    super(GreedyRewardPredictionPolicy, self).__init__(
        time_step_spec, action_spec,
        policy_state_spec=reward_network.state_spec,
        clip=False,
        info_spec=info_spec,
        emit_log_probability='log_probability' in emit_policy_info,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter),
        name=name)

Exemple #6

0

Afficher le fichier

  def _distribution(self, time_step, policy_state):
    observation = time_step.observation
    if self.observation_and_action_constraint_splitter is not None:
      observation, _ = self.observation_and_action_constraint_splitter(
          observation)

    predictions, policy_state = self._reward_network(
        observation, time_step.step_type, policy_state)
    batch_size = tf.shape(predictions)[0]

    if isinstance(self._reward_network,
                  heteroscedastic_q_network.HeteroscedasticQNetwork):
      predicted_reward_values = predictions.q_value_logits
    else:
      predicted_reward_values = predictions

    predicted_reward_values.shape.with_rank_at_least(2)
    predicted_reward_values.shape.with_rank_at_most(3)
    if predicted_reward_values.shape[
        -1] is not None and predicted_reward_values.shape[
            -1] != self._expected_num_actions:
      raise ValueError(
          'The number of actions ({}) does not match the reward_network output'
          ' size ({}).'.format(self._expected_num_actions,
                               predicted_reward_values.shape[1]))

    mask = constr.construct_mask_from_multiple_sources(
        time_step.observation, self._observation_and_action_constraint_splitter,
        self._constraints, self._expected_num_actions)

    # Argmax.
    if mask is not None:
      actions = policy_utilities.masked_argmax(
          predicted_reward_values, mask, output_type=self.action_spec.dtype)
    else:
      actions = tf.argmax(
          predicted_reward_values, axis=-1, output_type=self.action_spec.dtype)

    actions += self._action_offset

    bandit_policy_values = tf.fill([batch_size, 1],
                                   policy_utilities.BanditPolicyType.GREEDY)

    if self._accepts_per_arm_features:
      # Saving the features for the chosen action to the policy_info.
      def gather_observation(obs):
        return tf.gather(params=obs, indices=actions, batch_dims=1)

      chosen_arm_features = tf.nest.map_structure(
          gather_observation,
          observation[bandit_spec_utils.PER_ARM_FEATURE_KEY])
      policy_info = policy_utilities.PerArmPolicyInfo(
          log_probability=tf.zeros([batch_size], tf.float32) if
          policy_utilities.InfoFields.LOG_PROBABILITY in self._emit_policy_info
          else (),
          predicted_rewards_mean=(
              predicted_reward_values if policy_utilities.InfoFields
              .PREDICTED_REWARDS_MEAN in self._emit_policy_info else ()),
          bandit_policy_type=(bandit_policy_values
                              if policy_utilities.InfoFields.BANDIT_POLICY_TYPE
                              in self._emit_policy_info else ()),
          chosen_arm_features=chosen_arm_features)
    else:
      policy_info = policy_utilities.PolicyInfo(
          log_probability=tf.zeros([batch_size], tf.float32) if
          policy_utilities.InfoFields.LOG_PROBABILITY in self._emit_policy_info
          else (),
          predicted_rewards_mean=(
              predicted_reward_values if policy_utilities.InfoFields
              .PREDICTED_REWARDS_MEAN in self._emit_policy_info else ()),
          bandit_policy_type=(bandit_policy_values
                              if policy_utilities.InfoFields.BANDIT_POLICY_TYPE
                              in self._emit_policy_info else ()))

    return policy_step.PolicyStep(
        tfp.distributions.Deterministic(loc=actions), policy_state, policy_info)

Exemple #7

0

Afficher le fichier

Fichier : greedy_multi_objective_neural_policy.py Projet : morgandu/agents

    def __init__(
            self,
            time_step_spec: Optional[ts.TimeStep],
            action_spec: Optional[types.NestedBoundedTensorSpec],
            scalarizer: multi_objective_scalarizer.Scalarizer,
            objective_networks: Sequence[Network],
            observation_and_action_constraint_splitter: types.Splitter = None,
            accepts_per_arm_features: bool = False,
            emit_policy_info: Tuple[Text, ...] = (),
            name: Optional[Text] = None):
        """Builds a GreedyMultiObjectiveNeuralPolicy based on multiple networks.

    This policy takes an iterable of `tf_agents.Network`, each responsible for
    predicting a specific objective, along with a `Scalarizer` object to
    generate an action by maximizing the scalarized objective, i.e., the output
    of the `Scalarizer` applied to the multiple predicted objectives by the
    networks.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      scalarizer: A
       `tf_agents.bandits.multi_objective.multi_objective_scalarizer.Scalarizer`
        object that implements scalarization of multiple objectives into a
        single scalar reward.
      objective_networks: A Sequence of `tf_agents.network.Network` objects to
        be used by the policy. Each network will be called with
        call(observation, step_type) and is expected to provide a prediction for
        a specific objective for all actions.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the network and 2) the
        mask.  The mask should be a 0-1 `Tensor` of shape `[batch_size,
        num_actions]`. This function should also work with a `TensorSpec` as
        input, and should output `TensorSpec` objects for the observation and
        mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      NotImplementedError: If `action_spec` contains more than one
        `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid.
      NotImplementedError: If `action_spec` is not a `BoundedTensorSpec` of type
        int32 and shape ().
      ValueError: If `objective_networks` has fewer than two networks.
      ValueError: If `accepts_per_arm_features` is true but `time_step_spec` is
        None.
    """
        policy_utilities.check_no_mask_with_arm_features(
            accepts_per_arm_features,
            observation_and_action_constraint_splitter)
        flat_action_spec = tf.nest.flatten(action_spec)
        if len(flat_action_spec) > 1:
            raise NotImplementedError(
                'action_spec can only contain a single BoundedTensorSpec.')

        action_spec = flat_action_spec[0]
        if (not tensor_spec.is_bounded(action_spec)
                or not tensor_spec.is_discrete(action_spec)
                or action_spec.shape.rank > 1
                or action_spec.shape.num_elements() != 1):
            raise NotImplementedError(
                'action_spec must be a BoundedTensorSpec of type int32 and shape (). '
                'Found {}.'.format(action_spec))
        self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1
        self._action_offset = action_spec.minimum
        policy_state_spec = []
        for network in objective_networks:
            policy_state_spec.append(network.state_spec)
            network.create_variables()
        self._objective_networks = objective_networks
        self._scalarizer = scalarizer
        self._num_objectives = len(self._objective_networks)
        if self._num_objectives < 2:
            raise ValueError(
                'Number of objectives should be at least two, but found to be {}'
                .format(self._num_objectives))

        self._emit_policy_info = emit_policy_info
        predicted_rewards_mean = ()
        if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
            predicted_rewards_mean = tensor_spec.TensorSpec(
                [self._num_objectives, self._expected_num_actions])
        scalarized_predicted_rewards_mean = ()
        if (policy_utilities.InfoFields.
                MULTIOBJECTIVE_SCALARIZED_PREDICTED_REWARDS_MEAN
                in emit_policy_info):
            scalarized_predicted_rewards_mean = tensor_spec.TensorSpec(
                [self._expected_num_actions])
        bandit_policy_type = ()
        if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info:
            bandit_policy_type = (
                policy_utilities.create_bandit_policy_type_tensor_spec(
                    shape=[1]))
        if accepts_per_arm_features:
            if time_step_spec is None:
                raise ValueError(
                    'time_step_spec should not be None for per-arm-features policies, '
                    'but found to be.')
            # The features for the chosen arm is saved to policy_info.
            chosen_arm_features_info = (
                policy_utilities.create_chosen_arm_features_info_spec(
                    time_step_spec.observation))
            info_spec = policy_utilities.PerArmPolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                multiobjective_scalarized_predicted_rewards_mean=
                scalarized_predicted_rewards_mean,
                bandit_policy_type=bandit_policy_type,
                chosen_arm_features=chosen_arm_features_info)
        else:
            info_spec = policy_utilities.PolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                multiobjective_scalarized_predicted_rewards_mean=
                scalarized_predicted_rewards_mean,
                bandit_policy_type=bandit_policy_type)

        self._accepts_per_arm_features = accepts_per_arm_features

        super(GreedyMultiObjectiveNeuralPolicy,
              self).__init__(time_step_spec,
                             action_spec,
                             policy_state_spec=policy_state_spec,
                             clip=False,
                             info_spec=info_spec,
                             emit_log_probability='log_probability'
                             in emit_policy_info,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             name=name)

Exemple #8

0

Afficher le fichier

Fichier : greedy_multi_objective_neural_policy.py Projet : morgandu/agents

    def _distribution(
            self, time_step: ts.TimeStep,
            policy_state: Sequence[types.TensorSpec]
    ) -> policy_step.PolicyStep:
        observation = time_step.observation
        if self.observation_and_action_constraint_splitter is not None:
            observation, _ = self.observation_and_action_constraint_splitter(
                observation)
        predicted_objective_values_tensor, policy_state = self._predict(
            observation, time_step.step_type, policy_state)
        scalarized_reward = scalarize_objectives(
            predicted_objective_values_tensor, self._scalarizer)
        # Preserve static batch size values when they are available.
        batch_size = (tf.compat.dimension_value(scalarized_reward.shape[0])
                      or tf.shape(scalarized_reward)[0])
        mask = constraints.construct_mask_from_multiple_sources(
            time_step.observation,
            self._observation_and_action_constraint_splitter, (),
            self._expected_num_actions)

        # Argmax.
        if mask is not None:
            actions = policy_utilities.masked_argmax(
                scalarized_reward, mask, output_type=self.action_spec.dtype)
        else:
            actions = tf.argmax(scalarized_reward,
                                axis=-1,
                                output_type=self.action_spec.dtype)

        actions += self._action_offset

        bandit_policy_values = tf.fill(
            [batch_size, 1], policy_utilities.BanditPolicyType.GREEDY)

        if self._accepts_per_arm_features:
            # Saving the features for the chosen action to the policy_info.
            def gather_observation(obs):
                return tf.gather(params=obs, indices=actions, batch_dims=1)

            chosen_arm_features = tf.nest.map_structure(
                gather_observation,
                observation[bandit_spec_utils.PER_ARM_FEATURE_KEY])
            policy_info = policy_utilities.PerArmPolicyInfo(
                log_probability=tf.zeros([batch_size], tf.float32)
                if policy_utilities.InfoFields.LOG_PROBABILITY
                in self._emit_policy_info else (),
                predicted_rewards_mean=(
                    predicted_objective_values_tensor
                    if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
                    in self._emit_policy_info else ()),
                multiobjective_scalarized_predicted_rewards_mean=(
                    scalarized_reward if policy_utilities.InfoFields.
                    MULTIOBJECTIVE_SCALARIZED_PREDICTED_REWARDS_MEAN
                    in self._emit_policy_info else ()),
                bandit_policy_type=(
                    bandit_policy_values
                    if policy_utilities.InfoFields.BANDIT_POLICY_TYPE
                    in self._emit_policy_info else ()),
                chosen_arm_features=chosen_arm_features)
        else:
            policy_info = policy_utilities.PolicyInfo(
                log_probability=tf.zeros([batch_size], tf.float32)
                if policy_utilities.InfoFields.LOG_PROBABILITY
                in self._emit_policy_info else (),
                predicted_rewards_mean=(
                    predicted_objective_values_tensor
                    if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
                    in self._emit_policy_info else ()),
                multiobjective_scalarized_predicted_rewards_mean=(
                    scalarized_reward if policy_utilities.InfoFields.
                    MULTIOBJECTIVE_SCALARIZED_PREDICTED_REWARDS_MEAN
                    in self._emit_policy_info else ()),
                bandit_policy_type=(
                    bandit_policy_values
                    if policy_utilities.InfoFields.BANDIT_POLICY_TYPE
                    in self._emit_policy_info else ()))

        return policy_step.PolicyStep(
            tfp.distributions.Deterministic(loc=actions), policy_state,
            policy_info)