def setUp(self):
   super(TemporalActionSmoothingTest, self).setUp()
   self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
   self._time_step_spec = ts.time_step_spec(self._obs_spec)
   self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, 0, 10)
    def testDistributedLinearAgentUpdate(self,
                                         batch_size,
                                         context_dim,
                                         exploration_policy,
                                         dtype,
                                         use_eigendecomp=False):
        """Same as above, but uses the distributed train function of the agent."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps(
            batch_size, context_dim)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)

        agent = linear_agent.LinearBanditAgent(
            exploration_policy=exploration_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            dtype=dtype)
        self.evaluate(agent.initialize())
        train_fn = common.function_in_tf1()(agent._distributed_train_step)
        loss_info = train_fn(experience=experience)
        self.evaluate(loss_info)

        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)

        # Compute the expected updated estimates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(experience.observation, [batch_size, context_dim]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(experience.reward, [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_a_updated_list = []
        expected_b_updated_list = []
        expected_theta_updated_list = []
        for _, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):
            num_samples_for_arm_current = tf.cast(
                tf.shape(rewards_for_arm)[0], tf.float32)
            num_samples_for_arm_total = num_samples_for_arm_current

            # pylint: disable=cell-var-from-loop
            def true_fn():
                a_new = tf.matmul(observations_for_arm,
                                  observations_for_arm,
                                  transpose_a=True)
                b_new = bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, observations_for_arm)
                return a_new, b_new

            def false_fn():
                return tf.zeros([context_dim,
                                 context_dim]), tf.zeros([context_dim])

            a_new, b_new = tf.cond(
                tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn)
            theta_new = tf.squeeze(tf.linalg.solve(
                a_new + tf.eye(context_dim), tf.expand_dims(b_new, axis=-1)),
                                   axis=-1)

            expected_a_updated_list.append(self.evaluate(a_new))
            expected_b_updated_list.append(self.evaluate(b_new))
            expected_theta_updated_list.append(self.evaluate(theta_new))

        # Check that the actual updated estimates match the expectations.
        self.assertAllClose(expected_a_updated_list, final_a)
        self.assertAllClose(expected_b_updated_list, final_b)
Beispiel #3
0
 def testSerialization(self):
     desc = tensor_spec.BoundedTensorSpec([1, 5], tf.float32, -1, 1, "test")
     self.assertEqual(pickle.loads(pickle.dumps(desc)), desc)
Beispiel #4
0
 def setUp(self):
     super(QPolicyTest, self).setUp()
     self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
     self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)
    def testLinearAgentUpdateWithMaskedActions(self,
                                               batch_size,
                                               context_dim,
                                               exploration_policy,
                                               dtype,
                                               use_eigendecomp=False):
        """Check that the agent updates for specified actions and rewards."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps_with_action_mask(
            batch_size, context_dim, num_actions=num_actions)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = (tensor_spec.TensorSpec([context_dim], tf.float32),
                            tensor_spec.TensorSpec([num_actions], tf.int32))
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)

        def observation_and_action_constraint_splitter(obs):
            return obs[0], obs[1]

        agent = linear_agent.LinearBanditAgent(
            exploration_policy=exploration_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            observation_and_action_constraint_splitter=(
                observation_and_action_constraint_splitter),
            dtype=dtype)
        self.evaluate(agent.initialize())
        loss_info = agent.train(experience)
        self.evaluate(loss_info)
        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)

        # Compute the expected updated estimates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(
                observation_and_action_constraint_splitter(
                    experience.observation)[0], [batch_size, -1]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(experience.reward, [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_a_updated_list = []
        expected_b_updated_list = []
        for _, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):
            num_samples_for_arm_current = tf.cast(
                tf.shape(rewards_for_arm)[0], tf.float32)
            num_samples_for_arm_total = num_samples_for_arm_current

            # pylint: disable=cell-var-from-loop
            def true_fn():
                a_new = tf.matmul(observations_for_arm,
                                  observations_for_arm,
                                  transpose_a=True)
                b_new = bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, observations_for_arm)
                return a_new, b_new

            def false_fn():
                return tf.zeros([context_dim,
                                 context_dim]), tf.zeros([context_dim])

            a_new, b_new = tf.cond(
                tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn)

            expected_a_updated_list.append(self.evaluate(a_new))
            expected_b_updated_list.append(self.evaluate(b_new))

        # Check that the actual updated estimates match the expectations.
        self.assertAllClose(expected_a_updated_list, final_a)
        self.assertAllClose(expected_b_updated_list, final_b)
Beispiel #6
0
 def setUp(self):
     super(DqnAgentTest, self).setUp()
     self._observation_spec = tensor_spec.TensorSpec([2], tf.float32)
     self._time_step_spec = ts.time_step_spec(self._observation_spec)
     self._action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 1)
Beispiel #7
0
  def testNeuralLinUCBUpdateDistributed(self, batch_size=1, context_dim=10):
    """Same as above but with distributed LinUCB updates."""

    # Construct a `Trajectory` for the given action, observation, reward.
    num_actions = 5
    initial_step, final_step = _get_initial_and_final_steps(
        batch_size, context_dim)
    action = np.random.randint(num_actions, size=batch_size, dtype=np.int32)
    action_step = _get_action_step(action)
    experience = _get_experience(initial_step, action_step, final_step)

    # Construct an agent and perform the update.
    observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
    time_step_spec = time_step.time_step_spec(observation_spec)
    action_spec = tensor_spec.BoundedTensorSpec(
        dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
    encoder = DummyNet(observation_spec)
    encoding_dim = 10
    agent = neural_linucb_agent.NeuralLinUCBAgent(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        encoding_network=encoder,
        encoding_network_num_train_steps=0,
        encoding_dim=encoding_dim,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-2))

    self.evaluate(agent.initialize())
    self.evaluate(tf.compat.v1.global_variables_initializer())
    # Call the distributed LinUCB training instead of agent.train().
    train_fn = common.function_in_tf1()(
        agent.compute_loss_using_linucb_distributed)
    reward = tf.cast(experience.reward, agent._dtype)
    loss_info = train_fn(
        experience.observation, action, reward, weights=None)
    self.evaluate(loss_info)
    final_a = self.evaluate(agent.cov_matrix)
    final_b = self.evaluate(agent.data_vector)

    # Compute the expected updated estimates.
    observations_list = tf.dynamic_partition(
        data=tf.reshape(experience.observation, [batch_size, context_dim]),
        partitions=tf.convert_to_tensor(action),
        num_partitions=num_actions)
    rewards_list = tf.dynamic_partition(
        data=tf.reshape(experience.reward, [batch_size]),
        partitions=tf.convert_to_tensor(action),
        num_partitions=num_actions)
    expected_a_updated_list = []
    expected_b_updated_list = []
    for _, (observations_for_arm, rewards_for_arm) in enumerate(zip(
        observations_list, rewards_list)):

      encoded_observations_for_arm, _ = encoder(observations_for_arm)

      num_samples_for_arm_current = tf.cast(
          tf.shape(rewards_for_arm)[0], tf.float32)
      num_samples_for_arm_total = num_samples_for_arm_current

      # pylint: disable=cell-var-from-loop
      def true_fn():
        a_new = tf.matmul(
            encoded_observations_for_arm,
            encoded_observations_for_arm,
            transpose_a=True)
        b_new = bandit_utils.sum_reward_weighted_observations(
            rewards_for_arm, encoded_observations_for_arm)
        return a_new, b_new
      def false_fn():
        return (tf.zeros([encoding_dim, encoding_dim], dtype=tf.float32),
                tf.zeros([encoding_dim], dtype=tf.float32))

      a_new, b_new = tf.cond(
          tf.squeeze(num_samples_for_arm_total) > 0,
          true_fn,
          false_fn)

      expected_a_updated_list.append(self.evaluate(a_new))
      expected_b_updated_list.append(self.evaluate(b_new))

    # Check that the actual updated estimates match the expectations.
    self.assertAllClose(expected_a_updated_list, final_a)
    self.assertAllClose(expected_b_updated_list, final_b)
Beispiel #8
0
def create_bandit_policy_type_tensor_spec(shape):
    """Create tensor spec for bandit policy type."""
    return tensor_spec.BoundedTensorSpec(shape=shape,
                                         dtype=tf.int32,
                                         minimum=BanditPolicyType.UNKNOWN,
                                         maximum=BanditPolicyType.UNIFORM)
Beispiel #9
0
    def __init__(self,
                 encoding_network,
                 encoding_dim,
                 reward_layer,
                 epsilon_greedy,
                 actions_from_reward_layer,
                 cov_matrix,
                 data_vector,
                 num_samples,
                 time_step_spec=None,
                 alpha=1.0,
                 emit_log_probability=False,
                 name=None):
        """Initializes `NeuralLinUCBPolicy`.

    Args:
      encoding_network: network that encodes the observations.
      encoding_dim: (int) dimension of the encoded observations.
      reward_layer: final layer that predicts the expected reward per arm.
      epsilon_greedy: (float) representing the probability of choosing a random
        action instead of the greedy action.
      actions_from_reward_layer: (bool) whether to get actions from the reward
        layer or from LinUCB.
      cov_matrix: list of the covariance matrices. There exists one covariance
        matrix per arm.
      data_vector: list of the data vectors. A data vector is a weighted sum
        of the observations, where the weight is the corresponding reward. Each
        arm has its own data vector.
      num_samples: list of number of samples per arm.
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      alpha: (float) non-negative weight multiplying the confidence intervals.
      emit_log_probability: (bool) whether to emit log probabilities.
      name: The name of this policy.
    """
        self._encoding_network = encoding_network
        self._reward_layer = reward_layer
        self._encoding_dim = encoding_dim

        if not isinstance(cov_matrix, (list, tuple)):
            raise ValueError(
                'cov_matrix must be a list of matrices (Tensors).')
        self._cov_matrix = cov_matrix

        if not isinstance(data_vector, (list, tuple)):
            raise ValueError(
                'data_vector must be a list of vectors (Tensors).')
        self._data_vector = data_vector

        if not isinstance(num_samples, (list, tuple)):
            raise ValueError(
                'num_samples must be a list of vectors (Tensors).')
        self._num_samples = num_samples

        self._alpha = alpha
        self._actions_from_reward_layer = actions_from_reward_layer
        self._epsilon_greedy = epsilon_greedy
        self._dtype = self._data_vector[0].dtype

        if len(cov_matrix) != len(data_vector):
            raise ValueError(
                'The size of list cov_matrix must match the size of '
                'list data_vector. Got {} for cov_matrix and {} '
                'for data_vector'.format(len(self._cov_matrix),
                                         len((data_vector))))
        if len(num_samples) != len(cov_matrix):
            raise ValueError('The size of num_samples must match the size of '
                             'list cov_matrix. Got {} for num_samples and {} '
                             'for cov_matrix'.format(len(self._num_samples),
                                                     len((cov_matrix))))

        self._num_actions = len(cov_matrix)
        assert self._num_actions
        self._observation_dim = tf.compat.dimension_value(
            time_step_spec.observation.shape[0])
        cov_matrix_dim = tf.compat.dimension_value(cov_matrix[0].shape[0])
        if self._encoding_dim != cov_matrix_dim:
            raise ValueError('The dimension of matrix `cov_matrix` must match '
                             'encoding dimension {}.'
                             'Got {} for `cov_matrix`.'.format(
                                 self._encoding_dim, cov_matrix_dim))
        data_vector_dim = tf.compat.dimension_value(data_vector[0].shape[0])
        if self._encoding_dim != data_vector_dim:
            raise ValueError(
                'The dimension of vector `data_vector` must match '
                'encoding  dimension {}. '
                'Got {} for `data_vector`.'.format(self._encoding_dim,
                                                   data_vector_dim))
        action_spec = tensor_spec.BoundedTensorSpec(shape=(),
                                                    dtype=tf.int32,
                                                    minimum=0,
                                                    maximum=self._num_actions -
                                                    1,
                                                    name='action')
        super(NeuralLinUCBPolicy,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             emit_log_probability=emit_log_probability,
                             name=name)
Beispiel #10
0
  def __init__(self,
               time_step_spec,
               action_spec,
               policy_state_spec=(),
               info_spec=(),
               clip=True,
               emit_log_probability=False,
               automatic_state_reset=True,
               observation_and_action_constraint_splitter=None,
               name=None):
    """Initialization of Base class.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps. Usually
        provided by the user to the subclass.
      action_spec: A nest of BoundedTensorSpec representing the actions. Usually
        provided by the user to the subclass.
      policy_state_spec: A nest of TensorSpec representing the policy_state.
        Provided by the subclass, not directly by the user.
      info_spec: A nest of TensorSpec representing the policy info. Provided by
        the subclass, not directly by the user.
      clip: Whether to clip actions to spec before returning them.  Default
        True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped
        continuous actions for training.
      emit_log_probability: Emit log-probabilities of actions, if supported. If
        True, policy_step.info will have CommonFields.LOG_PROBABILITY set.
        Please consult utility methods provided in policy_step for setting and
        retrieving these. When working with custom policies, either provide a
        dictionary info_spec or a namedtuple with the field 'log_probability'.
      automatic_state_reset:  If `True`, then `get_initial_policy_state` is used
        to clear state in `action()` and `distribution()` for for time steps
        where `time_step.is_first()`.
      observation_and_action_constraint_splitter: A function used to process
        observations with action constraints. These constraints can indicate,
        for example, a mask of valid/invalid actions for a given state of the
        environment. The function takes in a full observation and returns a
        tuple consisting of 1) the part of the observation intended as input to
        the network and 2) the constraint. An example
        `observation_and_action_constraint_splitter` could be as simple as: ```
        def observation_and_action_constraint_splitter(observation): return
          observation['network_input'], observation['constraint'] ```
        *Note*: when using `observation_and_action_constraint_splitter`, make
          sure the provided `q_network` is compatible with the network-specific
          half of the output of the
          `observation_and_action_constraint_splitter`. In particular,
          `observation_and_action_constraint_splitter` will be called on the
          observation before passing to the network. If
          `observation_and_action_constraint_splitter` is None, action
          constraints are not applied.
      name: A name for this module. Defaults to the class name.
    """
    super(Base, self).__init__(name=name)
    common.check_tf1_allowed()
    common.tf_agents_gauge.get_cell('TFAPolicy').set(True)
    common.assert_members_are_not_overridden(base_cls=Base, instance=self)
    if not isinstance(time_step_spec, ts.TimeStep):
      raise ValueError(
          'The `time_step_spec` must be an instance of `TimeStep`, but is `{}`.'
          .format(type(time_step_spec)))

    self._time_step_spec = time_step_spec
    self._action_spec = action_spec
    self._policy_state_spec = policy_state_spec
    self._emit_log_probability = emit_log_probability
    if emit_log_probability:
      log_probability_spec = tensor_spec.BoundedTensorSpec(
          shape=(),
          dtype=tf.float32,
          maximum=0,
          minimum=-float('inf'),
          name='log_probability')
      log_probability_spec = tf.nest.map_structure(
          lambda _: log_probability_spec, action_spec)
      info_spec = policy_step.set_log_probability(info_spec,
                                                  log_probability_spec)

    self._info_spec = info_spec
    self._setup_specs()
    self._clip = clip
    self._action_fn = common.function_in_tf1()(self._action)
    self._automatic_state_reset = automatic_state_reset
    self._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter)
def create_feed_forward_common_tower_network(
    observation_spec: types.NestedTensorSpec,
    global_layers: Sequence[int],
    arm_layers: Sequence[int],
    common_layers: Sequence[int],
    output_dim: int = 1,
    global_preprocessing_combiner: Optional[Callable[...,
                                                     types.Tensor]] = None,
    arm_preprocessing_combiner: Optional[Callable[..., types.Tensor]] = None,
    activation_fn: Callable[[types.Tensor],
                            types.Tensor] = tf.keras.activations.relu
) -> types.Network:
    """Creates a common tower network with feedforward towers.

  The network produced by this function can be used either in
  `GreedyRewardPredictionPolicy`, or `NeuralLinUCBPolicy`.
  In the former case, the network must have `output_dim=1`, it is going to be an
  instance of `QNetwork`, and used in the policy as a reward prediction network.
  In the latter case, the network will be an encoding network with its output
  consumed by a reward layer or a LinUCB method. The specified `output_dim` will
  be the encoding dimension.

  Args:
    observation_spec: A nested tensor spec containing the specs for global as
      well as per-arm observations.
    global_layers: Iterable of ints. Specifies the layers of the global tower.
    arm_layers: Iterable of ints. Specifies the layers of the arm tower.
    common_layers: Iterable of ints. Specifies the layers of the common tower.
    output_dim: The output dimension of the network. If 1, the common tower will
      be a QNetwork. Otherwise, the common tower will be an encoding network
      with the specified output dimension.
    global_preprocessing_combiner: Preprocessing combiner for global features.
    arm_preprocessing_combiner: Preprocessing combiner for the arm features.
    activation_fn: A keras activation, specifying the activation function used
      in all layers. Defaults to relu.

  Returns:
    A network that takes observations adhering observation_spec and outputs
    reward estimates for every action.
  """
    obs_spec_no_num_actions = _remove_num_actions_dim_from_spec(
        observation_spec)
    global_network = encoding_network.EncodingNetwork(
        input_tensor_spec=obs_spec_no_num_actions[
            bandit_spec_utils.GLOBAL_FEATURE_KEY],
        fc_layer_params=global_layers,
        activation_fn=activation_fn,
        preprocessing_combiner=global_preprocessing_combiner)

    arm_network = encoding_network.EncodingNetwork(
        input_tensor_spec=obs_spec_no_num_actions[
            bandit_spec_utils.PER_ARM_FEATURE_KEY],
        fc_layer_params=arm_layers,
        activation_fn=activation_fn,
        preprocessing_combiner=arm_preprocessing_combiner)

    # When `global_layers` or `arm_layers` are empty, the corresponding encoding
    # networks simply pass the inputs forward, so in such cases we get the output
    # dimensions from the respective observation specs.
    global_network_out_dim = global_layers[
        -1] if global_layers else obs_spec_no_num_actions[
            bandit_spec_utils.GLOBAL_FEATURE_KEY].shape[-1]
    arm_network_out_dim = arm_layers[
        -1] if arm_layers else obs_spec_no_num_actions[
            bandit_spec_utils.PER_ARM_FEATURE_KEY].shape[-1]
    common_input_spec = tensor_spec.TensorSpec(shape=(global_network_out_dim +
                                                      arm_network_out_dim, ),
                                               dtype=tf.float32)
    if output_dim == 1:
        common_network = q_network.QNetwork(
            input_tensor_spec=common_input_spec,
            action_spec=tensor_spec.BoundedTensorSpec(shape=(),
                                                      minimum=0,
                                                      maximum=0,
                                                      dtype=tf.int32),
            fc_layer_params=common_layers,
            activation_fn=activation_fn)
    else:
        common_network = encoding_network.EncodingNetwork(
            input_tensor_spec=common_input_spec,
            fc_layer_params=list(common_layers) + [output_dim],
            activation_fn=activation_fn)
    return GlobalAndArmCommonTowerNetwork(obs_spec_no_num_actions,
                                          global_network, arm_network,
                                          common_network)
    def testBuildsRnn(self, lstm_size, rnn_construction_fn):
        observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32,
                                                         0, 1)
        time_step_spec = ts.time_step_spec(observation_spec)
        time_step = tensor_spec.sample_spec_nest(time_step_spec,
                                                 outer_dims=(1, ))

        action_spec = [
            tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3),
            tensor_spec.BoundedTensorSpec((3, ), tf.int32, 0, 3)
        ]

        net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
            observation_spec,
            action_spec,
            conv_layer_params=[(4, 2, 2)],
            input_fc_layer_params=(5, ),
            output_fc_layer_params=(5, ),
            lstm_size=lstm_size,
            rnn_construction_fn=rnn_construction_fn,
            rnn_construction_kwargs={'lstm_size': 3})

        action_distributions, network_state = net(
            time_step.observation, time_step.step_type,
            net.get_initial_state(batch_size=1))
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertEqual([1, 2],
                         action_distributions[0].mode().shape.as_list())
        self.assertEqual([1, 3],
                         action_distributions[1].mode().shape.as_list())

        self.assertLen(net.variables, 14)
        # Conv Net Kernel
        self.assertEqual((2, 2, 3, 4), net.variables[0].shape)
        # Conv Net bias
        self.assertEqual((4, ), net.variables[1].shape)
        # Fc Kernel
        self.assertEqual((64, 5), net.variables[2].shape)
        # Fc Bias
        self.assertEqual((5, ), net.variables[3].shape)
        # RNN Cell Kernel
        self.assertEqual((5, 3), net.variables[4].shape)
        # RNN Cell Recurrent Kernel
        self.assertEqual((3, 3), net.variables[5].shape)
        # RNN Cell Bias
        self.assertEqual((3, ), net.variables[6].shape)
        # Fc Kernel
        self.assertEqual((3, 5), net.variables[7].shape)
        # Fc Bias
        self.assertEqual((5, ), net.variables[8].shape)
        # Normal Projection Kernel
        self.assertEqual((5, 2), net.variables[9].shape)
        # Normal Projection Bias
        self.assertEqual((2, ), net.variables[10].shape)
        # Normal Projection STD Bias layer
        self.assertEqual((2, ), net.variables[11].shape)
        # Categorical Projection Kernel
        self.assertEqual((5, 12), net.variables[12].shape)
        # Categorical Projection Bias
        self.assertEqual((12, ), net.variables[13].shape)

        # Assert RNN cell is created.
        self.assertEqual((3, ), network_state[0].shape)
    def __init__(self,
                 encoding_network,
                 encoding_dim,
                 reward_layer,
                 epsilon_greedy,
                 actions_from_reward_layer,
                 cov_matrix,
                 data_vector,
                 num_samples,
                 time_step_spec=None,
                 alpha=1.0,
                 emit_log_probability=False,
                 observation_and_action_constraint_splitter=None,
                 name=None):
        """Initializes `NeuralLinUCBPolicy`.

    Args:
      encoding_network: network that encodes the observations.
      encoding_dim: (int) dimension of the encoded observations.
      reward_layer: final layer that predicts the expected reward per arm.
      epsilon_greedy: (float) representing the probability of choosing a random
        action instead of the greedy action.
      actions_from_reward_layer: (bool) whether to get actions from the reward
        layer or from LinUCB.
      cov_matrix: list of the covariance matrices. There exists one covariance
        matrix per arm.
      data_vector: list of the data vectors. A data vector is a weighted sum
        of the observations, where the weight is the corresponding reward. Each
        arm has its own data vector.
      num_samples: list of number of samples per arm.
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      alpha: (float) non-negative weight multiplying the confidence intervals.
      emit_log_probability: (bool) whether to emit log probabilities.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit policy and 2)
        the mask. The mask should be a 0-1 `Tensor` of shape
        `[batch_size, num_actions]`. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      name: The name of this policy.
    """
        self._encoding_network = encoding_network
        self._reward_layer = reward_layer
        self._encoding_dim = encoding_dim

        if not isinstance(cov_matrix, (list, tuple)):
            raise ValueError(
                'cov_matrix must be a list of matrices (Tensors).')
        self._cov_matrix = cov_matrix

        if not isinstance(data_vector, (list, tuple)):
            raise ValueError(
                'data_vector must be a list of vectors (Tensors).')
        self._data_vector = data_vector

        if not isinstance(num_samples, (list, tuple)):
            raise ValueError(
                'num_samples must be a list of vectors (Tensors).')
        self._num_samples = num_samples

        self._alpha = alpha
        self._actions_from_reward_layer = actions_from_reward_layer
        self._epsilon_greedy = epsilon_greedy
        self._dtype = self._data_vector[0].dtype

        if len(cov_matrix) != len(data_vector):
            raise ValueError(
                'The size of list cov_matrix must match the size of '
                'list data_vector. Got {} for cov_matrix and {} '
                'for data_vector'.format(len(self._cov_matrix),
                                         len((data_vector))))
        if len(num_samples) != len(cov_matrix):
            raise ValueError('The size of num_samples must match the size of '
                             'list cov_matrix. Got {} for num_samples and {} '
                             'for cov_matrix'.format(len(self._num_samples),
                                                     len((cov_matrix))))

        self._num_actions = len(cov_matrix)
        assert self._num_actions
        if observation_and_action_constraint_splitter is not None:
            context_shape = observation_and_action_constraint_splitter(
                time_step_spec.observation)[0].shape.as_list()
        else:
            context_shape = time_step_spec.observation.shape.as_list()
        self._context_dim = (tf.compat.dimension_value(context_shape[0])
                             if context_shape else 1)
        cov_matrix_dim = tf.compat.dimension_value(cov_matrix[0].shape[0])
        if self._encoding_dim != cov_matrix_dim:
            raise ValueError('The dimension of matrix `cov_matrix` must match '
                             'encoding dimension {}.'
                             'Got {} for `cov_matrix`.'.format(
                                 self._encoding_dim, cov_matrix_dim))
        data_vector_dim = tf.compat.dimension_value(data_vector[0].shape[0])
        if self._encoding_dim != data_vector_dim:
            raise ValueError(
                'The dimension of vector `data_vector` must match '
                'encoding  dimension {}. '
                'Got {} for `data_vector`.'.format(self._encoding_dim,
                                                   data_vector_dim))
        action_spec = tensor_spec.BoundedTensorSpec(shape=(),
                                                    dtype=tf.int32,
                                                    minimum=0,
                                                    maximum=self._num_actions -
                                                    1,
                                                    name='action')
        super(NeuralLinUCBPolicy,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             emit_log_probability=emit_log_probability,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             name=name)
    def __init__(self,
                 encoding_network: types.Network,
                 encoding_dim: int,
                 reward_layer: tf.keras.layers.Dense,
                 epsilon_greedy: float,
                 actions_from_reward_layer: types.Bool,
                 cov_matrix: Sequence[types.Float],
                 data_vector: Sequence[types.Float],
                 num_samples: Sequence[types.Int],
                 time_step_spec: types.TimeStep,
                 alpha: float = 1.0,
                 emit_policy_info: Sequence[Text] = (),
                 emit_log_probability: bool = False,
                 accepts_per_arm_features: bool = False,
                 distributed_use_reward_layer: bool = False,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 name: Optional[Text] = None):
        """Initializes `NeuralLinUCBPolicy`.

    Args:
      encoding_network: network that encodes the observations.
      encoding_dim: (int) dimension of the encoded observations.
      reward_layer: final layer that predicts the expected reward per arm. In
        case the policy accepts per-arm features, the output of this layer has
        to be a scalar. This is because in the per-arm case, all encoded
        observations have to go through the same computation to get the reward
        estimates. The `num_actions` dimension of the encoded observation is
        treated as a batch dimension in the reward layer.
      epsilon_greedy: (float) representing the probability of choosing a random
        action instead of the greedy action.
      actions_from_reward_layer: (boolean variable) whether to get actions from
        the reward layer or from LinUCB.
      cov_matrix: list of the covariance matrices. There exists one covariance
        matrix per arm, unless the policy accepts per-arm features, in which
        case this list must have a single element.
      data_vector: list of the data vectors. A data vector is a weighted sum
        of the observations, where the weight is the corresponding reward. Each
        arm has its own data vector, unless the policy accepts per-arm features,
        in which case this list must have a single element.
      num_samples: list of number of samples per arm. If the policy accepts per-
        arm features, this is a single-element list counting the number of
        steps.
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      alpha: (float) non-negative weight multiplying the confidence intervals.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: (bool) whether to emit log probabilities.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      distributed_use_reward_layer: (bool) Whether to pick the actions using
        the network or use LinUCB. This applies only in distributed training
        setting and has a similar role to the `actions_from_reward_layer`
        mentioned above.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit policy and 2)
        the mask. The mask should be a 0-1 `Tensor` of shape
        `[batch_size, num_actions]`. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      name: The name of this policy.
    """
        policy_utilities.check_no_mask_with_arm_features(
            accepts_per_arm_features,
            observation_and_action_constraint_splitter)
        encoding_network.create_variables()
        self._encoding_network = encoding_network
        self._reward_layer = reward_layer
        self._encoding_dim = encoding_dim

        if accepts_per_arm_features and reward_layer.units != 1:
            raise ValueError(
                'The output dimension of the reward layer must be 1, got'
                ' {}'.format(reward_layer.units))

        if not isinstance(cov_matrix, (list, tuple)):
            raise ValueError(
                'cov_matrix must be a list of matrices (Tensors).')
        self._cov_matrix = cov_matrix

        if not isinstance(data_vector, (list, tuple)):
            raise ValueError(
                'data_vector must be a list of vectors (Tensors).')
        self._data_vector = data_vector

        if not isinstance(num_samples, (list, tuple)):
            raise ValueError(
                'num_samples must be a list of vectors (Tensors).')
        self._num_samples = num_samples

        self._alpha = alpha
        self._actions_from_reward_layer = actions_from_reward_layer
        self._epsilon_greedy = epsilon_greedy
        self._dtype = self._data_vector[0].dtype
        self._distributed_use_reward_layer = distributed_use_reward_layer

        if len(cov_matrix) != len(data_vector):
            raise ValueError(
                'The size of list cov_matrix must match the size of '
                'list data_vector. Got {} for cov_matrix and {} '
                'for data_vector'.format(len(self._cov_matrix),
                                         len((data_vector))))
        if len(num_samples) != len(cov_matrix):
            raise ValueError('The size of num_samples must match the size of '
                             'list cov_matrix. Got {} for num_samples and {} '
                             'for cov_matrix'.format(len(self._num_samples),
                                                     len((cov_matrix))))

        self._accepts_per_arm_features = accepts_per_arm_features
        if observation_and_action_constraint_splitter is not None:
            context_spec, _ = observation_and_action_constraint_splitter(
                time_step_spec.observation)
        else:
            context_spec = time_step_spec.observation
        if accepts_per_arm_features:
            self._num_actions = tf.nest.flatten(context_spec[
                bandit_spec_utils.PER_ARM_FEATURE_KEY])[0].shape.as_list()[0]
            self._num_models = 1
        else:
            self._num_actions = len(cov_matrix)
            self._num_models = self._num_actions
        cov_matrix_dim = tf.compat.dimension_value(cov_matrix[0].shape[0])
        if self._encoding_dim != cov_matrix_dim:
            raise ValueError('The dimension of matrix `cov_matrix` must match '
                             'encoding dimension {}.'
                             'Got {} for `cov_matrix`.'.format(
                                 self._encoding_dim, cov_matrix_dim))
        data_vector_dim = tf.compat.dimension_value(data_vector[0].shape[0])
        if self._encoding_dim != data_vector_dim:
            raise ValueError(
                'The dimension of vector `data_vector` must match '
                'encoding  dimension {}. '
                'Got {} for `data_vector`.'.format(self._encoding_dim,
                                                   data_vector_dim))
        action_spec = tensor_spec.BoundedTensorSpec(shape=(),
                                                    dtype=tf.int32,
                                                    minimum=0,
                                                    maximum=self._num_actions -
                                                    1,
                                                    name='action')

        self._emit_policy_info = emit_policy_info
        predicted_rewards_mean = ()
        if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
            predicted_rewards_mean = tensor_spec.TensorSpec(
                [self._num_actions], dtype=tf.float32)
        predicted_rewards_optimistic = ()
        if (policy_utilities.InfoFields.PREDICTED_REWARDS_OPTIMISTIC
                in emit_policy_info):
            predicted_rewards_optimistic = tensor_spec.TensorSpec(
                [self._num_actions], dtype=tf.float32)
        if accepts_per_arm_features:
            chosen_arm_features_info_spec = (
                policy_utilities.create_chosen_arm_features_info_spec(
                    time_step_spec.observation))
            info_spec = policy_utilities.PerArmPolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                predicted_rewards_optimistic=predicted_rewards_optimistic,
                chosen_arm_features=chosen_arm_features_info_spec)
        else:
            info_spec = policy_utilities.PolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                predicted_rewards_optimistic=predicted_rewards_optimistic)

        super(NeuralLinUCBPolicy,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             emit_log_probability=emit_log_probability,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             info_spec=info_spec,
                             name=name)
 def setUp(self):
     super(BehavioralCloningAgentTest, self).setUp()
     self._obs_spec = [tensor_spec.TensorSpec([2], tf.float32)]
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
     self._action_spec = [tensor_spec.BoundedTensorSpec([], tf.int32, 0, 1)]
     self._observation_spec = self._time_step_spec.observation
 def __init__(self):
   info_spec = {"test": tensor_spec.BoundedTensorSpec([1], tf.int64, 0, 1)}
   super(TfDictInfoAndLogProbs, self).__init__(info_spec=info_spec)
Beispiel #17
0
    def testTrainWithSparseTensorAndDenseFeaturesLayer(self, agent_class):
        obs_spec = {
            'dense':
            tensor_spec.BoundedTensorSpec(dtype=tf.float32,
                                          shape=[3],
                                          minimum=-10.0,
                                          maximum=10.0),
            'sparse_terms':
            tf.SparseTensorSpec(dtype=tf.string, shape=[4]),
            'sparse_frequencies':
            tf.SparseTensorSpec(dtype=tf.float32, shape=[4]),
        }
        cat_column = (
            tf.compat.v2.feature_column.categorical_column_with_hash_bucket(
                'sparse_terms', hash_bucket_size=5))
        weighted_cat_column = (
            tf.compat.v2.feature_column.weighted_categorical_column(
                cat_column, weight_feature_key='sparse_frequencies'))
        feature_columns = [
            tf.compat.v2.feature_column.numeric_column('dense', [3]),
            tf.compat.v2.feature_column.embedding_column(
                weighted_cat_column, 3),
        ]
        dense_features_layer = tf.compat.v2.keras.layers.DenseFeatures(
            feature_columns)
        time_step_spec = ts.time_step_spec(obs_spec)
        q_net = q_network.QNetwork(time_step_spec.observation,
                                   self._action_spec,
                                   preprocessing_combiner=dense_features_layer)
        agent = agent_class(time_step_spec,
                            self._action_spec,
                            q_network=q_net,
                            optimizer=tf.compat.v1.train.AdamOptimizer())

        observations = tensor_spec.sample_spec_nest(obs_spec,
                                                    outer_dims=[5, 2])
        # sparse_terms and sparse_frequencies must be defined on matching indices.
        observations['sparse_terms'] = tf.SparseTensor(
            indices=observations['sparse_frequencies'].indices,
            values=tf.as_string(
                tf.math.round(observations['sparse_frequencies'].values)),
            dense_shape=observations['sparse_frequencies'].dense_shape)
        if not tf.executing_eagerly():
            # Mimic unknown inner dims on the SparseTensor
            def _unknown_inner_shape(t):
                if not isinstance(t, tf.SparseTensor):
                    return t
                return tf.SparseTensor(
                    indices=t.indices,
                    values=t.values,
                    dense_shape=tf.compat.v1.placeholder_with_default(
                        t.dense_shape, shape=t.dense_shape.shape))

            observations = tf.nest.map_structure(_unknown_inner_shape,
                                                 observations)
            self.assertIsNone(
                tf.get_static_value(observations['sparse_terms'].dense_shape))

        time_step = ts.restart(observations, batch_size=[5, 2])
        action_step = tensor_spec.sample_spec_nest(self._action_spec,
                                                   outer_dims=[5, 2])
        p_step = policy_step.PolicyStep(action=action_step, state=(), info=())
        traj = trajectory.from_transition(time_step, p_step, time_step)
        loss_info = agent.train(traj)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_info = self.evaluate(loss_info)
        self.assertGreater(loss_info.loss, 0)
 def __init__(self):
   observation_spec = tensor_spec.TensorSpec([2, 2], tf.float32)
   time_step_spec = ts.time_step_spec(observation_spec)
   action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)
   super(TFPolicyMismatchedDtypes, self).__init__(time_step_spec, action_spec)
Beispiel #19
0
 def testBoundedTensorSpecSample(self, dtype):
     spec = tensor_spec.BoundedTensorSpec((2, 3), dtype, 2, 7)
     sample = tensor_spec.sample_spec_nest(spec)
     sample_ = self.evaluate(sample)
     self.assertTrue(np.all(sample_ >= 2))
     self.assertTrue(np.all(sample_ <= 7))
    def testPerArmRewardsSparseObs(self):
        obs_spec = {
            'global': {
                'sport': tensor_spec.TensorSpec((), tf.string)
            },
            'per_arm': {
                'name': tensor_spec.TensorSpec((3, ), tf.string),
                'fruit': tensor_spec.TensorSpec((3, ), tf.string)
            }
        }
        columns_a = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'name', ['bob', 'george', 'wanda']))
        columns_b = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'fruit', ['banana', 'kiwi', 'pear']))
        columns_c = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'sport', ['bridge', 'chess', 'snooker']))

        objective_networks = []
        for _ in range(3):
            objective_networks.append(
                global_and_arm_feature_network.
                create_feed_forward_common_tower_network(
                    observation_spec=obs_spec,
                    global_layers=(4, 3, 2),
                    arm_layers=(6, 5, 4),
                    common_layers=(7, 6, 5),
                    global_preprocessing_combiner=(
                        tf.compat.v2.keras.layers.DenseFeatures([columns_c])),
                    arm_preprocessing_combiner=tf.compat.v2.keras.layers.
                    DenseFeatures([columns_a, columns_b])))
        time_step_spec = ts.time_step_spec(obs_spec)
        action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)
        policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
            time_step_spec,
            action_spec,
            self._scalarizer,
            objective_networks,
            accepts_per_arm_features=True,
            emit_policy_info=('predicted_rewards_mean', ))
        observations = {
            'global': {
                'sport': tf.constant(['snooker', 'chess'])
            },
            'per_arm': {
                'name':
                tf.constant([['george', 'george', 'george'],
                             ['bob', 'bob', 'bob']]),
                'fruit':
                tf.constant([['banana', 'banana', 'banana'],
                             ['kiwi', 'kiwi', 'kiwi']])
            }
        }

        time_step = ts.restart(observations, batch_size=2)
        action_step = policy.action(time_step)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate([
            tf.compat.v1.global_variables_initializer(),
            tf.compat.v1.tables_initializer()
        ])
        action, p_info, first_arm_name_feature = self.evaluate([
            action_step.action, action_step.info,
            observations[bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0]
        ])
        self.assertAllEqual(action.shape, [2])
        self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3, 3])
        self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2])
        self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2])
        first_action = action[0]
        self.assertAllEqual(p_info.chosen_arm_features['name'][0],
                            first_arm_name_feature[first_action])
Beispiel #21
0
def create_ppo_agent_and_dataset_fn(action_spec, time_step_spec, train_step,
                                    batch_size):
    """Builds and returns a dummy PPO Agent, dataset and dataset function."""
    del action_spec  # Unused.
    del time_step_spec  # Unused.
    del batch_size  # Unused.

    # No arbitrary spec supported.
    obs_spec = tensor_spec.TensorSpec([2], tf.float32)
    ts_spec = ts.time_step_spec(obs_spec)
    act_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)
    actor_net = actor_distribution_network.ActorDistributionNetwork(
        obs_spec,
        act_spec,
        fc_layer_params=(100, ),
        activation_fn=tf.keras.activations.tanh)

    value_net = value_network.ValueNetwork(
        obs_spec,
        fc_layer_params=(100, ),
        activation_fn=tf.keras.activations.tanh)

    agent = ppo_clip_agent.PPOClipAgent(
        ts_spec,
        act_spec,
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        actor_net=actor_net,
        value_net=value_net,
        entropy_regularization=0.0,
        importance_ratio_clipping=0.2,
        normalize_observations=False,
        normalize_rewards=False,
        use_gae=False,
        use_td_lambda_return=False,
        num_epochs=1,
        debug_summaries=False,
        summarize_grads_and_vars=False,
        train_step_counter=train_step,
        compute_value_and_advantage_in_train=False)

    def _create_experience(_):
        observations = tf.constant([
            [[1, 2], [3, 4], [5, 6]],
            [[1, 2], [3, 4], [5, 6]],
        ],
                                   dtype=tf.float32)
        mid_time_step_val = ts.StepType.MID.tolist()
        time_steps = ts.TimeStep(step_type=tf.constant(
            [[mid_time_step_val] * 3] * 2, dtype=tf.int32),
                                 reward=tf.constant([[1] * 3] * 2,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 3] * 2,
                                                      dtype=tf.float32),
                                 observation=observations)
        actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                              dtype=tf.float32)

        action_distribution_parameters = {
            'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
            'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
        }
        value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]],
                                  dtype=tf.float32)

        policy_info = {
            'dist_params': action_distribution_parameters,
        }
        policy_info['value_prediction'] = value_preds
        experience = trajectory.Trajectory(time_steps.step_type, observations,
                                           actions, policy_info,
                                           time_steps.step_type,
                                           time_steps.reward,
                                           time_steps.discount)
        return agent._preprocess(experience)  # pylint: disable=protected-access

    dataset = tf.data.Dataset.from_tensor_slices([[i] for i in range(100)
                                                  ]).map(_create_experience)
    dataset = tf.data.Dataset.zip((dataset, tf.data.experimental.Counter()))
    dataset_fn = lambda: dataset

    return agent, dataset, dataset_fn, agent.training_data_spec
    def testLinearAgentUpdatePerArmFeatures(self,
                                            batch_size,
                                            context_dim,
                                            exploration_policy,
                                            dtype,
                                            use_eigendecomp=False):
        """Check that the agent updates for specified actions and rewards."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        global_context_dim = context_dim
        arm_context_dim = 3
        initial_step, final_step = (
            _get_initial_and_final_steps_with_per_arm_features(
                batch_size,
                global_context_dim,
                num_actions,
                arm_context_dim,
                num_actions_feature=True))
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = policy_step.PolicyStep(
            action=tf.convert_to_tensor(action),
            info=policy_utilities.PerArmPolicyInfo(
                chosen_arm_features=np.arange(
                    batch_size * arm_context_dim, dtype=np.float32).reshape(
                        [batch_size, arm_context_dim])))
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = bandit_spec_utils.create_per_arm_observation_spec(
            context_dim,
            arm_context_dim,
            num_actions,
            add_num_actions_feature=True)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)
        agent = linear_agent.LinearBanditAgent(
            exploration_policy=exploration_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            use_eigendecomp=use_eigendecomp,
            accepts_per_arm_features=True,
            dtype=dtype)
        self.evaluate(agent.initialize())
        loss_info = agent.train(experience)
        self.evaluate(loss_info)
        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)

        # Compute the expected updated estimates.
        global_observation = experience.observation[
            bandit_spec_utils.GLOBAL_FEATURE_KEY]
        arm_observation = experience.policy_info.chosen_arm_features
        overall_observation = tf.squeeze(tf.concat(
            [global_observation, arm_observation], axis=-1),
                                         axis=1)
        rewards = tf.squeeze(experience.reward, axis=1)

        expected_a_new = tf.matmul(overall_observation,
                                   overall_observation,
                                   transpose_a=True)
        expected_b_new = bandit_utils.sum_reward_weighted_observations(
            rewards, overall_observation)
        self.assertAllClose(expected_a_new, final_a[0])
        self.assertAllClose(expected_b_new, final_b[0])
    def testLinearAgentUpdateWithBias(self,
                                      batch_size,
                                      context_dim,
                                      exploration_policy,
                                      dtype,
                                      use_eigendecomp=False):
        """Check that the agent updates for specified actions and rewards."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps(
            batch_size, context_dim)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)
        variable_collection = linear_agent.LinearBanditVariableCollection(
            context_dim + 1, num_actions, use_eigendecomp, dtype)
        agent = linear_agent.LinearBanditAgent(
            exploration_policy=exploration_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            variable_collection=variable_collection,
            use_eigendecomp=use_eigendecomp,
            add_bias=True,
            dtype=dtype)
        self.evaluate(agent.initialize())
        loss_info = agent.train(experience)
        self.evaluate(loss_info)
        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)
        final_theta = self.evaluate(agent.theta)

        # Compute the expected updated estimates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(experience.observation, [batch_size, context_dim]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(experience.reward, [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_a_updated_list = []
        expected_b_updated_list = []
        expected_theta_updated_list = []
        for _, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):
            observations_for_arm = tf.concat([
                observations_for_arm,
                tf.ones_like(observations_for_arm[:, 0:1])
            ],
                                             axis=1)
            num_samples_for_arm_current = tf.cast(
                tf.shape(rewards_for_arm)[0], tf.float32)
            num_samples_for_arm_total = num_samples_for_arm_current

            # pylint: disable=cell-var-from-loop
            def true_fn():
                a_new = tf.matmul(observations_for_arm,
                                  observations_for_arm,
                                  transpose_a=True)
                b_new = bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, observations_for_arm)
                return a_new, b_new

            def false_fn():
                return tf.zeros([context_dim + 1,
                                 context_dim + 1]), tf.zeros([context_dim + 1])

            a_new, b_new = tf.cond(
                tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn)
            theta_new = tf.squeeze(tf.linalg.solve(
                a_new + tf.eye(context_dim + 1), tf.expand_dims(b_new,
                                                                axis=-1)),
                                   axis=-1)

            expected_a_updated_list.append(self.evaluate(a_new))
            expected_b_updated_list.append(self.evaluate(b_new))
            expected_theta_updated_list.append(self.evaluate(theta_new))

        # Check that the actual updated estimates match the expectations.
        self.assertAllClose(expected_a_updated_list, final_a)
        self.assertAllClose(expected_b_updated_list, final_b)
        self.assertAllClose(self.evaluate(
            tf.stack(expected_theta_updated_list)),
                            final_theta,
                            atol=0.1,
                            rtol=0.05)
Beispiel #24
0
 def testTrainPerArmAgent(self):
     num_actions = 5
     mask_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                               shape=(num_actions, ),
                                               minimum=0,
                                               maximum=1)
     obs_spec = (bandit_spec_utils.create_per_arm_observation_spec(
         2, 3, num_actions), mask_spec)
     time_step_spec = time_step.time_step_spec(obs_spec)
     action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                 shape=(),
                                                 minimum=0,
                                                 maximum=num_actions - 1)
     encoding_dim = 10
     encoder = (global_and_arm_feature_network.
                create_feed_forward_common_tower_network(
                    obs_spec[0], (4, 3), (3, 4), (4, 2), encoding_dim))
     agent = neural_linucb_agent.NeuralLinUCBAgent(
         time_step_spec=time_step_spec,
         action_spec=action_spec,
         encoding_network=encoder,
         encoding_network_num_train_steps=10,
         encoding_dim=encoding_dim,
         observation_and_action_constraint_splitter=lambda x: (x[0], x[1]),
         accepts_per_arm_features=True,
         optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001))
     observations = ({
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         tf.cast(tf.reshape(tf.range(30), shape=[2, 5, 3]),
                 dtype=tf.float32)
     }, tf.ones(shape=(2, num_actions), dtype=tf.int32))
     actions = np.array([0, 3], dtype=np.int32)
     rewards = np.array([0.5, 3.0], dtype=np.float32)
     initial_step = time_step.TimeStep(
         tf.constant(time_step.StepType.FIRST,
                     dtype=tf.int32,
                     shape=[2],
                     name='step_type'),
         tf.constant(0.0, dtype=tf.float32, shape=[2], name='reward'),
         tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'),
         observations)
     final_step = time_step.TimeStep(
         tf.constant(time_step.StepType.LAST,
                     dtype=tf.int32,
                     shape=[2],
                     name='step_type'),
         tf.constant(rewards, dtype=tf.float32, name='reward'),
         tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'),
         observations)
     action_step = policy_step.PolicyStep(
         action=tf.convert_to_tensor(actions),
         info=policy_utilities.PerArmPolicyInfo(
             chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                          dtype=np.float32)))
     experience = _get_experience(initial_step, action_step, final_step)
     loss_info, _ = agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
     loss_value = self.evaluate(loss_info)
     self.assertGreater(loss_value, 0.0)
    def testLinearAgentUpdateWithForgetting(self,
                                            batch_size,
                                            context_dim,
                                            exploration_policy,
                                            dtype,
                                            use_eigendecomp=False):
        """Check that the agent updates for specified actions and rewards."""
        # We should rewrite this test as it currently does not depend on
        # the value of `gamma`. To properly test the forgetting factor, we need to
        # call `train` twice.
        gamma = 0.9

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps(
            batch_size, context_dim)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)
        agent = linear_agent.LinearBanditAgent(
            exploration_policy=exploration_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            gamma=gamma,
            dtype=dtype,
            use_eigendecomp=use_eigendecomp)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_info = agent.train(experience)
        self.evaluate(loss_info)
        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)
        final_eig_vals = self.evaluate(agent.eig_vals)

        # Compute the expected updated estimates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(experience.observation, [batch_size, context_dim]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(experience.reward, [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_a_updated_list = []
        expected_b_updated_list = []
        expected_eigvals_updated_list = []
        for _, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):
            num_samples_for_arm_current = tf.cast(
                tf.shape(rewards_for_arm)[0], tf.float32)
            num_samples_for_arm_total = num_samples_for_arm_current

            # pylint: disable=cell-var-from-loop
            def true_fn():
                a_new = tf.matmul(observations_for_arm,
                                  observations_for_arm,
                                  transpose_a=True)
                b_new = bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, observations_for_arm)
                eigmatrix_new = tf.constant([], dtype=dtype)
                eigvals_new = tf.constant([], dtype=dtype)
                if use_eigendecomp:
                    eigvals_new, eigmatrix_new = tf.linalg.eigh(a_new)
                return a_new, b_new, eigvals_new, eigmatrix_new

            def false_fn():
                if use_eigendecomp:
                    return (tf.zeros([context_dim,
                                      context_dim]), tf.zeros([context_dim]),
                            tf.ones([context_dim]), tf.eye(context_dim))
                else:
                    return (tf.zeros([context_dim,
                                      context_dim]), tf.zeros([context_dim]),
                            tf.constant([], dtype=dtype),
                            tf.constant([], dtype=dtype))

            a_new, b_new, eig_vals_new, _ = tf.cond(
                tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn)

            expected_a_updated_list.append(self.evaluate(a_new))
            expected_b_updated_list.append(self.evaluate(b_new))
            expected_eigvals_updated_list.append(self.evaluate(eig_vals_new))

        # Check that the actual updated estimates match the expectations.
        self.assertAllClose(expected_a_updated_list, final_a)
        self.assertAllClose(expected_b_updated_list, final_b)
        self.assertAllClose(expected_eigvals_updated_list,
                            final_eig_vals,
                            atol=1e-4,
                            rtol=1e-4)
Beispiel #26
0
 def setUp(self):
     super(SacAgentTest, self).setUp()
     self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
     self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1,
                                                       1)
Beispiel #27
0
 def testInvalidMaximum(self):
     with self.assertRaisesRegexp(ValueError, "not compatible"):
         tensor_spec.BoundedTensorSpec((3, 5), tf.uint8, 0, (1, 1, 1))
Beispiel #28
0
    def testMixturePolicyDynamicBatchSize(self):
        context_dim = 35
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = ts.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(shape=(),
                                                    dtype=tf.int32,
                                                    minimum=0,
                                                    maximum=9,
                                                    name='action')
        sub_policies = [
            ConstantPolicy(action_spec, time_step_spec, i) for i in range(10)
        ]
        weights = [0, 0, 0.2, 0, 0, 0.3, 0, 0, 0.5, 0]
        dist = tfd.Categorical(probs=weights)

        policy = mixture_policy.MixturePolicy(dist, sub_policies)
        batch_size = tf.random.uniform(shape=(),
                                       minval=10,
                                       maxval=15,
                                       dtype=tf.int32)
        time_step = ts.TimeStep(
            tf.fill(tf.expand_dims(batch_size, axis=0),
                    ts.StepType.FIRST,
                    name='step_type'),
            tf.zeros(shape=[batch_size], dtype=tf.float32, name='reward'),
            tf.ones(shape=[batch_size], dtype=tf.float32, name='discount'),
            tf.reshape(tf.range(tf.cast(batch_size * context_dim,
                                        dtype=tf.float32),
                                dtype=tf.float32),
                       shape=[-1, context_dim],
                       name='observation'))
        action_step = policy.action(time_step)
        actions, bsize = self.evaluate([action_step.action, batch_size])
        self.assertAllEqual(actions.shape, [bsize])
        self.assertAllInSet(actions, [2, 5, 8])

        train_step = tf.compat.v1.train.get_or_create_global_step()
        saver = policy_saver.PolicySaver(policy, train_step=train_step)
        location = os.path.join(self.get_temp_dir(), 'saved_policy')
        if not tf.executing_eagerly():
            with self.cached_session():
                self.evaluate(tf.compat.v1.global_variables_initializer())
                saver.save(location)
        else:
            saver.save(location)
        loaded_policy = tf.compat.v2.saved_model.load(location)
        new_batch_size = 3
        new_time_step = ts.TimeStep(
            tf.fill(tf.expand_dims(new_batch_size, axis=0),
                    ts.StepType.FIRST,
                    name='step_type'),
            tf.zeros(shape=[new_batch_size], dtype=tf.float32, name='reward'),
            tf.ones(shape=[new_batch_size], dtype=tf.float32, name='discount'),
            tf.reshape(tf.range(tf.cast(new_batch_size * context_dim,
                                        dtype=tf.float32),
                                dtype=tf.float32),
                       shape=[-1, context_dim],
                       name='observation'))
        new_action = self.evaluate(loaded_policy.action(new_time_step).action)
        self.assertAllEqual(new_action.shape, [new_batch_size])
        self.assertAllInSet(new_action, [2, 5, 8])
Beispiel #29
0
 def testUint8IncludeMaxOfDtype(self):
     spec = tensor_spec.BoundedTensorSpec((2, 3), tf.uint8, 255, 255)
     sample = tensor_spec.sample_spec_nest(spec)
     sample_ = self.evaluate(sample)
     self.assertTrue(np.all(sample_ == 255))
Beispiel #30
0
 def setUp(self):
     super(GreedyRewardPredictionPolicyTest, self).setUp()
     self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
     self._action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)