コード例 #1
0
    def testNumActionsFromTensorSpecWrongRank(self):
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(2, 3),
                                                    minimum=0,
                                                    maximum=15)

        with self.assertRaisesRegexp(ValueError,
                                     r'Action spec must be a scalar'):
            utils.get_num_actions_from_tensor_spec(action_spec)
コード例 #2
0
    def __init__(self, time_step_spec, action_spec, learning_rate, name=None):
        """Initialize an instance of `Exp3Agent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      learning_rate: A float valued scalar. A higher value will force the agent
        to converge on a single action more quickly. A lower value will
        encourage more exploration. This value corresponds to the
        `inverse_temperature` argument passed to `CategoricalPolicy`.
      name: a name for this instance of `Exp3Agent`.
    """
        tf.Module.__init__(self, name=name)
        common.tf_agents_gauge.get_cell('TFABandit').set(True)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)
        self._weights = tf.compat.v2.Variable(tf.zeros(self._num_actions),
                                              name='weights')
        self._learning_rate = tf.compat.v2.Variable(learning_rate,
                                                    name='learning_rate')
        policy = categorical_policy.CategoricalPolicy(
            weights=self._weights,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            inverse_temperature=self._learning_rate)
        # TODO(b/127462472): consider policy=GreedyPolicy(collect_policy).
        super(Exp3Agent, self).__init__(time_step_spec=time_step_spec,
                                        action_spec=policy.action_spec,
                                        policy=policy,
                                        collect_policy=policy,
                                        train_sequence_length=None)
コード例 #3
0
 def testNumActionsFromTensorSpecGoodSpec(self):
     action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                 shape=(),
                                                 minimum=0,
                                                 maximum=15)
     num_actions = utils.get_num_actions_from_tensor_spec(action_spec)
     self.assertEqual(num_actions, 16)
コード例 #4
0
ファイル: constraints.py プロジェクト: yangjue-han/agents
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 constraint_network,
                 error_loss_fn=tf.compat.v1.losses.mean_squared_error,
                 name='NeuralConstraint'):
        """Creates a trainable constraint using a neural network.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      constraint_network: An instance of `tf_agents.network.Network` used to
        provide estimates of action feasibility. The input structure should be
        consistent with the `observation_spec`.
      error_loss_fn: A function for computing the loss used to train the
        constraint network. The default is `tf.losses.mean_squared_error`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.
    """
        super(NeuralConstraint, self).__init__(time_step_spec, action_spec,
                                               name)

        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)

        with self.name_scope:
            constraint_network.create_variables()
        self._constraint_network = constraint_network
        self._error_loss_fn = error_loss_fn
コード例 #5
0
    def testLaplacian1D(self):
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=4)
        num_actions = utils.get_num_actions_from_tensor_spec(action_spec)
        laplacian_matrix = utils.build_laplacian_over_ordinal_integer_actions(
            action_spec)
        res = tf.matmul(laplacian_matrix,
                        tf.ones([num_actions, 1], dtype=tf.float32))
        # The vector of ones is in the null space of the Laplacian matrix.
        self.assertAllClose(0.0, self.evaluate(tf.norm(res)))

        # The row sum is zero.
        row_sum = tf.reduce_sum(laplacian_matrix, 1)
        self.assertAllClose(0.0, self.evaluate(tf.norm(row_sum)))

        # The column sum is zero.
        column_sum = tf.reduce_sum(laplacian_matrix, 0)
        self.assertAllClose(0.0, self.evaluate(tf.norm(column_sum)))

        # The diagonal elements are 2.0.
        self.assertAllClose(2.0, laplacian_matrix[1, 1])

        laplacian_matrix_expected = np.array([[1.0, -1.0, 0.0, 0.0, 0.0],
                                              [-1.0, 2.0, -1.0, 0.0, 0.0],
                                              [0.0, -1.0, 2.0, -1.0, 0.0],
                                              [0.0, 0.0, -1.0, 2.0, -1.0],
                                              [0.0, 0.0, 0.0, -1.0, 1.0]])
        self.assertAllClose(laplacian_matrix_expected,
                            self.evaluate(laplacian_matrix))
コード例 #6
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 gamma=1.0,
                 dtype=tf.float32,
                 name=None):
        """Initialize an instance of `LinearThompsonSamplingAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`.
      name: a name for this instance of `LinearThompsonSamplingAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
        tf.Module.__init__(self, name=name)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)
        self._context_dim = int(time_step_spec.observation.shape[0])
        self._gamma = gamma
        if self._gamma < 0.0 or self._gamma > 1.0:
            raise ValueError(
                'Forgetting factor `gamma` must be in [0.0, 1.0].')

        self._weight_covariances = []
        self._parameter_estimators = []
        self._dtype = dtype
        if dtype not in (tf.float32, tf.float64):
            raise ValueError(
                'Agent dtype should be either `tf.float32 or `tf.float64`.')

        for k in range(self._num_actions):
            self._weight_covariances.append(
                tf.compat.v2.Variable(tf.eye(self._context_dim, dtype=dtype),
                                      name='a_' + str(k)))
            self._parameter_estimators.append(
                tf.compat.v2.Variable(tf.zeros(self._context_dim, dtype=dtype),
                                      name='b_' + str(k)))

        policy = ts_policy.LinearThompsonSamplingPolicy(
            action_spec, self._weight_covariances, self._parameter_estimators)
        super(LinearThompsonSamplingAgent,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=policy.action_spec,
                             policy=policy,
                             collect_policy=policy,
                             train_sequence_length=None)
コード例 #7
0
    def __init__(self, original_environment, action_constraint_join_fn,
                 action_probability):
        """Initializes a `BernoulliActionMaskTFEnvironment`.

    Args:
      original_environment: Instance of `BanditTFEnvironment`. This environment
        will be wrapped.
      action_constraint_join_fn: A function that joins the osbervation from the
        original environment with the generated masks.
      action_probability: The probability that any action in the action space is
        allower by the generated mask.
    """
        self._original_environment = original_environment
        assert isinstance(
            original_environment, bandit_tf_environment.BanditTFEnvironment
        ), 'The wrapped environment needs to be a `BanditTFEnvironment`.'
        self._action_constraint_join_fn = action_constraint_join_fn
        self._action_probability = action_probability
        self._batch_size = self._original_environment.batch_size
        action_spec = self._original_environment.action_spec()
        observation_spec_without_mask = (
            self._original_environment.time_step_spec().observation)
        self._num_actions = agent_utils.get_num_actions_from_tensor_spec(
            action_spec)

        mask_spec = tf.TensorSpec([self._num_actions], dtype=tf.int32)
        joined_observation_spec = self._action_constraint_join_fn(
            observation_spec_without_mask, mask_spec)
        time_step_spec = ts.time_step_spec(joined_observation_spec)

        self._current_mask = tf.compat.v2.Variable(
            tf.ones([self.batch_size, self._num_actions], dtype=tf.int32))

        super(BernoulliActionMaskTFEnvironment,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             batch_size=self._batch_size)
コード例 #8
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 alpha=1.0,
                 gamma=1.0,
                 use_eigendecomp=False,
                 tikhonov_weight=1.0,
                 emit_log_probability=False,
                 dtype=tf.float32,
                 name=None):
        """Initialize an instance of `LinearUCBAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      use_eigendecomp: whether to use eigen-decomposition or not. The default
        solver is Conjugate Gradient.
      tikhonov_weight: (float) tikhonov regularization term.
      emit_log_probability: Whether the LinearUCBPolicy emits log-probabilities
        or not. Since the policy is deterministic, the probability is just 1.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`.
      name: a name for this instance of `LinearUCBAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
        tf.Module.__init__(self, name=name)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)
        self._context_dim = int(time_step_spec.observation.shape[0])
        self._alpha = alpha
        self._cov_matrix_list = []
        self._data_vector_list = []
        self._eig_matrix_list = []
        self._eig_vals_list = []
        # We keep track of the number of samples per arm.
        self._num_samples_list = []
        self._gamma = gamma
        if self._gamma < 0.0 or self._gamma > 1.0:
            raise ValueError(
                'Forgetting factor `gamma` must be in [0.0, 1.0].')
        self._dtype = dtype
        if dtype not in (tf.float32, tf.float64):
            raise ValueError(
                'Agent dtype should be either `tf.float32 or `tf.float64`.')
        self._use_eigendecomp = use_eigendecomp
        self._tikhonov_weight = tikhonov_weight

        for k in range(self._num_actions):
            self._cov_matrix_list.append(
                tf.compat.v2.Variable(tf.eye(self._context_dim, dtype=dtype),
                                      name='a_' + str(k)))
            self._data_vector_list.append(
                tf.compat.v2.Variable(tf.zeros(self._context_dim, dtype=dtype),
                                      name='b_' + str(k)))
            self._num_samples_list.append(
                tf.compat.v2.Variable(tf.zeros([], dtype=dtype),
                                      name='num_samples_' + str(k)))
            if self._use_eigendecomp:
                self._eig_matrix_list.append(
                    tf.compat.v2.Variable(tf.eye(self._context_dim,
                                                 dtype=dtype),
                                          name='eig_matrix' + str(k)))
                self._eig_vals_list.append(
                    tf.compat.v2.Variable(tf.ones([self._context_dim],
                                                  dtype=dtype),
                                          name='eig_vals' + str(k)))
            else:
                self._eig_matrix_list.append(
                    tf.compat.v2.Variable(tf.constant([], dtype=dtype),
                                          name='eig_matrix' + str(k)))
                self._eig_vals_list.append(
                    tf.compat.v2.Variable(tf.constant([], dtype=dtype),
                                          name='eig_vals' + str(k)))

        policy = lin_ucb_policy.LinearUCBPolicy(
            action_spec=action_spec,
            cov_matrix=self._cov_matrix_list,
            data_vector=self._data_vector_list,
            num_samples=self._num_samples_list,
            time_step_spec=time_step_spec,
            alpha=alpha,
            eig_vals=self._eig_vals_list if self._use_eigendecomp else (),
            eig_matrix=self._eig_matrix_list if self._use_eigendecomp else (),
            tikhonov_weight=self._tikhonov_weight,
            emit_log_probability=emit_log_probability)
        super(LinearUCBAgent, self).__init__(time_step_spec=time_step_spec,
                                             action_spec=action_spec,
                                             policy=policy,
                                             collect_policy=policy,
                                             train_sequence_length=None)
コード例 #9
0
    def __init__(self,
                 exploration_policy,
                 time_step_spec,
                 action_spec,
                 variable_collection=None,
                 alpha=1.0,
                 gamma=1.0,
                 use_eigendecomp=False,
                 tikhonov_weight=1.0,
                 add_bias=False,
                 emit_policy_info=(),
                 emit_log_probability=False,
                 observation_and_action_constraint_splitter=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 enable_summaries=True,
                 dtype=tf.float32,
                 name=None):
        """Initialize an instance of `LinearBanditAgent`.

    Args:
      exploration_policy: An Enum of type `ExplorationPolicy`. The kind of
        policy we use for exploration. Currently supported policies are
        `LinUCBPolicy` and `LinearThompsonSamplingPolicy`.
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      variable_collection: Instance of `LinearBanditVariableCollection`.
        Collection of variables to be updated by the agent. If `None`, a new
        instance of `LinearBanditVariableCollection` will be created.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to 1.0, the
        algorithm does not forget.
      use_eigendecomp: whether to use eigen-decomposition or not. The default
        solver is Conjugate Gradient.
      tikhonov_weight: (float) tikhonov regularization term.
      add_bias: If true, a bias term will be added to the linear reward
        estimation.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: Whether the policy emits log-probabilities or not.
        Since the policy is deterministic, the probability is just 1.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`.
      name: a name for this instance of `LinearBanditAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
      TypeError if variable_collection is not an instance of
        `LinearBanditVariableCollection`.
    """
        tf.Module.__init__(self, name=name)
        common.tf_agents_gauge.get_cell('TFABandit').set(True)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)
        if observation_and_action_constraint_splitter is not None:
            context_shape = observation_and_action_constraint_splitter(
                time_step_spec.observation)[0].shape.as_list()
        else:
            context_shape = time_step_spec.observation.shape.as_list()
        self._add_bias = add_bias
        self._context_dim = (tf.compat.dimension_value(context_shape[0])
                             if context_shape else 1)
        if self._add_bias:
            # The bias is added via a constant 1 feature.
            self._context_dim += 1
        self._alpha = alpha
        if variable_collection is None:
            variable_collection = LinearBanditVariableCollection(
                context_dim=self._context_dim,
                num_actions=self._num_actions,
                use_eigendecomp=use_eigendecomp,
                dtype=dtype)
        elif not isinstance(variable_collection,
                            LinearBanditVariableCollection):
            raise TypeError('Parameter `variable_collection` should be '
                            'of type `LinearBanditVariableCollection`.')
        self._variable_collection = variable_collection
        self._cov_matrix_list = variable_collection.cov_matrix_list
        self._data_vector_list = variable_collection.data_vector_list
        self._eig_matrix_list = variable_collection.eig_matrix_list
        self._eig_vals_list = variable_collection.eig_vals_list
        # We keep track of the number of samples per arm.
        self._num_samples_list = variable_collection.num_samples_list
        self._gamma = gamma
        if self._gamma < 0.0 or self._gamma > 1.0:
            raise ValueError(
                'Forgetting factor `gamma` must be in [0.0, 1.0].')
        self._dtype = dtype
        if dtype not in (tf.float32, tf.float64):
            raise ValueError(
                'Agent dtype should be either `tf.float32 or `tf.float64`.')
        self._use_eigendecomp = use_eigendecomp
        self._tikhonov_weight = tikhonov_weight
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)

        if exploration_policy == ExplorationPolicy.linear_ucb_policy:
            exploration_strategy = lin_policy.ExplorationStrategy.optimistic
        elif exploration_policy == (
                ExplorationPolicy.linear_thompson_sampling_policy):
            exploration_strategy = lin_policy.ExplorationStrategy.sampling
        else:
            raise ValueError(
                'Linear bandit agent with policy %s not implemented' %
                exploration_policy)
        policy = lin_policy.LinearBanditPolicy(
            action_spec=action_spec,
            cov_matrix=self._cov_matrix_list,
            data_vector=self._data_vector_list,
            num_samples=self._num_samples_list,
            time_step_spec=time_step_spec,
            exploration_strategy=exploration_strategy,
            alpha=alpha,
            eig_vals=self._eig_vals_list if self._use_eigendecomp else (),
            eig_matrix=self._eig_matrix_list if self._use_eigendecomp else (),
            tikhonov_weight=self._tikhonov_weight,
            add_bias=add_bias,
            emit_policy_info=emit_policy_info,
            emit_log_probability=emit_log_probability,
            observation_and_action_constraint_splitter=(
                observation_and_action_constraint_splitter))
        super(LinearBanditAgent,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             policy=policy,
                             collect_policy=policy,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             enable_summaries=enable_summaries,
                             train_sequence_length=None)
コード例 #10
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 alpha=1.0,
                 gamma=1.0,
                 use_eigendecomp=False,
                 tikhonov_weight=1.0,
                 add_bias=False,
                 emit_policy_info=(),
                 emit_log_probability=False,
                 observation_and_action_constraint_splitter=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 enable_summaries=True,
                 dtype=tf.float32,
                 name=None):
        """Initialize an instance of `LinearUCBAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      use_eigendecomp: whether to use eigen-decomposition or not. The default
        solver is Conjugate Gradient.
      tikhonov_weight: (float) tikhonov regularization term.
      add_bias: If true, a bias term will be added to the linear reward
        estimation.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: Whether the LinearUCBPolicy emits log-probabilities
        or not. Since the policy is deterministic, the probability is just 1.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`.
      name: a name for this instance of `LinearUCBAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
        tf.Module.__init__(self, name=name)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)
        if observation_and_action_constraint_splitter is not None:
            context_shape = observation_and_action_constraint_splitter(
                time_step_spec.observation)[0].shape.as_list()
        else:
            context_shape = time_step_spec.observation.shape.as_list()
        self._add_bias = add_bias
        self._context_dim = (tf.compat.dimension_value(context_shape[0])
                             if context_shape else 1)
        if self._add_bias:
            # The bias is added via a constant 1 feature.
            self._context_dim += 1
        self._alpha = alpha
        self._cov_matrix_list = []
        self._data_vector_list = []
        self._eig_matrix_list = []
        self._eig_vals_list = []
        # We keep track of the number of samples per arm.
        self._num_samples_list = []
        self._gamma = gamma
        if self._gamma < 0.0 or self._gamma > 1.0:
            raise ValueError(
                'Forgetting factor `gamma` must be in [0.0, 1.0].')
        self._dtype = dtype
        if dtype not in (tf.float32, tf.float64):
            raise ValueError(
                'Agent dtype should be either `tf.float32 or `tf.float64`.')
        self._use_eigendecomp = use_eigendecomp
        self._tikhonov_weight = tikhonov_weight
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)

        for k in range(self._num_actions):
            self._cov_matrix_list.append(
                tf.compat.v2.Variable(tf.eye(self._context_dim, dtype=dtype),
                                      name='a_' + str(k)))
            self._data_vector_list.append(
                tf.compat.v2.Variable(tf.zeros(self._context_dim, dtype=dtype),
                                      name='b_' + str(k)))
            self._num_samples_list.append(
                tf.compat.v2.Variable(tf.zeros([], dtype=dtype),
                                      name='num_samples_' + str(k)))
            if self._use_eigendecomp:
                self._eig_matrix_list.append(
                    tf.compat.v2.Variable(tf.eye(self._context_dim,
                                                 dtype=dtype),
                                          name='eig_matrix' + str(k)))
                self._eig_vals_list.append(
                    tf.compat.v2.Variable(tf.ones([self._context_dim],
                                                  dtype=dtype),
                                          name='eig_vals' + str(k)))
            else:
                self._eig_matrix_list.append(
                    tf.compat.v2.Variable(tf.constant([], dtype=dtype),
                                          name='eig_matrix' + str(k)))
                self._eig_vals_list.append(
                    tf.compat.v2.Variable(tf.constant([], dtype=dtype),
                                          name='eig_vals' + str(k)))

        policy = lin_ucb_policy.LinearUCBPolicy(
            action_spec=action_spec,
            cov_matrix=self._cov_matrix_list,
            data_vector=self._data_vector_list,
            num_samples=self._num_samples_list,
            time_step_spec=time_step_spec,
            alpha=alpha,
            eig_vals=self._eig_vals_list if self._use_eigendecomp else (),
            eig_matrix=self._eig_matrix_list if self._use_eigendecomp else (),
            tikhonov_weight=self._tikhonov_weight,
            add_bias=add_bias,
            emit_policy_info=emit_policy_info,
            emit_log_probability=emit_log_probability,
            observation_and_action_constraint_splitter=
            observation_and_action_constraint_splitter)
        super(LinearUCBAgent,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             policy=policy,
                             collect_policy=policy,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             enable_summaries=enable_summaries,
                             train_sequence_length=None)
コード例 #11
0
  def __init__(
      self,
      time_step_spec,
      action_spec,
      encoding_network,
      encoding_network_num_train_steps,
      encoding_dim,
      optimizer,
      variable_collection=None,
      alpha=1.0,
      gamma=1.0,
      epsilon_greedy=0.0,
      observation_and_action_constraint_splitter=None,
      # Params for training.
      error_loss_fn=tf.compat.v1.losses.mean_squared_error,
      gradient_clipping=None,
      # Params for debugging.
      debug_summaries=False,
      summarize_grads_and_vars=False,
      train_step_counter=None,
      emit_policy_info=(),
      emit_log_probability=False,
      dtype=tf.float64,
      name=None):
    """Initialize an instance of `NeuralLinUCBAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      encoding_network: a Keras network that encodes the observations.
      encoding_network_num_train_steps: how many training steps to run for
        training the encoding network before switching to LinUCB. If negative,
        the encoding network is assumed to be already trained.
      encoding_dim: the dimension of encoded observations.
      optimizer: The optimizer to use for training.
      variable_collection: Instance of `NeuralLinUCBVariableCollection`.
        Collection of variables to be updated by the agent. If `None`, a new
        instance of `LinearBanditVariables` will be created. Note that this
        collection excludes the variables owned by the encoding network.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      epsilon_greedy: A float representing the probability of choosing a random
        action instead of the greedy action.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: Whether the NeuralLinUCBPolicy emits
        log-probabilities or not. Since the policy is deterministic, the
        probability is just 1.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float64`.
      name: a name for this instance of `NeuralLinUCBAgent`.

    Raises:
      TypeError if variable_collection is not an instance of
        `NeuralLinUCBVariableCollection`.
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
    tf.Module.__init__(self, name=name)
    common.tf_agents_gauge.get_cell('TFABandit').set(True)
    self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
        action_spec)
    self._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter)
    if observation_and_action_constraint_splitter is not None:
      context_shape = observation_and_action_constraint_splitter(
          time_step_spec.observation)[0].shape.as_list()
    else:
      context_shape = time_step_spec.observation.shape.as_list()
    self._context_dim = (
        tf.compat.dimension_value(context_shape[0]) if context_shape else 1)
    self._alpha = alpha
    if variable_collection is None:
      variable_collection = NeuralLinUCBVariableCollection(
          self._num_actions, encoding_dim, dtype)
    elif not isinstance(variable_collection, NeuralLinUCBVariableCollection):
      raise TypeError('Parameter `variable_collection` should be '
                      'of type `NeuralLinUCBVariableCollection`.')
    self._variable_collection = variable_collection
    self._gamma = gamma
    if self._gamma < 0.0 or self._gamma > 1.0:
      raise ValueError('Forgetting factor `gamma` must be in [0.0, 1.0].')
    self._dtype = dtype
    if dtype not in (tf.float32, tf.float64):
      raise ValueError(
          'Agent dtype should be either `tf.float32 or `tf.float64`.')
    self._epsilon_greedy = epsilon_greedy

    reward_layer = tf.keras.layers.Dense(
        self._num_actions,
        kernel_initializer=tf.compat.v1.initializers.random_uniform(
            minval=-0.03, maxval=0.03),
        use_bias=False,
        activation=None,
        name='reward_layer')

    self._encoding_network = encoding_network
    self._reward_layer = reward_layer
    self._encoding_network_num_train_steps = encoding_network_num_train_steps
    self._encoding_dim = encoding_dim
    self._optimizer = optimizer
    self._error_loss_fn = error_loss_fn
    self._gradient_clipping = gradient_clipping
    train_step_counter = tf.compat.v1.train.get_or_create_global_step()

    policy = neural_linucb_policy.NeuralLinUCBPolicy(
        encoding_network=self._encoding_network,
        encoding_dim=self._encoding_dim,
        reward_layer=self._reward_layer,
        epsilon_greedy=self._epsilon_greedy,
        actions_from_reward_layer=self.actions_from_reward_layer,
        cov_matrix=self.cov_matrix,
        data_vector=self.data_vector,
        num_samples=self.num_samples,
        time_step_spec=time_step_spec,
        alpha=alpha,
        emit_policy_info=emit_policy_info,
        emit_log_probability=emit_log_probability,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter))

    super(NeuralLinUCBAgent, self).__init__(
        time_step_spec=time_step_spec,
        action_spec=policy.action_spec,
        policy=policy,
        collect_policy=policy,
        train_sequence_length=None,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step_counter)
コード例 #12
0
ファイル: neural_linucb_agent.py プロジェクト: xxyy1/agents
    def __init__(
            self,
            time_step_spec,
            action_spec,
            encoding_network,
            encoding_network_num_train_steps,
            encoding_dim,
            optimizer,
            alpha=1.0,
            gamma=1.0,
            epsilon_greedy=0.0,
            # Params for training.
            error_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gradient_clipping=None,
            # Params for debugging.
            debug_summaries=False,
            summarize_grads_and_vars=False,
            train_step_counter=None,
            emit_log_probability=False,
            dtype=tf.float64,
            name=None):
        """Initialize an instance of `NeuralLinUCBAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      encoding_network: a Keras network that encodes the observations.
      encoding_network_num_train_steps: how many training steps to run for
        training the encoding network before switching to LinUCB. If negative,
        the encoding network is assumed to be already trained.
      encoding_dim: the dimension of encoded observations.
      optimizer: The optimizer to use for training.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      epsilon_greedy: A float representing the probability of choosing a random
        action instead of the greedy action.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      emit_log_probability: Whether the NeuralLinUCBPolicy emits
        log-probabilities or not. Since the policy is deterministic, the
        probability is just 1.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float64`.
      name: a name for this instance of `NeuralLinUCBAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
        tf.Module.__init__(self, name=name)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)
        self._context_dim = int(time_step_spec.observation.shape[0])
        self._alpha = alpha
        self._cov_matrix_list = []
        self._data_vector_list = []
        # We keep track of the number of samples per arm.
        self._num_samples_list = []
        self._gamma = gamma
        if self._gamma < 0.0 or self._gamma > 1.0:
            raise ValueError(
                'Forgetting factor `gamma` must be in [0.0, 1.0].')
        self._dtype = dtype
        if dtype not in (tf.float32, tf.float64):
            raise ValueError(
                'Agent dtype should be either `tf.float32 or `tf.float64`.')
        self._epsilon_greedy = epsilon_greedy

        reward_layer = tf.keras.layers.Dense(
            self._num_actions,
            kernel_initializer=tf.compat.v1.initializers.random_uniform(
                minval=-0.03, maxval=0.03),
            bias_initializer=tf.compat.v1.initializers.constant(-0.2),
            activation=None,
            name='reward_layer')

        self._encoding_network = encoding_network
        self._reward_layer = reward_layer
        self._encoding_network_num_train_steps = encoding_network_num_train_steps
        self._encoding_dim = encoding_dim
        self._optimizer = optimizer
        self._error_loss_fn = error_loss_fn
        self._gradient_clipping = gradient_clipping
        train_step_counter = tf.compat.v1.train.get_or_create_global_step()
        self._actions_from_reward_layer = tf.compat.v2.Variable(True,
                                                                dtype=tf.bool)

        for k in range(self._num_actions):
            self._cov_matrix_list.append(
                tf.compat.v2.Variable(tf.eye(self._encoding_dim, dtype=dtype),
                                      name='a_' + str(k)))
            self._data_vector_list.append(
                tf.compat.v2.Variable(tf.zeros(self._encoding_dim,
                                               dtype=dtype),
                                      name='b_' + str(k)))
            self._num_samples_list.append(
                tf.compat.v2.Variable(tf.zeros([], dtype=dtype),
                                      name='num_samples_' + str(k)))

        policy = neural_linucb_policy.NeuralLinUCBPolicy(
            encoding_network=self._encoding_network,
            encoding_dim=self._encoding_dim,
            reward_layer=self._reward_layer,
            epsilon_greedy=self._epsilon_greedy,
            actions_from_reward_layer=self._actions_from_reward_layer,
            cov_matrix=self._cov_matrix_list,
            data_vector=self._data_vector_list,
            num_samples=self._num_samples_list,
            time_step_spec=time_step_spec,
            alpha=alpha,
            emit_log_probability=emit_log_probability)

        super(NeuralLinUCBAgent,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=policy.action_spec,
                             policy=policy,
                             collect_policy=policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
コード例 #13
0
    def __init__(
            self,
            time_step_spec,
            action_spec,
            reward_network,
            optimizer,
            # Params for training.
            error_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gradient_clipping=None,
            # Params for debugging.
            debug_summaries=False,
            summarize_grads_and_vars=False,
            train_step_counter=None,
            name=None):
        """Creates a Greedy Reward Network Prediction Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      reward_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with call(observation, step_type) and it is
        expected to provide a reward prediction for all actions.
      optimizer: The optimizer to use for training.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or or it is
      not a bounded scalar int32 spec with minimum 0.
    """
        tf.Module.__init__(self, name=name)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)

        self._reward_network = reward_network
        self._optimizer = optimizer
        self._error_loss_fn = error_loss_fn
        self._gradient_clipping = gradient_clipping

        policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
            time_step_spec, action_spec, reward_network=self._reward_network)

        super(GreedyRewardPredictionAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy=policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
  def __init__(
      self,
      time_step_spec: Optional[ts.TimeStep],
      action_spec: Optional[types.NestedBoundedTensorSpec],
      scalarizer: multi_objective_scalarizer.Scalarizer,
      objective_networks: Sequence[Network],
      optimizer: tf.keras.optimizers.Optimizer,
      observation_and_action_constraint_splitter: types.Splitter = None,
      accepts_per_arm_features: bool = False,
      # Params for training.
      error_loss_fn: Callable[
          ..., tf.Tensor] = tf.compat.v1.losses.mean_squared_error,
      gradient_clipping: Optional[float] = None,
      # Params for debugging.
      debug_summaries: bool = False,
      summarize_grads_and_vars: bool = False,
      enable_summaries: bool = True,
      emit_policy_info: Tuple[Text] = (),
      train_step_counter: Optional[tf.Variable] = None,
      name: Optional[Text] = None):
    """Creates a Greedy Multi-objective Neural Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      scalarizer: A
       `tf_agents.bandits.multi_objective.multi_objective_scalarizer.Scalarizer`
        object that implements scalarization of multiple objectives into a
        single scalar reward.
      objective_networks: A Sequence of `tf_agents.network.Network` objects to
        be used by the agent. Each network will be called with
        call(observation, step_type) and is expected to provide a prediction for
        a specific objective for all actions.
      optimizer: A 'tf.keras.optimizers.Optimizer' object, the optimizer to use
        for training.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask of shape `[batch_size, num_actions]`.
        This function should also work with a `TensorSpec` as input, and should
        output `TensorSpec` objects for the observation and mask.
      accepts_per_arm_features: (bool) Whether the agent accepts per-arm
        features.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError:
        - If the action spec contains more than one action or or it is not a
          bounded scalar int32 spec with minimum 0.
        - If `objective_networks` has fewer than two networks.
    """
    tf.Module.__init__(self, name=name)
    common.tf_agents_gauge.get_cell('TFABandit').set(True)
    self._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter)
    self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
        action_spec)
    self._accepts_per_arm_features = accepts_per_arm_features

    self._num_objectives = len(objective_networks)
    if self._num_objectives < 2:
      raise ValueError(
          'Number of objectives should be at least two, but found to be {}'
          .format(self._num_objectives))
    self._objective_networks = objective_networks
    self._optimizer = optimizer
    self._error_loss_fn = error_loss_fn
    self._gradient_clipping = gradient_clipping
    self._heteroscedastic = [
        isinstance(network, heteroscedastic_q_network.HeteroscedasticQNetwork)
        for network in objective_networks
    ]

    policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
        time_step_spec,
        action_spec,
        scalarizer,
        self._objective_networks,
        observation_and_action_constraint_splitter,
        accepts_per_arm_features=accepts_per_arm_features,
        emit_policy_info=emit_policy_info)
    training_data_spec = None
    if accepts_per_arm_features:
      training_data_spec = bandit_spec_utils.drop_arm_observation(
          policy.trajectory_spec, observation_and_action_constraint_splitter)

    super(GreedyMultiObjectiveNeuralAgent, self).__init__(
        time_step_spec,
        action_spec,
        policy,
        collect_policy=policy,
        train_sequence_length=None,
        training_data_spec=training_data_spec,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        enable_summaries=enable_summaries,
        train_step_counter=train_step_counter)
コード例 #15
0
    def __init__(
            self,
            time_step_spec,
            action_spec,
            reward_network,
            optimizer,
            observation_and_action_constraint_splitter=None,
            # Params for training.
            error_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gradient_clipping=None,
            # Params for debugging.
            debug_summaries=False,
            summarize_grads_and_vars=False,
            enable_summaries=True,
            emit_policy_info=(),
            train_step_counter=None,
            name=None):
        """Creates a Greedy Reward Network Prediction Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      reward_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with call(observation, step_type) and it is
        expected to provide a reward prediction for all actions.
      optimizer: The optimizer to use for training.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or or it is
      not a bounded scalar int32 spec with minimum 0.
    """
        tf.Module.__init__(self, name=name)
        common.tf_agents_gauge.get_cell('TFABandit').set(True)
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)
        self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
            action_spec)

        reward_network.create_variables()
        self._reward_network = reward_network
        self._optimizer = optimizer
        self._error_loss_fn = error_loss_fn
        self._gradient_clipping = gradient_clipping
        self._heteroscedastic = isinstance(
            reward_network, heteroscedastic_q_network.HeteroscedasticQNetwork)

        policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
            time_step_spec,
            action_spec,
            reward_network,
            observation_and_action_constraint_splitter,
            emit_policy_info=emit_policy_info)

        super(GreedyRewardPredictionAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy=policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             enable_summaries=enable_summaries,
                             train_step_counter=train_step_counter)
コード例 #16
0
  def __init__(
      self,
      time_step_spec,
      action_spec,
      reward_network,
      optimizer,
      observation_and_action_constraint_splitter=None,
      accepts_per_arm_features=False,
      # Params for training.
      error_loss_fn=tf.compat.v1.losses.mean_squared_error,
      gradient_clipping=None,
      # Params for debugging.
      debug_summaries=False,
      summarize_grads_and_vars=False,
      enable_summaries=True,
      emit_policy_info=(),
      train_step_counter=None,
      laplacian_matrix=None,
      laplacian_smoothing_weight=0.001,
      name=None):
    """Creates a Greedy Reward Network Prediction Agent.

     In some use cases, the actions are not independent and they are related to
     each other (e.g., when the actions are ordinal integers). Assuming that
     the relations between arms can be modeled by a graph, we may want to
     enforce that the estimated reward function is smooth over the graph. This
     implies that the estimated rewards `r_i` and `r_j` for two related actions
     `i` and `j`, should be close to each other. To quantify this smoothness
     criterion we use the Laplacian matrix `L` of the graph over the actions.
     When the laplacian smoothing is enabled, the loss is extended to:
     ```
       Loss_new := Loss + lambda r^T * L * r,
     ```
     where `r` is the estimated reward vector for all actions. The second
     term is the laplacian smoothing regularization term and `lambda` is the
     weight that determines how strongly we enforce the regularization.
     For more details, please see:
     "Bandits on graphs and structures", Michal Valko
     https://hal.inria.fr/tel-01359757/document

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      reward_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with call(observation, step_type) and it is
        expected to provide a reward prediction for all actions.
      optimizer: The optimizer to use for training.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      laplacian_matrix: A float `Tensor` or a numpy array shaped
        `[num_actions, num_actions]`. This holds the Laplacian matrix used to
        regularize the smoothness of the estimated expected reward function.
        This only applies to problems where the actions have a graph structure.
        If `None`, the regularization is not applied.
      laplacian_smoothing_weight: A float that determines the weight of the
        regularization term. Note that this has no effect if `laplacian_matrix`
        above is `None`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or or it is
      not a bounded scalar int32 spec with minimum 0.
      InvalidArgumentError: if the Laplacian provided is not None and not valid.
    """
    tf.Module.__init__(self, name=name)
    common.tf_agents_gauge.get_cell('TFABandit').set(True)
    self._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter)
    self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
        action_spec)
    self._accepts_per_arm_features = accepts_per_arm_features

    reward_network.create_variables()
    self._reward_network = reward_network
    self._optimizer = optimizer
    self._error_loss_fn = error_loss_fn
    self._gradient_clipping = gradient_clipping
    self._heteroscedastic = isinstance(
        reward_network, heteroscedastic_q_network.HeteroscedasticQNetwork)
    self._laplacian_matrix = None
    if laplacian_matrix is not None:
      self._laplacian_matrix = tf.convert_to_tensor(
          laplacian_matrix, dtype=tf.float32)
      # Check the validity of the laplacian matrix.
      tf.debugging.assert_near(
          0.0, tf.norm(tf.reduce_sum(self._laplacian_matrix, 1)))
      tf.debugging.assert_near(
          0.0, tf.norm(tf.reduce_sum(self._laplacian_matrix, 0)))
    self._laplacian_smoothing_weight = laplacian_smoothing_weight

    policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
        time_step_spec,
        action_spec,
        reward_network,
        observation_and_action_constraint_splitter,
        accepts_per_arm_features=accepts_per_arm_features,
        emit_policy_info=emit_policy_info)
    training_data_spec = None
    if accepts_per_arm_features:
      training_data_spec = bandit_spec_utils.drop_arm_observation(
          policy.trajectory_spec)

    super(GreedyRewardPredictionAgent, self).__init__(
        time_step_spec,
        action_spec,
        policy,
        collect_policy=policy,
        train_sequence_length=None,
        training_data_spec=training_data_spec,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        enable_summaries=enable_summaries,
        train_step_counter=train_step_counter)
コード例 #17
0
  def __init__(self,
               time_step_spec,
               action_spec,
               gamma=1.0,
               observation_and_action_constraint_splitter=None,
               dtype=tf.float32,
               name=None):
    """Initialize an instance of `LinearThompsonSamplingAgent`.

    Args:
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to
        1.0, the algorithm does not forget.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`.
      name: a name for this instance of `LinearThompsonSamplingAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
    """
    tf.Module.__init__(self, name=name)
    self._num_actions = bandit_utils.get_num_actions_from_tensor_spec(
        action_spec)
    self._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter)
    if observation_and_action_constraint_splitter:
      context_shape = observation_and_action_constraint_splitter(
          time_step_spec.observation)[0].shape.as_list()
    else:
      context_shape = time_step_spec.observation.shape.as_list()
    self._context_dim = (
        tf.compat.dimension_value(context_shape[0]) if context_shape else 1)
    self._gamma = gamma
    if self._gamma < 0.0 or self._gamma > 1.0:
      raise ValueError('Forgetting factor `gamma` must be in [0.0, 1.0].')

    self._weight_covariances = []
    self._parameter_estimators = []
    self._dtype = dtype
    if dtype not in (tf.float32, tf.float64):
      raise ValueError(
          'Agent dtype should be either `tf.float32 or `tf.float64`.')

    for k in range(self._num_actions):
      self._weight_covariances.append(
          tf.compat.v2.Variable(
              tf.eye(self._context_dim, dtype=dtype), name='a_' + str(k)))
      self._parameter_estimators.append(
          tf.compat.v2.Variable(
              tf.zeros(self._context_dim, dtype=dtype), name='b_' + str(k)))

    policy = ts_policy.LinearThompsonSamplingPolicy(
        action_spec,
        time_step_spec,
        self._weight_covariances,
        self._parameter_estimators,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter))
    super(LinearThompsonSamplingAgent, self).__init__(
        time_step_spec=time_step_spec,
        action_spec=policy.action_spec,
        policy=policy,
        collect_policy=policy,
        train_sequence_length=None)