コード例 #1
0
 def testIsDiscrete(self, dtype):
     spec = array_spec.ArraySpec((2, 3), dtype=dtype)
     self.assertIs(tensor_spec.is_discrete(spec),
                   issubclass(np.dtype(dtype).type, np.integer))
コード例 #2
0
 def testIsDiscrete(self, dtype):
     spec = tensor_spec.TensorSpec((2, 3), dtype=dtype)
     self.assertIs(tensor_spec.is_discrete(spec), dtype.is_integer)
コード例 #3
0
 def _get_clip(spec):
     dims = np.product(spec.shape.as_list())
     if tensor_spec.is_discrete(spec):
         dims *= spec.maximum - spec.minimum + 1
     return np.sqrt(action_dist_clip_per_dim * dims)
コード例 #4
0
    def __init__(self,
                 time_step_spec=None,
                 action_spec=None,
                 reward_network=None,
                 observation_and_action_constraint_splitter=None,
                 emit_policy_info=(),
                 name=None):
        """Builds a GreedyRewardPredictionPolicy given a reward tf_agents.Network.

    This policy takes a tf_agents.Network predicting rewards and generates the
    action corresponding to the largest predicted reward.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      reward_network: An instance of a `tf_agents.network.Network`,
        callable via `network(observation, step_type) -> (output, final_state)`.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the network and 2) the
        mask.  The mask should be a 0-1 `Tensor` of shape
        `[batch_size, num_actions]`. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      NotImplementedError: If `action_spec` contains more than one
        `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid.
    """
        flat_action_spec = tf.nest.flatten(action_spec)
        if len(flat_action_spec) > 1:
            raise NotImplementedError(
                'action_spec can only contain a single BoundedTensorSpec.')

        action_spec = flat_action_spec[0]
        if (not tensor_spec.is_bounded(action_spec)
                or not tensor_spec.is_discrete(action_spec)
                or action_spec.shape.rank > 1
                or action_spec.shape.num_elements() != 1):
            raise NotImplementedError(
                'action_spec must be a BoundedTensorSpec of type int32 and shape (). '
                'Found {}.'.format(action_spec))
        self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1
        self._action_offset = action_spec.minimum
        reward_network.create_variables()
        self._reward_network = reward_network

        self._emit_policy_info = emit_policy_info
        predicted_rewards_mean = ()
        if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
            predicted_rewards_mean = tensor_spec.TensorSpec(
                [self._expected_num_actions])
        bandit_policy_type = ()
        if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info:
            bandit_policy_type = (
                policy_utilities.create_bandit_policy_type_tensor_spec(
                    shape=[1]))
        info_spec = policy_utilities.PolicyInfo(
            predicted_rewards_mean=predicted_rewards_mean,
            bandit_policy_type=bandit_policy_type)

        super(GreedyRewardPredictionPolicy,
              self).__init__(time_step_spec,
                             action_spec,
                             policy_state_spec=reward_network.state_spec,
                             clip=False,
                             info_spec=info_spec,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             name=name)
コード例 #5
0
    def __init__(self,
                 time_step_spec: types.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 reward_network: types.Network,
                 temperature: types.FloatOrReturningFloat = 1.0,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 accepts_per_arm_features: bool = False,
                 constraints: Tuple[constr.NeuralConstraint, ...] = (),
                 emit_policy_info: Tuple[Text, ...] = (),
                 name: Optional[Text] = None):
        """Builds a BoltzmannRewardPredictionPolicy given a reward network.

    This policy takes a tf_agents.Network predicting rewards and chooses an
    action with weighted probabilities (i.e., using a softmax over the network
    estimates of value for each action).

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      reward_network: An instance of a `tf_agents.network.Network`,
        callable via `network(observation, step_type) -> (output, final_state)`.
      temperature: float or callable that returns a float. The temperature used
        in the Boltzmann exploration.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the network and 2) the
        mask.  The mask should be a 0-1 `Tensor` of shape
        `[batch_size, num_actions]`. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      constraints: iterable of constraints objects that are instances of
        `tf_agents.bandits.agents.NeuralConstraint`.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      NotImplementedError: If `action_spec` contains more than one
        `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid.
    """
        policy_utilities.check_no_mask_with_arm_features(
            accepts_per_arm_features,
            observation_and_action_constraint_splitter)
        flat_action_spec = tf.nest.flatten(action_spec)
        if len(flat_action_spec) > 1:
            raise NotImplementedError(
                'action_spec can only contain a single BoundedTensorSpec.')

        self._temperature = temperature
        action_spec = flat_action_spec[0]
        if (not tensor_spec.is_bounded(action_spec)
                or not tensor_spec.is_discrete(action_spec)
                or action_spec.shape.rank > 1
                or action_spec.shape.num_elements() != 1):
            raise NotImplementedError(
                'action_spec must be a BoundedTensorSpec of type int32 and shape (). '
                'Found {}.'.format(action_spec))
        self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1
        self._action_offset = action_spec.minimum
        reward_network.create_variables()
        self._reward_network = reward_network
        self._constraints = constraints

        self._emit_policy_info = emit_policy_info
        predicted_rewards_mean = ()
        if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
            predicted_rewards_mean = tensor_spec.TensorSpec(
                [self._expected_num_actions])
        bandit_policy_type = ()
        if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info:
            bandit_policy_type = (
                policy_utilities.create_bandit_policy_type_tensor_spec(
                    shape=[1]))
        if accepts_per_arm_features:
            # The features for the chosen arm is saved to policy_info.
            chosen_arm_features_info = (
                policy_utilities.create_chosen_arm_features_info_spec(
                    time_step_spec.observation))
            info_spec = policy_utilities.PerArmPolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                bandit_policy_type=bandit_policy_type,
                chosen_arm_features=chosen_arm_features_info)
        else:
            info_spec = policy_utilities.PolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                bandit_policy_type=bandit_policy_type)

        self._accepts_per_arm_features = accepts_per_arm_features

        super(BoltzmannRewardPredictionPolicy,
              self).__init__(time_step_spec,
                             action_spec,
                             policy_state_spec=reward_network.state_spec,
                             clip=False,
                             info_spec=info_spec,
                             emit_log_probability='log_probability'
                             in emit_policy_info,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             name=name)
コード例 #6
0
def _is_categorical_spec(spec):
    return (tensor_spec.is_discrete(spec) and tensor_spec.is_bounded(spec)
            and spec.shape == [] and spec.minimum == 0)
コード例 #7
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 actor_network: network.Network,
                 policy_state_spec: types.NestedTensorSpec = (),
                 info_spec: types.NestedTensorSpec = (),
                 observation_normalizer: Optional[
                     tensor_normalizer.TensorNormalizer] = None,
                 clip: bool = True,
                 training: bool = False,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 name: Optional[Text] = None):
        """Builds an Actor Policy given an actor network.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      actor_network: An instance of a `tf_agents.networks.network.Network` to be
        used by the policy. The network will be called with `call(observation,
        step_type, policy_state)` and should return `(actions_or_distributions,
        new_state)`.
      policy_state_spec: A nest of TensorSpec representing the policy_state.
        If not set, defaults to actor_network.state_spec.
      info_spec: A nest of `TensorSpec` representing the policy info.
      observation_normalizer: An object to use for observation normalization.
      clip: Whether to clip actions to spec before returning them. Default True.
        Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped
        continuous actions for training.
      training: Whether the network should be called in training mode.
      observation_and_action_constraint_splitter: A function used to process
        observations with action constraints. These constraints can indicate,
        for example, a mask of valid/invalid actions for a given state of the
        environment.
        The function takes in a full observation and returns a tuple consisting
        of 1) the part of the observation intended as input to the network and
        2) the constraint. An example
        `observation_and_action_constraint_splitter` could be as simple as:
        ```
        def observation_and_action_constraint_splitter(observation):
          return observation['network_input'], observation['constraint']
        ```
        *Note*: when using `observation_and_action_constraint_splitter`, make
        sure the provided `actor_network` is compatible with the
        network-specific half of the output of the
        `observation_and_action_constraint_splitter`. In particular,
        `observation_and_action_constraint_splitter` will be called on the
        observation before passing to the network.
        If `observation_and_action_constraint_splitter` is None, action
        constraints are not applied.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      ValueError: if `actor_network` is not of type `network.Network`.
      NotImplementedError: if `observation_and_action_constraint_splitter` is
        not None but `action_spec` is not discrete.
    """
        if not isinstance(actor_network, network.Network):
            raise ValueError('actor_network must be a network.Network. Found '
                             '{}.'.format(type(actor_network)))
        actor_network.create_variables()
        self._actor_network = actor_network
        self._observation_normalizer = observation_normalizer
        self._training = training

        if observation_and_action_constraint_splitter is not None:
            if len(tf.nest.flatten(action_spec)) > 1 or (
                    not tensor_spec.is_discrete(action_spec)):
                raise NotImplementedError(
                    'Action constraints for ActorPolicy are currently only supported '
                    'for a single spec of discrete actions. Got action_spec {}'
                    .format(action_spec))

        if not policy_state_spec:
            policy_state_spec = actor_network.state_spec

        super(ActorPolicy,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             policy_state_spec=policy_state_spec,
                             info_spec=info_spec,
                             clip=clip,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             name=name)
コード例 #8
0
    def __init__(self,
                 input_tensor_spec,
                 output_tensor_spec,
                 fc_layer_params=(200, 100),
                 conv_layer_params=None,
                 activation_fn=tf.keras.activations.relu,
                 discrete_projection_net=_categorical_projection_net,
                 continuous_projection_net=_normal_projection_net,
                 name='ActorDistributionNetwork'):
        """Creates an instance of `ActorDistributionNetwork`.

    Args:
      input_tensor_spec: A nest of `tensor_spec.TensorSpec` representing the
        input.
      output_tensor_spec: A nest of `tensor_spec.BoundedTensorSpec` representing
        the output.
      fc_layer_params: Optional list of fully_connected parameters, where each
        item is the number of units in the layer.
      conv_layer_params: Optional list of convolution layers parameters, where
        each item is a length-three tuple indicating (filters, kernel_size,
        stride).
      activation_fn: Activation function, e.g. tf.nn.relu, slim.leaky_relu, ...
      discrete_projection_net: Callable that generates a discrete projection
        network to be called with some hidden state and the outer_rank of the
        state.
      continuous_projection_net: Callable that generates a continuous projection
        network to be called with some hidden state and the outer_rank of the
        state.
      name: A string representing name of the network.

    Raises:
      ValueError: If `input_tensor_spec` contains more than one observation.
    """

        if len(tf.nest.flatten(input_tensor_spec)) > 1:
            raise ValueError(
                'Only a single observation is supported by this network')

        mlp_layers = utils.mlp_layers(conv_layer_params,
                                      fc_layer_params,
                                      activation_fn=activation_fn,
                                      kernel_initializer=tf.compat.v1.keras.
                                      initializers.glorot_uniform(),
                                      name='input_mlp')

        projection_networks = []
        for single_output_spec in tf.nest.flatten(output_tensor_spec):
            if tensor_spec.is_discrete(single_output_spec):
                projection_networks.append(
                    discrete_projection_net(single_output_spec))
            else:
                projection_networks.append(
                    continuous_projection_net(single_output_spec))

        projection_distribution_specs = [
            proj_net.output_spec for proj_net in projection_networks
        ]
        output_spec = tf.nest.pack_sequence_as(output_tensor_spec,
                                               projection_distribution_specs)

        super(ActorDistributionNetwork,
              self).__init__(input_tensor_spec=input_tensor_spec,
                             state_spec=(),
                             output_spec=output_spec,
                             name=name)

        self._mlp_layers = mlp_layers
        self._projection_networks = projection_networks
        self._output_tensor_spec = output_tensor_spec
コード例 #9
0
    def __init__(self,
                 action_spec,
                 feature_spec,
                 hidden_size=256,
                 reward_adapt_speed=8.0,
                 encoding_net: Network = None,
                 forward_net: Network = None,
                 inverse_net: Network = None,
                 name="ICMAlgorithm"):
        """Create an ICMAlgorithm.

        Args:
            hidden_size (int|tuple): size of hidden layer(s)
            reward_adapt_speed (float): how fast to adapt the reward normalizer.
                rouphly speaking, the statistics for the normalization is
                calculated mostly based on the most recent T/speed samples,
                where T is the total number of samples.
            encoding_net (Network): network for encoding observation into a
                latent feature specified by feature_spec. Its input is same as
                the input of this algorithm.
            forward_net (Network): network for predicting next feature based on
                previous feature and action. It should accept input with spec
                [feature_spec, encoded_action_spec] and output a tensor of shape
                feature_spec. For discrete action, encoded_action is an one-hot
                representation of the action. For continuous action, encoded
                action is same as the original action.
            inverse_net (Network): network for predicting previous action given
                the previous feature and current feature. It should accept input
                with spec [feature_spec, feature_spec] and output tensor of
                shape (num_actions,).
        """
        super(ICMAlgorithm, self).__init__(train_state_spec=feature_spec,
                                           name=name)

        flat_action_spec = tf.nest.flatten(action_spec)
        assert len(
            flat_action_spec) == 1, "ICM doesn't suport nested action_spec"

        flat_feature_spec = tf.nest.flatten(feature_spec)
        assert len(
            flat_feature_spec) == 1, "ICM doesn't support nested feature_spec"

        action_spec = flat_action_spec[0]

        if tensor_spec.is_discrete(action_spec):
            self._num_actions = action_spec.maximum - action_spec.minimum + 1
        else:
            self._num_actions = action_spec.shape[-1]

        self._action_spec = action_spec

        feature_dim = flat_feature_spec[0].shape[-1]

        self._encoding_net = encoding_net

        if isinstance(hidden_size, int):
            hidden_size = (hidden_size, )

        if forward_net is None:
            encoded_action_spec = tensor_spec.TensorSpec((self._num_actions, ),
                                                         dtype=tf.float32)
            forward_net = EncodingNetwork(
                name="forward_net",
                input_tensor_spec=[feature_spec, encoded_action_spec],
                fc_layer_params=hidden_size,
                last_layer_size=feature_dim)

        self._forward_net = forward_net

        if inverse_net is None:
            inverse_net = EncodingNetwork(
                name="inverse_net",
                input_tensor_spec=[feature_spec, feature_spec],
                fc_layer_params=hidden_size,
                last_layer_size=self._num_actions,
                last_kernel_initializer=tf.initializers.Zeros())

        self._inverse_net = inverse_net

        self._reward_normalizer = ScalarAdaptiveNormalizer(
            speed=reward_adapt_speed)
コード例 #10
0
ファイル: tensor_spec_test.py プロジェクト: sparshag21/agents
 def testExclusive(self, dtype):
   if dtype == tf.string:
     self.skipTest("Not compatible with string type.")
   spec = tensor_spec.TensorSpec((2, 3), dtype=dtype)
   self.assertIs(
       tensor_spec.is_discrete(spec) ^ tensor_spec.is_continuous(spec), True)
コード例 #11
0
    def __init__(self,
                 input_tensor_spec,
                 output_tensor_spec,
                 fc_layer_params=(200, 100),
                 activation_fn=tf.nn.relu,
                 output_activation_fn=None,
                 kernel_initializer=None,
                 last_kernel_initializer=None,
                 discrete_projection_net=_categorical_projection_net,
                 continuous_projection_net=_normal_projection_net,
                 name='PolicyNetwork'):
        """Creates an instance of `ValueNetwork`.

    Args:
      input_tensor_spec: A possibly nested container of
        `tensor_spec.TensorSpec` representing the inputs.
      output_tensor_spec: A possibly nested container of
        `tensor_spec.TensorSpec` representing the outputs.
      fc_layer_params: Optional list of fully connected parameters after
        merging all inputs, where each item is the number of units
        in the layer.
      activation_fn: Activation function, e.g. tf.nn.relu, slim.leaky_relu, ...
      output_activation_fn: Activation function for the last layer. This can be
        used to restrict the range of the output. For example, one can pass
        tf.keras.activations.sigmoid here to restrict the output to be bounded
        between 0 and 1.
      kernel_initializer: kernel initializer for all layers except for the value
        regression layer. If None, a VarianceScaling initializer will be used.
      last_kernel_initializer: kernel initializer for the value regression
         layer. If None, a RandomUniform initializer will be used.
      discrete_projection_net: projection layer for discrete actions.
      continuous_projection_net: projection layer for continuous actions.
      name: A string representing name of the network.
    """
        def map_proj(spec):
            if tensor_spec.is_discrete(spec):
                return discrete_projection_net(spec)
            else:
                return continuous_projection_net(spec)

        projection_networks = tf.nest.map_structure(map_proj,
                                                    output_tensor_spec)
        output_spec = tf.nest.map_structure(
            lambda proj_net: proj_net.output_spec, projection_networks)
        if tensor_spec.is_discrete(output_tensor_spec):
            action_dim = np.unique(output_tensor_spec.maximum -
                                   output_tensor_spec.minimum + 1)
        else:
            action_dim = output_tensor_spec.shape.num_elements()
        super(PolicyNetwork,
              self).__init__(input_tensor_spec=input_tensor_spec,
                             state_spec=(),
                             output_spec=output_spec,
                             name=name)

        self._flat_specs = tf.nest.flatten(input_tensor_spec)

        if kernel_initializer is None:
            kernel_initializer = tf.compat.v1.keras.initializers.VarianceScaling(
                scale=1. / 3., mode='fan_in', distribution='uniform')
        if last_kernel_initializer is None:
            last_kernel_initializer = tf.keras.initializers.RandomUniform(
                minval=-0.003, maxval=0.003)

        self._fc_layers = utils.mlp_layers(
            None,
            fc_layer_params,
            activation_fn=activation_fn,
            kernel_initializer=kernel_initializer,
            name='mlp')
        self._fc_layers.append(
            tf.keras.layers.Dense(action_dim,
                                  activation=output_activation_fn,
                                  kernel_initializer=last_kernel_initializer,
                                  name='value'))

        self._projection_networks = projection_networks
        self._output_tensor_spec = output_tensor_spec
コード例 #12
0
ファイル: mbrl_algorithm.py プロジェクト: emailweixu/alf
    def __init__(self,
                 observation_spec,
                 feature_spec,
                 action_spec,
                 dynamics_module: DynamicsLearningAlgorithm,
                 reward_module: RewardEstimationAlgorithm,
                 planner_module: PlanAlgorithm,
                 gradient_clipping=None,
                 debug_summaries=False,
                 name="MbrlAlgorithm"):
        """Create an MbrlAlgorithm.
        The MbrlAlgorithm takes as input the following set of modules for
        making decisions on actions based on the current observation:
        1) learnable/fixed dynamics module
        2) learnable/fixed reward module
        3) learnable/fixed planner module

        Args:
            action_spec (nested BoundedTensorSpec): representing the actions.
            dynamics_module (DDLAlgorithm): module for learning to predict
                the next feature based on the previous feature and action.
                It should accept input with spec [feature_spec,
                encoded_action_spec] and output a tensor of shape
                feature_spec. For discrete action, encoded_action is an one-hot
                representation of the action. For continuous action, encoded
                action is same as the original action.
            reward_module (REAlgorithm): module for calculating the reward,
                i.e.,  evaluating the reward for a (s, a) pair
            planner_module (PLANAlgorithm): module for generating planned action
                based on specified reward function and dynamics function
            gradient_clipping (float): Norm length to clip gradients.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.

        """
        train_state_spec = MbrlState(
            dynamics=dynamics_module.train_state_spec, reward=(), planner=())

        super().__init__(
            feature_spec,
            action_spec,
            train_state_spec=train_state_spec,
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            name=name)

        flat_action_spec = tf.nest.flatten(action_spec)
        action_spec = flat_action_spec[0]

        assert not tensor_spec.is_discrete(action_spec), "only support \
                                                    continious control"

        num_actions = action_spec.shape[-1]

        flat_feature_spec = tf.nest.flatten(feature_spec)
        assert len(flat_feature_spec) == 1, "Mbrl doesn't support nested \
                                             feature_spec"

        feature_dim = flat_feature_spec[0].shape[-1]

        self._action_spec = action_spec
        self._num_actions = num_actions

        self._dynamics_module = dynamics_module
        self._reward_module = reward_module
        self._planner_module = planner_module
        self._planner_module.set_reward_func(self._calc_step_reward)
        self._planner_module.set_dynamics_func(self._predict_next_step)
コード例 #13
0
ファイル: action_encoder.py プロジェクト: zhaoyinfu123/alf
 def check_supported_spec(spec):
     if tensor_spec.is_discrete(spec):
         assert len(spec.shape) == 0 or \
             (len(spec.shape) == 1 and spec.shape[0] == 1)
     else:
         assert len(spec.shape) == 1
コード例 #14
0
 def testExclusive(self, dtype):
     spec = tensor_spec.TensorSpec((2, 3), dtype=dtype)
     self.assertIs(
         tensor_spec.is_discrete(spec) ^ tensor_spec.is_continuous(spec),
         True)
コード例 #15
0
  def __init__(self,
               time_step_spec=None,
               action_spec=None,
               reward_network=None,
               observation_and_action_constraint_splitter=None,
               expose_predicted_rewards=False,
               name=None):
    """Builds a GreedyRewardPredictionPolicy given a reward tf_agents.Network.

    This policy takes a tf_agents.Network predicting rewards and generates the
    action corresponding to the largest predicted reward.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      reward_network: An instance of a `tf_agents.network.Network`,
        callable via `network(observation, step_type) -> (output, final_state)`.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the network and 2) the
        mask.  The mask should be a 0-1 `Tensor` of shape
        `[batch_size, num_actions]`. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      expose_predicted_rewards: (bool) Whether to expose the predicted rewards
        in the policy info field under the name 'predicted_rewards'.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      NotImplementedError: If `action_spec` contains more than one
        `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid.
    """
    self._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter)
    flat_action_spec = tf.nest.flatten(action_spec)
    if len(flat_action_spec) > 1:
      raise NotImplementedError(
          'action_spec can only contain a single BoundedTensorSpec.')

    action_spec = flat_action_spec[0]
    if (not tensor_spec.is_bounded(action_spec) or
        not tensor_spec.is_discrete(action_spec) or
        action_spec.shape.rank > 1 or
        action_spec.shape.num_elements() != 1):
      raise NotImplementedError(
          'action_spec must be a BoundedTensorSpec of type int32 and shape (). '
          'Found {}.'.format(action_spec))
    self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1
    self._action_offset = action_spec.minimum
    self._reward_network = reward_network

    self._expose_predicted_rewards = expose_predicted_rewards
    if expose_predicted_rewards:
      info_spec = PolicyInfo(
          predicted_rewards=tensor_spec.TensorSpec(
              [self._expected_num_actions], dtype=tf.float32))
    else:
      info_spec = ()

    super(GreedyRewardPredictionPolicy, self).__init__(
        time_step_spec, action_spec,
        policy_state_spec=reward_network.state_spec,
        clip=False,
        info_spec=info_spec,
        name=name)
コード例 #16
0
ファイル: icm_algorithm.py プロジェクト: zhaoyinfu123/alf
 def _encode_action(self, action):
     if tensor_spec.is_discrete(self._action_spec):
         return tf.one_hot(indices=action, depth=self._num_actions)
     else:
         return action
コード例 #17
0
    def __init__(self,
                 time_step_spec: types.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 alpha: Sequence[tf.Variable],
                 beta: Sequence[tf.Variable],
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 emit_policy_info: Sequence[Text] = (),
                 name: Optional[Text] = None):
        """Builds a BernoulliThompsonSamplingPolicy.

    For a reference, see e.g., Chapter 3 in "A Tutorial on Thompson Sampling" by
    Russo et al. (https://web.stanford.edu/~bvr/pubs/TS_Tutorial.pdf).

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      alpha: list or tuple of tf.Variable's. It holds the `alpha` parameter of
        the beta distribution of each arm.
      beta: list or tuple of tf.Variable's. It holds the `beta` parameter of the
        beta distribution of each arm.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the network and 2) the
        mask.  The mask should be a 0-1 `Tensor` of shape
        `[batch_size, num_actions]`. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      NotImplementedError: If `action_spec` contains more than one
        `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid.
    """
        flat_action_spec = tf.nest.flatten(action_spec)
        if len(flat_action_spec) > 1:
            raise NotImplementedError(
                'action_spec can only contain a single BoundedTensorSpec.')

        action_spec = flat_action_spec[0]
        if (not tensor_spec.is_bounded(action_spec)
                or not tensor_spec.is_discrete(action_spec)
                or action_spec.shape.rank > 1
                or action_spec.shape.num_elements() != 1):
            raise NotImplementedError(
                'action_spec must be a BoundedTensorSpec of integer type and '
                'shape (). Found {}.'.format(action_spec))
        self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1

        if len(alpha) != self._expected_num_actions:
            raise ValueError(
                'The size of alpha parameters is expected to be equal '
                'to the number of actions, but found to be {}'.format(
                    len(alpha)))
        self._alpha = alpha
        if len(alpha) != len(beta):
            raise ValueError(
                'The size of alpha parameters is expected to be equal '
                'to the size of beta parameters')
        self._beta = beta

        self._emit_policy_info = emit_policy_info
        predicted_rewards_mean = ()
        if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
            predicted_rewards_mean = tensor_spec.TensorSpec(
                [self._expected_num_actions])
        predicted_rewards_sampled = ()
        if policy_utilities.InfoFields.PREDICTED_REWARDS_SAMPLED in (
                emit_policy_info):
            predicted_rewards_sampled = tensor_spec.TensorSpec(
                [self._expected_num_actions])
        info_spec = policy_utilities.PolicyInfo(
            predicted_rewards_mean=predicted_rewards_mean,
            predicted_rewards_sampled=predicted_rewards_sampled)

        super(BernoulliThompsonSamplingPolicy,
              self).__init__(time_step_spec,
                             action_spec,
                             info_spec=info_spec,
                             emit_log_probability='log_probability'
                             in emit_policy_info,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             name=name)
コード例 #18
0
    def policy_gradient_loss(self,
                             time_steps,
                             actions,
                             sample_action_log_probs,
                             advantages,
                             current_policy_distribution,
                             weights,
                             debug_summaries=False):
        """Create tensor for policy gradient loss.

    All tensors should have a single batch dimension.

    Args:
      time_steps: TimeSteps with observations for each timestep.
      actions: Tensor of actions for timesteps, aligned on index.
      sample_action_log_probs: Tensor of sample probability of each action.
      advantages: Tensor of advantage estimate for each timestep, aligned on
        index. Works better when advantage estimates are normalized.
      current_policy_distribution: The policy distribution, evaluated on all
        time_steps.
      weights: Optional scalar or element-wise (per-batch-entry) importance
        weights.  Includes a mask for invalid timesteps.
      debug_summaries: True if debug summaries should be created.

    Returns:
      policy_gradient_loss: A tensor that will contain policy gradient loss for
        the on-policy experience.
    """
        tf.nest.assert_same_structure(time_steps, self.time_step_spec)
        action_log_prob = common.log_probability(current_policy_distribution,
                                                 actions, self._action_spec)
        action_log_prob = tf.cast(action_log_prob, tf.float32)
        if self._log_prob_clipping > 0.0:
            action_log_prob = tf.clip_by_value(action_log_prob,
                                               -self._log_prob_clipping,
                                               self._log_prob_clipping)
        if self._check_numerics:
            action_log_prob = tf.debugging.check_numerics(
                action_log_prob, 'action_log_prob')

        # Prepare both clipped and unclipped importance ratios.
        importance_ratio = tf.exp(action_log_prob - sample_action_log_probs)
        importance_ratio_clipped = tf.clip_by_value(
            importance_ratio, 1 - self._importance_ratio_clipping,
            1 + self._importance_ratio_clipping)

        if self._check_numerics:
            importance_ratio = tf.debugging.check_numerics(
                importance_ratio, 'importance_ratio')
            if self._importance_ratio_clipping > 0.0:
                importance_ratio_clipped = tf.debugging.check_numerics(
                    importance_ratio_clipped, 'importance_ratio_clipped')

        # Pessimistically choose the minimum objective value for clipped and
        #   unclipped importance ratios.
        per_timestep_objective = importance_ratio * advantages
        per_timestep_objective_clipped = importance_ratio_clipped * advantages
        per_timestep_objective_min = tf.minimum(
            per_timestep_objective, per_timestep_objective_clipped)

        if self._importance_ratio_clipping > 0.0:
            policy_gradient_loss = -per_timestep_objective_min
        else:
            policy_gradient_loss = -per_timestep_objective

        policy_gradient_loss = tf.reduce_mean(
            input_tensor=policy_gradient_loss * weights)

        if debug_summaries:
            if self._importance_ratio_clipping > 0.0:
                clip_fraction = tf.reduce_mean(input_tensor=tf.cast(
                    tf.greater(tf.abs(importance_ratio - 1.0),
                               self._importance_ratio_clipping), tf.float32))
                tf.compat.v2.summary.scalar(name='clip_fraction',
                                            data=clip_fraction,
                                            step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='action_log_prob',
                                           data=action_log_prob,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='action_log_prob_sample',
                                           data=sample_action_log_probs,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='importance_ratio',
                                           data=importance_ratio,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.scalar(
                name='importance_ratio_mean',
                data=tf.reduce_mean(input_tensor=importance_ratio),
                step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='importance_ratio_clipped',
                                           data=importance_ratio_clipped,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='per_timestep_objective',
                                           data=per_timestep_objective,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(
                name='per_timestep_objective_clipped',
                data=per_timestep_objective_clipped,
                step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='per_timestep_objective_min',
                                           data=per_timestep_objective_min,
                                           step=self.train_step_counter)
            entropy = common.entropy(current_policy_distribution,
                                     self.action_spec)
            tf.compat.v2.summary.histogram(name='policy_entropy',
                                           data=entropy,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.scalar(
                name='policy_entropy_mean',
                data=tf.reduce_mean(input_tensor=entropy),
                step=self.train_step_counter)
            for i, (single_action, single_distribution) in enumerate(
                    zip(tf.nest.flatten(self.action_spec),
                        tf.nest.flatten(current_policy_distribution))):
                # Categorical distribution (used for discrete actions) doesn't have a
                # mean.
                distribution_index = '_{}'.format(i) if i > 0 else ''
                if not tensor_spec.is_discrete(single_action):
                    tf.compat.v2.summary.histogram(
                        name='actions_distribution_mean' + distribution_index,
                        data=single_distribution.mean(),
                        step=self.train_step_counter)
                    tf.compat.v2.summary.histogram(
                        name='actions_distribution_stddev' +
                        distribution_index,
                        data=single_distribution.stddev(),
                        step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='policy_gradient_loss',
                                           data=policy_gradient_loss,
                                           step=self.train_step_counter)

        if self._check_numerics:
            policy_gradient_loss = tf.debugging.check_numerics(
                policy_gradient_loss, 'policy_gradient_loss')

        return policy_gradient_loss
コード例 #19
0
def evaluate_policy():
    env_load_fn = suite_mujoco.load
    categorical = True
    FLAGS = flags.FLAGS
    dim_z = FLAGS.dim_z
    mask_xy = FLAGS.mask_xy
    eval_env_name = FLAGS.env_name
    skill_epsilon = FLAGS.skill_epsilon
    epsilon = 0.75
    epsilon_greedy = False
    state_noise = False
    action_noise = False
    skill_randomization = True
    plot_actions = False

    def _env_load_fn(env_name):
        diayn_wrapper = (lambda x: diayn_gym_env_fixed.DiaynGymEnvFixed(
            x, dim_z, categorical))
        return env_load_fn(
            env_name,
            gym_env_wrappers=[diayn_wrapper],
        )

    root_dir = FLAGS.root_dir
    policy_fc_layers = (256, 256)
    env_steps = tf_metrics.EnvironmentSteps(prefix='Eval')
    _preprocessing_combiner = DictConcatenateLayer()
    global_step = tf.compat.v1.train.get_or_create_global_step()

    if eval_env_name == "Plane-v1":
        make_video = False
    else:
        make_video = True
    tf_env = tf_py_environment.TFPyEnvironment(_env_load_fn(eval_env_name))
    eval_py_env = _env_load_fn(eval_env_name)
    eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    time_step_spec = tf_env.time_step_spec()
    observation_spec = time_step_spec.observation
    action_spec = tf_env.action_spec()
    augmented_time_step_spec = tf_env.time_step_spec()
    augmented_observation_spec = augmented_time_step_spec.observation
    z_spec = augmented_observation_spec["z"]

    if tensor_spec.is_discrete(z_spec):
        _preprocessing_combiner = OneHotConcatenateLayer(dim_z)
    else:
        _preprocessing_combiner = DictConcatenateLayer()

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        augmented_observation_spec,
        action_spec,
        fc_layer_params=policy_fc_layers,
        continuous_projection_net=normal_projection_net,
        preprocessing_combiner=_preprocessing_combiner,
        mask_xy=mask_xy,
        name='EvalNetwork')

    generator_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
        root_dir, 'diayn_actor'),
                                                 actor_net=actor_net,
                                                 global_step=global_step)
    if generator_checkpointer.checkpoint_exists:
        generator_checkpointer.initialize_or_restore()
    else:
        generator_checkpointer.initialize_or_restore()
        print("No low-level actor checkpoint exists...training from scratch")
    # Re-purpose the restored actor network
    generator_net = actor_net

    eval_policy = actor_policy.ActorPolicy(
        time_step_spec=augmented_time_step_spec,
        action_spec=action_spec,
        actor_network=generator_net,
        training=False)

    print("Loaded evaluation policy")

    if make_video:
        print("Creating video")
        skill_path = root_dir + "/skills"
        action_path = root_dir + "/actions"
        if not path.exists(skill_path):
            os.mkdir(skill_path)
        if not path.exists(action_path):
            os.mkdir(action_path)

        color_wheel = ['b', 'r', 'g', 'c', 'm']
        for i in range(5):
            for runs in range(dim_z):
                xs_list = []
                ys_list = []
                actions_list = {new_list: [] for new_list in range(8)}
                video_filename = root_dir + '/skills/' + str(
                    i + 1) + eval_env_name[:-3] + '.mp4'
                skill_plot_filename = root_dir + '/skills/' + str(i+1) + eval_env_name[:-3] + 'eps' \
                     + str(skill_epsilon) + '.png'
                action_plot_filename = root_dir + '/actions/' + str(
                    i + 1) + eval_env_name[:-3] + '.png'
                path_len = 200
                num_eps = 1
                print_interval = 20
                action_interval = 10
                skill_sample_interval = 20
                print("skill {}".format(i))
                with imageio.get_writer(video_filename, fps=60) as video:
                    for _ in range(num_eps):
                        if categorical:
                            eval_py_env.set_z(i)
                        else:
                            skill = [0] * dim_z
                            skill[i] = 1
                            eval_py_env.set_z(skill)
                        _time_step = eval_py_env.reset()
                        print("{} {}".format(
                            _time_step.observation["observation"][:2][0],
                            _time_step.observation["observation"][:2][1]))
                        video.append_data(eval_py_env.render())
                        steps = 0
                        while steps < path_len:
                            if skill_randomization:
                                if steps % skill_sample_interval == 0:
                                    if np.random.random() < skill_epsilon:
                                        sampled_skill = np.random.choice(dim_z)
                                        eval_py_env.set_z(sampled_skill)
                                        print("randomly sampled skill: {}".
                                              format(sampled_skill))
                                    else:
                                        eval_py_env.set_z(i)
                                        print("stuck with skill {}".format(i))
                            if state_noise:
                                _time_step.observation[
                                    "observation"] = np.random.normal(
                                        _time_step.observation["observation"],
                                        scale=0.25)
                            if epsilon_greedy:
                                sample = np.random.random_sample()
                                if sample < epsilon:
                                    action = tensor_spec.sample_spec_nest(
                                        action_spec).numpy()
                                else:
                                    action = eval_policy.action(
                                        _time_step).action.numpy()
                            else:
                                action = eval_policy.action(
                                    _time_step).action.numpy()
                            if steps % action_interval == 0:
                                for index in range(action.shape[0]):
                                    actions_list[index].append(action[index])
                            if action_noise:
                                noisy_action = np.random.normal(action,
                                                                scale=1.0)
                                _time_step = eval_py_env.step(noisy_action)
                            else:
                                _time_step = eval_py_env.step(action)
                            if steps % print_interval == 0:
                                print("{} {}".format(
                                    _time_step.observation["observation"][:2]
                                    [0], _time_step.observation["observation"]
                                    [:2][1]))
                                xs_list.append(
                                    _time_step.observation["observation"][:2]
                                    [0])
                                ys_list.append(
                                    _time_step.observation["observation"][:2]
                                    [1])
                            video.append_data(eval_py_env.render())
                            steps += 1
                    embed_mp4(video_filename)
                    if plot_actions:
                        plt.ylim(-1, 1)
                        for i in range(8):
                            plt.plot(range(int(200 / action_interval)),
                                     actions_list[i])
                    plt.xlim(-25, 25)
                    plt.ylim(-25, 25)
                    plt.plot(xs_list, ys_list, color_wheel[i])
            plt.savefig(skill_plot_filename)
            if plot_actions: plt.savefig(action_plot_filename)
    else:
        print("Rendering plane skills")
        skill_path = root_dir + "/skills"
        if not path.exists(skill_path):
            os.mkdir(skill_path)

        for i in range(0, dim_z, 1):
            path_len = 10
            num_eps = 1
            xs_list = []
            ys_list = []
            skill_plot_filename = root_dir + '/skills/' + str(
                i + 1) + eval_env_name[:-3] + '.png'
            for _ in range(num_eps):
                if categorical:
                    eval_py_env.set_z(i)
                    print("skill {}".format(i + 1))
                else:
                    skill = [0] * dim_z
                    skill[i] = 1
                    eval_py_env.set_z(skill)
                _time_step = eval_py_env.reset()
                eval_py_env.render()
                steps = 0
                while steps < path_len:
                    xs_list.append(_time_step.observation["observation"][0])
                    ys_list.append(_time_step.observation["observation"][1])
                    action_step = eval_policy.action(_time_step)
                    _time_step = eval_py_env.step(action_step.action.numpy())
                    eval_py_env.render()
                    steps += 1
            plt.xlim(-100, 100)
            plt.ylim(-100, 100)
            plt.plot(xs_list, ys_list)
        plt.savefig(skill_plot_filename)
コード例 #20
0
  def __init__(self,
               input_tensor_spec,
               output_tensor_spec,
               input_fc_layer_params=(200, 100),
               output_fc_layer_params=(200, 100),
               conv_layer_params=None,
               lstm_size=(40,),
               activation_fn=tf.keras.activations.relu,
               categorical_projection_net=_categorical_projection_net,
               normal_projection_net=_normal_projection_net,
               name='ActorDistributionRnnNetwork'):
    """Creates an instance of `ActorDistributionRnnNetwork`.

    Args:
      input_tensor_spec: A nest of `tensor_spec.TensorSpec` representing the
        input.
      output_tensor_spec: A nest of `tensor_spec.BoundedTensorSpec` representing
        the output.
      input_fc_layer_params: Optional list of fully_connected parameters, where
        each item is the number of units in the layer. This is applied before
        the LSTM cell.
      output_fc_layer_params: Optional list of fully_connected parameters, where
        each item is the number of units in the layer. This is applied after the
        LSTM cell.
      conv_layer_params: Optional list of convolution layers parameters, where
        each item is a length-three tuple indicating (filters, kernel_size,
        stride).
      lstm_size: An iterable of ints specifying the LSTM cell sizes to use.
      activation_fn: Activation function, e.g. tf.nn.relu, slim.leaky_relu, ...
      categorical_projection_net: Callable that generates a categorical
        projection network to be called with some hidden state and the
        outer_rank of the state.
      normal_projection_net: Callable that generates a normal projection network
        to be called with some hidden state and the outer_rank of the state.
      name: A string representing name of the network.

    Raises:
      ValueError: If `input_tensor_spec` contains more than one observation.
    """
    if len(tf.nest.flatten(input_tensor_spec)) > 1:
      raise ValueError('Only a single observation is supported by this network')

    input_layers = utils.mlp_layers(
        conv_layer_params,
        input_fc_layer_params,
        activation_fn=activation_fn,
        kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform(),
        name='input_mlp')

    # Create RNN cell
    if len(lstm_size) == 1:
      cell = tf.keras.layers.LSTMCell(lstm_size[0])
    else:
      cell = tf.keras.layers.StackedRNNCells(
          [tf.keras.layers.LSTMCell(size) for size in lstm_size])

    state_spec = tf.nest.map_structure(
        functools.partial(
            tensor_spec.TensorSpec, dtype=tf.float32,
            name='network_state_spec'), cell.state_size)

    output_layers = utils.mlp_layers(
        fc_layer_params=output_fc_layer_params, name='output')

    projection_networks = []
    for single_output_spec in tf.nest.flatten(output_tensor_spec):
      if tensor_spec.is_discrete(single_output_spec):
        projection_networks.append(
            categorical_projection_net(single_output_spec))
      else:
        projection_networks.append(normal_projection_net(single_output_spec))

    projection_distribution_specs = [
        proj_net.output_spec for proj_net in projection_networks
    ]
    output_spec = tf.nest.pack_sequence_as(output_tensor_spec,
                                           projection_distribution_specs)

    super(ActorDistributionRnnNetwork, self).__init__(
        input_tensor_spec=input_tensor_spec,
        state_spec=state_spec,
        output_spec=output_spec,
        name=name)

    self._conv_layer_params = conv_layer_params
    self._input_layers = input_layers
    self._dynamic_unroll = dynamic_unroll_layer.DynamicUnroll(cell)
    self._output_layers = output_layers
    self._projection_networks = projection_networks
    self._output_tensor_spec = output_tensor_spec
コード例 #21
0
    def __init__(
            self,
            time_step_spec: Optional[ts.TimeStep],
            action_spec: Optional[NestedBoundedTensorSpec],
            scalarizer: multi_objective_scalarizer.Scalarizer,
            objective_networks: Sequence[Network],
            observation_and_action_constraint_splitter: types.Splitter = None,
            accepts_per_arm_features: bool = False,
            emit_policy_info: Tuple[Text] = (),
            name: Optional[Text] = None):
        """Builds a GreedyMultiObjectiveNeuralPolicy based on multiple networks.

    This policy takes an iterable of `tf_agents.Network`, each responsible for
    predicting a specific objective, along with a `Scalarizer` object to
    generate an action by maximizing the scalarized objective, i.e., the output
    of the `Scalarizer` applied to the multiple predicted objectives by the
    networks.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      scalarizer: A
       `tf_agents.bandits.multi_objective.multi_objective_scalarizer.Scalarizer`
        object that implements scalarization of multiple objectives into a
        single scalar reward.
      objective_networks: A Sequence of `tf_agents.network.Network` objects to
        be used by the policy. Each network will be called with
        call(observation, step_type) and is expected to provide a prediction for
        a specific objective for all actions.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the network and 2) the
        mask.  The mask should be a 0-1 `Tensor` of shape `[batch_size,
        num_actions]`. This function should also work with a `TensorSpec` as
        input, and should output `TensorSpec` objects for the observation and
        mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      NotImplementedError: If `action_spec` contains more than one
        `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid.
      NotImplementedError: If `action_spec` is not a `BoundedTensorSpec` of type
        int32 and shape ().
      ValueError: If `objective_networks` has fewer than two networks.
      ValueError: If `accepts_per_arm_features` is true but `time_step_spec` is
        None.
    """
        flat_action_spec = tf.nest.flatten(action_spec)
        if len(flat_action_spec) > 1:
            raise NotImplementedError(
                'action_spec can only contain a single BoundedTensorSpec.')

        action_spec = flat_action_spec[0]
        if (not tensor_spec.is_bounded(action_spec)
                or not tensor_spec.is_discrete(action_spec)
                or action_spec.shape.rank > 1
                or action_spec.shape.num_elements() != 1):
            raise NotImplementedError(
                'action_spec must be a BoundedTensorSpec of type int32 and shape (). '
                'Found {}.'.format(action_spec))
        self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1
        self._action_offset = action_spec.minimum
        policy_state_spec = []
        for network in objective_networks:
            policy_state_spec.append(network.state_spec)
            network.create_variables()
        self._objective_networks = objective_networks
        self._scalarizer = scalarizer
        self._num_objectives = len(self._objective_networks)
        if self._num_objectives < 2:
            raise ValueError(
                'Number of objectives should be at least two, but found to be {}'
                .format(self._num_objectives))

        self._emit_policy_info = emit_policy_info
        predicted_rewards_mean = ()
        if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info:
            predicted_rewards_mean = tensor_spec.TensorSpec(
                [self._num_objectives, self._expected_num_actions])
        bandit_policy_type = ()
        if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info:
            bandit_policy_type = (
                policy_utilities.create_bandit_policy_type_tensor_spec(
                    shape=[1]))
        if accepts_per_arm_features:
            if time_step_spec is None:
                raise ValueError(
                    'time_step_spec should not be None for per-arm-features policies, '
                    'but found to be.')
            # The features for the chosen arm is saved to policy_info.
            chosen_arm_features_info = (
                policy_utilities.create_chosen_arm_features_info_spec(
                    time_step_spec.observation,
                    observation_and_action_constraint_splitter))
            info_spec = policy_utilities.PerArmPolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                bandit_policy_type=bandit_policy_type,
                chosen_arm_features=chosen_arm_features_info)
        else:
            info_spec = policy_utilities.PolicyInfo(
                predicted_rewards_mean=predicted_rewards_mean,
                bandit_policy_type=bandit_policy_type)

        self._accepts_per_arm_features = accepts_per_arm_features

        super(GreedyMultiObjectiveNeuralPolicy,
              self).__init__(time_step_spec,
                             action_spec,
                             policy_state_spec=policy_state_spec,
                             clip=False,
                             info_spec=info_spec,
                             emit_log_probability='log_probability'
                             in emit_policy_info,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             name=name)
コード例 #22
0
 def map_proj(spec):
     if tensor_spec.is_discrete(spec):
         return discrete_projection_net(spec)
     else:
         return continuous_projection_net(spec)
コード例 #23
0
def train_eval(
        root_dir,
        env_name=None,
        env_load_fn=suite_mujoco.load,
        random_seed=0,
        # TODO(b/127576522): rename to policy_fc_layers.
        actor_fc_layers=(200, 100),
        value_fc_layers=(200, 100),
        inference_fc_layers=(200, 100),
        use_rnns=None,
        dim_z=4,
        categorical=True,
        # Params for collect
        num_environment_steps=10000000,
        collect_episodes_per_iteration=30,
        num_parallel_environments=30,
        replay_buffer_capacity=1001,  # Per-environment
        # Params for train
    num_epochs=25,
        learning_rate=1e-4,
        entropy_regularization=None,
        kl_posteriors_penalty=None,
        mock_inference=None,
        mock_reward=None,
        l2_distance=None,
        rl_steps=None,
        inference_steps=None,
        # Params for eval
        num_eval_episodes=30,
        eval_interval=1000,
        # Params for summaries and logging
        train_checkpoint_interval=10000,
        policy_checkpoint_interval=10000,
        log_interval=1000,
        summary_interval=1000,
        summaries_flush_secs=1,
        use_tf_functions=True,
        debug_summaries=False,
        summarize_grads_and_vars=False):
    """A simple train and eval for PPO."""
    if root_dir is None:
        raise AttributeError('train_eval requires a root_dir.')

    root_dir = os.path.expanduser(root_dir)
    train_dir = os.path.join(root_dir, 'train')
    eval_dir = os.path.join(root_dir, 'eval')
    saved_model_dir = os.path.join(root_dir, 'policy_saved_model')

    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis=summaries_flush_secs * 1000)
    train_summary_writer.set_as_default()

    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis=summaries_flush_secs * 1000)
    eval_metrics = [
        tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
    ]

    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(
            lambda: tf.math.equal(global_step % summary_interval, 0)):
        tf.compat.v1.set_random_seed(random_seed)

        def _env_load_fn(env_name):
            diayn_wrapper = (
                lambda x: diayn_gym_env.DiaynGymEnv(x, dim_z, categorical))
            return env_load_fn(
                env_name,
                gym_env_wrappers=[diayn_wrapper],
            )

        eval_tf_env = tf_py_environment.TFPyEnvironment(_env_load_fn(env_name))
        if num_parallel_environments == 1:
            py_env = _env_load_fn(env_name)
        else:
            py_env = parallel_py_environment.ParallelPyEnvironment(
                [lambda: _env_load_fn(env_name)] * num_parallel_environments)
        tf_env = tf_py_environment.TFPyEnvironment(py_env)

        augmented_time_step_spec = tf_env.time_step_spec()
        augmented_observation_spec = augmented_time_step_spec.observation
        observation_spec = augmented_observation_spec['observation']
        z_spec = augmented_observation_spec['z']
        reward_spec = augmented_time_step_spec.reward
        action_spec = tf_env.action_spec()
        time_step_spec = ts.time_step_spec(observation_spec)
        infer_from_com = False
        if env_name == "AntRandGoalEval-v1":
            infer_from_com = True
        if infer_from_com:
            input_inference_spec = tspec.BoundedTensorSpec(
                shape=[2],
                dtype=tf.float64,
                minimum=-1.79769313e+308,
                maximum=1.79769313e+308,
                name='body_com')
        else:
            input_inference_spec = observation_spec

        if tensor_spec.is_discrete(z_spec):
            _preprocessing_combiner = OneHotConcatenateLayer(dim_z)
        else:
            _preprocessing_combiner = DictConcatenateLayer()

        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)

        if use_rnns:
            actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
                augmented_observation_spec,
                action_spec,
                preprocessing_combiner=_preprocessing_combiner,
                input_fc_layer_params=actor_fc_layers,
                output_fc_layer_params=None)
            value_net = value_rnn_network.ValueRnnNetwork(
                augmented_observation_spec,
                preprocessing_combiner=_preprocessing_combiner,
                input_fc_layer_params=value_fc_layers,
                output_fc_layer_params=None)
        else:
            actor_net = actor_distribution_network.ActorDistributionNetwork(
                augmented_observation_spec,
                action_spec,
                preprocessing_combiner=_preprocessing_combiner,
                fc_layer_params=actor_fc_layers,
                name="actor_net")
            value_net = value_network.ValueNetwork(
                augmented_observation_spec,
                preprocessing_combiner=_preprocessing_combiner,
                fc_layer_params=value_fc_layers,
                name="critic_net")
        inference_net = actor_distribution_network.ActorDistributionNetwork(
            input_tensor_spec=input_inference_spec,
            output_tensor_spec=z_spec,
            fc_layer_params=inference_fc_layers,
            continuous_projection_net=normal_projection_net,
            name="inference_net")

        tf_agent = ppo_diayn_agent.PPODiaynAgent(
            augmented_time_step_spec,
            action_spec,
            z_spec,
            optimizer,
            actor_net=actor_net,
            value_net=value_net,
            inference_net=inference_net,
            num_epochs=num_epochs,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=global_step,
            entropy_regularization=entropy_regularization,
            kl_posteriors_penalty=kl_posteriors_penalty,
            mock_inference=mock_inference,
            mock_reward=mock_reward,
            infer_from_com=infer_from_com,
            l2_distance=l2_distance,
            rl_steps=rl_steps,
            inference_steps=inference_steps)
        tf_agent.initialize()

        environment_steps_metric = tf_metrics.EnvironmentSteps()
        step_metrics = [
            tf_metrics.NumberOfEpisodes(),
            environment_steps_metric,
        ]

        train_metrics = step_metrics + [
            tf_metrics.AverageReturnMetric(
                batch_size=num_parallel_environments),
            tf_metrics.AverageEpisodeLengthMetric(
                batch_size=num_parallel_environments),
        ]

        eval_policy = tf_agent.policy
        collect_policy = tf_agent.collect_policy
        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            tf_agent.collect_data_spec,
            batch_size=num_parallel_environments,
            max_length=replay_buffer_capacity)

        actor_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
            root_dir, 'diayn_actor'),
                                                 actor_net=actor_net,
                                                 global_step=global_step)
        train_checkpointer = common.Checkpointer(
            ckpt_dir=train_dir,
            agent=tf_agent,
            global_step=global_step,
            metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
        policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
            train_dir, 'diayn_policy'),
                                                  policy=eval_policy,
                                                  global_step=global_step)
        saved_model = policy_saver.PolicySaver(eval_policy,
                                               train_step=global_step)
        rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
            root_dir, 'diayn_replay_buffer'),
                                              max_to_keep=1,
                                              replay_buffer=replay_buffer)
        inference_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(root_dir, 'diayn_inference'),
            inference_net=inference_net,
            global_step=global_step)

        actor_checkpointer.initialize_or_restore()
        train_checkpointer.initialize_or_restore()
        rb_checkpointer.initialize_or_restore()
        inference_checkpointer.initialize_or_restore()

        collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
            tf_env,
            collect_policy,
            observers=[replay_buffer.add_batch] + train_metrics,
            num_episodes=collect_episodes_per_iteration)

        # option_length = 200
        # if env_name == "Plane-v1":
        #     option_length = 10
        # dataset = replay_buffer.as_dataset(
        #         num_parallel_calls=3, sample_batch_size=num_parallel_environments,
        #         num_steps=option_length)
        # iterator_dataset = iter(dataset)

        def train_step():
            trajectories = replay_buffer.gather_all()
            #   trajectories, _ = next(iterator_dataset)
            return tf_agent.train(experience=trajectories)

        if use_tf_functions:
            # TODO(b/123828980): Enable once the cause for slowdown was identified.
            collect_driver.run = common.function(collect_driver.run,
                                                 autograph=False)
            tf_agent.train = common.function(tf_agent.train, autograph=False)
            train_step = common.function(train_step)

        collect_time = 0
        train_time = 0
        timed_at_step = global_step.numpy()

        while environment_steps_metric.result() < num_environment_steps:
            global_step_val = global_step.numpy()
            if global_step_val % eval_interval == 0:
                metric_utils.eager_compute(
                    eval_metrics,
                    eval_tf_env,
                    eval_policy,
                    num_episodes=num_eval_episodes,
                    train_step=global_step,
                    summary_writer=eval_summary_writer,
                    summary_prefix='Metrics',
                )

            start_time = time.time()
            collect_driver.run()
            collect_time += time.time() - start_time

            start_time = time.time()
            total_loss, _ = train_step()
            replay_buffer.clear()
            train_time += time.time() - start_time

            for train_metric in train_metrics:
                train_metric.tf_summaries(train_step=global_step,
                                          step_metrics=step_metrics)

            if global_step_val % log_interval == 0:
                logging.info('step = %d, loss = %f', global_step_val,
                             total_loss)
                steps_per_sec = ((global_step_val - timed_at_step) /
                                 (collect_time + train_time))
                logging.info('%.3f steps/sec', steps_per_sec)
                logging.info('collect_time = {}, train_time = {}'.format(
                    collect_time, train_time))
                with tf.compat.v2.summary.record_if(True):
                    tf.compat.v2.summary.scalar(name='global_steps_per_sec',
                                                data=steps_per_sec,
                                                step=global_step)

                if global_step_val % train_checkpoint_interval == 0:
                    train_checkpointer.save(global_step=global_step_val)
                    inference_checkpointer.save(global_step=global_step_val)
                    actor_checkpointer.save(global_step=global_step_val)
                    rb_checkpointer.save(global_step=global_step_val)

                if global_step_val % policy_checkpoint_interval == 0:
                    policy_checkpointer.save(global_step=global_step_val)
                    saved_model_path = os.path.join(
                        saved_model_dir,
                        'policy_' + ('%d' % global_step_val).zfill(9))
                    saved_model.save(saved_model_path)

                timed_at_step = global_step_val
                collect_time = 0
                train_time = 0

        # One final eval before exiting.
        metric_utils.eager_compute(
            eval_metrics,
            eval_tf_env,
            eval_policy,
            num_episodes=num_eval_episodes,
            train_step=global_step,
            summary_writer=eval_summary_writer,
            summary_prefix='Metrics',
        )