Exemple #1
0
 def time_step_spec(self):
     """Return spec for ActionTimeStep."""
     return ActionTimeStep(step_type=tf.TensorSpec((), tf.int32),
                           reward=tf.TensorSpec((), tf.float32),
                           discount=tf.TensorSpec((), tf.float32),
                           observation=self.observation_spec,
                           prev_action=self.action_spec,
                           env_id=tf.TensorSpec((), tf.int32))
Exemple #2
0
    def __init__(self,
                 num_of_skills,
                 feature_spec,
                 hidden_size=256,
                 reward_adapt_speed=8.0,
                 encoding_net: Network = None,
                 discriminator_net: Network = None,
                 name="DIAYNAlgorithm"):
        """Create a DIAYNAlgorithm.

        Args:
            num_of_skills (int): number of skills
            hidden_size (int|tuple): size of hidden layer(s).
                If discriminator_net is None, a default discriminator_net
                with this hidden_size will be used.
            reward_adapt_speed (float): how fast to adapt the reward normalizer.
                rouphly speaking, the statistics for the normalization is
                calculated mostly based on the most recent T/speed samples,
                where T is the total number of samples.
            encoding_net (Network): network for encoding observation into a
                latent feature specified by feature_spec. Its input is the same
                as the input of this algorithm.
            discriminator_net (Network): network for predicting the skill labels
                based on the observation.
        """
        skill_spec = tf.TensorSpec((num_of_skills, ))
        super().__init__(train_state_spec=skill_spec, name=name)

        flat_feature_spec = tf.nest.flatten(feature_spec)
        assert len(flat_feature_spec
                   ) == 1, "DIAYNAlgorithm doesn't support nested feature_spec"

        self._num_skills = num_of_skills

        self._encoding_net = encoding_net

        if isinstance(hidden_size, int):
            hidden_size = (hidden_size, )

        if discriminator_net is None:
            discriminator_net = EncodingNetwork(
                name="discriminator_net",
                input_tensor_spec=feature_spec,
                fc_layer_params=hidden_size,
                last_layer_size=self._num_skills,
                last_kernel_initializer=tf.initializers.Zeros())

        self._discriminator_net = discriminator_net
        self._reward_normalizer = ScalarAdaptiveNormalizer(
            speed=reward_adapt_speed)
Exemple #3
0
    def _prepare_specs(self, algorithm):
        time_step_spec = self._env.time_step_spec()
        action_distribution_param_spec = tf.nest.map_structure(
            lambda spec: spec.input_params_spec,
            algorithm.action_distribution_spec)

        policy_step = algorithm.train_step(self.get_initial_time_step(),
                                           self._initial_state)
        info_spec = tf.nest.map_structure(
            lambda t: tf.TensorSpec(t.shape[1:], t.dtype), policy_step.info)

        self._training_info_spec = make_training_info(
            action_distribution=action_distribution_param_spec,
            action=self._env.action_spec(),
            step_type=time_step_spec.step_type,
            reward=time_step_spec.reward,
            discount=time_step_spec.discount,
            info=info_spec)
Exemple #4
0
    def __init__(self,
                 observation_spec,
                 num_of_goals,
                 name="RandomCategoricalGoalGenerator"):
        """Create a RandomCategoricalGoalGenerator.

        Args:
            observation_spec (nested TensorSpec): representing the observations.
            num_of_goals (int): total number of goals the agent can sample from
            name (str): name of the algorithm
        """
        goal_spec = tf.TensorSpec((num_of_goals, ))
        train_state_spec = GoalState(goal=goal_spec)
        super().__init__(observation_spec=observation_spec,
                         action_spec=tensor_spec.BoundedTensorSpec(
                             shape=(num_of_goals, ),
                             dtype=tf.float32,
                             minimum=0.,
                             maximum=1.),
                         train_state_spec=train_state_spec,
                         name=name)
        self._num_of_goals = num_of_goals
        self._p_goal = tf.ones(self._num_of_goals)
Exemple #5
0
    def __init__(self,
                 batch_size,
                 observation_spec,
                 action_spec,
                 soi_spec,
                 soc_spec,
                 split_observation_fn: Callable,
                 network: Network = None,
                 mi_r_scale=5000.0,
                 hidden_size=128,
                 buffer_size=100,
                 n_objects=1,
                 name="MISCAlgorithm"):
        """Create an MISCAlgorithm.

        Args:
            batch_size (int): batch size
            observation_spec (tf.TensorSpec): observation size
            action_spec (tf.TensorSpec): action size
            soi_spec (tf.TensorSpec): state of interest size
            soc_spec (tf.TensorSpec): state of context size
            split_observation_fn (Callable): split observation function.
                The input is observation and action concatenated.
                The outputs are the context states and states of interest
            network (Network): network for estimating mutual information (MI)
            mi_r_scale (float): scale factor of MI estimation
            hidden_size (int): number of hidden units in neural nets
            buffer_size (int): buffer size for the data buffer storing the trajectories
                for training the Mutual Information Neural Estimator
            n_objects: number of objects for estimating the mutual information reward
            name (str): the algorithm name, "MISCAlgorithm"
        """

        super(MISCAlgorithm,
              self).__init__(train_state_spec=[observation_spec, action_spec],
                             name=name)

        assert isinstance(observation_spec, tf.TensorSpec), \
            "does not support nested observation_spec"
        assert isinstance(action_spec, tf.TensorSpec), \
            "does not support nested action_spec"

        if network is None:
            network = EncodingNetwork(input_tensor_spec=[soc_spec, soi_spec],
                                      fc_layer_params=(hidden_size, ),
                                      activation_fn='relu',
                                      last_layer_size=1,
                                      last_activation_fn='tanh')

        self._network = network

        self._traj_spec = tf.TensorSpec(shape=[batch_size] + [
            observation_spec.shape.as_list()[0] +
            action_spec.shape.as_list()[0]
        ],
                                        dtype=observation_spec.dtype)
        self._buffer_size = buffer_size
        self._buffer = DataBuffer(self._traj_spec, capacity=self._buffer_size)
        self._mi_r_scale = mi_r_scale
        self._n_objects = n_objects
        self._split_observation_fn = split_observation_fn
        self._batch_size = batch_size
def create_ac_algorithm(env,
                        actor_fc_layers=(200, 100),
                        value_fc_layers=(200, 100),
                        encoding_conv_layers=(),
                        encoding_fc_layers=(),
                        use_rnns=False,
                        use_icm=False,
                        learning_rate=5e-5,
                        algorithm_class=ActorCriticAlgorithm,
                        loss_class=ActorCriticLoss,
                        debug_summaries=False):
    """Create a simple ActorCriticAlgorithm.

    Args:
        env (TFEnvironment): A TFEnvironment
        actor_fc_layers (list[int]): list of fc layers parameters for actor network
        value_fc_layers (list[int]): list of fc layers parameters for value network
        encoding_conv_layers (list[int]): list of convolution layers parameters for encoding network
        encoding_fc_layers (list[int]): list of fc layers parameters for encoding network
        use_rnns (bool): True if rnn should be used
        use_icm (bool): True if intrinsic curiosity module should be used
        learning_rate (float): learning rate
        algorithm_class (type): class of the algorithm. Can be
            ActorCriticAlgorithm or PPOAlgorithm
        loss_class (type): the class of the loss. The signature of its
            constructor: loss_class(action_spec, debug_summaries)
        debug_summaries (bool): True if debug summaries should be created.
    """
    optimizer = tf.optimizers.Adam(learning_rate=learning_rate)

    if use_rnns:
        actor_net = ActorDistributionRnnNetwork(
            env.observation_spec(),
            env.action_spec(),
            input_fc_layer_params=actor_fc_layers,
            output_fc_layer_params=None)
        value_net = ValueRnnNetwork(env.observation_spec(),
                                    input_fc_layer_params=value_fc_layers,
                                    output_fc_layer_params=None)
    else:
        actor_net = ActorDistributionNetwork(env.observation_spec(),
                                             env.action_spec(),
                                             fc_layer_params=actor_fc_layers)
        value_net = ValueNetwork(env.observation_spec(),
                                 fc_layer_params=value_fc_layers)

    encoding_net = None
    if encoding_fc_layers or encoding_conv_layers:
        encoding_net = EncodingNetwork(
            input_tensor_spec=env.observation_spec(),
            conv_layer_params=encoding_conv_layers,
            fc_layer_params=encoding_fc_layers)

    icm = None
    if use_icm:
        feature_spec = env.observation_spec()
        if encoding_net:
            feature_spec = tf.TensorSpec((encoding_fc_layers[-1], ),
                                         dtype=tf.float32)
        icm = ICMAlgorithm(env.action_spec(),
                           feature_spec,
                           encoding_net=encoding_net)

    algorithm = algorithm_class(action_spec=env.action_spec(),
                                actor_network=actor_net,
                                value_network=value_net,
                                intrinsic_curiosity_module=icm,
                                loss_class=loss_class,
                                optimizer=optimizer,
                                debug_summaries=debug_summaries)

    return algorithm
Exemple #7
0
    def __init__(self,
                 target_net: Network,
                 predictor_net: Network,
                 encoder_net: Network = None,
                 reward_adapt_speed=None,
                 observation_adapt_speed=None,
                 observation_spec=None,
                 learning_rate=None,
                 clip_value=-1.0,
                 stacked_frames=True,
                 name="RNDAlgorithm"):
        """
        Args:
            encoder_net (Network): a shared network that encodes observation to
                embeddings before being input to `target_net` or `predictor_net`;
                its parameters are not trainable
            target_net (Network): the random fixed network that generates target
                state embeddings to be fitted
            predictor_net (Network): the trainable network that predicts target
                embeddings. If fully trained given enough data, predictor_net
                will become target_net eventually.
            reward_adapt_speed (float): speed for adaptively normalizing intrinsic
                rewards; if None, no normalizer is used
            observation_adapt_speed (float): speed for adaptively normalizing
                observations. Only useful if `observation_spec` is not None.
            observation_spec (TensorSpec): the observation tensor spec; used
                for creating an adaptive observation normalizer
            learning_rate (float): the learning rate for prediction cost; if None,
                a global learning rate will be used
            clip_value (float): if positive, the rewards will be clipped to
                [-clip_value, clip_value]; only used for reward normalization
            stacked_frames (bool): a boolean flag indicating whether the input
                observation has stacked frames. If True, then we only keep the
                last frame for RND to make predictions on, as suggested by the
                original paper Burda et al. 2019. For Atari games, this flag is
                usually True (`frame_stacking==4`).
            name (str):
        """
        optimizer = None
        if learning_rate is not None:
            optimizer = tf.optimizers.Adam(learning_rate=learning_rate)
        super(RNDAlgorithm, self).__init__(
            train_state_spec=(), optimizer=optimizer, name=name)
        self._encoder_net = encoder_net
        self._target_net = target_net  # fixed
        self._predictor_net = predictor_net  # trainable
        if reward_adapt_speed is not None:
            self._reward_normalizer = ScalarAdaptiveNormalizer(
                speed=reward_adapt_speed)
            self._reward_clip_value = clip_value
        else:
            self._reward_normalizer = None

        self._stacked_frames = stacked_frames
        if stacked_frames and (observation_spec is not None):
            # Assuming stacking in the last dim, we only keep the last frame.
            shape = observation_spec.shape
            new_shape = shape[:-1] + (1, )
            observation_spec = tf.TensorSpec(
                shape=new_shape, dtype=observation_spec.dtype)

        # The paper suggests to also normalize observations, because the
        # original observation subspace might be small and the target network will
        # yield random embeddings that are indistinguishable
        self._observation_normalizer = None
        if observation_adapt_speed is not None:
            assert observation_spec is not None, \
                "Observation normalizer requires its input tensor spec!"
            self._observation_normalizer = AdaptiveNormalizer(
                tensor_spec=observation_spec, speed=observation_adapt_speed)