Example #1
0
 def _setup(self, beta):
     """Sets up the reward normalizer and a distribution strategy to run."""
     reward_normalizer = popart.PopArt(running_statistics.EMAMeanStd(beta))
     strategy = test_utils.create_distribution_strategy(
         use_tpu=self.primary_device == 'TPU')
     self.assertEqual(strategy.num_replicas_in_sync, 2)
     with strategy.scope():
         reward_normalizer.init()
     return reward_normalizer, strategy
Example #2
0
 def setUp(self):
     super().setUp()
     reward_normalizer = popart.PopArt(running_statistics.AverageMeanStd())
     reward_normalizer.init()
     self._loss = generalized_onpolicy_loss.GeneralizedOnPolicyLoss(
         _DummyAgent(), reward_normalizer,
         parametric_distribution.normal_tanh_distribution(
             _NUM_ACTIONS).create_dist, advantages.GAE(lambda_=0.95),
         _DummyPolicyLoss(), 0.97, _DummyRegularizationLoss(), 0.2, 0.5)
Example #3
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 alpha_lr=3e-4,
                 discount=0.99,
                 tau=0.005,
                 target_entropy=0.0,
                 f_reg=1.0,
                 reward_bonus=5.0,
                 num_augmentations=1,
                 rep_learn_keywords='outer',
                 env_name='',
                 batch_size=256,
                 n_quantiles=5,
                 temp=0.1,
                 num_training_levels=200,
                 latent_dim=256,
                 n_levels_nce=5,
                 popart_norm_beta=0.1):
        """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      alpha_lr: Temperature learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      target_entropy: Target entropy.
      f_reg: Critic regularization weight.
      reward_bonus: Bonus added to the rewards.
      num_augmentations: Number of DrQ augmentations (crops)
      rep_learn_keywords: Representation learning loss to add (see below)
      env_name: Env name
      batch_size: Batch size
      n_quantiles: Number of GVF quantiles
      temp: Temperature of NCE softmax
      num_training_levels: Number of training MDPs (Procgen=200)
      latent_dim: Latent dimensions of auxiliary MLPs
      n_levels_nce: Number of MDPs to use contrastive loss on
      popart_norm_beta: PopArt normalization constant

    For `rep_learn_keywords`, pick from:
      stop_grad_FQI: whether to stop_grad TD/FQI critic updates?
      linear_Q: use a linear critic?

      successor_features: uses ||SF|| as cumulant
      gvf_termination: uses +1 if done else 0 as cumulant
      gvf_action_count: uses state-cond. action counts as cumulant

      nce: uses the multi-class dot-product InfoNCE objective
      cce: uses MoCo Categorical CrossEntropy objective
      energy: uses SimCLR + pairwise GVF distance (not fully tested)

    If no cumulant is specified, the reward will be taken as default one.
    """
        del actor_lr, critic_lr, alpha_lr, target_entropy
        self.action_spec = action_spec
        self.num_augmentations = num_augmentations
        self.rep_learn_keywords = rep_learn_keywords.split('__')
        self.batch_size = batch_size
        self.env_name = env_name
        self.stop_grad_fqi = 'stop_grad_FQI' in self.rep_learn_keywords
        critic_kwargs = {'hidden_dims': (1024, 1024)}
        self.latent_dim = latent_dim
        self.n_levels_nce = n_levels_nce
        hidden_dims = hidden_dims_per_level = (self.latent_dim,
                                               self.latent_dim)
        self.num_training_levels = int(num_training_levels)
        self.n_quantiles = n_quantiles
        self.temp = temp

        # Make 2 sets of weights:
        # - Critic
        # - Critic (target)
        # Optionally, make a 3rd set for per-level critics

        if observation_spec.shape == (64, 64, 3):
            # IMPALA for Procgen
            def conv_stack():
                return make_impala_cnn_network(depths=[16, 32, 32],
                                               use_batch_norm=False,
                                               dropout_rate=0.)

            state_dim = 256
        else:
            # Reduced architecture for DMC
            def conv_stack():
                return ConvStack(observation_spec.shape)

            state_dim = 50

        conv_stack_critic = conv_stack()
        conv_target_stack_critic = conv_stack()

        if observation_spec.shape == (64, 64, 3):
            conv_stack_critic.output_size = state_dim
            conv_target_stack_critic.output_size = state_dim
        critic_kwargs['encoder'] = ImageEncoder(conv_stack_critic,
                                                feature_dim=state_dim,
                                                bprop_conv_stack=True)
        # Note: the target critic does not share any weights.
        critic_kwargs['encoder_target'] = ImageEncoder(
            conv_target_stack_critic,
            feature_dim=state_dim,
            bprop_conv_stack=True)

        conv_stack_critic_per_level = conv_stack()
        conv_target_stack_critic_per_level = conv_stack()
        if observation_spec.shape == (64, 64, 3):
            conv_stack_critic_per_level.output_size = state_dim
            conv_target_stack_critic_per_level.output_size = state_dim

        self.encoder_per_level = ImageEncoder(conv_stack_critic_per_level,
                                              feature_dim=state_dim,
                                              bprop_conv_stack=True)
        self.encoder_per_level_target = ImageEncoder(
            conv_target_stack_critic_per_level,
            feature_dim=state_dim,
            bprop_conv_stack=True)

        criticCL.soft_update(self.encoder_per_level,
                             self.encoder_per_level_target,
                             tau=1.0)

        if self.num_augmentations == 0:
            dummy_state = tf.constant(
                np.zeros([1] + list(observation_spec.shape)))
        else:  # account for padding of +4 everywhere and then cropping out 68
            dummy_state = tf.constant(np.zeros(shape=[1, 68, 68, 3]))
        dummy_enc = critic_kwargs['encoder'](dummy_state)

        @tf.function
        def init_models():
            """This function initializes all auxiliary networks (state and action encoders) with dummy input (Procgen-specific, 68x68x3, 15 actions).
      """
            critic_kwargs['encoder'](dummy_state)
            critic_kwargs['encoder_target'](dummy_state)
            self.encoder_per_level(dummy_state)
            self.encoder_per_level_target(dummy_state)

        init_models()

        action_dim = action_spec.maximum.item() + 1

        self.action_dim = action_dim
        self.discount = discount
        self.tau = tau
        self.reg = f_reg
        self.reward_bonus = reward_bonus

        self.critic = criticCL.Critic(state_dim,
                                      action_dim,
                                      hidden_dims=hidden_dims,
                                      encoder=critic_kwargs['encoder'],
                                      discrete_actions=True,
                                      linear='linear_Q'
                                      in self.rep_learn_keywords)
        self.critic_target = criticCL.Critic(
            state_dim,
            action_dim,
            hidden_dims=hidden_dims,
            encoder=critic_kwargs['encoder_target'],
            discrete_actions=True,
            linear='linear_Q' in self.rep_learn_keywords)

        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)
        self.task_critic_optimizer = tf.keras.optimizers.Adam(
            learning_rate=3e-4)
        self.br_optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)

        if 'cce' in self.rep_learn_keywords:
            self.classifier = tf.keras.Sequential([
                tf.keras.layers.Dense(self.latent_dim, use_bias=True),
                tf.keras.layers.ReLU(),
                tf.keras.layers.Dense(self.n_quantiles, use_bias=True)
            ],
                                                  name='classifier')
        elif 'nce' in self.rep_learn_keywords:
            self.embedding = tf.keras.Sequential([
                tf.keras.layers.Dense(self.latent_dim, use_bias=True),
                tf.keras.layers.ReLU(),
                tf.keras.layers.Dense(self.latent_dim, use_bias=True)
            ],
                                                 name='embedding')

        # This snipet initializes all auxiliary networks (state and action encoders)
        # with dummy input (Procgen-specific, 68x68x3, 15 actions).
        dummy_state = tf.zeros((1, 68, 68, 3), dtype=tf.float32)
        phi_s = self.critic.encoder(dummy_state)
        phi_a = tf.eye(action_dim, dtype=tf.float32)
        if 'linear_Q' in self.rep_learn_keywords:
            _ = self.critic.critic1.state_encoder(phi_s)
            _ = self.critic.critic2.state_encoder(phi_s)
            _ = self.critic.critic1.action_encoder(phi_a)
            _ = self.critic.critic2.action_encoder(phi_a)
            _ = self.critic_target.critic1.state_encoder(phi_s)
            _ = self.critic_target.critic2.state_encoder(phi_s)
            _ = self.critic_target.critic1.action_encoder(phi_a)
            _ = self.critic_target.critic2.action_encoder(phi_a)
        if 'cce' in self.rep_learn_keywords:
            self.classifier(phi_s)
        elif 'nce' in self.rep_learn_keywords:
            self.embedding(phi_s)

        self.target_critic_to_use = self.critic_target
        self.critic_to_use = self.critic

        criticCL.soft_update(self.critic, self.critic_target, tau=1.0)

        self.cce = tf.keras.losses.SparseCategoricalCrossentropy(
            reduction=tf.keras.losses.Reduction.NONE, from_logits=True)

        self.bc = None

        if 'successor_features' in self.rep_learn_keywords:
            self.output_dim_level = self.latent_dim
        elif 'gvf_termination' in self.rep_learn_keywords:
            self.output_dim_level = 1
        elif 'gvf_action_count' in self.rep_learn_keywords:
            self.output_dim_level = action_dim
        else:
            self.output_dim_level = action_dim

        self.task_critic_one = criticCL.Critic(
            state_dim,
            self.output_dim_level * self.num_training_levels,
            hidden_dims=hidden_dims_per_level,
            encoder=None,  # critic_kwargs['encoder'],
            discrete_actions=True,
            cross_norm=False)
        self.task_critic_target_one = criticCL.Critic(
            state_dim,
            self.output_dim_level * 200,
            hidden_dims=hidden_dims_per_level,
            encoder=None,  # critic_kwargs['encoder'],
            discrete_actions=True,
            cross_norm=False)
        self.task_critic_one(dummy_enc,
                             actions=None,
                             training=False,
                             return_features=False,
                             stop_grad_features=False)
        self.task_critic_target_one(dummy_enc,
                                    actions=None,
                                    training=False,
                                    return_features=False,
                                    stop_grad_features=False)
        criticCL.soft_update(self.task_critic_one,
                             self.task_critic_target_one,
                             tau=1.0)

        # Normalization constant beta, set to best default value as per PopArt paper
        self.reward_normalizer = popart.PopArt(
            running_statistics.EMAMeanStd(popart_norm_beta))
        self.reward_normalizer.init()

        if 'CLIP' in self.rep_learn_keywords or 'clip' in self.rep_learn_keywords:
            self.loss_temp = tf.Variable(tf.constant(0.0, dtype=tf.float32),
                                         name='loss_temp',
                                         trainable=True)

        self.model_dict = {
            'critic': self.critic,
            'critic_target': self.critic_target,
            'critic_optimizer': self.critic_optimizer,
            'br_optimizer': self.br_optimizer
        }

        self.model_dict['encoder_perLevel'] = self.encoder_per_level
        self.model_dict[
            'encoder_perLevel_target'] = self.encoder_per_level_target
        self.model_dict['task_critic'] = self.task_critic_one
        self.model_dict['task_critic_target'] = self.task_critic_target_one
    def test_ppo_training_step(self, batch_mode, use_agent_state):
        action_space = gym.spaces.Box(low=-1,
                                      high=1,
                                      shape=[128],
                                      dtype=np.float32)
        distribution = (
            parametric_distribution.
            get_parametric_distribution_for_action_space(action_space))
        training_agent = continuous_control_agent.ContinuousControlAgent(
            distribution)
        virtual_bs = 32
        unroll_length = 5
        batches_per_step = 4
        done = tf.zeros([unroll_length, virtual_bs], dtype=tf.bool)
        prev_actions = tf.reshape(
            tf.stack([
                action_space.sample()
                for _ in range(unroll_length * virtual_bs)
            ]), [unroll_length, virtual_bs, -1])
        env_outputs = utils.EnvOutput(
            reward=tf.random.uniform([unroll_length, virtual_bs]),
            done=done,
            observation=tf.zeros([unroll_length, virtual_bs, 128],
                                 dtype=tf.float32),
            abandoned=tf.zeros_like(done),
            episode_step=tf.ones([unroll_length, virtual_bs], dtype=tf.int32))
        if use_agent_state:
            core_state = tf.zeros([virtual_bs, 64])
        else:
            core_state = training_agent.initial_state(virtual_bs)
        agent_outputs, _ = training_agent((prev_actions, env_outputs),
                                          core_state,
                                          unroll=True)
        args = Unroll(core_state, prev_actions, env_outputs, agent_outputs)

        class DummyStrategy:
            def __init__(self):
                self.num_replicas_in_sync = 1

        loss_fn = generalized_onpolicy_loss.GeneralizedOnPolicyLoss(
            training_agent,
            popart.PopArt(running_statistics.FixedMeanStd(), compensate=False),
            distribution,
            ga_advantages.GAE(lambda_=0.9),
            policy_losses.ppo(0.9),
            discount_factor=0.99,
            regularizer=policy_regularizers.KLPolicyRegularizer(entropy=0.5),
            baseline_cost=0.5,
            max_abs_reward=None,
            frame_skip=1,
            reward_scaling=10)
        loss_fn.init()
        loss, logs = ppo_training_step_utils.ppo_training_step(
            epochs_per_step=8,
            loss_fn=loss_fn,
            args=args,
            batch_mode=batch_mode,
            training_strategy=DummyStrategy(),
            virtual_batch_size=virtual_bs,
            unroll_length=unroll_length - 1,
            batches_per_step=batches_per_step,
            clip_norm=50.,
            optimizer=tf.keras.optimizers.Adam(1e-3),
            logger=utils.ProgressLogger())
        del loss
        del logs