def _setup(self, beta): """Sets up the reward normalizer and a distribution strategy to run.""" reward_normalizer = popart.PopArt(running_statistics.EMAMeanStd(beta)) strategy = test_utils.create_distribution_strategy( use_tpu=self.primary_device == 'TPU') self.assertEqual(strategy.num_replicas_in_sync, 2) with strategy.scope(): reward_normalizer.init() return reward_normalizer, strategy
def setUp(self): super().setUp() reward_normalizer = popart.PopArt(running_statistics.AverageMeanStd()) reward_normalizer.init() self._loss = generalized_onpolicy_loss.GeneralizedOnPolicyLoss( _DummyAgent(), reward_normalizer, parametric_distribution.normal_tanh_distribution( _NUM_ACTIONS).create_dist, advantages.GAE(lambda_=0.95), _DummyPolicyLoss(), 0.97, _DummyRegularizationLoss(), 0.2, 0.5)
def __init__(self, observation_spec, action_spec, actor_lr=3e-4, critic_lr=3e-4, alpha_lr=3e-4, discount=0.99, tau=0.005, target_entropy=0.0, f_reg=1.0, reward_bonus=5.0, num_augmentations=1, rep_learn_keywords='outer', env_name='', batch_size=256, n_quantiles=5, temp=0.1, num_training_levels=200, latent_dim=256, n_levels_nce=5, popart_norm_beta=0.1): """Creates networks. Args: observation_spec: environment observation spec. action_spec: Action spec. actor_lr: Actor learning rate. critic_lr: Critic learning rate. alpha_lr: Temperature learning rate. discount: MDP discount. tau: Soft target update parameter. target_entropy: Target entropy. f_reg: Critic regularization weight. reward_bonus: Bonus added to the rewards. num_augmentations: Number of DrQ augmentations (crops) rep_learn_keywords: Representation learning loss to add (see below) env_name: Env name batch_size: Batch size n_quantiles: Number of GVF quantiles temp: Temperature of NCE softmax num_training_levels: Number of training MDPs (Procgen=200) latent_dim: Latent dimensions of auxiliary MLPs n_levels_nce: Number of MDPs to use contrastive loss on popart_norm_beta: PopArt normalization constant For `rep_learn_keywords`, pick from: stop_grad_FQI: whether to stop_grad TD/FQI critic updates? linear_Q: use a linear critic? successor_features: uses ||SF|| as cumulant gvf_termination: uses +1 if done else 0 as cumulant gvf_action_count: uses state-cond. action counts as cumulant nce: uses the multi-class dot-product InfoNCE objective cce: uses MoCo Categorical CrossEntropy objective energy: uses SimCLR + pairwise GVF distance (not fully tested) If no cumulant is specified, the reward will be taken as default one. """ del actor_lr, critic_lr, alpha_lr, target_entropy self.action_spec = action_spec self.num_augmentations = num_augmentations self.rep_learn_keywords = rep_learn_keywords.split('__') self.batch_size = batch_size self.env_name = env_name self.stop_grad_fqi = 'stop_grad_FQI' in self.rep_learn_keywords critic_kwargs = {'hidden_dims': (1024, 1024)} self.latent_dim = latent_dim self.n_levels_nce = n_levels_nce hidden_dims = hidden_dims_per_level = (self.latent_dim, self.latent_dim) self.num_training_levels = int(num_training_levels) self.n_quantiles = n_quantiles self.temp = temp # Make 2 sets of weights: # - Critic # - Critic (target) # Optionally, make a 3rd set for per-level critics if observation_spec.shape == (64, 64, 3): # IMPALA for Procgen def conv_stack(): return make_impala_cnn_network(depths=[16, 32, 32], use_batch_norm=False, dropout_rate=0.) state_dim = 256 else: # Reduced architecture for DMC def conv_stack(): return ConvStack(observation_spec.shape) state_dim = 50 conv_stack_critic = conv_stack() conv_target_stack_critic = conv_stack() if observation_spec.shape == (64, 64, 3): conv_stack_critic.output_size = state_dim conv_target_stack_critic.output_size = state_dim critic_kwargs['encoder'] = ImageEncoder(conv_stack_critic, feature_dim=state_dim, bprop_conv_stack=True) # Note: the target critic does not share any weights. critic_kwargs['encoder_target'] = ImageEncoder( conv_target_stack_critic, feature_dim=state_dim, bprop_conv_stack=True) conv_stack_critic_per_level = conv_stack() conv_target_stack_critic_per_level = conv_stack() if observation_spec.shape == (64, 64, 3): conv_stack_critic_per_level.output_size = state_dim conv_target_stack_critic_per_level.output_size = state_dim self.encoder_per_level = ImageEncoder(conv_stack_critic_per_level, feature_dim=state_dim, bprop_conv_stack=True) self.encoder_per_level_target = ImageEncoder( conv_target_stack_critic_per_level, feature_dim=state_dim, bprop_conv_stack=True) criticCL.soft_update(self.encoder_per_level, self.encoder_per_level_target, tau=1.0) if self.num_augmentations == 0: dummy_state = tf.constant( np.zeros([1] + list(observation_spec.shape))) else: # account for padding of +4 everywhere and then cropping out 68 dummy_state = tf.constant(np.zeros(shape=[1, 68, 68, 3])) dummy_enc = critic_kwargs['encoder'](dummy_state) @tf.function def init_models(): """This function initializes all auxiliary networks (state and action encoders) with dummy input (Procgen-specific, 68x68x3, 15 actions). """ critic_kwargs['encoder'](dummy_state) critic_kwargs['encoder_target'](dummy_state) self.encoder_per_level(dummy_state) self.encoder_per_level_target(dummy_state) init_models() action_dim = action_spec.maximum.item() + 1 self.action_dim = action_dim self.discount = discount self.tau = tau self.reg = f_reg self.reward_bonus = reward_bonus self.critic = criticCL.Critic(state_dim, action_dim, hidden_dims=hidden_dims, encoder=critic_kwargs['encoder'], discrete_actions=True, linear='linear_Q' in self.rep_learn_keywords) self.critic_target = criticCL.Critic( state_dim, action_dim, hidden_dims=hidden_dims, encoder=critic_kwargs['encoder_target'], discrete_actions=True, linear='linear_Q' in self.rep_learn_keywords) self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4) self.task_critic_optimizer = tf.keras.optimizers.Adam( learning_rate=3e-4) self.br_optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4) if 'cce' in self.rep_learn_keywords: self.classifier = tf.keras.Sequential([ tf.keras.layers.Dense(self.latent_dim, use_bias=True), tf.keras.layers.ReLU(), tf.keras.layers.Dense(self.n_quantiles, use_bias=True) ], name='classifier') elif 'nce' in self.rep_learn_keywords: self.embedding = tf.keras.Sequential([ tf.keras.layers.Dense(self.latent_dim, use_bias=True), tf.keras.layers.ReLU(), tf.keras.layers.Dense(self.latent_dim, use_bias=True) ], name='embedding') # This snipet initializes all auxiliary networks (state and action encoders) # with dummy input (Procgen-specific, 68x68x3, 15 actions). dummy_state = tf.zeros((1, 68, 68, 3), dtype=tf.float32) phi_s = self.critic.encoder(dummy_state) phi_a = tf.eye(action_dim, dtype=tf.float32) if 'linear_Q' in self.rep_learn_keywords: _ = self.critic.critic1.state_encoder(phi_s) _ = self.critic.critic2.state_encoder(phi_s) _ = self.critic.critic1.action_encoder(phi_a) _ = self.critic.critic2.action_encoder(phi_a) _ = self.critic_target.critic1.state_encoder(phi_s) _ = self.critic_target.critic2.state_encoder(phi_s) _ = self.critic_target.critic1.action_encoder(phi_a) _ = self.critic_target.critic2.action_encoder(phi_a) if 'cce' in self.rep_learn_keywords: self.classifier(phi_s) elif 'nce' in self.rep_learn_keywords: self.embedding(phi_s) self.target_critic_to_use = self.critic_target self.critic_to_use = self.critic criticCL.soft_update(self.critic, self.critic_target, tau=1.0) self.cce = tf.keras.losses.SparseCategoricalCrossentropy( reduction=tf.keras.losses.Reduction.NONE, from_logits=True) self.bc = None if 'successor_features' in self.rep_learn_keywords: self.output_dim_level = self.latent_dim elif 'gvf_termination' in self.rep_learn_keywords: self.output_dim_level = 1 elif 'gvf_action_count' in self.rep_learn_keywords: self.output_dim_level = action_dim else: self.output_dim_level = action_dim self.task_critic_one = criticCL.Critic( state_dim, self.output_dim_level * self.num_training_levels, hidden_dims=hidden_dims_per_level, encoder=None, # critic_kwargs['encoder'], discrete_actions=True, cross_norm=False) self.task_critic_target_one = criticCL.Critic( state_dim, self.output_dim_level * 200, hidden_dims=hidden_dims_per_level, encoder=None, # critic_kwargs['encoder'], discrete_actions=True, cross_norm=False) self.task_critic_one(dummy_enc, actions=None, training=False, return_features=False, stop_grad_features=False) self.task_critic_target_one(dummy_enc, actions=None, training=False, return_features=False, stop_grad_features=False) criticCL.soft_update(self.task_critic_one, self.task_critic_target_one, tau=1.0) # Normalization constant beta, set to best default value as per PopArt paper self.reward_normalizer = popart.PopArt( running_statistics.EMAMeanStd(popart_norm_beta)) self.reward_normalizer.init() if 'CLIP' in self.rep_learn_keywords or 'clip' in self.rep_learn_keywords: self.loss_temp = tf.Variable(tf.constant(0.0, dtype=tf.float32), name='loss_temp', trainable=True) self.model_dict = { 'critic': self.critic, 'critic_target': self.critic_target, 'critic_optimizer': self.critic_optimizer, 'br_optimizer': self.br_optimizer } self.model_dict['encoder_perLevel'] = self.encoder_per_level self.model_dict[ 'encoder_perLevel_target'] = self.encoder_per_level_target self.model_dict['task_critic'] = self.task_critic_one self.model_dict['task_critic_target'] = self.task_critic_target_one
def test_ppo_training_step(self, batch_mode, use_agent_state): action_space = gym.spaces.Box(low=-1, high=1, shape=[128], dtype=np.float32) distribution = ( parametric_distribution. get_parametric_distribution_for_action_space(action_space)) training_agent = continuous_control_agent.ContinuousControlAgent( distribution) virtual_bs = 32 unroll_length = 5 batches_per_step = 4 done = tf.zeros([unroll_length, virtual_bs], dtype=tf.bool) prev_actions = tf.reshape( tf.stack([ action_space.sample() for _ in range(unroll_length * virtual_bs) ]), [unroll_length, virtual_bs, -1]) env_outputs = utils.EnvOutput( reward=tf.random.uniform([unroll_length, virtual_bs]), done=done, observation=tf.zeros([unroll_length, virtual_bs, 128], dtype=tf.float32), abandoned=tf.zeros_like(done), episode_step=tf.ones([unroll_length, virtual_bs], dtype=tf.int32)) if use_agent_state: core_state = tf.zeros([virtual_bs, 64]) else: core_state = training_agent.initial_state(virtual_bs) agent_outputs, _ = training_agent((prev_actions, env_outputs), core_state, unroll=True) args = Unroll(core_state, prev_actions, env_outputs, agent_outputs) class DummyStrategy: def __init__(self): self.num_replicas_in_sync = 1 loss_fn = generalized_onpolicy_loss.GeneralizedOnPolicyLoss( training_agent, popart.PopArt(running_statistics.FixedMeanStd(), compensate=False), distribution, ga_advantages.GAE(lambda_=0.9), policy_losses.ppo(0.9), discount_factor=0.99, regularizer=policy_regularizers.KLPolicyRegularizer(entropy=0.5), baseline_cost=0.5, max_abs_reward=None, frame_skip=1, reward_scaling=10) loss_fn.init() loss, logs = ppo_training_step_utils.ppo_training_step( epochs_per_step=8, loss_fn=loss_fn, args=args, batch_mode=batch_mode, training_strategy=DummyStrategy(), virtual_batch_size=virtual_bs, unroll_length=unroll_length - 1, batches_per_step=batches_per_step, clip_norm=50., optimizer=tf.keras.optimizers.Adam(1e-3), logger=utils.ProgressLogger()) del loss del logs