def testTrainWithRnn(self): actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( self._obs_spec, self._action_spec, input_fc_layer_params=None, output_fc_layer_params=None, conv_layer_params=None, lstm_size=(40, ), ) critic_net = critic_rnn_network.CriticRnnNetwork( (self._obs_spec, self._action_spec), observation_fc_layer_params=(16, ), action_fc_layer_params=(16, ), joint_fc_layer_params=(16, ), lstm_size=(16, ), output_fc_layer_params=None, ) counter = common.create_variable('test_train_counter') optimizer_fn = tf.compat.v1.train.AdamOptimizer agent = sac_agent.SacAgent( self._time_step_spec, self._action_spec, critic_network=critic_net, actor_network=actor_net, actor_optimizer=optimizer_fn(1e-3), critic_optimizer=optimizer_fn(1e-3), alpha_optimizer=optimizer_fn(1e-3), train_step_counter=counter, ) batch_size = 5 observations = tf.constant([[[1, 2], [3, 4], [5, 6]]] * batch_size, dtype=tf.float32) actions = tf.constant([[[0], [1], [1]]] * batch_size, dtype=tf.float32) time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * batch_size, dtype=tf.int32), reward=tf.constant([[1] * 3] * batch_size, dtype=tf.float32), discount=tf.constant([[1] * 3] * batch_size, dtype=tf.float32), observation=observations) experience = trajectory.Trajectory(time_steps.step_type, observations, actions, (), time_steps.step_type, time_steps.reward, time_steps.discount) # Force variable creation. agent.policy.variables() if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertEqual(self.evaluate(counter), 0) self.evaluate(loss) self.assertEqual(self.evaluate(counter), 1)
def testInitialValue(self): counter = common.create_variable('counter', 1) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(counter), 1)
def testMultipleCounters(self): counter1 = common.create_variable('counter', 1) counter2 = common.create_variable('counter', 2) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(counter1), 1) self.assertEqual(self.evaluate(counter2), 2)
def create_acc(spec): return common.create_variable( initial_value=np.zeros((batch_size,) + spec.shape), shape=(batch_size,) + spec.shape, dtype=spec.dtype, name='Accumulator/' + spec.name)
def train_eval( root_dir, random_seed=None, # Dataset params domain_name='cartpole', task_name='swingup', frame_shape=(84, 84, 3), image_aug_type='random_shifting', # None/'random_shifting' frame_stack=3, action_repeat=4, # Params for learning num_env_steps=1000000, learn_ceb=True, use_augmented_q=False, # Params for CEB e_ctor=encoders.FRNConv, e_head_ctor=encoders.MVNormalDiagParamHead, b_ctor=encoders.FRNConv, b_head_ctor=encoders.MVNormalDiagParamHead, conv_feature_dim=50, # deterministic feature used by actor/critic/ceb ceb_feature_dim=50, ceb_action_condition=True, ceb_backward_encode_rewards=True, initial_feature_step=0, feature_lr=3e-4, feature_lr_schedule=None, ceb_beta=0.01, ceb_beta_schedule=None, ceb_generative_ratio=0.0, ceb_generative_items=None, feature_grad_clip=None, enc_ema_tau=0.05, # if enc_ema_tau=None, ceb also learns backend encoder use_critic_grad=True, # Params for SAC actor_kernel_init='glorot_uniform', normal_proj_net=sac_agent.sac_normal_projection_net, critic_kernel_init='glorot_uniform', critic_last_kernel_init='glorot_uniform', actor_fc_layers=(256, 256), critic_obs_fc_layers=None, critic_action_fc_layers=None, critic_joint_fc_layers=(256, 256), # Params for collect collect_every=1, initial_collect_steps=1000, collect_steps_per_iteration=1, replay_buffer_capacity=100000, # Params for target update target_update_tau=0.005, target_update_period=1, # Params for train batch_size=256, actor_learning_rate=3e-4, actor_lr_schedule=None, critic_learning_rate=3e-4, critic_lr_schedule=None, alpha_learning_rate=3e-4, alpha_lr_schedule=None, td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error, gamma=0.99, reward_scale_factor=1.0, gradient_clipping=None, use_tf_functions=True, drivers_in_graph=True, # Params for eval num_eval_episodes=10, eval_env_interval=5000, # number of env steps greedy_eval_policy=True, train_next_frame_decoder=False, # Params for summaries and logging baseline_log_fn=None, checkpoint_env_interval=100000, # number of env steps log_env_interval=1000, # number of env steps summary_interval=1000, image_summary_interval=0, summaries_flush_secs=10, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None): """train and eval for PI-SAC.""" if random_seed is not None: tf.compat.v1.set_random_seed(random_seed) np.random.seed(random_seed) # Load baseline logs and write to tensorboard if baseline_log_fn is not None: baseline_log_fn(root_dir, domain_name, task_name, action_repeat) if root_dir is None: raise AttributeError('train_eval requires a root_dir.') # Set iterations and intervals to be computed relative to the number of # environment steps rather than the number of gradient steps. num_iterations = ( num_env_steps * collect_every // collect_steps_per_iteration + (initial_feature_step)) checkpoint_interval = (checkpoint_env_interval * collect_every // collect_steps_per_iteration) eval_interval = (eval_env_interval * collect_every // collect_steps_per_iteration) log_interval = (log_env_interval * collect_every // collect_steps_per_iteration) logging.info('num_env_steps = %d (env steps)', num_env_steps) logging.info('initial_feature_step = %d (gradient steps)', initial_feature_step) logging.info('num_iterations = %d (gradient steps)', num_iterations) logging.info('checkpoint interval (env steps) = %d', checkpoint_env_interval) logging.info('checkpoint interval (gradient steps) = %d', checkpoint_interval) logging.info('eval interval (env steps) = %d', eval_env_interval) logging.info('eval interval (gradient steps) = %d', eval_interval) logging.info('log interval (env steps) = %d', log_env_interval) logging.info('log interval (gradient steps) = %d', log_interval) root_dir = os.path.expanduser(root_dir) summary_writer = tf.compat.v2.summary.create_file_writer( root_dir, flush_millis=summaries_flush_secs * 1000) summary_writer.set_as_default() eval_histograms = [ pisac_metric_utils.ReturnHistogram(buffer_size=num_eval_episodes), ] eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), pisac_metric_utils.ReturnStddevMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes) ] # create training environment render_configs = { 'height': frame_shape[0], 'width': frame_shape[1], 'camera_id': dict(quadruped=2).get(domain_name, 0), } tf_env = tf_py_environment.TFPyEnvironment( env_load_fn(domain_name, task_name, render_configs, frame_stack, action_repeat)) eval_tf_env = tf_py_environment.TFPyEnvironment( env_load_fn(domain_name, task_name, render_configs, frame_stack, action_repeat)) # Define global step g_step = common.create_variable('g_step') # Spec ims_shape = frame_shape[:2] + (frame_shape[2] * frame_stack, ) ims_spec = tf.TensorSpec(shape=ims_shape, dtype=tf.uint8) conv_feature_spec = tf.TensorSpec(shape=(conv_feature_dim, ), dtype=tf.float32) action_spec = tf_env.action_spec() # Forward encoder e_enc = e_ctor(ims_spec, output_dim=conv_feature_dim, name='e') e_enc_t = e_ctor(ims_spec, output_dim=conv_feature_dim, name='e_t') e_enc.create_variables() e_enc_t.create_variables() common.soft_variables_update(e_enc.variables, e_enc_t.variables, tau=1.0, tau_non_trainable=1.0) # Forward encoder head if e_head_ctor is None: e_head = None else: stacked_action_spec = tensor_spec.BoundedTensorSpec( action_spec.shape[:-1] + (action_spec.shape[-1] * frame_stack), action_spec.dtype, action_spec.minimum.tolist() * frame_stack, action_spec.maximum.tolist() * frame_stack, action_spec.name) e_head_spec = [conv_feature_spec, stacked_action_spec ] if ceb_action_condition else conv_feature_spec e_head = e_head_ctor(e_head_spec, output_dim=ceb_feature_dim, name='e_head') e_head.create_variables() # Backward encoder b_enc = b_ctor(ims_spec, output_dim=conv_feature_dim, name='b') b_enc.create_variables() # Backward encoder head if b_head_ctor is None: b_head = None else: stacked_reward_spec = tf.TensorSpec(shape=(frame_stack, ), dtype=tf.float32) b_head_spec = [conv_feature_spec, stacked_reward_spec ] if ceb_backward_encode_rewards else conv_feature_spec b_head = b_head_ctor(b_head_spec, output_dim=ceb_feature_dim, name='b_head') b_head.create_variables() # future decoder for generative formulation future_deconv = None future_reward_mlp = None y_decoders = None if ceb_generative_ratio > 0.0: future_deconv = utils.SimpleDeconv(conv_feature_spec, output_tensor_spec=ims_spec) future_deconv.create_variables() future_reward_mlp = utils.MLP(conv_feature_spec, hidden_dims=(ceb_feature_dim, ceb_feature_dim // 2, frame_stack)) future_reward_mlp.create_variables() y_decoders = [future_deconv, future_reward_mlp] m_vars = e_enc.trainable_variables if enc_ema_tau is None: m_vars += b_enc.trainable_variables else: # do not train b_enc common.soft_variables_update(e_enc.variables, b_enc.variables, tau=1.0, tau_non_trainable=1.0) if e_head_ctor is not None: m_vars += e_head.trainable_variables if b_head_ctor is not None: m_vars += b_head.trainable_variables if ceb_generative_ratio > 0.0: m_vars += future_deconv.trainable_variables m_vars += future_reward_mlp.trainable_variables feature_lr_fn = schedule_utils.get_schedule_fn(base=feature_lr, sched=feature_lr_schedule, step=g_step) m_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=feature_lr_fn) # CEB beta schedule, e.q. 'berp@0:1.0:1000_10000:0.3:0' beta_fn = schedule_utils.get_schedule_fn(base=ceb_beta, sched=ceb_beta_schedule, step=g_step) def img_pred_summary_fn(obs, pred): utils.replay_summary('y0', g_step, reshape=True, frame_stack=frame_stack, image_summary_interval=image_summary_interval)( obs, None) utils.replay_summary('y0_pred', g_step, reshape=True, frame_stack=frame_stack, image_summary_interval=image_summary_interval)( pred, None) utils.replay_summary('y0_pred_diff', g_step, reshape=True, frame_stack=frame_stack, image_summary_interval=image_summary_interval)( ((obs - pred) / 2.0 + 0.5), None) ceb = ceb_task.CEB(beta_fn=beta_fn, generative_ratio=ceb_generative_ratio, generative_items=ceb_generative_items, step_counter=g_step, img_pred_summary_fn=img_pred_summary_fn) m_ceb = ceb_task.CEBTask( ceb, e_enc, b_enc, forward_head=e_head, backward_head=b_head, y_decoders=y_decoders, learn_backward_enc=(enc_ema_tau is None), action_condition=ceb_action_condition, backward_encode_rewards=ceb_backward_encode_rewards, optimizer=m_optimizer, grad_clip=feature_grad_clip, global_step=g_step) if train_next_frame_decoder: ns_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3) next_frame_deconv = utils.SimpleDeconv(conv_feature_spec, output_tensor_spec=ims_spec) next_frame_decoder = utils.PixelDecoder( next_frame_deconv, optimizer=ns_optimizer, step_counter=g_step, image_summary_interval=image_summary_interval, frame_stack=frame_stack) next_frame_deconv.create_variables() # Agent training actor_lr_fn = schedule_utils.get_schedule_fn(base=actor_learning_rate, sched=actor_lr_schedule, step=g_step) critic_lr_fn = schedule_utils.get_schedule_fn(base=critic_learning_rate, sched=critic_lr_schedule, step=g_step) alpha_lr_fn = schedule_utils.get_schedule_fn(base=alpha_learning_rate, sched=alpha_lr_schedule, step=g_step) actor_net = actor_distribution_network.ActorDistributionNetwork( conv_feature_spec, action_spec, kernel_initializer=actor_kernel_init, fc_layer_params=actor_fc_layers, activation_fn=tf.keras.activations.relu, continuous_projection_net=normal_proj_net) critic_net = critic_network.CriticNetwork( (conv_feature_spec, action_spec), observation_fc_layer_params=critic_obs_fc_layers, action_fc_layer_params=critic_action_fc_layers, joint_fc_layer_params=critic_joint_fc_layers, activation_fn=tf.nn.relu, kernel_initializer=critic_kernel_init, last_kernel_initializer=critic_last_kernel_init) tf_agent = sac_agent.SacAgent( ts.time_step_spec(observation_spec=conv_feature_spec), action_spec, actor_network=actor_net, critic_network=critic_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=actor_lr_fn), critic_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=critic_lr_fn), alpha_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=alpha_lr_fn), target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=td_errors_loss_fn, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=g_step) tf_agent.initialize() env_steps = tf_metrics.EnvironmentSteps(prefix='Train') average_return = tf_metrics.AverageReturnMetric( prefix='Train', buffer_size=num_eval_episodes, batch_size=tf_env.batch_size) train_metrics = [ tf_metrics.NumberOfEpisodes(prefix='Train'), env_steps, average_return, tf_metrics.AverageEpisodeLengthMetric(prefix='Train', buffer_size=num_eval_episodes, batch_size=tf_env.batch_size), tf_metrics.AverageReturnMetric(name='LatestReturn', prefix='Train', buffer_size=1, batch_size=tf_env.batch_size) ] # Collect and eval policies initial_collect_policy = random_tf_policy.RandomTFPolicy( tf_env.time_step_spec(), action_spec) eval_policy = tf_agent.policy if greedy_eval_policy: eval_policy = greedy_policy.GreedyPolicy(eval_policy) def obs_to_feature(observation): feature, _ = e_enc(observation['pixels'], training=False) return tf.stop_gradient(feature) eval_policy = FeaturePolicy(policy=eval_policy, time_step_spec=tf_env.time_step_spec(), obs_to_feature_fn=obs_to_feature) collect_policy = FeaturePolicy(policy=tf_agent.collect_policy, time_step_spec=tf_env.time_step_spec(), obs_to_feature_fn=obs_to_feature) # Make the replay buffer. replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=collect_policy.trajectory_spec, batch_size=1, max_length=replay_buffer_capacity) replay_observer = [replay_buffer.add_batch] # Checkpoints train_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(root_dir, 'train'), agent=tf_agent, actor_net=actor_net, critic_net=critic_net, global_step=g_step, metrics=tfa_metric_utils.MetricsGroup(train_metrics, 'train_metrics')) train_checkpointer.initialize_or_restore() policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( root_dir, 'policy'), policy=eval_policy, global_step=g_step) policy_checkpointer.initialize_or_restore() rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( root_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=replay_buffer, global_step=g_step) rb_checkpointer.initialize_or_restore() if learn_ceb: d = dict() if future_deconv is not None: d.update(future_deconv=future_deconv) if future_reward_mlp is not None: d.update(future_reward_mlp=future_reward_mlp) model_ckpt = common.Checkpointer(ckpt_dir=os.path.join( root_dir, 'model'), forward_encoder=e_enc, forward_encoder_target=e_enc_t, forward_head=e_head, backward_encoder=b_enc, backward_head=b_head, global_step=g_step, **d) else: model_ckpt = common.Checkpointer(ckpt_dir=os.path.join( root_dir, 'model'), forward_encoder=e_enc, forward_encoder_target=e_enc_t, global_step=g_step) model_ckpt.initialize_or_restore() if train_next_frame_decoder: next_frame_decoder_ckpt = common.Checkpointer( ckpt_dir=os.path.join(root_dir, 'next_frame_decoder'), next_frame_decoder=next_frame_decoder, next_frame_deconv=next_frame_deconv, global_step=g_step) next_frame_decoder_ckpt.initialize_or_restore() if use_tf_functions and not drivers_in_graph: collect_policy.action = common.function(collect_policy.action) initial_collect_driver = dynamic_step_driver.DynamicStepDriver( tf_env, initial_collect_policy, observers=replay_observer + train_metrics, num_steps=initial_collect_steps) collect_driver = dynamic_step_driver.DynamicStepDriver( tf_env, collect_policy, observers=replay_observer + train_metrics, num_steps=collect_steps_per_iteration) if use_tf_functions and drivers_in_graph: initial_collect_driver.run = common.function( initial_collect_driver.run) collect_driver.run = common.function(collect_driver.run) # Collect initial replay data. if env_steps.result() == 0 or replay_buffer.num_frames() == 0: qj(initial_collect_steps, 'Initializing replay buffer by collecting random experience', tic=1) initial_collect_driver.run() for train_metric in train_metrics: train_metric.tf_summaries(train_step=env_steps.result()) qj(s='Done initializing replay buffer', toc=1) time_step = None policy_state = collect_policy.get_initial_state(tf_env.batch_size) time_acc = 0 env_steps_before = env_steps.result().numpy() paddings = tf.constant([[4, 4], [4, 4], [0, 0]]) def random_shifting(traj, meta): x0 = traj.observation['pixels'][0] x1 = traj.observation['pixels'][1] y0 = traj.observation['pixels'][frame_stack] y1 = traj.observation['pixels'][frame_stack + 1] x0 = tf.pad(x0, paddings, 'SYMMETRIC') x1 = tf.pad(x1, paddings, 'SYMMETRIC') y0 = tf.pad(y0, paddings, 'SYMMETRIC') y1 = tf.pad(y1, paddings, 'SYMMETRIC') x0a = tf.image.random_crop(x0, ims_shape) x1a = tf.image.random_crop(x1, ims_shape) x0 = tf.image.random_crop(x0, ims_shape) x1 = tf.image.random_crop(x1, ims_shape) y0 = tf.image.random_crop(y0, ims_shape) y1 = tf.image.random_crop(y1, ims_shape) return (traj, (x0, x1, x0a, x1a, y0, y1)), meta # Dataset generates trajectories with shape [B, T, ...] num_steps = frame_stack + 2 with tf.device('/cpu:0'): if image_aug_type == 'random_shifting': dataset = replay_buffer.as_dataset( sample_batch_size=batch_size, num_steps=num_steps).unbatch().filter( utils.filter_invalid_transition).map( random_shifting, num_parallel_calls=3).batch(batch_size).map( utils.replay_summary( 'replay/filtered', order_frame_stack=True, frame_stack=frame_stack, image_summary_interval=image_summary_interval, has_augmentations=True)) elif image_aug_type is None: dataset = replay_buffer.as_dataset( sample_batch_size=batch_size, num_steps=num_steps).unbatch().filter( utils.filter_invalid_transition).batch(batch_size).map( utils.replay_summary( 'replay/filtered', order_frame_stack=True, frame_stack=frame_stack, image_summary_interval=image_summary_interval, has_augmentations=False)) else: raise NotImplementedError iterator_nstep = iter(dataset) def model_train_step(experience): if image_aug_type == 'random_shifting': experience, cropped_frames = experience x0, x1, _, _, y0, y1 = cropped_frames r0, r1, a0, a1 = utils.split_xy(experience, frame_stack, rewards_n_actions_only=True) x0 = x0[:, None, ...] x1 = x1[:, None, ...] y0 = y0[:, None, ...] y1 = y1[:, None, ...] elif image_aug_type is None: x0, x1, y0, y1, r0, r1, a0, a1 = utils.split_xy( experience, frame_stack, rewards_n_actions_only=False) else: raise NotImplementedError # Flatten stacked actions action_shape = a0.shape.as_list() a0 = tf.reshape(a0, [action_shape[0], action_shape[1], -1]) a1 = tf.reshape(a1, [action_shape[0], action_shape[1], -1]) if image_summary_interval > 0: utils.replay_summary( 'ceb/x0', g_step, reshape=True, frame_stack=frame_stack, image_summary_interval=image_summary_interval)(x0, None) utils.replay_summary( 'ceb/x1', g_step, reshape=True, frame_stack=frame_stack, image_summary_interval=image_summary_interval)(x1, None) utils.replay_summary( 'ceb/y0', g_step, reshape=True, frame_stack=frame_stack, image_summary_interval=image_summary_interval)(y0, None) utils.replay_summary( 'ceb/y1', g_step, reshape=True, frame_stack=frame_stack, image_summary_interval=image_summary_interval)(y1, None) ceb_loss, feat_x0, zx0 = m_ceb.train(x0, a0, y0, y1, r0, r1, m_vars) if train_next_frame_decoder: # zx0: [B, 1, Z] zx0 = tf.squeeze(zx0, axis=1) # y0: [B, 1, H, W, Cxframe_stack] next_obs = tf.cast(tf.squeeze(y0, axis=1), tf.float32) / 255.0 next_frame_decoder.train(next_obs, tf.stop_gradient(zx0)) if enc_ema_tau is not None: common.soft_variables_update(e_enc.variables, b_enc.variables, tau=enc_ema_tau, tau_non_trainable=enc_ema_tau) def agent_train_step(experience): # preprocess experience if image_aug_type == 'random_shifting': experience, cropped_frames = experience x0, x1, x0a, x1a, y0, y1 = cropped_frames experience = tf.nest.map_structure( lambda t: composite.slice_to(t, axis=1, end=2), experience) time_steps, actions, next_time_steps = ( tf_agent.experience_to_transitions(experience)) # pylint: disable=protected-access elif image_aug_type is None: experience = tf.nest.map_structure( lambda t: composite.slice_to(t, axis=1, end=2), experience) time_steps, actions, next_time_steps = ( tf_agent.experience_to_transitions(experience)) # pylint: disable=protected-access x0 = time_steps.observation['pixels'] x1 = next_time_steps.observation['pixels'] else: raise NotImplementedError tf_agent.train_pix(time_steps, actions, next_time_steps, x0, x1, x0a=x0a if use_augmented_q else None, x1a=x1a if use_augmented_q else None, e_enc=e_enc, e_enc_t=e_enc_t, q_aug=use_augmented_q, use_critic_grad=use_critic_grad) def checkpoint(step): rb_checkpointer.save(global_step=step) train_checkpointer.save(global_step=step) policy_checkpointer.save(global_step=step) model_ckpt.save(global_step=step) if train_next_frame_decoder: next_frame_decoder_ckpt.save(global_step=step) def evaluate(): # Override outer record_if that may be out of sync with respect to the # env_steps.result() value used for the summay step. with tf.compat.v2.summary.record_if(True): qj(g_step.numpy(), 'Starting eval at step', tic=1) results = pisac_metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, histograms=eval_histograms, num_episodes=num_eval_episodes, train_step=env_steps.result(), summary_writer=summary_writer, summary_prefix='Eval', use_function=drivers_in_graph, ) if eval_metrics_callback is not None: eval_metrics_callback(results, env_steps.result()) tfa_metric_utils.log_metrics(eval_metrics) qj(s='Finished eval', toc=1) def update_target(): common.soft_variables_update( e_enc.variables, e_enc_t.variables, tau=tf_agent.target_update_tau, tau_non_trainable=tf_agent.target_update_tau) common.soft_variables_update( tf_agent._critic_network_1.variables, # pylint: disable=protected-access tf_agent._target_critic_network_1.variables, # pylint: disable=protected-access tau=tf_agent.target_update_tau, tau_non_trainable=tf_agent.target_update_tau) common.soft_variables_update( tf_agent._critic_network_2.variables, # pylint: disable=protected-access tf_agent._target_critic_network_2.variables, # pylint: disable=protected-access tau=tf_agent.target_update_tau, tau_non_trainable=tf_agent.target_update_tau) if use_tf_functions: if learn_ceb: m_ceb.train = common.function(m_ceb.train) model_train_step = common.function(model_train_step) agent_train_step = common.function(agent_train_step) tf_agent.train_pix = common.function(tf_agent.train_pix) update_target = common.function(update_target) if train_next_frame_decoder: next_frame_decoder.train = common.function( next_frame_decoder.train) if not learn_ceb and initial_feature_step > 0: raise ValueError('Not learning CEB but initial_feature_step > 0') with tf.summary.record_if( lambda: tf.math.equal(g_step % summary_interval, 0)): if learn_ceb and g_step.numpy() < initial_feature_step: qj(initial_feature_step, 'Pretraining CEB...', tic=1) for _ in range(g_step.numpy(), initial_feature_step): with tf.name_scope('LearningRates'): tf.summary.scalar(name='CEB learning rate', data=feature_lr_fn(), step=g_step) experience, _ = next(iterator_nstep) model_train_step(experience) g_step.assign_add(1) qj(s='Done pretraining CEB.', toc=1) first_step = True for _ in range(g_step.numpy(), num_iterations): g_step_val = g_step.numpy() start_time = time.time() with tf.summary.record_if( lambda: tf.math.equal(g_step % summary_interval, 0)): with tf.name_scope('LearningRates'): tf.summary.scalar(name='Actor learning rate', data=actor_lr_fn(), step=g_step) tf.summary.scalar(name='Critic learning rate', data=critic_lr_fn(), step=g_step) tf.summary.scalar(name='Alpha learning rate', data=alpha_lr_fn(), step=g_step) if learn_ceb: tf.summary.scalar(name='CEB learning rate', data=feature_lr_fn(), step=g_step) with tf.name_scope('Train'): tf.summary.scalar(name='StepsVsEnvironmentSteps', data=env_steps.result(), step=g_step) tf.summary.scalar(name='StepsVsAverageReturn', data=average_return.result(), step=g_step) if g_step_val % collect_every == 0: time_step, policy_state = collect_driver.run( time_step=time_step, policy_state=policy_state, ) experience, _ = next(iterator_nstep) agent_train_step(experience) if (g_step_val - initial_feature_step) % tf_agent.target_update_period == 0: update_target() if learn_ceb: model_train_step(experience) time_acc += time.time() - start_time # Increment global step counter. g_step.assign_add(1) g_step_val = g_step.numpy() if (g_step_val - initial_feature_step) % log_interval == 0: for train_metric in train_metrics: train_metric.tf_summaries(train_step=env_steps.result()) logging.info('env steps = %d, average return = %f', env_steps.result(), average_return.result()) env_steps_per_sec = (env_steps.result().numpy() - env_steps_before) / time_acc logging.info('%.3f env steps/sec', env_steps_per_sec) tf.compat.v2.summary.scalar(name='env_steps_per_sec', data=env_steps_per_sec, step=env_steps.result()) time_acc = 0 env_steps_before = env_steps.result().numpy() if (g_step_val - initial_feature_step) % eval_interval == 0: eval_start_time = time.time() evaluate() logging.info('eval time %.3f sec', time.time() - eval_start_time) if (g_step_val - initial_feature_step) % checkpoint_interval == 0: checkpoint(g_step_val) # Write gin config to Tensorboard if first_step: summ = utils.Summ(0, root_dir) conf = gin.operative_config_str() conf = ' ' + conf.replace('\n', '\n ') summ.text('gin/config', conf) summ.flush() first_step = False # Final checkpoint. checkpoint(g_step.numpy()) # Final evaluation. evaluate()
def __init__(self, time_step_spec, action_spec, critic_network, actor_network, actor_optimizer, critic_optimizer, alpha_optimizer, actor_policy_ctor=actor_policy.ActorPolicy, squash_actions=True, target_update_tau=1.0, target_update_period=1, td_errors_loss_fn=tf.math.squared_difference, gamma=1.0, reward_scale_factor=1.0, initial_log_alpha=0.0, target_entropy=None, gradient_clipping=None, debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, name=None): """Creates a SAC Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. critic_network: A function critic_network((observations, actions)) that returns the q_values for each observation and action. actor_network: A function actor_network(observation, action_spec) that returns action distribution. actor_optimizer: The optimizer to use for the actor network. critic_optimizer: The default optimizer to use for the critic network. alpha_optimizer: The default optimizer to use for the alpha variable. actor_policy_ctor: The policy class to use. squash_actions: Whether or not to use tanh to squash actions between -1 and 1. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the elementwise TD errors loss. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. initial_log_alpha: Initial value for log_alpha. target_entropy: The target average policy entropy, for updating alpha. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) self._critic_network1 = critic_network self._critic_network2 = critic_network.copy(name='CriticNetwork2') self._target_critic_network1 = critic_network.copy( name='TargetCriticNetwork1') self._target_critic_network2 = critic_network.copy( name='TargetCriticNetwork2') self._actor_network = actor_network policy = actor_policy_ctor(time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network) self._log_alpha = common.create_variable( 'initial_log_alpha', initial_value=initial_log_alpha, dtype=tf.float32, trainable=True) # If target_entropy was not passed, set it to negative of the total number # of action dimensions. if target_entropy is None: flat_action_spec = tf.nest.flatten(action_spec) target_entropy = -np.sum([ np.product(single_spec.shape.as_list()) for single_spec in flat_action_spec ]) self._squash_actions = squash_actions self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._alpha_optimizer = alpha_optimizer self._td_errors_loss_fn = td_errors_loss_fn self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._target_entropy = target_entropy self._gradient_clipping = gradient_clipping self._debug_summaries = debug_summaries self._summarize_grads_and_vars = summarize_grads_and_vars super(SacAgent, self).__init__(time_step_spec, action_spec, policy=policy, collect_policy=policy, train_sequence_length=2, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter)
def __init__(self, variable_scope='num_episodes_step_observer'): with tf.compat.v1.variable_scope(variable_scope): self._num_episodes = common.create_variable('num_episodes', 0, shape=[], dtype=tf.int32)
def __init__(self, # counter train_step_counter, # specs time_step_spec, action_spec, # networks critic_network, actor_network, model_network, compressor_network, # optimizers actor_optimizer, critic_optimizer, alpha_optimizer, model_optimizer, # target update target_update_tau=1.0, target_update_period=1, # inputs and stop gradients critic_input='state', actor_input='state', critic_input_stop_gradient=True, actor_input_stop_gradient=False, # model stuff model_batch_size=256, # will round to nearest full trajectory ac_batch_size=128, # other episodes_per_trial = 1, num_tasks_per_train=1, num_batches_per_sampled_trials=1, td_errors_loss_fn=tf.math.squared_difference, gamma=1.0, reward_scale_factor=1.0, task_reward_dim=None, initial_log_alpha=0.0, target_entropy=None, gradient_clipping=None, control_timestep=None, num_images_per_summary=1, offline_ratio=None, override_reward_func=None, ): tf.Module.__init__(self) self.override_reward_func = override_reward_func self.offline_ratio = offline_ratio ################ # critic ################ # networks self._critic_network1 = critic_network self._critic_network2 = critic_network.copy(name='CriticNetwork2') self._target_critic_network1 = critic_network.copy(name='TargetCriticNetwork1') self._target_critic_network2 = critic_network.copy(name='TargetCriticNetwork2') # update the target networks self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._update_target = self._get_target_updater(tau=self._target_update_tau, period=self._target_update_period) ################ # model ################ self._model_network = model_network self.model_input = self._model_network.model_input ################ # compressor ################ self._compressor_network = compressor_network ################ # actor ################ self._actor_network = actor_network ################ # policies ################ self.condition_on_full_latent_dist = (actor_input=="latentDistribution" and critic_input=="latentDistribution") # both policies below share the same actor network # but they process latents (to give to actor network) in potentially different ways # used for eval which_posterior='first' if self._model_network.sparse_reward_inputs: which_rew_input='sparse' else: which_rew_input='dense' policy = MeldPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, model_network=self._model_network, actor_input=actor_input, which_posterior=which_posterior, which_rew_input=which_rew_input, ) # used for collecting data during training # overwrite if specified (eg for double agent) which_posterior='first' if self._model_network.sparse_reward_inputs: which_rew_input='sparse' else: which_rew_input='dense' collect_policy = MeldPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, model_network=self._model_network, actor_input=actor_input, which_posterior=which_posterior, which_rew_input=which_rew_input, ) ################ # more vars ################ self.num_batches_per_sampled_trials = num_batches_per_sampled_trials self.episodes_per_trial = episodes_per_trial self._task_reward_dim = task_reward_dim self._log_alpha = common.create_variable( 'initial_log_alpha', initial_value=initial_log_alpha, dtype=tf.float32, trainable=True) # If target_entropy was not passed, set it to negative of the total number # of action dimensions. if target_entropy is None: flat_action_spec = tf.nest.flatten(action_spec) target_entropy = -np.sum([ np.product(single_spec.shape.as_list()) for single_spec in flat_action_spec ]) self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._alpha_optimizer = alpha_optimizer self._model_optimizer = model_optimizer self._td_errors_loss_fn = td_errors_loss_fn self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._target_entropy = target_entropy self._gradient_clipping = gradient_clipping self._critic_input = critic_input self._actor_input = actor_input self._critic_input_stop_gradient = critic_input_stop_gradient self._actor_input_stop_gradient = actor_input_stop_gradient self._model_batch_size = model_batch_size self._ac_batch_size = ac_batch_size self._control_timestep = control_timestep self._num_images_per_summary = num_images_per_summary self._actor_time_step_spec = time_step_spec._replace(observation=actor_network.input_tensor_spec) self._num_tasks_per_train = num_tasks_per_train ################ # init tf agent ################ super(MeldAgent, self).__init__( time_step_spec, action_spec, policy=policy, collect_policy=collect_policy, #used to set self.step_spec train_sequence_length=None, #train function can accept experience of any length T (i.e., [B,T,...]) train_step_counter=train_step_counter) self._train_model_fn = common.function_in_tf1()(self._train_model) self._train_ac_fn = common.function_in_tf1()(self._train_ac)
def testSequencePreprocessNotBatched(self): counter = common.create_variable('test_train_counter') n_time_steps = 3 agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet( self._obs_spec, self._action_spec, ), value_net=DummyValueNet(self._obs_spec), normalize_observations=False, num_epochs=1, use_gae=False, use_td_lambda_return=False, compute_value_and_advantage_in_train=False, train_step_counter=counter) observations = tf.constant([[1, 2], [3, 4], [5, 6]], dtype=tf.float32) mid_time_step_val = ts.StepType.MID.tolist() time_steps = ts.TimeStep( step_type=tf.constant( [mid_time_step_val] * n_time_steps, dtype=tf.int32), reward=tf.constant([1] * n_time_steps, dtype=tf.float32), discount=tf.constant([1] * n_time_steps, dtype=tf.float32), observation=observations) actions = tf.constant([[0], [1], [1]], dtype=tf.float32) old_action_distribution_parameters = { 'loc': tf.constant([[0.0]] * n_time_steps, dtype=tf.float32), 'scale': tf.constant([[1.0]] * n_time_steps, dtype=tf.float32), } value_preds = tf.constant([9., 15., 21.], dtype=tf.float32) policy_info = { 'dist_params': old_action_distribution_parameters, 'value_prediction': value_preds, } experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) returned_experience = agent.preprocess_sequence(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(observations, returned_experience.observation) self.assertAllClose(actions, returned_experience.action) self.assertAllClose(old_action_distribution_parameters, returned_experience.policy_info['dist_params']) self.assertEqual(n_time_steps, returned_experience.policy_info['return'].shape) self.assertAllClose([40.4821, 30.79], returned_experience.policy_info['return'][:-1]) self.assertEqual( n_time_steps, returned_experience.policy_info['normalized_advantage'].shape) self.assertAllClose( [1., -1.], returned_experience.policy_info['normalized_advantage'][:-1])
def create_variable(spec): return common.create_variable( name=spec.name, dtype=spec.dtype, shape=[batch_size] + spec.shape.as_list())
def testSaveAction(self, seeded, has_state, distribution_net, has_input_fn_and_spec): with tf.compat.v1.Graph().as_default(): tf.compat.v1.set_random_seed(self._global_seed) with tf.compat.v1.Session().as_default(): global_step = common.create_variable('train_step', initial_value=0) if distribution_net: network = actor_distribution_network.ActorDistributionNetwork( self._time_step_spec.observation, self._action_spec) policy = actor_policy.ActorPolicy( time_step_spec=self._time_step_spec, action_spec=self._action_spec, actor_network=network) else: if has_state: network = q_rnn_network.QRnnNetwork( input_tensor_spec=self._time_step_spec.observation, action_spec=self._action_spec, lstm_size=(40,)) else: network = q_network.QNetwork( input_tensor_spec=self._time_step_spec.observation, action_spec=self._action_spec) policy = q_policy.QPolicy( time_step_spec=self._time_step_spec, action_spec=self._action_spec, q_network=network) action_seed = 98723 batch_size = 3 action_inputs = tensor_spec.sample_spec_nest( (self._time_step_spec, policy.policy_state_spec), outer_dims=(batch_size,), seed=4) action_input_values = self.evaluate(action_inputs) action_input_tensors = tf.nest.map_structure(tf.convert_to_tensor, action_input_values) action_output = policy.action(*action_input_tensors, seed=action_seed) distribution_output = policy.distribution(*action_input_tensors) self.assertIsInstance( distribution_output.action, tfp.distributions.Distribution) self.evaluate(tf.compat.v1.global_variables_initializer()) action_output_dict = collections.OrderedDict( ((spec.name, value) for (spec, value) in zip( tf.nest.flatten(policy.policy_step_spec), tf.nest.flatten(action_output)))) # Check output of the flattened signature call. (action_output_value, action_output_dict) = self.evaluate( (action_output, action_output_dict)) distribution_output_value = self.evaluate(_sample_from_distributions( distribution_output)) input_fn_and_spec = None if has_input_fn_and_spec: input_fn_and_spec = (_convert_string_vector_to_action_input, tf.TensorSpec((7,), tf.string, name='example')) saver = policy_saver.PolicySaver( policy, batch_size=None, use_nest_path_signatures=False, seed=action_seed, input_fn_and_spec=input_fn_and_spec, train_step=global_step) path = os.path.join(self.get_temp_dir(), 'save_model_action') saver.save(path) with tf.compat.v1.Graph().as_default(): tf.compat.v1.set_random_seed(self._global_seed) with tf.compat.v1.Session().as_default(): reloaded = tf.compat.v2.saved_model.load(path) self.assertIn('action', reloaded.signatures) reloaded_action = reloaded.signatures['action'] if has_input_fn_and_spec: self._compare_input_output_specs( reloaded_action, expected_input_specs=input_fn_and_spec[1], expected_output_spec=policy.policy_step_spec, batch_input=True) else: self._compare_input_output_specs( reloaded_action, expected_input_specs=(self._time_step_spec, policy.policy_state_spec), expected_output_spec=policy.policy_step_spec, batch_input=True) # Reload action_input_values as tensors in the new graph. action_input_tensors = tf.nest.map_structure(tf.convert_to_tensor, action_input_values) action_input_spec = (self._time_step_spec, policy.policy_state_spec) function_action_input_dict = collections.OrderedDict( (spec.name, value) for (spec, value) in zip( tf.nest.flatten(action_input_spec), tf.nest.flatten(action_input_tensors))) # NOTE(ebrevdo): The graph-level seeds for the policy and the reloaded # model are equal, which in addition to seeding the call to action() and # PolicySaver helps ensure equality of the output of action() in both # cases. self.assertEqual(reloaded_action.graph.seed, self._global_seed) # The seed= argument for the SavedModel action call was given at # creation of the PolicySaver. if has_input_fn_and_spec: action_string_vector = _convert_action_input_to_string_vector( action_input_tensors) action_string_vector_values = self.evaluate(action_string_vector) reloaded_action_output_dict = reloaded_action(action_string_vector) reloaded_action_output = reloaded.action(action_string_vector) reloaded_distribution_output = reloaded.distribution( action_string_vector) self.assertIsInstance(reloaded_distribution_output.action, tfp.distributions.Distribution) else: # This is the flat-signature function. reloaded_action_output_dict = reloaded_action( **function_action_input_dict) # This is the non-flat function. reloaded_action_output = reloaded.action(*action_input_tensors) reloaded_distribution_output = reloaded.distribution( *action_input_tensors) self.assertIsInstance(reloaded_distribution_output.action, tfp.distributions.Distribution) if not has_state: # Try both cases: one with an empty policy_state and one with no # policy_state. Compare them. # NOTE(ebrevdo): The first call to .action() must be stored in # reloaded_action_output because this is the version being compared # later against the true action_output and the values will change # after the first call due to randomness. reloaded_action_output_no_input_state = reloaded.action( action_input_tensors[0]) reloaded_distribution_output_no_input_state = reloaded.distribution( action_input_tensors[0]) # Even with a seed, multiple calls to action will get different # values, so here we just check the signature matches. self.assertIsInstance( reloaded_distribution_output_no_input_state.action, tfp.distributions.Distribution) tf.nest.map_structure(self.match_dtype_shape, reloaded_action_output_no_input_state, reloaded_action_output) tf.nest.map_structure( self.match_dtype_shape, _sample_from_distributions( reloaded_distribution_output_no_input_state), _sample_from_distributions(reloaded_distribution_output)) self.evaluate(tf.compat.v1.global_variables_initializer()) (reloaded_action_output_dict, reloaded_action_output_value) = self.evaluate( (reloaded_action_output_dict, reloaded_action_output)) reloaded_distribution_output_value = self.evaluate( _sample_from_distributions(reloaded_distribution_output)) self.assertAllEqual(action_output_dict.keys(), reloaded_action_output_dict.keys()) for k in action_output_dict: if seeded: self.assertAllClose( action_output_dict[k], reloaded_action_output_dict[k], msg='\nMismatched dict key: %s.' % k) else: self.match_dtype_shape( action_output_dict[k], reloaded_action_output_dict[k], msg='\nMismatch dict key: %s.' % k) # With non-signature functions, we can check that passing a seed does # the right thing the second time. if seeded: tf.nest.map_structure(self.assertAllClose, action_output_value, reloaded_action_output_value) else: tf.nest.map_structure(self.match_dtype_shape, action_output_value, reloaded_action_output_value) tf.nest.map_structure(self.assertAllClose, distribution_output_value, reloaded_distribution_output_value) ## TFLite tests. # The converter must run outside of a TF1 graph context, even in # eager mode, to ensure the TF2 path is being executed. Only # works in TF2. if tf.compat.v1.executing_eagerly_outside_functions(): tflite_converter = tf.lite.TFLiteConverter.from_saved_model( path, signature_keys=['action']) tflite_converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, # TODO(b/111309333): Remove this when `has_input_fn_and_spec` # is `False` once TFLite has native support for RNG ops, atan, etc. tf.lite.OpsSet.SELECT_TF_OPS, ] tflite_serialized_model = tflite_converter.convert() tflite_interpreter = tf.lite.Interpreter( model_content=tflite_serialized_model) tflite_runner = tflite_interpreter.get_signature_runner('action') tflite_signature = tflite_interpreter.get_signature_list()['action'] if has_input_fn_and_spec: tflite_action_input_dict = { 'example': action_string_vector_values, } else: tflite_action_input_dict = collections.OrderedDict( (spec.name, value) for (spec, value) in zip( tf.nest.flatten(action_input_spec), tf.nest.flatten(action_input_values))) self.assertEqual( set(tflite_signature['inputs']), set(tflite_action_input_dict)) self.assertEqual( set(tflite_signature['outputs']), set(action_output_dict)) tflite_output = tflite_runner(**tflite_action_input_dict) self.assertAllClose(tflite_output, action_output_dict)
def __init__(self, action_spec: BoundedTensorSpec): super().__init__(action_spec) self._highest_return = common.create_variable("highest_reward", -inf, dtype=tf.float32)
def testNonScalarInitialValue(self): var = common.create_variable('var', [1, 2], shape=None) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(var), [1, 2])
def __init__(self, observation_spec, action_spec, actor_network: DistributionNetwork, critic_network: Network, critic_loss=None, target_entropy=None, initial_log_alpha=0.0, target_update_tau=0.05, target_update_period=1, dqda_clipping=None, actor_optimizer=None, critic_optimizer=None, alpha_optimizer=None, gradient_clipping=None, debug_summaries=False, name="SacAlgorithm"): """Create a SacAlgorithm Args: action_spec (nested BoundedTensorSpec): representing the actions. actor_network (Network): The network will be called with call(observation, step_type). critic_network (Network): The network will be called with call(observation, action, step_type). critic_loss (None|OneStepTDLoss): an object for calculating critic loss. If None, a default OneStepTDLoss will be used. initial_log_alpha (float): initial value for variable log_alpha target_entropy (float|None): The target average policy entropy, for updating alpha. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. dqda_clipping (float): when computing the actor loss, clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Does not perform clipping if dqda_clipping == 0. actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor. critic_optimizer (tf.optimizers.Optimizer): The optimizer for critic. alpha_optimizer (tf.optimizers.Optimizer): The optimizer for alpha. gradient_clipping (float): Norm length to clip gradients. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ critic_network1 = critic_network critic_network2 = critic_network.copy(name='CriticNetwork2') log_alpha = tfa_common.create_variable(name='log_alpha', initial_value=initial_log_alpha, dtype=tf.float32, trainable=True) super().__init__( observation_spec, action_spec, train_state_spec=SacState( share=SacShareState(actor=actor_network.state_spec), actor=SacActorState(critic1=critic_network.state_spec, critic2=critic_network.state_spec), critic=SacCriticState( critic1=critic_network.state_spec, critic2=critic_network.state_spec, target_critic1=critic_network.state_spec, target_critic2=critic_network.state_spec)), optimizer=[actor_optimizer, critic_optimizer, alpha_optimizer], trainable_module_sets=[[actor_network], [critic_network1, critic_network2], [log_alpha]], gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, name=name) self._log_alpha = log_alpha self._actor_network = actor_network self._critic_network1 = critic_network1 self._critic_network2 = critic_network2 self._target_critic_network1 = self._critic_network1.copy( name='target_critic_network1') self._target_critic_network2 = self._critic_network2.copy( name='target_critic_network2') self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._alpha_optimizer = alpha_optimizer if critic_loss is None: critic_loss = OneStepTDLoss(debug_summaries=debug_summaries) self._critic_loss = critic_loss flat_action_spec = tf.nest.flatten(self._action_spec) self._is_continuous = tensor_spec.is_continuous(flat_action_spec[0]) if target_entropy is None: target_entropy = np.sum( list( map(dist_utils.calc_default_target_entropy, flat_action_spec))) self._target_entropy = target_entropy self._dqda_clipping = dqda_clipping self._update_target = common.get_target_updater( models=[self._critic_network1, self._critic_network2], target_models=[ self._target_critic_network1, self._target_critic_network2 ], tau=target_update_tau, period=target_update_period)
def testSaveAction(self, seeded, has_state, distribution_net, has_input_fn_and_spec): with tf.compat.v1.Graph().as_default(): tf.compat.v1.set_random_seed(self._global_seed) with tf.compat.v1.Session().as_default(): global_step = common.create_variable('train_step', initial_value=0) if distribution_net: network = actor_distribution_network.ActorDistributionNetwork( self._time_step_spec.observation, self._action_spec) policy = actor_policy.ActorPolicy( time_step_spec=self._time_step_spec, action_spec=self._action_spec, actor_network=network) else: if has_state: network = q_rnn_network.QRnnNetwork( input_tensor_spec=self._time_step_spec.observation, action_spec=self._action_spec, lstm_size=(40, )) else: network = q_network.QNetwork( input_tensor_spec=self._time_step_spec.observation, action_spec=self._action_spec) policy = q_policy.QPolicy( time_step_spec=self._time_step_spec, action_spec=self._action_spec, q_network=network) action_seed = 98723 batch_size = 3 action_inputs = tensor_spec.sample_spec_nest( (self._time_step_spec, policy.policy_state_spec), outer_dims=(batch_size, ), seed=4) action_input_values = self.evaluate(action_inputs) action_input_tensors = tf.nest.map_structure( tf.convert_to_tensor, action_input_values) action_output = policy.action(*action_input_tensors, seed=action_seed) distribution_output = policy.distribution( *action_input_tensors) self.assertIsInstance(distribution_output.action, tfp.distributions.Distribution) self.evaluate(tf.compat.v1.global_variables_initializer()) action_output_dict = dict(((spec.name, value) for ( spec, value) in zip(tf.nest.flatten(policy.policy_step_spec), tf.nest.flatten(action_output)))) # Check output of the flattened signature call. (action_output_value, action_output_dict) = self.evaluate( (action_output, action_output_dict)) distribution_output_value = self.evaluate( _sample_from_distributions(distribution_output)) input_fn_and_spec = None if has_input_fn_and_spec: input_fn_and_spec = ( self._convert_string_vector_to_action_input, tf.TensorSpec((7, ), tf.string, name='example')) saver = policy_saver.PolicySaver( policy, batch_size=None, use_nest_path_signatures=False, seed=action_seed, input_fn_and_spec=input_fn_and_spec, train_step=global_step) path = os.path.join(self.get_temp_dir(), 'save_model_action') saver.save(path) with tf.compat.v1.Graph().as_default(): tf.compat.v1.set_random_seed(self._global_seed) with tf.compat.v1.Session().as_default(): reloaded = tf.compat.v2.saved_model.load(path) self.assertIn('action', reloaded.signatures) reloaded_action = reloaded.signatures['action'] if has_input_fn_and_spec: self._compare_input_output_specs( reloaded_action, expected_input_specs=input_fn_and_spec[1], expected_output_spec=policy.policy_step_spec, batch_input=True) else: self._compare_input_output_specs( reloaded_action, expected_input_specs=(self._time_step_spec, policy.policy_state_spec), expected_output_spec=policy.policy_step_spec, batch_input=True) # Reload action_input_values as tensors in the new graph. action_input_tensors = tf.nest.map_structure( tf.convert_to_tensor, action_input_values) action_input_spec = (self._time_step_spec, policy.policy_state_spec) function_action_input_dict = dict( (spec.name, value) for (spec, value) in zip(tf.nest.flatten(action_input_spec), tf.nest.flatten(action_input_tensors))) # NOTE(ebrevdo): The graph-level seeds for the policy and the reloaded # model are equal, which in addition to seeding the call to action() and # PolicySaver helps ensure equality of the output of action() in both # cases. self.assertEqual(reloaded_action.graph.seed, self._global_seed) def match_dtype_shape(x, y, msg=None): self.assertEqual(x.shape, y.shape, msg=msg) self.assertEqual(x.dtype, y.dtype, msg=msg) # The seed= argument for the SavedModel action call was given at # creation of the PolicySaver. if has_input_fn_and_spec: action_string_vector = self._convert_action_input_to_string_vector( action_input_tensors) reloaded_action_output_dict = reloaded_action( action_string_vector) reloaded_action_output = reloaded.action( action_string_vector) reloaded_distribution_output = reloaded.distribution( action_string_vector) self.assertIsInstance(reloaded_distribution_output.action, tfp.distributions.Distribution) else: # This is the flat-signature function. reloaded_action_output_dict = reloaded_action( **function_action_input_dict) # This is the non-flat function. reloaded_action_output = reloaded.action( *action_input_tensors) reloaded_distribution_output = reloaded.distribution( *action_input_tensors) self.assertIsInstance(reloaded_distribution_output.action, tfp.distributions.Distribution) if not has_state: # Try both cases: one with an empty policy_state and one with no # policy_state. Compare them. # NOTE(ebrevdo): The first call to .action() must be stored in # reloaded_action_output because this is the version being compared # later against the true action_output and the values will change # after the first call due to randomness. reloaded_action_output_no_input_state = reloaded.action( action_input_tensors[0]) reloaded_distribution_output_no_input_state = reloaded.distribution( action_input_tensors[0]) # Even with a seed, multiple calls to action will get different # values, so here we just check the signature matches. self.assertIsInstance( reloaded_distribution_output_no_input_state.action, tfp.distributions.Distribution) tf.nest.map_structure( match_dtype_shape, reloaded_action_output_no_input_state, reloaded_action_output) tf.nest.map_structure( match_dtype_shape, _sample_from_distributions( reloaded_distribution_output_no_input_state), _sample_from_distributions( reloaded_distribution_output)) self.evaluate(tf.compat.v1.global_variables_initializer()) (reloaded_action_output_dict, reloaded_action_output_value) = self.evaluate( (reloaded_action_output_dict, reloaded_action_output)) reloaded_distribution_output_value = self.evaluate( _sample_from_distributions(reloaded_distribution_output)) self.assertAllEqual(action_output_dict.keys(), reloaded_action_output_dict.keys()) for k in action_output_dict: if seeded: self.assertAllClose(action_output_dict[k], reloaded_action_output_dict[k], msg='\nMismatched dict key: %s.' % k) else: match_dtype_shape(action_output_dict[k], reloaded_action_output_dict[k], msg='\nMismatch dict key: %s.' % k) # With non-signature functions, we can check that passing a seed does # the right thing the second time. if seeded: tf.nest.map_structure(self.assertAllClose, action_output_value, reloaded_action_output_value) else: tf.nest.map_structure(match_dtype_shape, action_output_value, reloaded_action_output_value) tf.nest.map_structure(self.assertAllClose, distribution_output_value, reloaded_distribution_output_value)
def testTrain(self, num_epochs, use_td_lambda_return, compute_value_and_advantage_in_train): # Mock the build_train_op to return an op for incrementing this counter. counter = common.create_variable('test_train_counter') agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet( self._obs_spec, self._action_spec, ), value_net=DummyValueNet(self._obs_spec), normalize_observations=False, num_epochs=num_epochs, use_gae=use_td_lambda_return, use_td_lambda_return=use_td_lambda_return, compute_value_and_advantage_in_train=compute_value_and_advantage_in_train, train_step_counter=counter) observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) mid_time_step_val = ts.StepType.MID.tolist() time_steps = ts.TimeStep( step_type=tf.constant([[mid_time_step_val] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]], dtype=tf.float32) policy_info = { 'dist_params': action_distribution_parameters, } if not compute_value_and_advantage_in_train: policy_info['value_prediction'] = value_preds experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) if not compute_value_and_advantage_in_train: experience = agent._preprocess(experience) if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) # Assert that counter starts out at zero. self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertEqual(0, self.evaluate(counter)) loss_type = self.evaluate(loss) loss_numpy = loss_type.loss # Assert that loss is not zero as we are training in a non-episodic env. self.assertNotEqual( loss_numpy, 0.0, msg=('Loss is exactly zero, looks like no training ' 'was performed due to incomplete episodes.')) # Assert that train_op ran increment_counter num_epochs times. self.assertEqual(num_epochs, self.evaluate(counter))
def testSaveGetInitialState(self): network = q_rnn_network.QRnnNetwork( input_tensor_spec=self._time_step_spec.observation, action_spec=self._action_spec, lstm_size=(40, )) policy = q_policy.QPolicy(time_step_spec=self._time_step_spec, action_spec=self._action_spec, q_network=network) train_step = common.create_variable('train_step', initial_value=0) saver_nobatch = policy_saver.PolicySaver( policy, train_step=train_step, batch_size=None, use_nest_path_signatures=False) path = os.path.join(self.get_temp_dir(), 'save_model_initial_state_nobatch') self.evaluate(tf.compat.v1.global_variables_initializer()) with self.cached_session(): saver_nobatch.save(path) reloaded_nobatch = tf.compat.v2.saved_model.load(path) self.evaluate( tf.compat.v1.initializers.variables( reloaded_nobatch.model_variables)) self.assertIn('get_initial_state', reloaded_nobatch.signatures) reloaded_get_initial_state = ( reloaded_nobatch.signatures['get_initial_state']) self._compare_input_output_specs( reloaded_get_initial_state, expected_input_specs=(tf.TensorSpec(dtype=tf.int32, shape=(), name='batch_size'), ), expected_output_spec=policy.policy_state_spec, batch_input=False, batch_size=None) initial_state = policy.get_initial_state(batch_size=3) initial_state = self.evaluate(initial_state) reloaded_nobatch_initial_state = reloaded_nobatch.get_initial_state( batch_size=3) reloaded_nobatch_initial_state = self.evaluate( reloaded_nobatch_initial_state) tf.nest.map_structure(self.assertAllClose, initial_state, reloaded_nobatch_initial_state) saver_batch = policy_saver.PolicySaver(policy, train_step=train_step, batch_size=3, use_nest_path_signatures=False) path = os.path.join(self.get_temp_dir(), 'save_model_initial_state_batch') with self.cached_session(): saver_batch.save(path) reloaded_batch = tf.compat.v2.saved_model.load(path) self.evaluate( tf.compat.v1.initializers.variables( reloaded_batch.model_variables)) self.assertIn('get_initial_state', reloaded_batch.signatures) reloaded_get_initial_state = reloaded_batch.signatures[ 'get_initial_state'] self._compare_input_output_specs( reloaded_get_initial_state, expected_input_specs=(), expected_output_spec=policy.policy_state_spec, batch_input=False, batch_size=3) reloaded_batch_initial_state = reloaded_batch.get_initial_state() reloaded_batch_initial_state = self.evaluate( reloaded_batch_initial_state) tf.nest.map_structure(self.assertAllClose, initial_state, reloaded_batch_initial_state)
def testStatelessValueNetTrain(self, compute_value_and_advantage_in_train): counter = common.create_variable('test_train_counter') actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( self._time_step_spec.observation, self._action_spec, input_fc_layer_params=None, output_fc_layer_params=None, lstm_size=(20,)) value_net = value_network.ValueNetwork( self._time_step_spec.observation, fc_layer_params=None) agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, optimizer=tf.compat.v1.train.AdamOptimizer(), actor_net=actor_net, value_net=value_net, num_epochs=1, train_step_counter=counter, compute_value_and_advantage_in_train=compute_value_and_advantage_in_train ) observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) mid_time_step_val = ts.StepType.MID.tolist() time_steps = ts.TimeStep( step_type=tf.constant([[mid_time_step_val] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]], dtype=tf.float32) policy_info = { 'dist_params': action_distribution_parameters, } if not compute_value_and_advantage_in_train: policy_info['value_prediction'] = value_preds experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) if not compute_value_and_advantage_in_train: experience = agent._preprocess(experience) if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) loss_type = self.evaluate(loss) loss_numpy = loss_type.loss # Assert that loss is not zero as we are training in a non-episodic env. self.assertNotEqual( loss_numpy, 0.0, msg=('Loss is exactly zero, looks like no training ' 'was performed due to incomplete episodes.'))
def __init__(self, data_spec, batch_size, max_length=1000, scope='TFUniformReplayBuffer', device='cpu:*', table_fn=table.Table, dataset_drop_remainder=False, dataset_window_shift=None, stateful_dataset=False): """Creates a TFUniformReplayBuffer. The TFUniformReplayBuffer stores episodes in `B == batch_size` blocks of size `L == max_length`, with total frame capacity `C == L * B`. Storage looks like: ``` block1 ep1 frame1 frame2 ... ep2 frame1 frame2 ... <L frames total> block2 ep1 frame1 frame2 ... ep2 frame1 frame2 ... <L frames total> ... blockB ep1 frame1 frame2 ... ep2 frame1 frame2 ... <L frames total> ``` Multiple episodes may be stored within a given block, up to `max_length` frames total. In practice, new episodes will overwrite old ones as the block rolls over its `max_length`. Args: data_spec: A TensorSpec or a list/tuple/nest of TensorSpecs describing a single item that can be stored in this buffer. batch_size: Batch dimension of tensors when adding to buffer. max_length: The maximum number of items that can be stored in a single batch segment of the buffer. scope: Scope prefix for variables and ops created by this class. device: A TensorFlow device to place the Variables and ops. table_fn: Function to create tables `table_fn(data_spec, capacity)` that can read/write nested tensors. dataset_drop_remainder: If `True`, then when calling `as_dataset` with arguments `single_deterministic_pass=True` and `sample_batch_size is not None`, the final batch will be dropped if it does not contain exactly `sample_batch_size` items. This is helpful for static shape inference as the resulting tensors will always have leading dimension `sample_batch_size` instead of `None`. dataset_window_shift: Window shift used when calling `as_dataset` with arguments `single_deterministic_pass=True` and `num_steps is not None`. This determines how the resulting frames are windowed. If `None`, then there is no overlap created between frames and each frame is seen exactly once. For example, if `max_length=5`, `num_steps=2`, `sample_batch_size=None`, and `dataset_window_shift=None`, then the datasets returned will have frames `{[0, 1], [2, 3], [4]}`. If `dataset_window_shift is not None`, then windows are created with a window overlap of `dataset_window_shift` and you will see each frame up to `num_steps` times. For example, if `max_length=5`, `num_steps=2`, `sample_batch_size=None`, and `dataset_window_shift=1`, then the datasets returned will have windows of shifted repeated frames: `{[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]}`. For more details, see the documentation of `tf.data.Dataset.window`, specifically for the `shift` argument. The default behavior is to not overlap frames (`dataset_window_shift=None`) but users often want to see all combinations of frame sequences, in which case `dataset_window_shift=1` is the appropriate value. stateful_dataset: whether the dataset contains stateful ops or not. """ self._batch_size = batch_size self._max_length = max_length capacity = self._batch_size * self._max_length super(TFUniformReplayBuffer, self).__init__( data_spec, capacity, stateful_dataset) self._id_spec = tensor_spec.TensorSpec([], dtype=tf.int64, name='id') self._capacity_value = np.int64(self._capacity) self._batch_offsets = ( tf.range(self._batch_size, dtype=tf.int64) * self._max_length) self._scope = scope self._device = device self._table_fn = table_fn self._dataset_drop_remainder = dataset_drop_remainder self._dataset_window_shift = dataset_window_shift with tf.device(self._device), tf.compat.v1.variable_scope(self._scope): self._capacity = tf.constant(capacity, dtype=tf.int64) self._data_table = table_fn(self._data_spec, self._capacity_value) self._id_table = table_fn(self._id_spec, self._capacity_value) self._last_id = common.create_variable('last_id', -1) self._last_id_cs = tf.CriticalSection(name='last_id')
def __init__(self, time_step_spec, action_spec, optimizer=None, actor_net=None, value_net=None, importance_ratio_clipping=0.0, lambda_value=0.95, discount_factor=0.99, entropy_regularization=0.0, policy_l2_reg=0.0, value_function_l2_reg=0.0, value_pred_loss_coef=0.5, num_epochs=25, use_gae=False, use_td_lambda_return=False, normalize_rewards=True, reward_norm_clipping=10.0, normalize_observations=True, log_prob_clipping=0.0, kl_cutoff_factor=2.0, kl_cutoff_coef=1000.0, initial_adaptive_kl_beta=1.0, adaptive_kl_target=0.01, adaptive_kl_tolerance=0.3, gradient_clipping=None, check_numerics=False, debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, name=None): """Creates a PPO Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. optimizer: Optimizer to use for the agent. actor_net: A function actor_net(observations, action_spec) that returns tensor of action distribution params for each observation. Takes nested observation and returns nested action. value_net: A function value_net(time_steps) that returns value tensor from neural net predictions for each observation. Takes nested observation and returns batch of value_preds. importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective. For more detail, see explanation at the top of the doc. lambda_value: Lambda parameter for TD-lambda computation. discount_factor: Discount factor for return computation. entropy_regularization: Coefficient for entropy regularization loss term. policy_l2_reg: Coefficient for l2 regularization of policy weights. value_function_l2_reg: Coefficient for l2 regularization of value function weights. value_pred_loss_coef: Multiplier for value prediction loss to balance with policy gradient loss. num_epochs: Number of epochs for computing policy updates. use_gae: If True (default False), uses generalized advantage estimation for computing per-timestep advantage. Else, just subtracts value predictions from empirical return. use_td_lambda_return: If True (default False), uses td_lambda_return for training value function. (td_lambda_return = gae_advantage + value_predictions) normalize_rewards: If true, keeps moving variance of rewards and normalizes incoming rewards. reward_norm_clipping: Value above an below to clip normalized reward. normalize_observations: If true, keeps moving mean and variance of observations and normalizes incoming observations. log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN values. Default: no clipping. kl_cutoff_factor: If policy KL changes more than this much for any single timestep, adds a squared KL penalty to loss function. kl_cutoff_coef: Loss coefficient for kl cutoff term. initial_adaptive_kl_beta: Initial value for beta coefficient of adaptive kl penalty. adaptive_kl_target: Desired kl target for policy updates. If actual kl is far from this target, adaptive_kl_beta will be updated. adaptive_kl_tolerance: A tolerance for adaptive_kl_beta. Mean KL above (1 + tol) * adaptive_kl_target, or below (1 - tol) * adaptive_kl_target, will cause adaptive_kl_beta to be updated. gradient_clipping: Norm length to clip gradients. Default: no clipping. check_numerics: If true, adds tf.debugging.check_numerics to help find NaN / Inf values. For debugging only. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If true, gradient summaries will be written. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If the actor_net is not a DistributionNetwork. """ if not isinstance(actor_net, network.DistributionNetwork): raise ValueError( 'actor_net must be an instance of a DistributionNetwork.') tf.Module.__init__(self, name=name) self._optimizer = optimizer self._actor_net = actor_net self._value_net = value_net self._importance_ratio_clipping = importance_ratio_clipping self._lambda = lambda_value self._discount_factor = discount_factor self._entropy_regularization = entropy_regularization self._policy_l2_reg = policy_l2_reg self._value_function_l2_reg = value_function_l2_reg self._value_pred_loss_coef = value_pred_loss_coef self._num_epochs = num_epochs self._use_gae = use_gae self._use_td_lambda_return = use_td_lambda_return self._reward_norm_clipping = reward_norm_clipping self._log_prob_clipping = log_prob_clipping self._kl_cutoff_factor = kl_cutoff_factor self._kl_cutoff_coef = kl_cutoff_coef self._adaptive_kl_target = adaptive_kl_target self._adaptive_kl_tolerance = adaptive_kl_tolerance self._gradient_clipping = gradient_clipping or 0.0 self._check_numerics = check_numerics if initial_adaptive_kl_beta > 0.0: # TODO(kbanoop): Rename create_variable. self._adaptive_kl_beta = common.create_variable( 'adaptive_kl_beta', initial_adaptive_kl_beta, dtype=tf.float32) else: self._adaptive_kl_beta = None self._reward_normalizer = None if normalize_rewards: self._reward_normalizer = tensor_normalizer.StreamingTensorNormalizer( tensor_spec.TensorSpec([], tf.float32), scope='normalize_reward') self._observation_normalizer = None if normalize_observations: self._observation_normalizer = ( tensor_normalizer.StreamingTensorNormalizer( time_step_spec.observation, scope='normalize_observations')) policy = greedy_policy.GreedyPolicy( ppo_policy.PPOPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=actor_net, value_network=value_net, observation_normalizer=self._observation_normalizer, clip=False, collect=False)) collect_policy = ppo_policy.PPOPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=actor_net, value_network=value_net, observation_normalizer=self._observation_normalizer, clip=False, collect=True) self._action_distribution_spec = (self._actor_net.output_spec) super(PPOAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter)
def __init__(self, name='NumberOfEpisodes', prefix='Metrics', dtype=tf.int64): super(NumberOfEpisodes, self).__init__(name=name, prefix=prefix) self.dtype = dtype self.number_episodes = common.create_variable( initial_value=0, dtype=self.dtype, shape=(), name='number_episodes')
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, critic_network: network.Network, actor_network: network.Network, actor_optimizer: types.Optimizer, critic_optimizer: types.Optimizer, alpha_optimizer: types.Optimizer, actor_loss_weight: types.Float = 1.0, critic_loss_weight: types.Float = 0.5, alpha_loss_weight: types.Float = 1.0, actor_policy_ctor: Callable[ ..., tf_policy.TFPolicy] = actor_policy.ActorPolicy, critic_network_2: Optional[network.Network] = None, target_critic_network: Optional[network.Network] = None, target_critic_network_2: Optional[network.Network] = None, target_update_tau: types.Float = 1.0, target_update_period: types.Int = 1, td_errors_loss_fn: types.LossFn = tf.math.squared_difference, gamma: types.Float = 1.0, reward_scale_factor: types.Float = 1.0, initial_log_alpha: types.Float = 0.0, use_log_alpha_in_alpha_loss: bool = True, target_entropy: Optional[types.Float] = None, gradient_clipping: Optional[types.Float] = None, debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates a SAC Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. critic_network: A function critic_network((observations, actions)) that returns the q_values for each observation and action. actor_network: A function actor_network(observation, action_spec) that returns action distribution. actor_optimizer: The optimizer to use for the actor network. critic_optimizer: The default optimizer to use for the critic network. alpha_optimizer: The default optimizer to use for the alpha variable. actor_loss_weight: The weight on actor loss. critic_loss_weight: The weight on critic loss. alpha_loss_weight: The weight on alpha loss. actor_policy_ctor: The policy class to use. critic_network_2: (Optional.) A `tf_agents.network.Network` to be used as the second critic network during Q learning. The weights from `critic_network` are copied if this is not provided. target_critic_network: (Optional.) A `tf_agents.network.Network` to be used as the target critic network during Q learning. Every `target_update_period` train steps, the weights from `critic_network` are copied (possibly withsmoothing via `target_update_tau`) to ` target_critic_network`. If `target_critic_network` is not provided, it is created by making a copy of `critic_network`, which initializes a new network with the same structure and its own layers and weights. Performing a `Network.copy` does not work when the network instance already has trainable parameters (e.g., has already been built, or when the network is sharing layers with another). In these cases, it is up to you to build a copy having weights that are not shared with the original `critic_network`, so that this can be used as a target network. If you provide a `target_critic_network` that shares any weights with `critic_network`, a warning will be logged but no exception is thrown. target_critic_network_2: (Optional.) Similar network as target_critic_network but for the critic_network_2. See documentation for target_critic_network. Will only be used if 'critic_network_2' is also specified. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the elementwise TD errors loss. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. initial_log_alpha: Initial value for log_alpha. use_log_alpha_in_alpha_loss: A boolean, whether using log_alpha or alpha in alpha loss. Certain implementations of SAC use log_alpha as log values are generally nicer to work with. target_entropy: The target average policy entropy, for updating alpha. The default value is negative of the total number of actions. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) self._check_action_spec(action_spec) net_observation_spec = time_step_spec.observation critic_spec = (net_observation_spec, action_spec) self._critic_network_1 = critic_network if critic_network_2 is not None: self._critic_network_2 = critic_network_2 else: self._critic_network_2 = critic_network.copy(name='CriticNetwork2') # Do not use target_critic_network_2 if critic_network_2 is None. target_critic_network_2 = None # Wait until critic_network_2 has been copied from critic_network_1 before # creating variables on both. self._critic_network_1.create_variables(critic_spec) self._critic_network_2.create_variables(critic_spec) if target_critic_network: target_critic_network.create_variables(critic_spec) self._target_critic_network_1 = ( common.maybe_copy_target_network_with_checks( self._critic_network_1, target_critic_network, input_spec=critic_spec, name='TargetCriticNetwork1')) if target_critic_network_2: target_critic_network_2.create_variables(critic_spec) self._target_critic_network_2 = ( common.maybe_copy_target_network_with_checks( self._critic_network_2, target_critic_network_2, input_spec=critic_spec, name='TargetCriticNetwork2')) if actor_network: actor_network.create_variables(net_observation_spec) self._actor_network = actor_network policy = actor_policy_ctor(time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, training=False) self._train_policy = actor_policy_ctor( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, training=True) self._log_alpha = common.create_variable( 'initial_log_alpha', initial_value=initial_log_alpha, dtype=tf.float32, trainable=True) if target_entropy is None: target_entropy = self._get_default_target_entropy(action_spec) self._use_log_alpha_in_alpha_loss = use_log_alpha_in_alpha_loss self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._alpha_optimizer = alpha_optimizer self._actor_loss_weight = actor_loss_weight self._critic_loss_weight = critic_loss_weight self._alpha_loss_weight = alpha_loss_weight self._td_errors_loss_fn = td_errors_loss_fn self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._target_entropy = target_entropy self._gradient_clipping = gradient_clipping self._debug_summaries = debug_summaries self._summarize_grads_and_vars = summarize_grads_and_vars self._update_target = self._get_target_updater( tau=self._target_update_tau, period=self._target_update_period) train_sequence_length = 2 if not critic_network.state_spec else None super(SacAgent, self).__init__( time_step_spec, action_spec, policy=policy, collect_policy=policy, train_sequence_length=train_sequence_length, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, ) self._as_transition = data_converter.AsTransition( self.data_context, squeeze_time_dim=(train_sequence_length == 2))
def __init__(self, name='EnvironmentSteps', prefix='Metrics', dtype=tf.int64): super(EnvironmentSteps, self).__init__(name=name, prefix=prefix) self.dtype = dtype self.environment_steps = common.create_variable( initial_value=0, dtype=self.dtype, shape=(), name='environment_steps')
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, critic_network: network.Network, actor_network: network.Network, actor_optimizer: types.Optimizer, critic_optimizer: types.Optimizer, alpha_optimizer: types.Optimizer, actor_loss_weight: types.Float = 1.0, critic_loss_weight: types.Float = 0.5, alpha_loss_weight: types.Float = 1.0, actor_policy_ctor: Callable[ ..., tf_policy.TFPolicy] = actor_policy.ActorPolicy, critic_network_2: Optional[network.Network] = None, target_critic_network: Optional[network.Network] = None, target_critic_network_2: Optional[network.Network] = None, target_update_tau: types.Float = 1.0, target_update_period: types.Int = 1, td_errors_loss_fn: types.LossFn = tf.math.squared_difference, gamma: types.Float = 1.0, sigma: types.Float = 0.9, reward_scale_factor: types.Float = 1.0, initial_log_alpha: types.Float = 0.0, use_log_alpha_in_alpha_loss: bool = True, target_entropy: Optional[types.Float] = None, gradient_clipping: Optional[types.Float] = None, debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): tf.Module.__init__(self, name=name) self._check_action_spec(action_spec) net_observation_spec = time_step_spec.observation critic_spec = (net_observation_spec, action_spec) self._critic_network_1 = critic_network if critic_network_2 is not None: self._critic_network_2 = critic_network_2 else: self._critic_network_2 = critic_network.copy(name='CriticNetwork2') # Do not use target_critic_network_2 if critic_network_2 is None. target_critic_network_2 = None # Wait until critic_network_2 has been copied from critic_network_1 before # creating variables on both. self._critic_network_1.create_variables(critic_spec) self._critic_network_2.create_variables(critic_spec) if target_critic_network: target_critic_network.create_variables(critic_spec) self._target_critic_network_1 = ( common.maybe_copy_target_network_with_checks( self._critic_network_1, target_critic_network, input_spec=critic_spec, name='TargetCriticNetwork1')) if target_critic_network_2: target_critic_network_2.create_variables(critic_spec) self._target_critic_network_2 = ( common.maybe_copy_target_network_with_checks( self._critic_network_2, target_critic_network_2, input_spec=critic_spec, name='TargetCriticNetwork2')) if actor_network: actor_network.create_variables(net_observation_spec) self._actor_network = actor_network policy = actor_policy_ctor(time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, training=False) self._train_policy = actor_policy_ctor( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network, training=True) self._log_alpha = common.create_variable( 'initial_log_alpha', initial_value=initial_log_alpha, dtype=tf.float32, trainable=True) if target_entropy is None: target_entropy = self._get_default_target_entropy(action_spec) self._use_log_alpha_in_alpha_loss = use_log_alpha_in_alpha_loss self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._alpha_optimizer = alpha_optimizer self._actor_loss_weight = actor_loss_weight self._critic_loss_weight = critic_loss_weight self._alpha_loss_weight = alpha_loss_weight self._td_errors_loss_fn = td_errors_loss_fn self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._target_entropy = target_entropy self._gradient_clipping = gradient_clipping self._debug_summaries = debug_summaries self._summarize_grads_and_vars = summarize_grads_and_vars self._update_target = self._get_target_updater( tau=self._target_update_tau, period=self._target_update_period) self.sigma = sigma train_sequence_length = 2 if not critic_network.state_spec else None super(sac_agent.SacAgent, self).__init__(time_step_spec, action_spec, policy=policy, collect_policy=policy, train_sequence_length=train_sequence_length, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, validate_args=False) self._as_transition = data_converter.AsTransition( self.data_context, squeeze_time_dim=(train_sequence_length == 2))
def testDefaults(self): counter = common.create_variable('counter') self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(counter), 0)
def __init__(self, time_step_spec, action_spec, critic_network, actor_network, actor_optimizer, critic_optimizer, alpha_optimizer, actor_policy_ctor=actor_policy.ActorPolicy, critic_network_2=None, target_critic_network=None, target_critic_network_2=None, target_update_tau=1.0, target_update_period=1, td_errors_loss_fn=tf.math.squared_difference, gamma=1.0, reward_scale_factor=1.0, initial_log_alpha=0.0, target_entropy=None, gradient_clipping=None, debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, name=None): """Creates a SAC Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. critic_network: A function critic_network((observations, actions)) that returns the q_values for each observation and action. actor_network: A function actor_network(observation, action_spec) that returns action distribution. actor_optimizer: The optimizer to use for the actor network. critic_optimizer: The default optimizer to use for the critic network. alpha_optimizer: The default optimizer to use for the alpha variable. actor_policy_ctor: The policy class to use. critic_network_2: (Optional.) A `tf_agents.network.Network` to be used as the second critic network during Q learning. The weights from `critic_network` are copied if this is not provided. target_critic_network: (Optional.) A `tf_agents.network.Network` to be used as the target critic network during Q learning. Every `target_update_period` train steps, the weights from `critic_network` are copied (possibly withsmoothing via `target_update_tau`) to ` target_critic_network`. If `target_critic_network` is not provided, it is created by making a copy of `critic_network`, which initializes a new network with the same structure and its own layers and weights. Performing a `Network.copy` does not work when the network instance already has trainable parameters (e.g., has already been built, or when the network is sharing layers with another). In these cases, it is up to you to build a copy having weights that are not shared with the original `critic_network`, so that this can be used as a target network. If you provide a `target_critic_network` that shares any weights with `critic_network`, a warning will be logged but no exception is thrown. target_critic_network_2: (Optional.) Similar network as target_critic_network but for the critic_network_2. See documentation for target_critic_network. Will only be used if 'critic_network_2' is also specified. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the elementwise TD errors loss. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. initial_log_alpha: Initial value for log_alpha. target_entropy: The target average policy entropy, for updating alpha. The default value is negative of the total number of actions. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) flat_action_spec = tf.nest.flatten(action_spec) for spec in flat_action_spec: if spec.dtype.is_integer: raise NotImplementedError( 'SacAgent does not currently support discrete actions. ' 'Action spec: {}'.format(action_spec)) self._critic_network_1 = critic_network self._critic_network_1.create_variables() if target_critic_network: target_critic_network.create_variables() self._target_critic_network_1 = ( common.maybe_copy_target_network_with_checks( self._critic_network_1, target_critic_network, 'TargetCriticNetwork1')) if critic_network_2 is not None: self._critic_network_2 = critic_network_2 else: self._critic_network_2 = critic_network.copy(name='CriticNetwork2') # Do not use target_critic_network_2 if critic_network_2 is None. target_critic_network_2 = None self._critic_network_2.create_variables() if target_critic_network_2: target_critic_network_2.create_variables() self._target_critic_network_2 = ( common.maybe_copy_target_network_with_checks( self._critic_network_2, target_critic_network_2, 'TargetCriticNetwork2')) if actor_network: actor_network.create_variables() self._actor_network = actor_network policy = actor_policy_ctor(time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network) self._log_alpha = common.create_variable( 'initial_log_alpha', initial_value=initial_log_alpha, dtype=tf.float32, trainable=True) # If target_entropy was not passed, set it to negative of the total number # of action dimensions. if target_entropy is None: flat_action_spec = tf.nest.flatten(action_spec) target_entropy = -np.sum([ np.product(single_spec.shape.as_list()) for single_spec in flat_action_spec ]) self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._alpha_optimizer = alpha_optimizer self._td_errors_loss_fn = td_errors_loss_fn self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._target_entropy = target_entropy self._gradient_clipping = gradient_clipping self._debug_summaries = debug_summaries self._summarize_grads_and_vars = summarize_grads_and_vars self._update_target = self._get_target_updater( tau=self._target_update_tau, period=self._target_update_period) train_sequence_length = 2 if not critic_network.state_spec else None super(SacAgent, self).__init__(time_step_spec, action_spec, policy=policy, collect_policy=policy, train_sequence_length=train_sequence_length, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter)
def testIncrement(self): counter = common.create_variable('counter', 0) inc_counter = counter.assign_add(1) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(inc_counter), 1)
def __init__( self, time_step_spec, action_spec, optimizer = None, actor_net = None, value_net = None, importance_ratio_clipping = 0.0, lambda_value = 0.95, discount_factor = 0.99, entropy_regularization = 0.0, policy_l2_reg = 0.0, value_function_l2_reg = 0.0, shared_vars_l2_reg = 0.0, value_pred_loss_coef = 0.5, num_epochs = 25, use_gae = False, use_td_lambda_return = False, normalize_rewards = True, reward_norm_clipping = 10.0, normalize_observations = True, log_prob_clipping = 0.0, kl_cutoff_factor = 0.0, kl_cutoff_coef = 0.0, initial_adaptive_kl_beta = 0.0, adaptive_kl_target = 0.0, adaptive_kl_tolerance = 0.0, gradient_clipping = None, value_clipping = None, check_numerics = False, # TODO(b/150244758): Change the default to False once we move # clients onto Reverb. compute_value_and_advantage_in_train = True, update_normalizers_in_train = True, debug_summaries = False, summarize_grads_and_vars = False, train_step_counter = None, name = 'AttentionPPOAgent'): """Creates a PPO Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. optimizer: Optimizer to use for the agent, default to using `tf.compat.v1.train.AdamOptimizer`. actor_net: A `network.DistributionNetwork` which maps observations to action distributions. Commonly, it is set to `actor_distribution_network.ActorDistributionNetwork`. value_net: A `Network` which returns the value prediction for input states, with `call(observation, step_type, network_state)`. Commonly, it is set to `value_network.ValueNetwork`. importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective. For more detail, see explanation at the top of the doc. lambda_value: Lambda parameter for TD-lambda computation. discount_factor: Discount factor for return computation. Default to `0.99` which is the value used for all environments from (Schulman, 2017). entropy_regularization: Coefficient for entropy regularization loss term. Default to `0.0` because no entropy bonus was used in (Schulman, 2017). policy_l2_reg: Coefficient for L2 regularization of unshared actor_net weights. Default to `0.0` because no L2 regularization was applied on the policy network weights in (Schulman, 2017). value_function_l2_reg: Coefficient for l2 regularization of unshared value function weights. Default to `0.0` because no L2 regularization was applied on the policy network weights in (Schulman, 2017). shared_vars_l2_reg: Coefficient for l2 regularization of weights shared between actor_net and value_net. Default to `0.0` because no L2 regularization was applied on the policy network or value network weights in (Schulman, 2017). value_pred_loss_coef: Multiplier for value prediction loss to balance with policy gradient loss. Default to `0.5`, which was used for all environments in the OpenAI baseline implementation. This parameters is irrelevant unless you are sharing part of actor_net and value_net. In that case, you would want to tune this coeeficient, whose value depends on the network architecture of your choice. num_epochs: Number of epochs for computing policy updates. (Schulman,2017) sets this to 10 for Mujoco, 15 for Roboschool and 3 for Atari. use_gae: If True (default False), uses generalized advantage estimation for computing per-timestep advantage. Else, just subtracts value predictions from empirical return. use_td_lambda_return: If True (default False), uses td_lambda_return for training value function; here: `td_lambda_return = gae_advantage + value_predictions`. `use_gae` must be set to `True` as well to enable TD -lambda returns. If `use_td_lambda_return` is set to True while `use_gae` is False, the empirical return will be used and a warning will be logged. normalize_rewards: If true, keeps moving variance of rewards and normalizes incoming rewards. While not mentioned directly in (Schulman, 2017), reward normalization was implemented in OpenAI baselines and (Ilyas et al., 2018) pointed out that it largely improves performance. You may refer to Figure 1 of https://arxiv.org/pdf/1811.02553.pdf for a comparison with and without reward scaling. reward_norm_clipping: Value above and below to clip normalized reward. Additional optimization proposed in (Ilyas et al., 2018) set to `5` or `10`. normalize_observations: If `True`, keeps moving mean and variance of observations and normalizes incoming observations. Additional optimization proposed in (Ilyas et al., 2018). If true, and the observation spec is not tf.float32 (such as Atari), please manually convert the observation spec received from the environment to tf.float32 before creating the networks. Otherwise, the normalized input to the network (float32) will have a different dtype as what the network expects, resulting in a mismatch error. Example usage: ```python observation_tensor_spec, action_spec, time_step_tensor_spec = ( spec_utils.get_tensor_specs(env)) normalized_observation_tensor_spec = tf.nest.map_structure( lambda s: tf.TensorSpec( dtype=tf.float32, shape=s.shape, name=s.name ), observation_tensor_spec ) actor_net = actor_distribution_network.ActorDistributionNetwork( normalized_observation_tensor_spec, ...) value_net = value_network.ValueNetwork( normalized_observation_tensor_spec, ...) # Note that the agent still uses the original time_step_tensor_spec # from the environment. agent = ppo_clip_agent.PPOClipAgent( time_step_tensor_spec, action_spec, actor_net, value_net, ...) ``` log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN values. Default: no clipping. kl_cutoff_factor: Only meaningful when `kl_cutoff_coef > 0.0`. A multipler used for calculating the KL cutoff ( = `kl_cutoff_factor * adaptive_kl_target`). If policy KL averaged across the batch changes more than the cutoff, a squared cutoff loss would be added to the loss function. kl_cutoff_coef: kl_cutoff_coef and kl_cutoff_factor are additional params if one wants to use a KL cutoff loss term in addition to the adaptive KL loss term. Default to 0.0 to disable the KL cutoff loss term as this was not used in the paper. kl_cutoff_coef is the coefficient to mulitply by the KL cutoff loss term, before adding to the total loss function. initial_adaptive_kl_beta: Initial value for beta coefficient of adaptive KL penalty. This initial value is not important in practice because the algorithm quickly adjusts to it. A common default is 1.0. adaptive_kl_target: Desired KL target for policy updates. If actual KL is far from this target, adaptive_kl_beta will be updated. You should tune this for your environment. 0.01 was found to perform well for Mujoco. adaptive_kl_tolerance: A tolerance for adaptive_kl_beta. Mean KL above `(1 + tol) * adaptive_kl_target`, or below `(1 - tol) * adaptive_kl_target`, will cause `adaptive_kl_beta` to be updated. `0.5` was chosen heuristically in the paper, but the algorithm is not very sensitive to it. gradient_clipping: Norm length to clip gradients. Default: no clipping. value_clipping: Difference between new and old value predictions are clipped to this threshold. Value clipping could be helpful when training very deep networks. Default: no clipping. check_numerics: If true, adds `tf.debugging.check_numerics` to help find NaN / Inf values. For debugging only. compute_value_and_advantage_in_train: A bool to indicate where value prediction and advantage calculation happen. If True, both happen in agent.train(). If False, value prediction is computed during data collection. This argument must be set to `False` if mini batch learning is enabled. update_normalizers_in_train: A bool to indicate whether normalizers are updated as parts of the `train` method. Set to `False` if mini batch learning is enabled, or if `train` is called on multiple iterations of the same trajectories. In that case, you would need to use `PPOLearner` (which updates all the normalizers outside of the agent). This ensures that normalizers are updated in the same way as (Schulman, 2017). debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If true, gradient summaries will be written. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: TypeError: if `actor_net` or `value_net` is not of type `tf_agents.networks.Network`. """ if not isinstance(actor_net, network.Network): raise TypeError('actor_net must be an instance of a network.Network.') if not isinstance(value_net, network.Network): raise TypeError('value_net must be an instance of a network.Network.') # PPOPolicy validates these, so we skip validation here. actor_net.create_variables(time_step_spec.observation) value_net.create_variables(time_step_spec.observation) tf.Module.__init__(self, name=name) self._optimizer = optimizer self._actor_net = actor_net self._value_net = value_net self._importance_ratio_clipping = importance_ratio_clipping self._lambda = lambda_value self._discount_factor = discount_factor self._entropy_regularization = entropy_regularization self._policy_l2_reg = policy_l2_reg self._value_function_l2_reg = value_function_l2_reg self._shared_vars_l2_reg = shared_vars_l2_reg self._value_pred_loss_coef = value_pred_loss_coef self._num_epochs = num_epochs self._use_gae = use_gae self._use_td_lambda_return = use_td_lambda_return self._reward_norm_clipping = reward_norm_clipping self._log_prob_clipping = log_prob_clipping self._kl_cutoff_factor = kl_cutoff_factor self._kl_cutoff_coef = kl_cutoff_coef self._adaptive_kl_target = adaptive_kl_target self._adaptive_kl_tolerance = adaptive_kl_tolerance self._gradient_clipping = gradient_clipping or 0.0 self._value_clipping = value_clipping or 0.0 self._check_numerics = check_numerics self._compute_value_and_advantage_in_train = ( compute_value_and_advantage_in_train) self.update_normalizers_in_train = update_normalizers_in_train if not isinstance(self._optimizer, tf.keras.optimizers.Optimizer): logging.warning( 'Only tf.keras.optimizers.Optimizers are well supported, got a ' 'non-TF2 optimizer: %s', self._optimizer) self._initial_adaptive_kl_beta = initial_adaptive_kl_beta if initial_adaptive_kl_beta > 0.0: self._adaptive_kl_beta = common.create_variable( 'adaptive_kl_beta', initial_adaptive_kl_beta, dtype=tf.float32) else: self._adaptive_kl_beta = None self._reward_normalizer = None if normalize_rewards: self._reward_normalizer = tensor_normalizer.StreamingTensorNormalizer( tensor_spec.TensorSpec([], tf.float32), scope='normalize_reward') self._observation_normalizer = None if normalize_observations: self._observation_normalizer = ( tensor_normalizer.StreamingTensorNormalizer( time_step_spec.observation, scope='normalize_observations')) self._advantage_normalizer = tensor_normalizer.StreamingTensorNormalizer( tensor_spec.TensorSpec([], tf.float32), scope='normalize_advantages') policy = greedy_policy.GreedyPolicy( attention_ppo_policy.AttentionPPOPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=actor_net, value_network=value_net, observation_normalizer=self._observation_normalizer, clip=False, collect=False)) collect_policy = attention_ppo_policy.AttentionPPOPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=actor_net, value_network=value_net, observation_normalizer=self._observation_normalizer, clip=False, collect=True, compute_value_and_advantage_in_train=( self._compute_value_and_advantage_in_train), ) if isinstance(self._actor_net, network.DistributionNetwork): # Legacy behavior self._action_distribution_spec = self._actor_net.output_spec else: self._action_distribution_spec = self._actor_net.create_variables( time_step_spec.observation) # Set training_data_spec to collect_data_spec with augmented policy info, # iff return and normalized advantage are saved in preprocess_sequence. if self._compute_value_and_advantage_in_train: training_data_spec = None else: training_policy_info = collect_policy.trajectory_spec.policy_info.copy() training_policy_info.update({ 'value_prediction': collect_policy.trajectory_spec.policy_info['value_prediction'], 'return': tensor_spec.TensorSpec(shape=[], dtype=tf.float32), 'advantage': tensor_spec.TensorSpec(shape=[], dtype=tf.float32), }) training_data_spec = collect_policy.trajectory_spec.replace( policy_info=training_policy_info) super(ppo_agent.PPOAgent, self).__init__( time_step_spec, action_spec, policy, collect_policy, train_sequence_length=None, training_data_spec=training_data_spec, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) # This must be built after super() which sets up self.data_context. self._collected_as_transition = data_converter.AsTransition( self.collect_data_context, squeeze_time_dim=False) self._as_trajectory = data_converter.AsTrajectory( self.data_context, sequence_length=None)
def testInitialValueWithShape(self): counter = common.create_variable('counter', 1, shape=(2, )) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(counter), [1, 1])
def testTrainWithLagrange(self, use_lagrange_cql_alpha, use_variable_for_cql_alpha, log_cql_alpha_clipping, expected_cql_alpha_step_one, expected_cql_alpha_step_two, expected_cql_loss_step_one, expected_cql_loss_step_two): if use_variable_for_cql_alpha: cql_alpha = tf.Variable(5.0) cql_alpha_var = cql_alpha # Getting around type checking. else: cql_alpha = 5.0 cql_alpha_learning_rate = 0.5 cql_tau = 10 num_cql_samples = 5 actor_net = actor_distribution_network.ActorDistributionNetwork( self._obs_spec, self._action_spec, fc_layer_params=None) critic_net = critic_network.CriticNetwork( (self._obs_spec, self._action_spec), observation_fc_layer_params=(16,), action_fc_layer_params=(16,), joint_fc_layer_params=(16,), kernel_initializer='glorot_uniform', last_kernel_initializer='glorot_uniform') counter = common.create_variable('test_train_counter') optimizer_fn = tf.compat.v1.train.AdamOptimizer agent = cql_sac_agent.CqlSacAgent( self._time_step_spec, self._action_spec, critic_network=critic_net, actor_network=actor_net, actor_optimizer=optimizer_fn(1e-3), critic_optimizer=optimizer_fn(1e-3), alpha_optimizer=optimizer_fn(1e-3), cql_alpha=cql_alpha, num_cql_samples=num_cql_samples, include_critic_entropy_term=False, use_lagrange_cql_alpha=use_lagrange_cql_alpha, cql_alpha_learning_rate=cql_alpha_learning_rate, cql_tau=cql_tau, random_seed=self._random_seed, log_cql_alpha_clipping=log_cql_alpha_clipping, train_step_counter=counter) batch_size = 5 observations = tf.constant( [[[1, 2], [3, 4]]] * batch_size, dtype=tf.float32) actions = tf.constant([[[0], [1]]] * batch_size, dtype=tf.float32) time_steps = ts.TimeStep( step_type=tf.constant([[1] * 2] * batch_size, dtype=tf.int32), reward=tf.constant([[1] * 2] * batch_size, dtype=tf.float32), discount=tf.constant([[1] * 2] * batch_size, dtype=tf.float32), observation=observations) experience = trajectory.Trajectory(time_steps.step_type, observations, actions, (), time_steps.step_type, time_steps.reward, time_steps.discount) # Force variable creation. agent.policy.variables() if not tf.executing_eagerly(): # Get experience first to make sure optimizer variables are created and # can be initialized. experience = agent.train(experience) with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertEqual(self.evaluate(counter), 0) self.evaluate(experience) self.assertEqual(self.evaluate(counter), 1) else: # Training step one. self.assertEqual(self.evaluate(counter), 0) loss = self.evaluate(agent.train(experience)) self.assertEqual(self.evaluate(counter), 1) self.assertAllClose(loss.extra.cql_loss, expected_cql_loss_step_one) self.assertAllClose(loss.extra.cql_alpha, expected_cql_alpha_step_one) if use_lagrange_cql_alpha: self.assertGreater(loss.extra.cql_alpha_loss, 0) else: self.assertEqual(loss.extra.cql_alpha_loss, 0) # Training step two. if use_variable_for_cql_alpha: cql_alpha_var.assign_add(1) loss = self.evaluate(agent.train(experience)) self.assertEqual(self.evaluate(counter), 2) self.assertAllClose(loss.extra.cql_loss, expected_cql_loss_step_two) self.assertAllClose(loss.extra.cql_alpha, expected_cql_alpha_step_two)