def get_agent(time_step_spec, action_spec, actor_net, value_net, num_epochs, step_counter, learning_rate): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) tf_agent = ppo_clip_agent.PPOClipAgent(time_step_spec, action_spec, optimizer, actor_net=actor_net, value_net=value_net, entropy_regularization=0.0, importance_ratio_clipping=0.2, normalize_observations=False, normalize_rewards=False, use_gae=True, num_epochs=num_epochs, debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=step_counter) return tf_agent
def train_eval( root_dir, env_name='HalfCheetah-v2', env_load_fn=suite_mujoco.load, random_seed=None, # TODO(b/127576522): rename to policy_fc_layers. actor_fc_layers=(200, 100), value_fc_layers=(200, 100), use_rnns=False, lstm_size=(20, ), # Params for collect num_environment_steps=25000000, collect_episodes_per_iteration=30, num_parallel_environments=30, replay_buffer_capacity=1001, # Per-environment # Params for train num_epochs=25, learning_rate=1e-3, # Params for eval num_eval_episodes=30, eval_interval=500, # Params for summaries and logging train_checkpoint_interval=500, policy_checkpoint_interval=500, log_interval=50, summary_interval=50, summaries_flush_secs=1, use_tf_functions=True, debug_summaries=False, summarize_grads_and_vars=False): """A simple train and eval for PPO.""" if root_dir is None: raise AttributeError('train_eval requires a root_dir.') root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') saved_model_dir = os.path.join(root_dir, 'policy_saved_model') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes) ] global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): if random_seed is not None: tf.compat.v1.set_random_seed(random_seed) eval_tf_env = tf_py_environment.TFPyEnvironment(env_load_fn(env_name)) tf_env = tf_py_environment.TFPyEnvironment( parallel_py_environment.ParallelPyEnvironment( [lambda: env_load_fn(env_name)] * num_parallel_environments)) optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) if use_rnns: actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( tf_env.observation_spec(), tf_env.action_spec(), input_fc_layer_params=actor_fc_layers, output_fc_layer_params=None, lstm_size=lstm_size) value_net = value_rnn_network.ValueRnnNetwork( tf_env.observation_spec(), input_fc_layer_params=value_fc_layers, output_fc_layer_params=None) else: actor_net = actor_distribution_network.ActorDistributionNetwork( tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=actor_fc_layers, activation_fn=tf.keras.activations.tanh) value_net = value_network.ValueNetwork( tf_env.observation_spec(), fc_layer_params=value_fc_layers, activation_fn=tf.keras.activations.tanh) tf_agent = ppo_clip_agent.PPOClipAgent( tf_env.time_step_spec(), tf_env.action_spec(), optimizer, actor_net=actor_net, value_net=value_net, entropy_regularization=0.0, importance_ratio_clipping=0.2, normalize_observations=False, normalize_rewards=False, use_gae=True, num_epochs=num_epochs, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) tf_agent.initialize() environment_steps_metric = tf_metrics.EnvironmentSteps() step_metrics = [ tf_metrics.NumberOfEpisodes(), environment_steps_metric, ] train_metrics = step_metrics + [ tf_metrics.AverageReturnMetric( batch_size=num_parallel_environments), tf_metrics.AverageEpisodeLengthMetric( batch_size=num_parallel_environments), ] eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) train_checkpointer = common.Checkpointer( ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')) policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'policy'), policy=eval_policy, global_step=global_step) saved_model = policy_saver.PolicySaver(eval_policy, train_step=global_step) train_checkpointer.initialize_or_restore() collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_episodes=collect_episodes_per_iteration) def train_step(): trajectories = replay_buffer.gather_all() return tf_agent.train(experience=trajectories) if use_tf_functions: # TODO(b/123828980): Enable once the cause for slowdown was identified. collect_driver.run = common.function(collect_driver.run, autograph=False) tf_agent.train = common.function(tf_agent.train, autograph=False) train_step = common.function(train_step) collect_time = 0 train_time = 0 timed_at_step = global_step.numpy() while environment_steps_metric.result() < num_environment_steps: global_step_val = global_step.numpy() if global_step_val % eval_interval == 0: metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', ) start_time = time.time() collect_driver.run() collect_time += time.time() - start_time start_time = time.time() total_loss, _ = train_step() replay_buffer.clear() train_time += time.time() - start_time for train_metric in train_metrics: train_metric.tf_summaries(train_step=global_step, step_metrics=step_metrics) if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, total_loss) steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) logging.info('%.3f steps/sec', steps_per_sec) logging.info('collect_time = %.3f, train_time = %.3f', collect_time, train_time) with tf.compat.v2.summary.record_if(True): tf.compat.v2.summary.scalar(name='global_steps_per_sec', data=steps_per_sec, step=global_step) if global_step_val % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step_val) if global_step_val % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step_val) saved_model_path = os.path.join( saved_model_dir, 'policy_' + ('%d' % global_step_val).zfill(9)) saved_model.save(saved_model_path) timed_at_step = global_step_val collect_time = 0 train_time = 0 # One final eval before exiting. metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', )
def train_eval( root_dir, tf_master='', env_name='HalfCheetah-v2', env_load_fn=suite_mujoco.load, random_seed=None, # TODO(b/127576522): rename to policy_fc_layers. actor_fc_layers=(200, 100), value_fc_layers=(200, 100), use_rnns=False, # Params for collect num_environment_steps=25000000, collect_episodes_per_iteration=30, num_parallel_environments=30, replay_buffer_capacity=1001, # Per-environment # Params for train num_epochs=25, learning_rate=1e-3, # Params for eval num_eval_episodes=30, eval_interval=500, # Params for summaries and logging train_checkpoint_interval=500, policy_checkpoint_interval=500, log_interval=50, summary_interval=50, summaries_flush_secs=1, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None): """A simple train and eval for PPO.""" if root_dir is None: raise AttributeError('train_eval requires a root_dir.') root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ batched_py_metric.BatchedPyMetric( AverageReturnMetric, metric_args={'buffer_size': num_eval_episodes}, batch_size=num_parallel_environments), batched_py_metric.BatchedPyMetric( AverageEpisodeLengthMetric, metric_args={'buffer_size': num_eval_episodes}, batch_size=num_parallel_environments), ] eval_summary_writer_flush_op = eval_summary_writer.flush() global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): if random_seed is not None: tf.compat.v1.set_random_seed(random_seed) eval_py_env = parallel_py_environment.ParallelPyEnvironment( [lambda: env_load_fn(env_name)] * num_parallel_environments) tf_env = tf_py_environment.TFPyEnvironment( parallel_py_environment.ParallelPyEnvironment( [lambda: env_load_fn(env_name)] * num_parallel_environments)) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) if use_rnns: actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( tf_env.observation_spec(), tf_env.action_spec(), input_fc_layer_params=actor_fc_layers, output_fc_layer_params=None) value_net = value_rnn_network.ValueRnnNetwork( tf_env.observation_spec(), input_fc_layer_params=value_fc_layers, output_fc_layer_params=None) else: actor_net = actor_distribution_network.ActorDistributionNetwork( tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=actor_fc_layers, activation_fn=tf.keras.activations.tanh) value_net = value_network.ValueNetwork( tf_env.observation_spec(), fc_layer_params=value_fc_layers, activation_fn=tf.keras.activations.tanh) tf_agent = ppo_clip_agent.PPOClipAgent( tf_env.time_step_spec(), tf_env.action_spec(), optimizer, actor_net=actor_net, value_net=value_net, entropy_regularization=0.0, importance_ratio_clipping=0.2, normalize_observations=False, normalize_rewards=False, use_gae=True, num_epochs=num_epochs, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) eval_py_policy = py_tf_policy.PyTFPolicy(tf_agent.policy) environment_steps_metric = tf_metrics.EnvironmentSteps() environment_steps_count = environment_steps_metric.result() step_metrics = [ tf_metrics.NumberOfEpisodes(), environment_steps_metric, ] train_metrics = step_metrics + [ tf_metrics.AverageReturnMetric( batch_size=num_parallel_environments), tf_metrics.AverageEpisodeLengthMetric( batch_size=num_parallel_environments), ] # Add to replay buffer and other agent specific observers. replay_buffer_observer = [replay_buffer.add_batch] collect_policy = tf_agent.collect_policy collect_op = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, collect_policy, observers=replay_buffer_observer + train_metrics, num_episodes=collect_episodes_per_iteration).run() trajectories = replay_buffer.gather_all() train_op, _ = tf_agent.train(experience=trajectories) with tf.control_dependencies([train_op]): clear_replay_op = replay_buffer.clear() with tf.control_dependencies([clear_replay_op]): train_op = tf.identity(train_op) train_checkpointer = common.Checkpointer( ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')) policy_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(train_dir, 'policy'), policy=tf_agent.policy, global_step=global_step) summary_ops = [] for train_metric in train_metrics: summary_ops.append(train_metric.tf_summaries( train_step=global_step, step_metrics=step_metrics)) with eval_summary_writer.as_default(), \ tf.compat.v2.summary.record_if(True): for eval_metric in eval_metrics: eval_metric.tf_summaries( train_step=global_step, step_metrics=step_metrics) init_agent_op = tf_agent.initialize() with tf.compat.v1.Session(tf_master) as sess: # Initialize graph. train_checkpointer.initialize_or_restore(sess) common.initialize_uninitialized_variables(sess) sess.run(init_agent_op) sess.run(train_summary_writer.init()) sess.run(eval_summary_writer.init()) collect_time = 0 train_time = 0 timed_at_step = sess.run(global_step) steps_per_second_ph = tf.compat.v1.placeholder( tf.float32, shape=(), name='steps_per_sec_ph') steps_per_second_summary = tf.compat.v2.summary.scalar( name='global_steps_per_sec', data=steps_per_second_ph, step=global_step) while sess.run(environment_steps_count) < num_environment_steps: global_step_val = sess.run(global_step) if global_step_val % eval_interval == 0: metric_utils.compute_summaries( eval_metrics, eval_py_env, eval_py_policy, num_episodes=num_eval_episodes, global_step=global_step_val, callback=eval_metrics_callback, log=True, ) sess.run(eval_summary_writer_flush_op) start_time = time.time() sess.run(collect_op) collect_time += time.time() - start_time start_time = time.time() total_loss, _ = sess.run([train_op, summary_ops]) train_time += time.time() - start_time global_step_val = sess.run(global_step) if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, total_loss) steps_per_sec = ( (global_step_val - timed_at_step) / (collect_time + train_time)) logging.info('%.3f steps/sec', steps_per_sec) sess.run( steps_per_second_summary, feed_dict={steps_per_second_ph: steps_per_sec}) logging.info('%s', 'collect_time = {}, train_time = {}'.format( collect_time, train_time)) timed_at_step = global_step_val collect_time = 0 train_time = 0 if global_step_val % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step_val) if global_step_val % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step_val) # One final eval before exiting. metric_utils.compute_summaries( eval_metrics, eval_py_env, eval_py_policy, num_episodes=num_eval_episodes, global_step=global_step_val, callback=eval_metrics_callback, log=True, ) sess.run(eval_summary_writer_flush_op)
def train_eval( root_dir, env_name='HalfCheetah-v2', # Training params num_iterations=1600, actor_fc_layers=(64, 64), value_fc_layers=(64, 64), learning_rate=3e-4, collect_sequence_length=2048, minibatch_size=64, num_epochs=10, # Agent params importance_ratio_clipping=0.2, lambda_value=0.95, discount_factor=0.99, entropy_regularization=0., value_pred_loss_coef=0.5, use_gae=True, use_td_lambda_return=True, gradient_clipping=0.5, value_clipping=None, # Replay params reverb_port=None, replay_capacity=10000, # Others policy_save_interval=5000, summary_interval=1000, eval_interval=10000, eval_episodes=100, debug_summaries=False, summarize_grads_and_vars=False): """Trains and evaluates PPO (Importance Ratio Clipping). Args: root_dir: Main directory path where checkpoints, saved_models, and summaries will be written to. env_name: Name for the Mujoco environment to load. num_iterations: The number of iterations to perform collection and training. actor_fc_layers: List of fully_connected parameters for the actor network, where each item is the number of units in the layer. value_fc_layers: : List of fully_connected parameters for the value network, where each item is the number of units in the layer. learning_rate: Learning rate used on the Adam optimizer. collect_sequence_length: Number of steps to take in each collect run. minibatch_size: Number of elements in each mini batch. If `None`, the entire collected sequence will be treated as one batch. num_epochs: Number of iterations to repeat over all collected data per data collection step. (Schulman,2017) sets this to 10 for Mujoco, 15 for Roboschool and 3 for Atari. importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective. For more detail, see explanation at the top of the doc. lambda_value: Lambda parameter for TD-lambda computation. discount_factor: Discount factor for return computation. Default to `0.99` which is the value used for all environments from (Schulman, 2017). entropy_regularization: Coefficient for entropy regularization loss term. Default to `0.0` because no entropy bonus was used in (Schulman, 2017). value_pred_loss_coef: Multiplier for value prediction loss to balance with policy gradient loss. Default to `0.5`, which was used for all environments in the OpenAI baseline implementation. This parameters is irrelevant unless you are sharing part of actor_net and value_net. In that case, you would want to tune this coeeficient, whose value depends on the network architecture of your choice. use_gae: If True (default False), uses generalized advantage estimation for computing per-timestep advantage. Else, just subtracts value predictions from empirical return. use_td_lambda_return: If True (default False), uses td_lambda_return for training value function; here: `td_lambda_return = gae_advantage + value_predictions`. `use_gae` must be set to `True` as well to enable TD -lambda returns. If `use_td_lambda_return` is set to True while `use_gae` is False, the empirical return will be used and a warning will be logged. gradient_clipping: Norm length to clip gradients. value_clipping: Difference between new and old value predictions are clipped to this threshold. Value clipping could be helpful when training very deep networks. Default: no clipping. reverb_port: Port for reverb server, if None, use a randomly chosen unused port. replay_capacity: The maximum number of elements for the replay buffer. Items will be wasted if this is smalled than collect_sequence_length. policy_save_interval: How often, in train_steps, the policy will be saved. summary_interval: How often to write data into Tensorboard. eval_interval: How often to run evaluation, in train_steps. eval_episodes: Number of episodes to evaluate over. debug_summaries: Boolean for whether to gather debug summaries. summarize_grads_and_vars: If true, gradient summaries will be written. """ collect_env = suite_mujoco.load(env_name) eval_env = suite_mujoco.load(env_name) num_environments = 1 observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = ( spec_utils.get_tensor_specs(collect_env)) # TODO(b/172267869): Remove this conversion once TensorNormalizer stops # converting float64 inputs to float32. observation_tensor_spec = tf.TensorSpec( dtype=tf.float32, shape=observation_tensor_spec.shape) train_step = train_utils.create_train_step() actor_net_builder = ppo_actor_network.PPOActorNetwork() actor_net = actor_net_builder.create_sequential_actor_net( actor_fc_layers, action_tensor_spec) value_net = value_network.ValueNetwork( observation_tensor_spec, fc_layer_params=value_fc_layers, kernel_initializer=tf.keras.initializers.Orthogonal()) current_iteration = tf.Variable(0, dtype=tf.int64) def learning_rate_fn(): # Linearly decay the learning rate. return learning_rate * (1 - current_iteration / num_iterations) agent = ppo_clip_agent.PPOClipAgent( time_step_tensor_spec, action_tensor_spec, optimizer=tf.keras.optimizers.Adam( learning_rate=learning_rate_fn, epsilon=1e-5), actor_net=actor_net, value_net=value_net, importance_ratio_clipping=importance_ratio_clipping, lambda_value=lambda_value, discount_factor=discount_factor, entropy_regularization=entropy_regularization, value_pred_loss_coef=value_pred_loss_coef, # This is a legacy argument for the number of times we repeat the data # inside of the train function, incompatible with mini batch learning. # We set the epoch number from the replay buffer and tf.Data instead. num_epochs=1, use_gae=use_gae, use_td_lambda_return=use_td_lambda_return, gradient_clipping=gradient_clipping, value_clipping=value_clipping, # TODO(b/150244758): Default compute_value_and_advantage_in_train to False # after Reverb open source. compute_value_and_advantage_in_train=False, # Skips updating normalizers in the agent, as it's handled in the learner. update_normalizers_in_train=False, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step) agent.initialize() reverb_server = reverb.Server( [ reverb.Table( # Replay buffer storing experience for training. name='training_table', sampler=reverb.selectors.Fifo(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1), max_size=replay_capacity, max_times_sampled=1, ), reverb.Table( # Replay buffer storing experience for normalization. name='normalization_table', sampler=reverb.selectors.Fifo(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1), max_size=replay_capacity, max_times_sampled=1, ) ], port=reverb_port) # Create the replay buffer. reverb_replay_train = reverb_replay_buffer.ReverbReplayBuffer( agent.collect_data_spec, sequence_length=collect_sequence_length, table_name='training_table', server_address='localhost:{}'.format(reverb_server.port), # The only collected sequence is used to populate the batches. max_cycle_length=1, rate_limiter_timeout_ms=1000) reverb_replay_normalization = reverb_replay_buffer.ReverbReplayBuffer( agent.collect_data_spec, sequence_length=collect_sequence_length, table_name='normalization_table', server_address='localhost:{}'.format(reverb_server.port), # The only collected sequence is used to populate the batches. max_cycle_length=1, rate_limiter_timeout_ms=1000) rb_observer = reverb_utils.ReverbTrajectorySequenceObserver( reverb_replay_train.py_client, ['training_table', 'normalization_table'], sequence_length=collect_sequence_length, stride_length=collect_sequence_length) saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR) collect_env_step_metric = py_metrics.EnvironmentSteps() learning_triggers = [ triggers.PolicySavedModelTrigger( saved_model_dir, agent, train_step, interval=policy_save_interval, metadata_metrics={ triggers.ENV_STEP_METADATA_KEY: collect_env_step_metric }), triggers.StepPerSecondLogTrigger(train_step, interval=summary_interval), ] def training_dataset_fn(): return reverb_replay_train.as_dataset( sample_batch_size=num_environments, sequence_preprocess_fn=agent.preprocess_sequence) def normalization_dataset_fn(): return reverb_replay_normalization.as_dataset( sample_batch_size=num_environments, sequence_preprocess_fn=agent.preprocess_sequence) agent_learner = ppo_learner.PPOLearner( root_dir, train_step, agent, experience_dataset_fn=training_dataset_fn, normalization_dataset_fn=normalization_dataset_fn, num_samples=1, num_epochs=num_epochs, minibatch_size=minibatch_size, shuffle_buffer_size=collect_sequence_length, triggers=learning_triggers) tf_collect_policy = agent.collect_policy collect_policy = py_tf_eager_policy.PyTFEagerPolicy( tf_collect_policy, use_tf_function=True) collect_actor = actor.Actor( collect_env, collect_policy, train_step, steps_per_run=collect_sequence_length, observers=[rb_observer], metrics=actor.collect_metrics(buffer_size=10) + [collect_env_step_metric], reference_metrics=[collect_env_step_metric], summary_dir=os.path.join(root_dir, learner.TRAIN_DIR), summary_interval=summary_interval) eval_greedy_policy = py_tf_eager_policy.PyTFEagerPolicy( agent.policy, use_tf_function=True) if eval_interval: logging.info('Intial evaluation.') eval_actor = actor.Actor( eval_env, eval_greedy_policy, train_step, metrics=actor.eval_metrics(eval_episodes), reference_metrics=[collect_env_step_metric], summary_dir=os.path.join(root_dir, 'eval'), episodes_per_run=eval_episodes) eval_actor.run_and_log() logging.info('Training on %s', env_name) last_eval_step = 0 for i in range(num_iterations): collect_actor.run() rb_observer.flush() agent_learner.run() reverb_replay_train.clear() reverb_replay_normalization.clear() current_iteration.assign_add(1) # Eval only if `eval_interval` has been set. Then, eval if the current train # step is equal or greater than the `last_eval_step` + `eval_interval` or if # this is the last iteration. This logic exists because agent_learner.run() # does not return after every train step. if (eval_interval and (agent_learner.train_step_numpy >= eval_interval + last_eval_step or i == num_iterations - 1)): logging.info('Evaluating.') eval_actor.run_and_log() last_eval_step = agent_learner.train_step_numpy rb_observer.close() reverb_server.stop()
def __init__( self, time_step_spec, action_spec, # Specific to multi-agent case n_agents, learning_rate=1e-4, # Specific to multi-grid agents actor_fc_layers=(32, 32), value_fc_layers=(32, 32), lstm_size=(128, ), conv_filters=8, conv_kernel=3, direction_fc=5, # Modifying agents inactive_agent_ids=tuple(), non_learning_agents=tuple(), # PPO Clip agent params importance_ratio_clipping=0.0, lambda_value=0.95, discount_factor=0.99, entropy_regularization=0.05, policy_l2_reg=0.0, value_function_l2_reg=0.0, shared_vars_l2_reg=0.0, value_pred_loss_coef=0.5, num_epochs=25, use_gae=False, use_td_lambda_return=False, normalize_rewards=True, reward_norm_clipping=10.0, normalize_observations=True, log_prob_clipping=0.0, gradient_clipping=None, check_numerics=False, debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=None, use_attention_networks=False, name='MultiagentPPO'): """Creates a centralized controller agent that creates several PPO Agents. Note that all architecture params apply to each of the sub-agents created. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. n_agents: The number of agents in this environment. learning_rate: Initial learning rate for all agents. actor_fc_layers: Number and size of fully-connected layers in the actor. value_fc_layers: Number and size of fully-connected layers in the critic. lstm_size: Number of cells in the LSTM in the actor and critic. conv_filters: Number of convolutional filters. conv_kernel: Size of the convolutional kernel. direction_fc: Number of fully-connected neurons connecting the one-hot direction to the main LSTM. inactive_agent_ids: Integer IDs of agents who will not train or act in the environment, but will simply return a no-op action. non_learning_agents: Integer IDs of agents who will not train, but still act in the environment. importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective. For more detail, see explanation at the top of the doc. lambda_value: Lambda parameter for TD-lambda computation. discount_factor: Discount factor for return computation. entropy_regularization: Coefficient for entropy regularization loss term. policy_l2_reg: Coefficient for l2 regularization of unshared policy weights. value_function_l2_reg: Coefficient for l2 regularization of unshared value function weights. shared_vars_l2_reg: Coefficient for l2 regularization of weights shared between the policy and value functions. value_pred_loss_coef: Multiplier for value prediction loss to balance with policy gradient loss. num_epochs: Number of epochs for computing policy updates. use_gae: If True (default False), uses generalized advantage estimation for computing per-timestep advantage. Else, just subtracts value predictions from empirical return. use_td_lambda_return: If True (default False), uses td_lambda_return for training value function. (td_lambda_return = gae_advantage + value_predictions) normalize_rewards: If true, keeps moving variance of rewards and normalizes incoming rewards. reward_norm_clipping: Value above and below to clip normalized reward. normalize_observations: If true, keeps moving mean and variance of observations and normalizes incoming observations. log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN values. Default: no clipping. gradient_clipping: Norm length to clip gradients. Default: no clipping. check_numerics: If true, adds tf.debugging.check_numerics to help find NaN / Inf values. For debugging only. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If true, gradient summaries will be written. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. use_attention_networks: Option to use attention network architecture in the agent. This architecture requires observations from the previous time step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If the actor_net is not a DistributionNetwork. """ self.n_agents = n_agents self.inactive_agent_ids = inactive_agent_ids self.non_learning_agents = non_learning_agents # Get single-agent specs (single_obs_spec, single_time_step_spec, single_action_spec) = self.get_single_agent_specs( time_step_spec, action_spec) # Make baby agents self.agents = [None] * self.n_agents self.optimizers = [None] * self.n_agents for agent_id in range(self.n_agents): with tf.name_scope('agent_' + str(agent_id)): self.optimizers[agent_id] = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) if use_attention_networks: network_build_fn = multigrid_networks.construct_attention_networks else: network_build_fn = multigrid_networks.construct_multigrid_networks # Build actor and critic networks actor_net, value_net = network_build_fn( single_obs_spec, single_action_spec, actor_fc_layers=actor_fc_layers, value_fc_layers=value_fc_layers, lstm_size=lstm_size, conv_filters=conv_filters, conv_kernel=conv_kernel, scalar_fc=direction_fc) logging.info('Creating agent %d...', agent_id) self.agents[agent_id] = ppo_clip_agent.PPOClipAgent( single_time_step_spec, single_action_spec, self.optimizers[agent_id], actor_net=actor_net, value_net=value_net, entropy_regularization=entropy_regularization, importance_ratio_clipping=0.2, normalize_observations=False, normalize_rewards=False, use_gae=True, num_epochs=num_epochs, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, compute_value_and_advantage_in_train=True) self.agents[agent_id].initialize() with tf.name_scope('meta_agent'): # Initialize policies self._policies = [ self.agents[a].policy for a in range(self.n_agents) ] policy = multiagent_ppo_policy.MultiagentPPOPolicy( self._policies, time_step_spec=time_step_spec, action_spec=action_spec, clip=False, collect=False, inactive_agent_ids=inactive_agent_ids) self._collect_policies = [ self.agents[a].collect_policy for a in range(self.n_agents) ] collect_policy = multiagent_ppo_policy.MultiagentPPOPolicy( self._collect_policies, time_step_spec=time_step_spec, action_spec=action_spec, clip=False, collect=True, inactive_agent_ids=inactive_agent_ids) super(MultiagentPPO, self).__init__( time_step_spec, action_spec, policy, collect_policy, train_sequence_length=None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) print('Finished constructing multi-agent PPO')
def create_ppo_agent_and_dataset_fn(action_spec, time_step_spec, train_step, batch_size): """Builds and returns a dummy PPO Agent, dataset and dataset function.""" del action_spec # Unused. del time_step_spec # Unused. del batch_size # Unused. # No arbitrary spec supported. obs_spec = tensor_spec.TensorSpec([2], tf.float32) ts_spec = ts.time_step_spec(obs_spec) act_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1) actor_net = actor_distribution_network.ActorDistributionNetwork( obs_spec, act_spec, fc_layer_params=(100, ), activation_fn=tf.keras.activations.tanh) value_net = value_network.ValueNetwork( obs_spec, fc_layer_params=(100, ), activation_fn=tf.keras.activations.tanh) agent = ppo_clip_agent.PPOClipAgent( ts_spec, act_spec, optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), actor_net=actor_net, value_net=value_net, entropy_regularization=0.0, importance_ratio_clipping=0.2, normalize_observations=False, normalize_rewards=False, use_gae=False, use_td_lambda_return=False, num_epochs=1, debug_summaries=False, summarize_grads_and_vars=False, train_step_counter=train_step, compute_value_and_advantage_in_train=False) def _create_experience(_): observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) mid_time_step_val = ts.StepType.MID.tolist() time_steps = ts.TimeStep(step_type=tf.constant( [[mid_time_step_val] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]], dtype=tf.float32) policy_info = { 'dist_params': action_distribution_parameters, } policy_info['value_prediction'] = value_preds experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) return agent._preprocess(experience) # pylint: disable=protected-access dataset = tf.data.Dataset.from_tensor_slices([[i] for i in range(100) ]).map(_create_experience) dataset = tf.data.Dataset.zip((dataset, tf.data.experimental.Counter())) dataset_fn = lambda: dataset return agent, dataset, dataset_fn, agent.training_data_spec
def __init__( self, env, global_step, root_dir, step_metrics, name='Agent', is_environment=False, use_tf_functions=True, max_steps=250, replace_reward=True, non_negative_regret=False, id_num=0, block_budget_weight=0., # Architecture hparams use_rnn=True, learning_rate=1e-4, actor_fc_layers=(32, 32), value_fc_layers=(32, 32), lstm_size=(128, ), conv_filters=8, conv_kernel=3, scalar_fc=5, entropy_regularization=0., xy_dim=None, # Training & logging settings num_epochs=25, num_eval_episodes=5, num_parallel_envs=5, replay_buffer_capacity=1001, debug_summaries=True, summarize_grads_and_vars=True, ): """Initializes agent, replay buffer, metrics, and checkpointing. Args: env: An AdversarialTfPyEnvironment with specs and advesary specs. global_step: A tf variable tracking the global step. root_dir: Path to directory where metrics and checkpoints should be saved. step_metrics: A list of tf-agents metrics which represent the x-axis during training, such as the number of episodes or the number of environment steps. name: The name of this agent, e.g. 'Adversary'. is_environment: If True, will use the adversary specs from the environment and construct a network with additional inputs for the adversary. use_tf_functions: If True, will use tf.function to wrap the agent's train function. max_steps: The maximum number of steps the agent is allowed to interact with the environment in every data collection loop. replace_reward: If False, will not modify the reward stored in the agent's trajectories. This means the agent will be trained with the default environment reward rather than regret. non_negative_regret: If True, will ensure that the regret reward cannot be below 0. id_num: The ID number of this agent within the population of agents of the same type. I.e. this is adversary agent 3. block_budget_weight: Weight to place on the adversary's block budget reward. Default is 0 for no block budget. use_rnn: If True, will use an RNN within the network architecture. learning_rate: The learning rate used to initialize the optimizer for this agent. actor_fc_layers: The number and size of fully connected layers in the policy. value_fc_layers: The number and size of fully connected layers in the critic / value network. lstm_size: The number of LSTM cells in the RNN. conv_filters: The number of convolution filters. conv_kernel: The width of the convolution kernel. scalar_fc: The width of the fully-connected layer which inputs a scalar. entropy_regularization: Entropy regularization coefficient. xy_dim: Certain adversaries take in the current (x,y) position as a one-hot vector. In this case, the maximum value for x or y is required to create the one-hot representation. num_epochs: Number of epochs for computing PPO policy updates. num_eval_episodes: Number of evaluation episodes be eval step, used as batch size to initialize eval metrics. num_parallel_envs: Number of parallel environments used in trainin, used as batch size for training metrics and rewards. replay_buffer_capacity: Capacity of this agent's replay buffer. debug_summaries: Log additional summaries from the PPO agent. summarize_grads_and_vars: If True, logs gradient norms and variances in PPO agent. """ self.name = name self.id = id_num self.max_steps = max_steps self.is_environment = is_environment self.replace_reward = replace_reward self.non_negative_regret = non_negative_regret self.block_budget_weight = block_budget_weight with tf.name_scope(self.name): self.optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) logging.info('\tCalculating specs and building networks...') if is_environment: self.time_step_spec = env.adversary_time_step_spec self.action_spec = env.adversary_action_spec self.observation_spec = env.adversary_observation_spec (self.actor_net, self.value_net ) = multigrid_networks.construct_multigrid_networks( self.observation_spec, self.action_spec, use_rnns=use_rnn, actor_fc_layers=actor_fc_layers, value_fc_layers=value_fc_layers, lstm_size=lstm_size, conv_filters=conv_filters, conv_kernel=conv_kernel, scalar_fc=scalar_fc, scalar_name='time_step', scalar_dim=self.observation_spec['time_step'].maximum + 1, random_z=True, xy_dim=xy_dim) else: self.time_step_spec = env.time_step_spec() self.action_spec = env.action_spec() self.observation_spec = env.observation_spec() (self.actor_net, self.value_net ) = multigrid_networks.construct_multigrid_networks( self.observation_spec, self.action_spec, use_rnns=use_rnn, actor_fc_layers=actor_fc_layers, value_fc_layers=value_fc_layers, lstm_size=lstm_size, conv_filters=conv_filters, conv_kernel=conv_kernel, scalar_fc=scalar_fc) self.tf_agent = ppo_clip_agent.PPOClipAgent( self.time_step_spec, self.action_spec, self.optimizer, actor_net=self.actor_net, value_net=self.value_net, entropy_regularization=entropy_regularization, importance_ratio_clipping=0.2, normalize_observations=False, normalize_rewards=False, use_gae=True, num_epochs=num_epochs, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) self.tf_agent.initialize() self.eval_policy = self.tf_agent.policy self.collect_policy = self.tf_agent.collect_policy logging.info('\tAllocating replay buffer ...') self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( self.tf_agent.collect_data_spec, batch_size=num_parallel_envs, max_length=replay_buffer_capacity) logging.info('\t\tRB capacity: %i', self.replay_buffer.capacity) self.final_reward = tf.zeros(shape=(num_parallel_envs), dtype=tf.float32) self.enemy_max = tf.zeros(shape=(num_parallel_envs), dtype=tf.float32) # Creates train metrics self.step_metrics = step_metrics self.train_metrics = step_metrics + [ tf_metrics.AverageEpisodeLengthMetric( batch_size=num_parallel_envs, name=name + '_AverageEpisodeLength') ] self.eval_metrics = [ tf_metrics.AverageEpisodeLengthMetric( batch_size=num_eval_episodes, name=name + '_AverageEpisodeLength') ] if is_environment: self.env_train_metric = adversarial_eval.AdversarialEnvironmentScalar( batch_size=num_parallel_envs, name=name + '_AdversaryReward') self.env_eval_metric = adversarial_eval.AdversarialEnvironmentScalar( batch_size=num_eval_episodes, name=name + '_AdversaryReward') else: self.train_metrics.append( tf_metrics.AverageReturnMetric( batch_size=num_parallel_envs, name=name + '_AverageReturn')) self.eval_metrics.append( tf_metrics.AverageReturnMetric( batch_size=num_eval_episodes, name=name + '_AverageReturn')) self.metrics_group = metric_utils.MetricsGroup( self.train_metrics, name + '_train_metrics') self.observers = self.train_metrics + [ self.replay_buffer.add_batch ] self.train_dir = os.path.join(root_dir, 'train', name, str(id_num)) self.eval_dir = os.path.join(root_dir, 'eval', name, str(id_num)) self.train_checkpointer = common.Checkpointer( ckpt_dir=self.train_dir, agent=self.tf_agent, global_step=global_step, metrics=self.metrics_group, ) self.policy_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(self.train_dir, 'policy'), policy=self.eval_policy, global_step=global_step) self.saved_model = policy_saver.PolicySaver(self.eval_policy, train_step=global_step) self.saved_model_dir = os.path.join(root_dir, 'policy_saved_model', name, str(id_num)) self.train_checkpointer.initialize_or_restore() if use_tf_functions: self.tf_agent.train = common.function(self.tf_agent.train, autograph=False) self.total_loss = None self.extra_loss = None self.loss_divergence_counter = 0
def main(): logging.set_verbosity(logging.INFO) tf.compat.v1.enable_v2_behavior() parser = argparse.ArgumentParser() ## Essential parameters parser.add_argument("--output_dir", default=None, type=str, required=True,help="The output directory where the model stats and checkpoints will be written.") parser.add_argument("--env", default=None, type=str, required=True,help="The environment to train the agent on") parser.add_argument("--max_horizon", default=4, type=int) parser.add_argument("--atari", default=False, type=bool, help = "Gets some data Types correctly") ##agent parameters parser.add_argument("--reward_scale_factor", default=1.0, type=float) parser.add_argument("--debug_summaries", default=False, type=bool) parser.add_argument("--summarize_grads_and_vars", default=False, type=bool) ##transformer parameters parser.add_argument("--d_model", default=64, type=int) parser.add_argument("--num_layers", default=3, type=int) parser.add_argument("--dff", default=256, type=int) ##Training parameters parser.add_argument('--num_iterations', type=int, default=100000,help="steps in the env") parser.add_argument('--num_parallel', type=int, default=30,help="how many envs should run in parallel") parser.add_argument("--collect_episodes_per_iteration", default=1, type=int) parser.add_argument('--num_epochs', type=int, default = 25,help = 'Number of epochs for computing policy updates.') ## Other parameters parser.add_argument("--num_eval_episodes", default=10, type=int) parser.add_argument("--eval_interval", default=1000, type=int) parser.add_argument("--log_interval", default=10, type=int) parser.add_argument("--summary_interval", default=1000, type=int) parser.add_argument("--run_graph_mode", default=True, type=bool) parser.add_argument("--checkpoint_interval", default=1000, type=int) parser.add_argument("--summary_flush", default=10, type=int) #what does this exactly do? # HP opt params #parser.add_argument("--doubleQ", default=True, type=bool,help="Whether to use a DoubleQ agent") parser.add_argument("--custom_last_layer", default=True, type=bool) parser.add_argument("--custom_layer_init", default=1.0,type= float) parser.add_argument("--initial_collect_steps", default=5000, type=int) #parser.add_argument("--loss_function", default="element_wise_huber_loss", type=str) parser.add_argument("--num_heads", default=4, type=int) parser.add_argument("--normalize_env", default=False, type=bool) parser.add_argument('--custom_lr_schedule',default="No",type=str,help = "whether to use a custom LR schedule") #parser.add_argument("--epsilon_greedy", default=0.3, type=float) #parser.add_argument("--target_update_period", default=1000, type=int) parser.add_argument("--rate", default=0.1, type=float) # dropout rate (might be not used depending on the q network) #Setting this to 0.0 somehow break the code. Not relevant tho just select a network without dropout parser.add_argument("--gradient_clipping", default=True, type=bool) parser.add_argument("--replay_buffer_max_length", default=1001, type=int) #parser.add_argument("--batch_size", default=32, type=int) parser.add_argument("--learning_rate", default=1e-4, type=float) parser.add_argument("--encoder_type", default=3, type=int,help="Which Type of encoder is used for the model") parser.add_argument("--layer_type", default=3, type=int,help="Which Type of layer is used for the encoder") #parser.add_argument("--target_update_tau", default=1, type=float) #parser.add_argument("--gamma", default=0.99, type=float) args = parser.parse_args() global_step = tf.compat.v1.train.get_or_create_global_step() baseEnv = gym.make(args.env) eval_tf_env = tf_py_environment.TFPyEnvironment(PyhistoryWrapper(suite_gym.load(args.env),args.max_horizon,args.atari)) #[lambda: PyhistoryWrapper(suite_gym.load(args.env),args.max_horizon,args.atari)] * args.num_parallel) tf_env = tf_py_environment.TFPyEnvironment( parallel_py_environment.ParallelPyEnvironment( #[lambda: PyhistoryWrapper(suite_gym.load(args.env),args.max_horizon,args.atari)] * args.num_parallel)) [lambda: PyhistoryWrapper(suite_gym.load(args.env),args.max_horizon,args.atari)] * args.num_parallel)) actor_net = actor_distribution_network.ActorDistributionNetwork( tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=(200, 100), activation_fn=tf.keras.activations.tanh) value_net = value_network.ValueNetwork( tf_env.observation_spec(), fc_layer_params=(200, 100), activation_fn=tf.keras.activations.tanh) actor_net = QTransformer( tf_env.observation_spec(), baseEnv.action_space.n, num_layers=args.num_layers, d_model=args.d_model, num_heads=args.num_heads, dff=args.dff, rate = args.rate, encoderType = args.encoder_type, enc_layer_type=args.layer_type, max_horizon=args.max_horizon, custom_layer = args.custom_layer_init, custom_last_layer = args.custom_last_layer) if args.custom_lr_schedule == "Transformer": # builds a lr schedule according to the original usage for the transformer learning_rate = CustomSchedule(args.d_model,int(args.num_iterations/10)) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) elif args.custom_lr_schedule == "Transformer_low": # builds a lr schedule according to the original usage for the transformer learning_rate = CustomSchedule(int(args.d_model/2),int(args.num_iterations/10)) # --> same schedule with lower general lr optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) elif args.custom_lr_schedule == "Linear": lrs = LinearCustomSchedule(learning_rate,args.num_iterations) optimizer = tf.keras.optimizers.Adam(lrs, beta_1=0.9, beta_2=0.98, epsilon=1e-9) else: optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=args.learning_rate) tf_agent = ppo_clip_agent.PPOClipAgent( tf_env.time_step_spec(), tf_env.action_spec(), optimizer, actor_net=actor_net, value_net=value_net, entropy_regularization=0.0, importance_ratio_clipping=0.2, normalize_observations=False, normalize_rewards=False, use_gae=True, num_epochs=args.num_epochs, debug_summaries=args.debug_summaries, summarize_grads_and_vars=args.summarize_grads_and_vars, train_step_counter=global_step) tf_agent.initialize() train_eval( args.output_dir, 0, # ?? # TODO(b/127576522): rename to policy_fc_layers. tf_agent, eval_tf_env, tf_env, # Params for collect args.num_iterations, args.collect_episodes_per_iteration, args.num_parallel, args.replay_buffer_max_length, # Per-environment # Params for train args.num_epochs, args.learning_rate, # Params for eval args.num_eval_episodes, args.eval_interval, # Params for summaries and logging args.checkpoint_interval, args.checkpoint_interval, args.checkpoint_interval, args.log_interval, args.summary_interval, args.summary_flush, args.debug_summaries, args.summarize_grads_and_vars, args.run_graph_mode, None) pickle.dump(args,open(args.output_dir + "/training_args.p","wb")) print("Successfully trained and evaluation.")
tf_env.action_spec(), fc_layer_params=actor_fc_layers, activation_fn=tf.keras.activations.tanh)) value_net = (value_network.ValueNetwork( tf_env.observation_spec(), fc_layer_params=value_fc_layers, activation_fn=tf.keras.activations.tanh)) tf_agent = ppo_clip_agent.PPOClipAgent( tf_env.time_step_spec(), tf_env.action_spec(), optimizer, actor_net=actor_net, value_net=value_net, entropy_regularization=0.0, importance_ratio_clipping=importance_ratio_clipping, normalize_observations=False, normalize_rewards=False, use_gae=True, num_epochs=num_epochs, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) tf_agent.initialize() # - # ### Replay buffer and initial data collection eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy