def testAdaptiveKlLoss(self): actor_net = actor_distribution_network.ActorDistributionNetwork( self._time_step_spec.observation, self._action_spec, fc_layer_params=None) value_net = value_network.ValueNetwork( self._time_step_spec.observation, fc_layer_params=None) agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=actor_net, value_net=value_net, initial_adaptive_kl_beta=1.0, adaptive_kl_target=10.0, adaptive_kl_tolerance=0.5, ) # Initialize variables self.evaluate(tf.compat.v1.global_variables_initializer()) # Loss should not change if data kl is target kl. loss_1 = agent.adaptive_kl_loss([10.0]) loss_2 = agent.adaptive_kl_loss([10.0]) self.assertEqual(self.evaluate(loss_1), self.evaluate(loss_2)) # If data kl is low, kl penalty should decrease between calls. loss_1 = self.evaluate(agent.adaptive_kl_loss([1.0])) adaptive_kl_beta_update_fn = common.function( agent.update_adaptive_kl_beta) self.evaluate(adaptive_kl_beta_update_fn([1.0])) loss_2 = self.evaluate(agent.adaptive_kl_loss([1.0])) self.assertGreater(loss_1, loss_2) # # # If data kl is low, kl penalty should increase between calls. loss_1 = self.evaluate(agent.adaptive_kl_loss([100.0])) self.evaluate(adaptive_kl_beta_update_fn([100.0])) loss_2 = self.evaluate(agent.adaptive_kl_loss([100.0])) self.assertLess(loss_1, loss_2)
def testEntropyRegularizationLoss(self, not_zero): ent_reg = 0.1 * not_zero agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet(self._obs_spec, self._action_spec), value_net=DummyValueNet(self._obs_spec), normalize_observations=False, entropy_regularization=ent_reg, ) # Call other loss functions to make sure trainable variables are # constructed. observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[0], [1]], dtype=tf.float32) returns = tf.constant([1.9, 1.0], dtype=tf.float32) sample_action_log_probs = tf.constant([[0.9], [0.3]], dtype=tf.float32) advantages = tf.constant([1.9, 1.0], dtype=tf.float32) weights = tf.ones_like(advantages) current_policy_distribution, unused_network_state = DummyActorNet( self._obs_spec, self._action_spec)(time_steps.observation, time_steps.step_type, ()) agent.policy_gradient_loss(time_steps, actions, sample_action_log_probs, advantages, current_policy_distribution, weights) agent.value_estimation_loss(time_steps, returns, weights) # Now request entropy regularization loss. # Action stdevs should be ~1.0, and mean entropy ~3.70111. expected_loss = -3.70111 * ent_reg loss = agent.entropy_regularization_loss(time_steps, current_policy_distribution, weights) self.evaluate(tf.compat.v1.initialize_all_variables()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def testValueEstimationLoss(self): agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet(self._obs_spec, self._action_spec), value_net=DummyValueNet(self._obs_spec), value_pred_loss_coef=1.0, normalize_observations=False, ) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) returns = tf.constant([1.9, 1.0], dtype=tf.float32) weights = tf.ones_like(returns) expected_loss = 123.205 loss = agent.value_estimation_loss(time_steps, returns, weights) self.evaluate(tf.compat.v1.initialize_all_variables()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def get_agent(self, env, params): """Returns a TensorFlow PPO-Agent Arguments: env {TFAPyEnvironment} -- Tensorflow-Agents PyEnvironment params {ParameterServer} -- ParameterServer from BARK Returns: agent -- tf-agent """ # actor network actor_net = actor_distribution_network.ActorDistributionNetwork( env.observation_spec(), env.action_spec(), fc_layer_params=tuple( self._params["ML"]["Agent"]["actor_fc_layer_params"])) # critic network value_net = value_network.ValueNetwork( env.observation_spec(), fc_layer_params=tuple( self._params["ML"]["Agent"]["critic_fc_layer_params"])) # agent tf_agent = ppo_agent.PPOAgent( env.time_step_spec(), env.action_spec(), actor_net=actor_net, value_net=value_net, optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=self._params["ML"]["Agent"]["learning_rate"]), train_step_counter=self._ckpt.step, num_epochs=self._params["ML"]["Agent"]["num_epochs"], name=self._params["ML"]["Agent"]["agent_name"], debug_summaries=self._params["ML"]["Agent"]["debug_summaries"]) tf_agent.initialize() return tf_agent
def testAdaptiveKlLoss(self): if tf.executing_eagerly(): self.skipTest('b/123777119') # Secondary bug: ('b/123770194') actor_net = actor_distribution_network.ActorDistributionNetwork( self._time_step_spec.observation, self._action_spec, fc_layer_params=None) value_net = value_network.ValueNetwork( self._time_step_spec.observation, fc_layer_params=None) agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=actor_net, value_net=value_net, initial_adaptive_kl_beta=1.0, adaptive_kl_target=10.0, adaptive_kl_tolerance=0.5, ) self.evaluate(tf.compat.v1.global_variables_initializer()) # Loss should not change if data kl is target kl. loss_1 = self.evaluate(agent.adaptive_kl_loss(10.0)) loss_2 = self.evaluate(agent.adaptive_kl_loss(10.0)) self.assertEqual(loss_1, loss_2) # If data kl is low, kl penalty should decrease between calls. loss_1 = self.evaluate(agent.adaptive_kl_loss(1.0)) self.evaluate(agent.update_adaptive_kl_beta(1.0)) loss_2 = self.evaluate(agent.adaptive_kl_loss(1.0)) self.assertGreater(loss_1, loss_2) # If data kl is low, kl penalty should increase between calls. loss_1 = self.evaluate(agent.adaptive_kl_loss(100.0)) self.evaluate(agent.update_adaptive_kl_beta(100.0)) loss_2 = self.evaluate(agent.adaptive_kl_loss(100.0)) self.assertLess(loss_1, loss_2)
def testComputeAdvantagesNoGae(self): agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet(self._obs_spec, self._action_spec), value_net=DummyValueNet(self._obs_spec), normalize_observations=False, use_gae=False) rewards = tf.constant([[1.0] * 9, [1.0] * 9]) discounts = tf.constant([[1.0, 1.0, 1.0, 1.0, 0.0, 0.9, 0.9, 0.9, 0.0], [1.0, 1.0, 1.0, 1.0, 0.0, 0.9, 0.9, 0.9, 0.0]]) returns = tf.constant([[5.0, 4.0, 3.0, 2.0, 1.0, 3.439, 2.71, 1.9, 1.0], [3.0, 4.0, 7.0, 2.0, -1.0, 5.439, 2.71, -2.9, 1.0]]) value_preds = tf.constant([ [3.0] * 10, [3.0] * 10, ]) # One extra for final time_step. expected_advantages = returns - value_preds[:, :-1] advantages = agent.compute_advantages(rewards, returns, discounts, value_preds) self.assertAllClose(expected_advantages, advantages)
def testUpdateAdaptiveKlBeta(self, strategy_fn): with strategy_fn().scope(): actor_net = actor_distribution_network.ActorDistributionNetwork( self._time_step_spec.observation, self._action_spec, fc_layer_params=None) value_net = value_network.ValueNetwork( self._time_step_spec.observation, fc_layer_params=None) agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=actor_net, value_net=value_net, initial_adaptive_kl_beta=1.0, adaptive_kl_target=10.0, adaptive_kl_tolerance=0.5, ) agent.initialize() self.evaluate(tf.compat.v1.global_variables_initializer()) # When KL is target kl, beta should not change. update_adaptive_kl_beta_fn = common.function(agent.update_adaptive_kl_beta) beta_0 = update_adaptive_kl_beta_fn([10.0]) expected_beta_0 = 1.0 self.assertEqual(expected_beta_0, self.evaluate(beta_0)) # When KL is large, beta should increase. beta_1 = update_adaptive_kl_beta_fn([100.0]) expected_beta_1 = 1.5 self.assertEqual(expected_beta_1, self.evaluate(beta_1)) # When KL is small, beta should decrease. beta_2 = update_adaptive_kl_beta_fn([1.0]) expected_beta_2 = 1.0 self.assertEqual(expected_beta_2, self.evaluate(beta_2))
def testL2RegularizationLoss(self, not_zero): l2_reg = 1e-4 * not_zero agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.train.AdamOptimizer(), actor_net=DummyActorNet(self._action_spec), value_net=DummyValueNet(), normalize_observations=False, policy_l2_reg=l2_reg, value_function_l2_reg=l2_reg, ) # Call other loss functions to make sure trainable variables are # constructed. observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[0], [1]], dtype=tf.float32) returns = tf.constant([1.9, 1.0], dtype=tf.float32) sample_action_log_probs = tf.constant([[0.9], [0.3]], dtype=tf.float32) advantages = tf.constant([1.9, 1.0], dtype=tf.float32) current_policy_distribution, unused_network_state = DummyActorNet( self._action_spec)(time_steps.observation, time_steps.step_type, ()) valid_mask = tf.ones_like(advantages) agent.policy_gradient_loss(time_steps, actions, sample_action_log_probs, advantages, current_policy_distribution, valid_mask) agent.value_estimation_loss(time_steps, returns, valid_mask) # Now request L2 regularization loss. # Value function weights are [2, 1], actor net weights are [2, 1, 1, 1]. expected_loss = l2_reg * ((2**2 + 1) + (2**2 + 1 + 1 + 1)) loss = agent.l2_regularization_loss() self.evaluate(tf.global_variables_initializer()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def test_multiple_tf_agents(self): env_name = "CartPole-v0" # DQN env = gym.make(env_name) train_env = environment_converter.gym_to_tf(env) fc_layer_params = (100, ) q_net = q_network.QNetwork( input_tensor_spec=train_env.observation_spec(), action_spec=train_env.action_spec(), fc_layer_params=fc_layer_params, ) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3) dqn_tf_agent = dqn_agent.DqnAgent( time_step_spec=train_env.time_step_spec(), action_spec=train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, ) dqn_tf_agent.initialize() # PPO env = gym.make(env_name) actor_fc_layers = (200, 100) value_fc_layers = (200, 100) learning_rate = 1e-3 train_env = environment_converter.gym_to_tf(env) actor_net = actor_distribution_network.ActorDistributionNetwork( train_env.observation_spec(), train_env.action_spec(), fc_layer_params=actor_fc_layers, ) value_net = value_network.ValueNetwork(train_env.observation_spec(), fc_layer_params=value_fc_layers) optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) ppo_tf_agent = ppo_agent.PPOAgent( train_env.time_step_spec(), train_env.action_spec(), optimizer, actor_net=actor_net, value_net=value_net, ) ppo_tf_agent.initialize() # REINFORCE: env = gym.make(env_name) train_env = environment_converter.gym_to_tf(env) learning_rate = 1e-3 fc_layer_params = (100, ) actor_net = actor_distribution_network.ActorDistributionNetwork( train_env.observation_spec(), train_env.action_spec(), fc_layer_params=fc_layer_params, ) optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) train_step_counter = tf.compat.v2.Variable(0) reinforce_tf_agent = reinforce_agent.ReinforceAgent( train_env.time_step_spec(), train_env.action_spec(), actor_network=actor_net, optimizer=optimizer, normalize_returns=True, train_step_counter=train_step_counter, ) reinforce_tf_agent.initialize() agents = [dqn_tf_agent, ppo_tf_agent, reinforce_tf_agent] agent_names = ["dqn_agent", "ppo_agent", "reinforce_agent"] train_multiple(agents, env, 1470, 195, agent_names, 200) trained_env = get_saved_environments()[0] trained_models = get_trained_model_names(trained_env) model_saved = set(agent_names) == set(trained_models) shutil.rmtree(save_path) self.assertTrue(model_saved)
def __init__( self, landscape: flexs.Landscape, rounds: int, sequences_batch_size: int, model_queries_per_batch: int, starting_sequence: str, alphabet: str, log_file: Optional[str] = None, model: Optional[flexs.Model] = None, num_experiment_rounds: int = 10, num_model_rounds: int = 1, ): """ Args: num_experiment_rounds: Number of experiment-based rounds to run. This is by default set to 10, the same number of sequence proposal of rounds run. num_model_rounds: Number of model-based rounds to run. """ tf.config.run_functions_eagerly(False) name = f"DynaPPO_Agent_{num_experiment_rounds}_{num_model_rounds}" if model is None: model = DynaPPOEnsemble( len(starting_sequence), alphabet, ) model.train( s_utils.generate_random_sequences(len(starting_sequence), 10, alphabet), [0] * 10, ) super().__init__( model, name, rounds, sequences_batch_size, model_queries_per_batch, starting_sequence, log_file, ) self.alphabet = alphabet self.num_experiment_rounds = num_experiment_rounds self.num_model_rounds = num_model_rounds env = DynaPPOEnvMut( alphabet=self.alphabet, starting_seq=starting_sequence, model=model, landscape=landscape, max_num_steps=model_queries_per_batch, ) validate_py_environment(env, episodes=1) self.tf_env = tf_py_environment.TFPyEnvironment(env) encoder_layer = tf.keras.layers.Lambda(lambda obs: obs["sequence"]) actor_net = actor_distribution_network.ActorDistributionNetwork( self.tf_env.observation_spec(), self.tf_env.action_spec(), preprocessing_combiner=encoder_layer, fc_layer_params=[128], ) value_net = value_network.ValueNetwork( self.tf_env.observation_spec(), preprocessing_combiner=encoder_layer, fc_layer_params=[128], ) self.agent = ppo_agent.PPOAgent( self.tf_env.time_step_spec(), self.tf_env.action_spec(), optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), actor_net=actor_net, value_net=value_net, num_epochs=10, summarize_grads_and_vars=False, ) self.agent.initialize()
def train_eval_doom_simple( # Params for collect num_environment_steps=100000, collect_episodes_per_iteration=32, num_parallel_environments=1, replay_buffer_capacity=301, # Per-environment # Params for train num_epochs=25, learning_rate=4e-4, # Params for eval eval_interval=10, num_video_episodes=10, # Params for summaries and logging log_interval=10): """A simple train and eval for PPO.""" # if not os.path.exists(videos_dir): # os.makedirs(videos_dir) global terminate eval_py_env = CarlaEnv() tf_env = tf_py_environment.TFPyEnvironment(eval_py_env) actor_net, value_net = create_networks(tf_env.observation_spec(), tf_env.action_spec()) global_step = tf.compat.v1.train.get_or_create_global_step() optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-5) tf_agent = ppo_agent.PPOAgent( tf_env.time_step_spec(), tf_env.action_spec(), optimizer, actor_net, value_net, num_epochs=num_epochs, train_step_counter=global_step, discount_factor=0.99, gradient_clipping=0.5, entropy_regularization=1e-2, importance_ratio_clipping=0.2, use_gae=True, use_td_lambda_return=True ) tf_agent.initialize() environment_steps_metric = tf_metrics.EnvironmentSteps() step_metrics = [ tf_metrics.NumberOfEpisodes(), environment_steps_metric, ] replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) train_replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(tf_env, tf_agent.collect_policy, observers=[replay_buffer.add_batch] + step_metrics, num_episodes=collect_episodes_per_iteration) collect_time = 0 train_time = 0 timed_at_step = global_step.numpy() my_policy = tf_agent.policy saver = PolicySaver(my_policy, batch_size=None) def train_step(): trajectories = train_replay_buffer.gather_all() return tf_agent.train(experience=trajectories) def evaluate(policy, step_count): create_video(tf_env, policy, 10, f'agent/behave/imageio_{step_count}.mp4') print("collecting samples initial:") collect_driver.run() train_replay_buffer = copy.deepcopy(replay_buffer) replay_buffer.clear() print(f"train size {train_replay_buffer.num_frames()} buffer size{replay_buffer.num_frames()}") while environment_steps_metric.result() < num_environment_steps and not terminate: start_time = time.time() print("collecting samples") collector_thread = threading.Thread(target=collect_driver.run) collector_thread.start() start_time = time.time() count = 0 # while collector_thread.is_alive() and not terminate: # count = count + 1 print(f"Training agent {count}") total_loss, _ = train_step() print() print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''") print(f'step = {global_step.numpy()}, loss = {total_loss}, env_metric = {environment_steps_metric.result()}') print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''") print() train_replay_buffer.clear() print("Training agent Finshed") print("Waiting for collecting samples thread") collector_thread.join() print("collecting samples Finished") collect_time += time.time() - start_time train_replay_buffer = copy.deepcopy(replay_buffer) replay_buffer.clear() train_time += time.time() - start_time global_step_val = global_step.numpy() print(f"global_step_val:{global_step_val} % log_interval:{log_interval} = {global_step_val % log_interval}") # if global_step_val % log_interval == 0: print() print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''") print(f'step = {global_step_val}, loss = {total_loss}') steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) print(f'{steps_per_sec} steps/sec') print(f'collect_time = {collect_time}, train_time = {train_time}') print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''") print() timed_at_step = global_step_val collect_time = 0 train_time = 0 if global_step_val % eval_interval == 0: print("Evaluating!!") saver.save(f'agent/saved/policy_ppo_simple_{global_step_val}') policy = tf_agent.policy evaluate(policy, global_step_val) print("Terminated") policy = tf_agent.policy evaluate(policy, global_step_val)
def testAgentDoesNotFailWhenNestedObservationActionAndDebugSummaries(self): summary_writer = tf.compat.v2.summary.create_file_writer(FLAGS.test_tmpdir, flush_millis=10000) summary_writer.set_as_default() nested_obs_spec = (self._obs_spec, self._obs_spec, { 'a': self._obs_spec, 'b': self._obs_spec, }) nested_time_spec = ts.time_step_spec(nested_obs_spec) nested_act_spec = (self._action_spec, { 'c': self._action_spec, 'd': self._action_spec }) class NestedActorNet(network.DistributionNetwork): def __init__(self, dummy_model): output_spec = (dummy_model.output_spec, { 'c': dummy_model.output_spec, 'd': dummy_model.output_spec, }) super(NestedActorNet, self).__init__( dummy_model.input_tensor_spec, (), output_spec=output_spec, name='NestedActorNet') self.dummy_model = dummy_model def call(self, *args, **kwargs): dummy_ans, _ = self.dummy_model(*args, **kwargs) return (dummy_ans, {'c': dummy_ans, 'd': dummy_ans}), () dummy_model = DummyActorNet(nested_obs_spec, self._action_spec) agent = ppo_agent.PPOAgent( nested_time_spec, nested_act_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=NestedActorNet(dummy_model), value_net=DummyValueNet(nested_obs_spec), debug_summaries=True) observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) observations = (observations, observations, { 'a': observations, 'b': observations, }) time_steps = ts.TimeStep( step_type=tf.constant([[1] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) actions = (actions, { 'c': actions, 'd': actions, }) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } action_distribution_parameters = (action_distribution_parameters, { 'c': action_distribution_parameters, 'd': action_distribution_parameters, }) policy_info = action_distribution_parameters experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) agent.train(experience)
def testTrain(self, num_epochs, use_td_lambda_return): # Mock the build_train_op to return an op for incrementing this counter. counter = common.create_variable('test_train_counter') agent = ppo_agent.PPOAgent(self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet( self._obs_spec, self._action_spec, ), value_net=DummyValueNet(self._obs_spec), normalize_observations=False, num_epochs=num_epochs, use_gae=use_td_lambda_return, use_td_lambda_return=use_td_lambda_return, train_step_counter=counter) observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) mid_time_step_val = ts.StepType.MID.tolist() time_steps = ts.TimeStep(step_type=tf.constant( [[mid_time_step_val] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } policy_info = action_distribution_parameters experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) # Assert that counter starts out at zero. self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertEqual(0, self.evaluate(counter)) loss_type = self.evaluate(loss) loss_numpy = loss_type.loss # Assert that loss is not zero as we are training in a non-episodic env. self.assertNotEqual( loss_numpy, 0.0, msg=('Loss is exactly zero, looks like no training ' 'was performed due to incomplete episodes.')) # Assert that train_op ran increment_counter num_epochs times. self.assertEqual(num_epochs, self.evaluate(counter))
def testTrain(self, num_epochs, use_td_lambda_return): agent = ppo_agent.PPOAgent(self._time_step_spec, self._action_spec, tf.train.AdamOptimizer(), actor_net=DummyActorNet( self._action_spec, ), value_net=DummyValueNet(outer_rank=2), normalize_observations=False, num_epochs=num_epochs, use_gae=use_td_lambda_return, use_td_lambda_return=use_td_lambda_return) observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } policy_info = action_distribution_parameters experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) # Mock the build_train_op to return an op for incrementing this counter. counter = tf.train.get_or_create_global_step() zero = tf.constant(0, dtype=tf.float32) agent.build_train_op = ( lambda *_, **__: tf_agent.LossInfo( counter.assign_add(1), # pylint: disable=g-long-lambda ppo_agent.PPOLossInfo(*[zero] * 5))) train_op = agent.train(experience) with self.cached_session() as sess: sess.run(tf.global_variables_initializer()) # Assert that counter starts out at zero. counter_ = sess.run(counter) self.assertEqual(0, counter_) sess.run(train_op) # Assert that train_op ran increment_counter num_epochs times. counter_ = sess.run(counter) self.assertEqual(num_epochs, counter_)
environment.observation_spec(), environment.action_spec(), fc_layer_params=(200, 100)) value_net = value_network.ValueNetwork(environment.observation_spec(), fc_layer_params=(200, 100)) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3) train_step_counter = tf.compat.v2.Variable(0) tf_agent = ppo_agent.PPOAgent(time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), actor_net=actor_net, value_net=value_net, optimizer=optimizer, train_step_counter=train_step_counter, discount_factor=0.995, gradient_clipping=0.5, entropy_regularization=1e-2, importance_ratio_clipping=0.2, use_gae=True, use_td_lambda_return=True) tf_agent.initialize() eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=1, max_length=20000) tf_agent.train = common.function(tf_agent.train)
BombermanEnvironment(mode="no_bomb")) eval_tf_env = tf_py_environment.TFPyEnvironment( BombermanEnvironment(mode="no_bomb")) actor_net, value_net = create_networks(tf_env) train_step = tf.Variable(0) update_period = 4 optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4) # todo fine tune agent = ppo_agent.PPOAgent(tf_env.time_step_spec(), tf_env.action_spec(), optimizer, actor_net=actor_net, value_net=value_net, num_epochs=25, gradient_clipping=0.5, entropy_regularization=1e-2, importance_ratio_clipping=0.2, use_gae=True, use_td_lambda_return=True) agent.initialize() replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=10000 # todo finetune ) replay_buffer_observer = replay_buffer.add_batch
def train(): normalizer = Normalizer(0, 499) sae = StateAutoEncoder(1, 1, num_state_bits, normalize=True, normalizer=normalizer) sae.use_checkpoints(encoder_path) train_env, _ = load_env(env_name, sae) master_action_spec = array_spec.BoundedArraySpec(shape=((num_options, )), dtype=np.float32, minimum=0, maximum=1, name='master_action') options_observation_spec = array_spec.BoundedArraySpec( shape=((num_options + num_state_bits), ), dtype=np.float32, minimum=0, maximum=1, name='option_observation') options_action_spec = array_spec.BoundedArraySpec(shape=(num_state_bits, 2), dtype=np.float32, minimum=0, maximum=1, name='option_action') options_time_step_spec = ts.TimeStep( step_type=train_env.time_step_spec().step_type, reward=train_env.time_step_spec().reward, discount=train_env.time_step_spec().discount, observation=options_observation_spec) num_actions = train_env.action_spec().maximum - train_env.action_spec( ).minimum + 1 low_level_model, callbacks = setup_model(num_actions, num_state_bits, sae, low_level_model_path) low_level_env = LowLevelEnv(train_env, low_level_model) options_env = OptionsEnv(low_level_env, options_observation_spec, options_action_spec) option_train_env = tf_py_environment.TFPyEnvironment(options_env) master_env = MasterEnv(low_level_env, master_action_spec) master_train_env = tf_py_environment.TFPyEnvironment(master_env) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) global_step = tf.compat.v1.train.get_or_create_global_step() master_value_network = value_network.ValueNetwork( master_train_env.time_step_spec().observation, fc_layer_params=(100, )) master_actor_network = actor_distribution_network.ActorDistributionNetwork( master_train_env.time_step_spec().observation, master_train_env.action_spec(), fc_layer_params=(100, )) master_agent = ppo_agent.PPOAgent(master_train_env.time_step_spec(), master_train_env.action_spec(), optimizer=optimizer, actor_net=master_actor_network, value_net=master_value_network, train_step_counter=tf.Variable(0)) master_agent.initialize() master_agent.train = common.function(master_agent.train) options_env.set_master_policy(master_agent.policy) options_critic_net = critic_network.CriticNetwork( (option_train_env.observation_spec(), option_train_env.action_spec()), observation_fc_layer_params=None, action_fc_layer_params=None, joint_fc_layer_params=(100, ), kernel_initializer='glorot_uniform', last_kernel_initializer='glorot_uniform') options_actor_net = OptionsNetwork(option_train_env.observation_spec(), option_train_env.action_spec(), 4) options_agent = sac_agent.SacAgent( option_train_env.time_step_spec(), option_train_env.action_spec(), actor_network=options_actor_net, critic_network=options_critic_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate), critic_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate), alpha_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate), target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=tf.math.squared_difference, gamma=gamma, reward_scale_factor=reward_scale_factor, train_step_counter=tf.Variable(0)) options_agent.initialize() options_agent.train = common.function(options_agent.train) master_env.set_options_policy(options_agent.policy) master_rb = create_replay_buffer(master_agent, batch_size, replay_buffer_max_length) options_rb = create_replay_buffer(options_agent, batch_size, replay_buffer_max_length) master_ds = master_rb.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2) master_iter = iter(master_ds) options_ds = options_rb.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2) options_iter = iter(options_ds) master_checkpointer = create_train_checkpointer(checkpoint_dir + "master/", master_agent, master_rb, global_step) options_checkpointer = create_train_checkpointer( checkpoint_dir + "options/", options_agent, options_rb, global_step) master_saver = policy_saver.PolicySaver(master_agent.policy) options_saver = policy_saver.PolicySaver(options_agent.policy) def check_interval(interval): return global_step % interval == 0 while (global_step < num_iterations): populate_buffer(master_train_env, master_rb, master_agent.collect_policy, master_agent.time_step_spec, master_collect_steps, batch_size) for _ in range(warmup_period): experience, unused_info = next(master_iter) master_loss = master_agent.train(experience) for _ in range(joint_update_period): populate_buffer(master_train_env, master_rb, master_agent.collect_policy, master_agent.time_step_spec, 2, batch_size) populate_buffer(option_train_env, options_rb, options_agent.collect_policy, options_agent.time_step_spec, 2, batch_size) option_exp, unused_info = next(options_iter) options_loss = options_agent.train(option_exp) master_exp, unused_info = next(master_iter) master_loss = master_agent.train(master_exp) global_step.assign_add(1) if check_interval(log_interval): print('step = {0}: master loss = {1}, options loss = {2}'.format( global_step.value, master_loss, options_loss)) if check_interval(checkpoint_interval): master_checkpointer.save(global_step) options_checkpointer.save(global_step) print('Checkpoint saved!') # Reset master here master_saver.save(save_dir + "master/") options_saver.save(save_dir + "options/") print("Policies Saved!")
def train_eval( root_dir, summary_dir, game_config, tf_master='', env_load_fn=None, random_seed=0, # TODO(b/127576522): rename to policy_fc_layers. actor_fc_layers=(150, 75), value_fc_layers=(150, 75), actor_fc_layers_rnn=(150, ), value_fc_layers_rnn=(150, ), use_rnns=True, # Params for collect num_environment_steps=int(3e08), collect_episodes_per_iteration=90, num_parallel_environments=30, replay_buffer_capacity=1001, # Per-environment # Params for train num_epochs=25, learning_rate=1e-4, # Params for eval num_eval_episodes=30, eval_interval=5000, # Params for summaries and logging train_checkpoint_interval=2000, policy_checkpoint_interval=1000, rb_checkpoint_interval=4000, log_interval=500, summary_interval=500, summaries_flush_secs=1, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None, eval_py_env=None, tf_env=None): tf.reset_default_graph() """A simple train and eval for PPO.""" if root_dir is None: raise AttributeError('train_eval requires a root_dir.') # ################################################ # # ------------ Create summary-writers ------------ # # ################################################ # root_dir = os.path.expanduser(root_dir) summary_dir = os.path.join(summary_dir, FOLDERNAME) train_dir = os.path.join(os.path.join(root_dir, 'train'), FOLDERNAME) eval_dir = os.path.join(os.path.join(root_dir, 'eval'), FOLDERNAME) train_summary_writer, eval_summary_writer = get_writers_train_eval( summary_dir, eval_dir, filename_suffix=FILENAME_SUFFIX) eval_metrics = get_metrics_eval(num_parallel_environments, num_eval_episodes) eval_summary_writer_flush_op = eval_summary_writer.flush() global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): tf.compat.v1.set_random_seed(random_seed) optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) # ################################################ # # ---------------- Create Networks --------------- # # ################################################ # if use_rnns: actor_net, value_net = get_networks( tf_env, { "actor_net": actor_fc_layers_rnn, "value_net": value_fc_layers_rnn }) else: actor_net, value_net = get_networks(tf_env, { "actor_net": actor_fc_layers, "value_net": value_fc_layers }) state_pred_net = custom_environment.predictive_models.StatePredictor( state_pred_l1, state_pred_l2, num_parallel_environments, curiosity_param) action_pred_net = custom_environment.predictive_models.ActionPredictor( action_pred_l1, action_pred_l2, action_pred_l3) #traj = trajectory.Trajectory( # step_type=[], # observation=[], # action=[], # policy_info=[], # next_step_type=[], # reward=[], # discount=[]) # ################################################ # # ---------------- Create PPO Agent -------------- # # ################################################ # tf_agent = ppo_agent.PPOAgent( tf_env.time_step_spec(), tf_env.action_spec(), optimizer, entropy_regularization=0, #0.1 up to 0.4 actor_net=actor_net, value_net=value_net, num_epochs=num_epochs, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step, normalize_observations=False ) # cause the observations also include the 0-1 mask replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) eval_py_policy = py_tf_policy.PyTFPolicy(tf_agent.policy) # ################################################ # # ---------------- Create Metrics ---------------- # # ################################################ # train_metrics, step_metrics, environment_steps_count = get_metrics_train_and_step( num_eval_episodes, num_parallel_environments) # Add to replay buffer and other agent specific observers. replay_buffer_observer = [replay_buffer.add_batch] # ################################################ # # ----------------- Trajectories ----------------- # # ################################################ # collect_policy = tf_agent.collect_policy collect_op = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, collect_policy, observers=replay_buffer_observer + train_metrics, num_episodes=collect_episodes_per_iteration).run() trajectories = replay_buffer.gather_all() train_op, _ = tf_agent.train(trajectories) # Prediction Implementation OPs gather_op = replay_buffer.gather_all() clear_op = replay_buffer.clear() step_type = tf.placeholder("int32", None) state = tf.placeholder("uint8", [None, None]) info = tf.placeholder("int64", None) mask = tf.placeholder("float32", [None, None]) state2 = tf.placeholder("uint8", [None, None]) action = tf.placeholder("int64", None) logits = tf.placeholder("float32", [None, None]) next_step_type = tf.placeholder("int32", None) reward = tf.placeholder("float32", None) discount = tf.placeholder("float32", None) traj = trajectory.Trajectory(step_type=step_type, observation={ 'state': state, 'mask': mask, 'info': info, 'state2': state2 }, action=action, policy_info={'logits': logits}, next_step_type=next_step_type, reward=reward, discount=discount) add_batch_op = replay_buffer.add_batch(traj) # printing #print_op = tf.print(trajectories) #with tf.control_dependencies([print_op]): # train_op, _ = tf_agent.train(trajectories) with tf.control_dependencies([train_op]): clear_replay_op = replay_buffer.clear() with tf.control_dependencies([clear_replay_op]): train_op = tf.identity(train_op) # ################################################ # # ------------ Create Checkpointers -------------- # # ################################################ # train_checkpointer = common.Checkpointer( ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')) policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'policy'), policy=tf_agent.policy, global_step=global_step) rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=replay_buffer) # ################################################ # # -------------- Create Summary Ops -------------- # # ################################################ # summary_ops = [] for train_metric in train_metrics: summary_ops.append( train_metric.tf_summaries(train_step=global_step, step_metrics=step_metrics)) with eval_summary_writer.as_default(), \ tf.compat.v2.summary.record_if(True): for eval_metric in eval_metrics: eval_metric.tf_summaries(train_step=global_step, step_metrics=step_metrics) init_agent_op = tf_agent.initialize() # ################################################ # # --------------- Initialize Graph --------------- # # ################################################ # with tf.compat.v1.Session(tf_master) as sess: train_checkpointer.initialize_or_restore(sess) rb_checkpointer.initialize_or_restore(sess) common.initialize_uninitialized_variables(sess) sess.run(init_agent_op) sess.run(train_summary_writer.init()) sess.run(eval_summary_writer.init()) collect_time = 0 train_time = 0 timed_at_step = sess.run(global_step) steps_per_second_ph = tf.compat.v1.placeholder( tf.float32, shape=(), name='steps_per_sec_ph') steps_per_second_summary = tf.compat.v2.summary.scalar( name='global_steps_per_sec', data=steps_per_second_ph, step=global_step) # ################################################ # # -------------------- Loop ------ --------------- # # ------------ Collect/Train/Write --------------- # # ################################################ # while sess.run(environment_steps_count) < num_environment_steps: global_step_val = sess.run(global_step) if global_step_val % eval_interval == 0: metric_utils.compute_summaries( eval_metrics, eval_py_env, eval_py_policy, num_episodes=num_eval_episodes, global_step=global_step_val, callback=eval_metrics_callback, log=True, ) sess.run(eval_summary_writer_flush_op) start_time = time.time() sess.run(collect_op) collect_time += time.time() - start_time # ################################################ # # -------- Prediction-Implementation Start ------- # # ################################################ # if statePred or actionPred: #get trajectory and clear Replay-Buffer collectedTrajectory = sess.run(gather_op) if curiosity: #experimental sess.run(clear_op) #augment reward in trajectory collectedTrajectory = state_pred_net.augmentReward( collectedTrajectory) #write augmented trajectory back into replay buffer for j in range(len(collectedTrajectory[0][0])): i = j - 1 sess.run( add_batch_op, feed_dict={ step_type: collectedTrajectory[0][:, i], state: collectedTrajectory[1].get("state")[:, i, :], info: collectedTrajectory[1].get("info")[:, i], mask: collectedTrajectory[1].get("mask")[:, i, :], state2: collectedTrajectory[1].get("state2")[:, i, :], action: collectedTrajectory[2][:, i], logits: collectedTrajectory[3].get("logits")[:, i, :], next_step_type: collectedTrajectory[4][:, i], reward: collectedTrajectory[5][:, i], discount: collectedTrajectory[6][:, i] }) #train prediction network if statePred: state_pred_net.train(collectedTrajectory, True) if actionPred: action_pred_net.train(collectedTrajectory, True) # ################################################ # # ------ Prediction-Implementation Stop ---------- # # ################################################ # train_time = 0 total_loss = -1 #indicates that there was no training if train: start_time = time.time() total_loss, _ = sess.run([train_op, summary_ops]) train_time += time.time() - start_time # ################################################ # # ---------- Logging and Checkpointing ----------- # # ################################################ # if saveModel: global_step_val = sess.run(global_step) if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, total_loss) steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) logging.info('%.3f steps/sec', steps_per_sec) sess.run( steps_per_second_summary, feed_dict={steps_per_second_ph: steps_per_sec}) logging.info( '%s', 'collect_time = {}, train_time = {}'.format( collect_time, train_time)) timed_at_step = global_step_val collect_time = 0 train_time = 0 if global_step_val % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step_val) if global_step_val % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step_val) if global_step_val % rb_checkpoint_interval == 0: rb_checkpointer.save(global_step=global_step_val) if saveModel: # One final eval before exiting. metric_utils.compute_summaries( eval_metrics, eval_py_env, eval_py_policy, num_episodes=num_eval_episodes, global_step=global_step_val, callback=eval_metrics_callback, log=True, ) sess.run(eval_summary_writer_flush_op) tf.reset_default_graph()
def testBuildTrainOp(self): agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.train.AdamOptimizer(), actor_net=DummyActorNet(self._action_spec,), value_net=DummyValueNet(), normalize_observations=False, normalize_rewards=False, value_pred_loss_coef=1.0, policy_l2_reg=1e-4, value_function_l2_reg=1e-4, entropy_regularization=0.1, importance_ratio_clipping=10, ) observations = tf.constant([[1, 2], [3, 4], [1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[0], [1], [0], [1]], dtype=tf.float32) returns = tf.constant([1.9, 1.0, 1.9, 1.0], dtype=tf.float32) sample_action_log_probs = tf.constant([0.9, 0.3, 0.9, 0.3], dtype=tf.float32) advantages = tf.constant([1.9, 1.0, 1.9, 1.0], dtype=tf.float32) valid_mask = tf.constant([1.0, 1.0, 0.0, 0.0], dtype=tf.float32) sample_action_distribution_parameters = { 'loc': tf.constant([[9.0], [15.0], [9.0], [15.0]], dtype=tf.float32), 'scale': tf.constant([[8.0], [12.0], [8.0], [12.0]], dtype=tf.float32), } train_step = tf.train.get_or_create_global_step() (train_op, losses) = ( agent.build_train_op( time_steps, actions, sample_action_log_probs, returns, advantages, sample_action_distribution_parameters, valid_mask, train_step, summarize_gradients=False, gradient_clipping=0.0, debug_summaries=False)) (policy_gradient_loss, value_estimation_loss, l2_regularization_loss, entropy_reg_loss, kl_penalty_loss) = losses # Run train_op once. self.evaluate(tf.global_variables_initializer()) total_loss_, pg_loss_, ve_loss_, l2_loss_, ent_loss_, kl_penalty_loss_ = ( self.evaluate([ train_op, policy_gradient_loss, value_estimation_loss, l2_regularization_loss, entropy_reg_loss, kl_penalty_loss ])) # Check loss values are as expected. Factor of 2/4 is because four timesteps # were included in the data, but two were masked out. Reduce_means in losses # will divide by 4, but computed loss values are for first 2 timesteps. expected_pg_loss = -0.0164646133 * 2 / 4 expected_ve_loss = 123.205 * 2 / 4 expected_l2_loss = 1e-4 * 12 * 2 / 4 expected_ent_loss = -0.370111 * 2 / 4 expected_kl_penalty_loss = 0.0 self.assertAllClose( expected_pg_loss + expected_ve_loss + expected_l2_loss + expected_ent_loss + expected_kl_penalty_loss, total_loss_, atol=0.001, rtol=0.001) self.assertAllClose(expected_pg_loss, pg_loss_) self.assertAllClose(expected_ve_loss, ve_loss_) self.assertAllClose(expected_l2_loss, l2_loss_, atol=0.001, rtol=0.001) self.assertAllClose(expected_ent_loss, ent_loss_) self.assertAllClose(expected_kl_penalty_loss, kl_penalty_loss_) # Assert that train_step was incremented self.assertEqual(1, self.evaluate(train_step))
def train_eval( root_dir, random_seed=0, num_epochs=1000000, # Params for train normalize_observations=True, normalize_rewards=True, discount_factor=1.0, lr=1e-5, lr_schedule=None, num_policy_updates=20, initial_adaptive_kl_beta=0.0, kl_cutoff_factor=0, importance_ratio_clipping=0.2, value_pred_loss_coef=0.5, gradient_clipping=None, entropy_regularization=0.0, log_prob_clipping=0.0, # Params for log, eval, save eval_interval=100, save_interval=1000, checkpoint_interval=None, summary_interval=100, do_evaluation=True, # Params for data collection train_batch_size=10, eval_batch_size=100, collect_driver=None, eval_driver=None, replay_buffer_capacity=20000, # Policy and value networks ActorNet=actor_distribution_network.ActorDistributionNetwork, zero_means_kernel_initializer=False, init_action_stddev=0.35, actor_fc_layers=(), value_fc_layers=(), use_rnn=True, actor_lstm_size=(12, ), value_lstm_size=(12, ), **kwargs): """ A simple train and eval for PPO agent. Args: root_dir (str): directory for saving training and evalutaion data random_seed (int): seed for random number generator num_epochs (int): number of training epochs. At each epoch a batch of data is collected according to one stochastic policy, and then the policy is updated. normalize_observations (bool): flag for normalization of observations. Uses StreamingTensorNormalizer which normalizes based on the whole history of observations. normalize_rewards (bool): flag for normalization of rewards. Uses StreamingTensorNormalizer which normalizes based on the whole history of rewards. discount_factor (float): rewards discout factor, should be in (0,1] lr (float): learning rate for Adam optimizer lr_schedule (callable: int -> float, optional): function to schedule the learning rate annealing. Takes as argument the int epoch number and returns float value of the learning rate. num_policy_updates (int): number of policy gradient steps to do on each epoch of training. In PPO this is typically >1. initial_adaptive_kl_beta (float): see tf-agents PPO docs kl_cutoff_factor (float): see tf-agents PPO docs importance_ratio_clipping (float): clipping value for importance ratio. Should demotivate the policy from doing updates that significantly change the policy. Should be in (0,1] value_pred_loss_coef (float): weight coefficient for quadratic value estimation loss. gradient_clipping (float): gradient clipping coefficient. entropy_regularization (float): entropy regularization loss coefficient. log_prob_clipping (float): +/- value for clipping log probs to prevent inf / NaN values. Default: no clipping. eval_interval (int): interval between evaluations, counted in epochs. save_interval (int): interval between savings, counted in epochs. It updates the log file and saves the deterministic policy. checkpoint_interval (int): interval between saving checkpoints, counted in epochs. Overwrites the previous saved one. Defaults to None, in which case checkpoints are not saved. summary_interval (int): interval between summary writing, counted in epochs. tf-agents takes care of summary writing; results can be later displayed in tensorboard. do_evaluation (bool): flag to interleave training epochs with evaluation epochs. train_batch_size (int): training batch size, collected in parallel. eval_batch_size (int): batch size for evaluation of the policy. collect_driver (Driver): driver for training data collection eval_driver (Driver): driver for evaluation data collection replay_buffer_capacity (int): How many transition tuples the buffer can store. The buffer is emptied and re-populated at each epoch. ActorNet (network.DistributionNetwork): a distribution actor network to use for training. The default is ActorDistributionNetwork from tf-agents, but this can also be customized. zero_means_kernel_initializer (bool): flag to initialize the means projection network with zeros. If this flag is not set, it will use default tf-agent random initializer. init_action_stddev (float): initial stddev of the normal action dist. actor_fc_layers (tuple): sizes of fully connected layers in actor net. value_fc_layers (tuple): sizes of fully connected layers in value net. use_rnn (bool): whether to use LSTM units in the neural net. actor_lstm_size (tuple): sizes of LSTM layers in actor net. value_lstm_size (tuple): sizes of LSTM layers in value net. """ # -------------------------------------------------------------------- # -------------------------------------------------------------------- tf.compat.v1.set_random_seed(random_seed) # Setup directories within 'root_dir' if not os.path.isdir(root_dir): os.mkdir(root_dir) policy_dir = os.path.join(root_dir, 'policy') checkpoint_dir = os.path.join(root_dir, 'checkpoint') logfile = os.path.join(root_dir, 'log.hdf5') train_dir = os.path.join(root_dir, 'train_summaries') # Create tf summary writer train_summary_writer = tf.compat.v2.summary.create_file_writer(train_dir) train_summary_writer.set_as_default() summary_interval *= num_policy_updates global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): # Define action and observation specs observation_spec = collect_driver.observation_spec() action_spec = collect_driver.action_spec() # Preprocessing: flatten and concatenate observation components preprocessing_layers = { obs: tf.keras.layers.Flatten() for obs in observation_spec.keys() } preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1) # Define actor network and value network if use_rnn: actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( input_tensor_spec=observation_spec, output_tensor_spec=action_spec, preprocessing_layers=preprocessing_layers, preprocessing_combiner=preprocessing_combiner, input_fc_layer_params=None, lstm_size=actor_lstm_size, output_fc_layer_params=actor_fc_layers) value_net = value_rnn_network.ValueRnnNetwork( input_tensor_spec=observation_spec, preprocessing_layers=preprocessing_layers, preprocessing_combiner=preprocessing_combiner, input_fc_layer_params=None, lstm_size=value_lstm_size, output_fc_layer_params=value_fc_layers) else: npn = actor_distribution_network._normal_projection_net normal_projection_net = lambda specs: npn( specs, zero_means_kernel_initializer=zero_means_kernel_initializer, init_action_stddev=init_action_stddev) actor_net = ActorNet( input_tensor_spec=observation_spec, output_tensor_spec=action_spec, preprocessing_layers=preprocessing_layers, preprocessing_combiner=preprocessing_combiner, fc_layer_params=actor_fc_layers, continuous_projection_net=normal_projection_net) value_net = value_network.ValueNetwork( input_tensor_spec=observation_spec, preprocessing_layers=preprocessing_layers, preprocessing_combiner=preprocessing_combiner, fc_layer_params=value_fc_layers) # Create PPO agent optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr) tf_agent = ppo_agent.PPOAgent( time_step_spec=collect_driver.time_step_spec(), action_spec=action_spec, optimizer=optimizer, actor_net=actor_net, value_net=value_net, num_epochs=num_policy_updates, train_step_counter=global_step, discount_factor=discount_factor, normalize_observations=normalize_observations, normalize_rewards=normalize_rewards, initial_adaptive_kl_beta=initial_adaptive_kl_beta, kl_cutoff_factor=kl_cutoff_factor, importance_ratio_clipping=importance_ratio_clipping, gradient_clipping=gradient_clipping, value_pred_loss_coef=value_pred_loss_coef, entropy_regularization=entropy_regularization, log_prob_clipping=log_prob_clipping, debug_summaries=True) tf_agent.initialize() eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy # Create replay buffer and collection driver replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=train_batch_size, max_length=replay_buffer_capacity) def train_step(): experience = replay_buffer.gather_all() return tf_agent.train(experience) tf_agent.train = common.function(tf_agent.train) avg_return_metric = tf_metrics.AverageReturnMetric( batch_size=eval_batch_size, buffer_size=eval_batch_size) collect_driver.setup(collect_policy, [replay_buffer.add_batch]) eval_driver.setup(eval_policy, [avg_return_metric]) # Create a checkpointer and load the saved agent train_checkpointer = common.Checkpointer(ckpt_dir=checkpoint_dir, max_to_keep=1, agent=tf_agent, policy=tf_agent.policy, replay_buffer=replay_buffer, global_step=global_step) train_checkpointer.initialize_or_restore() global_step = tf.compat.v1.train.get_global_step() # Saver for the deterministic policy saved_model = policy_saver.PolicySaver(eval_policy, train_step=global_step) # Evaluate policy once before training if do_evaluation: eval_driver.run(0) avg_return = avg_return_metric.result().numpy() avg_return_metric.reset() log = { 'returns': [avg_return], 'epochs': [0], 'policy_steps': [0], 'experience_time': [0.0], 'train_time': [0.0] } print('-------------------') print('Epoch 0') print(' Policy steps: 0') print(' Experience time: 0.00 mins') print(' Policy train time: 0.00 mins') print(' Average return: %.5f' % avg_return) # Save initial random policy path = os.path.join(policy_dir, ('0').zfill(6)) saved_model.save(path) # Training loop train_timer = timer.Timer() experience_timer = timer.Timer() for epoch in range(1, num_epochs + 1): # Collect new experience experience_timer.start() collect_driver.run(epoch) experience_timer.stop() # Update the policy train_timer.start() if lr_schedule: optimizer._lr = lr_schedule(epoch) train_loss = train_step() replay_buffer.clear() train_timer.stop() if (epoch % eval_interval == 0) and do_evaluation: # Evaluate the policy eval_driver.run(epoch) avg_return = avg_return_metric.result().numpy() avg_return_metric.reset() # Print out and log all metrics print('-------------------') print('Epoch %d' % epoch) print(' Policy steps: %d' % (epoch * num_policy_updates)) print(' Experience time: %.2f mins' % (experience_timer.value() / 60)) print(' Policy train time: %.2f mins' % (train_timer.value() / 60)) print(' Average return: %.5f' % avg_return) log['epochs'].append(epoch) log['policy_steps'].append(epoch * num_policy_updates) log['returns'].append(avg_return) log['experience_time'].append(experience_timer.value()) log['train_time'].append(train_timer.value()) # Save updated log save_log(log, logfile, ('%d' % epoch).zfill(6)) if epoch % save_interval == 0: # Save deterministic policy path = os.path.join(policy_dir, ('%d' % epoch).zfill(6)) saved_model.save(path) if checkpoint_interval is not None and \ epoch % checkpoint_interval == 0: # Save training checkpoint train_checkpointer.save(global_step) collect_driver.finish_training() eval_driver.finish_training()
def train_eval_doom_simple( # Params for collect num_environment_steps=30000000, collect_episodes_per_iteration=32, num_parallel_environments=32, replay_buffer_capacity=301, # Per-environment # Params for train num_epochs=25, learning_rate=4e-4, # Params for eval eval_interval=500, num_video_episodes=10, # Params for summaries and logging log_interval=50): """A simple train and eval for PPO.""" # if not os.path.exists(videos_dir): # os.makedirs(videos_dir) # eval_py_env = CSGOEnvironment() # eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env) tf_env = tf_py_environment.TFPyEnvironment(CSGOEnvironment()) actor_net, value_net = create_networks(tf_env.observation_spec(), tf_env.action_spec()) global_step = tf.compat.v1.train.get_or_create_global_step() optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-5) tf_agent = ppo_agent.PPOAgent(tf_env.time_step_spec(), tf_env.action_spec(), optimizer, actor_net, value_net, num_epochs=num_epochs, train_step_counter=global_step, discount_factor=0.99, gradient_clipping=0.5, entropy_regularization=1e-2, importance_ratio_clipping=0.2, use_gae=True, use_td_lambda_return=True) tf_agent.initialize() environment_steps_metric = tf_metrics.EnvironmentSteps() step_metrics = [ tf_metrics.NumberOfEpisodes(), environment_steps_metric, ] replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, tf_agent.collect_policy, observers=[replay_buffer.add_batch] + step_metrics, num_episodes=collect_episodes_per_iteration) def train_step(): trajectories = replay_buffer.gather_all() return tf_agent.train(experience=trajectories) # def evaluate(): # create_video(eval_py_env, eval_tf_env, tf_agent.policy, num_episodes=num_video_episodes, video_filename=os.path.join(videos_dir, "video_%d.mp4" % global_step_val)) collect_time = 0 train_time = 0 timed_at_step = global_step.numpy() while environment_steps_metric.result() < num_environment_steps: start_time = time.time() collect_driver.run() collect_time += time.time() - start_time start_time = time.time() total_loss, _ = train_step() replay_buffer.clear() train_time += time.time() - start_time global_step_val = global_step.numpy() if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, total_loss) steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) logging.info('%.3f steps/sec', steps_per_sec) logging.info('collect_time = {}, train_time = {}'.format( collect_time, train_time)) timed_at_step = global_step_val collect_time = 0 train_time = 0
def train(self): """ trains a policy using the gym_env. Sets training_losses and training_average_returns, depending on the training scheme defined in TrainingDuration configuration. """ # Create Training Environment, Optimizer and PpoAgent self._log_agent("Creating environment:") train_env = self._create_tfagent_env() observation_spec = train_env.observation_spec() action_spec = train_env.action_spec() timestep_spec = train_env.time_step_spec() self._log_agent("Creating agent:") self._log_agent(" creating tf.compat.v1.train.AdamOptimizer( ... )") optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=self._learning_rate) actor_net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, fc_layer_params=self.fc_layers) value_net = value_network.ValueNetwork(observation_spec, fc_layer_params=self.fc_layers) self._log_agent(" creating PpoAgent( ... )") tf_agent = ppo_agent.PPOAgent( timestep_spec, action_spec, optimizer, actor_net=actor_net, value_net=value_net, num_epochs=self._training_duration.num_epochs_per_iteration) self._log_agent(" executing tf_agent.initialize()") tf_agent.initialize() self._trained_policy = tf_agent.policy # Data collection self._log_agent("Creating data collection:") collect_data_spec = tf_agent.collect_data_spec self._log_agent(" creating TFUniformReplayBuffer()") replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( collect_data_spec, batch_size=1, max_length=self._num_training_steps_in_replay_buffer) collect_policy = tf_agent.collect_policy self._log_agent(" creating DynamicEpisodeDriver()") collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( train_env, collect_policy, observers=[replay_buffer.add_batch], num_episodes=self._training_duration.num_episodes_per_iteration) # Train collect_driver.run = common.function(collect_driver.run, autograph=False) tf_agent.train = common.function(tf_agent.train, autograph=False) self._clear_average_rewards_and_steps_log() self._record_average_rewards_and_steps() self.training_losses = [] self._log_agent("Starting training:") for step in range(1, self._training_duration.num_iterations + 1): msg = f'training {step:4} of {self._training_duration.num_iterations:<4}:' self._log_agent(msg + " executing collect_driver.run()") collect_driver.run() self._log_agent(msg + " executing replay_buffer.gather_all()") trajectories = replay_buffer.gather_all() self._log_agent(msg + " executing tf_agent.train(...)") total_loss, _ = tf_agent.train(experience=trajectories) self.training_losses.append(float(total_loss)) self._log_minimal( f'{msg} completed tf_agent.train(...) = {total_loss.numpy():>8.3f} [loss]' ) self._log_agent(msg + " executing replay_buffer.clear()") replay_buffer.clear() if step % self._training_duration.num_iterations_between_eval == 0: self._record_average_rewards_and_steps() return
def testDebugSummaries(self): logdir = self.get_temp_dir() with tf.contrib.summary.create_file_writer(logdir, max_queue=None, flush_millis=None, filename_suffix=None, name=None).as_default(): agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.train.AdamOptimizer(), actor_net=DummyActorNet(self._action_spec, ), value_net=DummyValueNet(), debug_summaries=True, ) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[0], [1]], dtype=tf.float32) returns = tf.constant([1.9, 1.0], dtype=tf.float32) sample_action_log_probs = tf.constant([0.9, 0.3], dtype=tf.float32) advantages = tf.constant([1.9, 1.0], dtype=tf.float32) weights = tf.ones_like(advantages) sample_action_distribution_parameters = { 'loc': tf.constant([[9.0], [15.0]], dtype=tf.float32), 'scale': tf.constant([[8.0], [12.0]], dtype=tf.float32), } train_step = tf.train.get_or_create_global_step() with self.cached_session() as sess: tf.contrib.summary.initialize(session=sess) (_, _) = (agent.build_train_op( time_steps, actions, sample_action_log_probs, returns, advantages, sample_action_distribution_parameters, weights, train_step, summarize_gradients=False, gradient_clipping=0.0, debug_summaries=False)) summaries_without_debug = tf.contrib.summary.all_summary_ops() (_, _) = (agent.build_train_op( time_steps, actions, sample_action_log_probs, returns, advantages, sample_action_distribution_parameters, weights, train_step, summarize_gradients=False, gradient_clipping=0.0, debug_summaries=True)) summaries_with_debug = tf.contrib.summary.all_summary_ops() self.assertGreater(len(summaries_with_debug), len(summaries_without_debug))
def testSequencePreprocessNotBatched(self, strategy_fn): with strategy_fn().scope(): counter = common.create_variable('test_train_counter') n_time_steps = 3 agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet( self._obs_spec, self._action_spec, ), value_net=DummyValueNet(self._obs_spec), normalize_observations=False, num_epochs=1, use_gae=False, use_td_lambda_return=False, compute_value_and_advantage_in_train=False, train_step_counter=counter) agent.initialize() observations = tf.constant([[1, 2], [3, 4], [5, 6]], dtype=tf.float32) mid_time_step_val = ts.StepType.MID.tolist() time_steps = ts.TimeStep( step_type=tf.constant( [mid_time_step_val] * n_time_steps, dtype=tf.int32), reward=tf.constant([1] * n_time_steps, dtype=tf.float32), discount=tf.constant([1] * n_time_steps, dtype=tf.float32), observation=observations) actions = tf.constant([[0], [1], [1]], dtype=tf.float32) old_action_distribution_parameters = { 'loc': tf.constant([[0.0]] * n_time_steps, dtype=tf.float32), 'scale': tf.constant([[1.0]] * n_time_steps, dtype=tf.float32), } value_preds = tf.constant([9., 15., 21.], dtype=tf.float32) policy_info = { 'dist_params': old_action_distribution_parameters, 'value_prediction': value_preds, } experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) returned_experience = agent.preprocess_sequence(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(observations, returned_experience.observation) self.assertAllClose(actions, returned_experience.action) self.assertAllClose(old_action_distribution_parameters, returned_experience.policy_info['dist_params']) self.assertEqual(n_time_steps, returned_experience.policy_info['return'].shape) self.assertAllClose([40.4821, 30.79], returned_experience.policy_info['return'][:-1]) self.assertEqual(n_time_steps, returned_experience.policy_info['advantage'].shape) self.assertAllClose([31.482101, 15.790001], returned_experience.policy_info['advantage'][:-1])
def train_eval( root_dir, env_name='HalfCheetah-v2', env_load_fn=suite_mujoco.load, random_seed=0, # TODO(b/127576522): rename to policy_fc_layers. actor_fc_layers=(200, 100), value_fc_layers=(200, 100), use_rnns=False, # Params for collect num_environment_steps=10000000, collect_episodes_per_iteration=30, num_parallel_environments=30, replay_buffer_capacity=1001, # Per-environment # Params for train num_epochs=25, learning_rate=1e-4, # Params for eval num_eval_episodes=30, eval_interval=500, # Params for summaries and logging train_checkpoint_interval=500, policy_checkpoint_interval=500, log_interval=50, summary_interval=50, summaries_flush_secs=1, use_tf_functions=True, debug_summaries=False, summarize_grads_and_vars=False): """A simple train and eval for PPO.""" if root_dir is None: raise AttributeError('train_eval requires a root_dir.') root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') saved_model_dir = os.path.join(root_dir, 'policy_saved_model') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes) ] global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): tf.compat.v1.set_random_seed(random_seed) eval_tf_env = tf_py_environment.TFPyEnvironment(env_load_fn(env_name)) tf_env = tf_py_environment.TFPyEnvironment( parallel_py_environment.ParallelPyEnvironment( [lambda: env_load_fn(env_name)] * num_parallel_environments)) optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) if use_rnns: actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( tf_env.observation_spec(), tf_env.action_spec(), input_fc_layer_params=actor_fc_layers, output_fc_layer_params=None) value_net = value_rnn_network.ValueRnnNetwork( tf_env.observation_spec(), input_fc_layer_params=value_fc_layers, output_fc_layer_params=None) else: actor_net = actor_distribution_network.ActorDistributionNetwork( tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=actor_fc_layers) value_net = value_network.ValueNetwork( tf_env.observation_spec(), fc_layer_params=value_fc_layers) tf_agent = ppo_agent.PPOAgent( tf_env.time_step_spec(), tf_env.action_spec(), optimizer, actor_net=actor_net, value_net=value_net, num_epochs=num_epochs, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) tf_agent.initialize() environment_steps_metric = tf_metrics.EnvironmentSteps() step_metrics = [ tf_metrics.NumberOfEpisodes(), environment_steps_metric, ] train_metrics = step_metrics + [ tf_metrics.AverageReturnMetric( batch_size=num_parallel_environments), tf_metrics.AverageEpisodeLengthMetric( batch_size=num_parallel_environments), ] eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) train_checkpointer = common.Checkpointer( ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')) policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'policy'), policy=eval_policy, global_step=global_step) saved_model = policy_saver.PolicySaver(eval_policy, train_step=global_step) train_checkpointer.initialize_or_restore() collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_episodes=collect_episodes_per_iteration) def train_step(): trajectories = replay_buffer.gather_all() return tf_agent.train(experience=trajectories) if use_tf_functions: # TODO(b/123828980): Enable once the cause for slowdown was identified. collect_driver.run = common.function(collect_driver.run, autograph=False) tf_agent.train = common.function(tf_agent.train, autograph=False) train_step = common.function(train_step) collect_time = 0 train_time = 0 timed_at_step = global_step.numpy() while environment_steps_metric.result() < num_environment_steps: global_step_val = global_step.numpy() if global_step_val % eval_interval == 0: metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', ) start_time = time.time() collect_driver.run() collect_time += time.time() - start_time start_time = time.time() total_loss, _ = train_step() replay_buffer.clear() train_time += time.time() - start_time for train_metric in train_metrics: train_metric.tf_summaries(train_step=global_step, step_metrics=step_metrics) if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, total_loss) steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) logging.info('%.3f steps/sec', steps_per_sec) logging.info('collect_time = {}, train_time = {}'.format( collect_time, train_time)) with tf.compat.v2.summary.record_if(True): tf.compat.v2.summary.scalar(name='global_steps_per_sec', data=steps_per_sec, step=global_step) if global_step_val % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step_val) if global_step_val % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step_val) saved_model_path = os.path.join( saved_model_dir, 'policy_' + ('%d' % global_step_val).zfill(9)) saved_model.save(saved_model_path) timed_at_step = global_step_val collect_time = 0 train_time = 0 # One final eval before exiting. metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', )
from tf_agents.agents.ppo import ppo_agent from tf_agents.networks.value_network import ValueNetwork actor_net = actor_distribution_network.ActorDistributionNetwork( train_env.observation_spec(), train_env.action_spec(), fc_layer_params=actor_fc_layer_params) value_net = ValueNetwork(train_env.observation_spec()) global_step = tf.compat.v2.Variable(0) tf_agent = ppo_agent.PPOAgent(train_env.time_step_spec(), train_env.action_spec(), actor_net=actor_net, value_net=value_net, optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=actor_learning_rate), train_step_counter=global_step)
def testGetEpochLoss(self): agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet(self._obs_spec, self._action_spec), value_net=DummyValueNet(self._obs_spec), normalize_observations=False, normalize_rewards=False, value_pred_loss_coef=1.0, policy_l2_reg=1e-4, value_function_l2_reg=1e-4, entropy_regularization=0.1, importance_ratio_clipping=10, ) observations = tf.constant([[1, 2], [3, 4], [1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[0], [1], [0], [1]], dtype=tf.float32) returns = tf.constant([1.9, 1.0, 1.9, 1.0], dtype=tf.float32) sample_action_log_probs = tf.constant([0.9, 0.3, 0.9, 0.3], dtype=tf.float32) advantages = tf.constant([1.9, 1.0, 1.9, 1.0], dtype=tf.float32) weights = tf.constant([1.0, 1.0, 0.0, 0.0], dtype=tf.float32) sample_action_distribution_parameters = { 'loc': tf.constant([[9.0], [15.0], [9.0], [15.0]], dtype=tf.float32), 'scale': tf.constant([[8.0], [12.0], [8.0], [12.0]], dtype=tf.float32), } train_step = tf.compat.v1.train.get_or_create_global_step() loss_info = agent.get_epoch_loss( time_steps, actions, sample_action_log_probs, returns, advantages, sample_action_distribution_parameters, weights, train_step, debug_summaries=False) self.evaluate(tf.compat.v1.initialize_all_variables()) total_loss, extra_loss_info = self.evaluate(loss_info) (policy_gradient_loss, value_estimation_loss, l2_regularization_loss, entropy_reg_loss, kl_penalty_loss) = extra_loss_info # Check loss values are as expected. Factor of 2/4 is because four timesteps # were included in the data, but two were masked out. Reduce_means in losses # will divide by 4, but computed loss values are for first 2 timesteps. expected_pg_loss = -0.0164646133 * 2 / 4 expected_ve_loss = 123.205 * 2 / 4 expected_l2_loss = 1e-4 * 12 * 2 / 4 expected_ent_loss = -0.370111 * 2 / 4 expected_kl_penalty_loss = 0.0 self.assertAllClose( expected_pg_loss + expected_ve_loss + expected_l2_loss + expected_ent_loss + expected_kl_penalty_loss, total_loss, atol=0.001, rtol=0.001) self.assertAllClose(expected_pg_loss, policy_gradient_loss) self.assertAllClose(expected_ve_loss, value_estimation_loss) self.assertAllClose(expected_l2_loss, l2_regularization_loss, atol=0.001, rtol=0.001) self.assertAllClose(expected_ent_loss, entropy_reg_loss) self.assertAllClose(expected_kl_penalty_loss, kl_penalty_loss)
def __init__( self, landscape: flexs.Landscape, rounds: int, sequences_batch_size: int, model_queries_per_batch: int, starting_sequence: str, alphabet: str, log_file: Optional[str] = None, model: Optional[flexs.Model] = None, num_experiment_rounds: int = 10, num_model_rounds: int = 1, env_batch_size: int = 4, ): """ Args: num_experiment_rounds: Number of experiment-based rounds to run. This is by default set to 10, the same number of sequence proposal of rounds run. num_model_rounds: Number of model-based rounds to run. env_batch_size: Number of epsisodes to batch together and run in parallel. """ tf.config.run_functions_eagerly(False) name = f"DynaPPO_Agent_{num_experiment_rounds}_{num_model_rounds}" if model is None: model = DynaPPOEnsemble( len(starting_sequence), alphabet, ) # Some models in the ensemble need to be trained on dummy dataset before # they can predict model.train( s_utils.generate_random_sequences(len(starting_sequence), 10, alphabet), [0] * 10, ) super().__init__( model, name, rounds, sequences_batch_size, model_queries_per_batch, starting_sequence, log_file, ) self.alphabet = alphabet self.num_experiment_rounds = num_experiment_rounds self.num_model_rounds = num_model_rounds self.env_batch_size = env_batch_size env = DynaPPOEnv(self.alphabet, len(starting_sequence), model, landscape, env_batch_size) self.tf_env = tf_py_environment.TFPyEnvironment(env) actor_net = actor_distribution_network.ActorDistributionNetwork( self.tf_env.observation_spec(), self.tf_env.action_spec(), fc_layer_params=[128], ) value_net = value_network.ValueNetwork(self.tf_env.observation_spec(), fc_layer_params=[128]) print(self.tf_env.action_spec()) self.agent = ppo_agent.PPOAgent( time_step_spec=self.tf_env.time_step_spec(), action_spec=self.tf_env.action_spec(), optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), actor_net=actor_net, value_net=value_net, num_epochs=10, summarize_grads_and_vars=False, ) self.agent.initialize()
def train_eval( root_dir, tf_master='', env_name='HalfCheetah-v2', env_load_fn=suite_mujoco.load, random_seed=0, # TODO(b/127576522): rename to policy_fc_layers. actor_fc_layers=(200, 100), value_fc_layers=(200, 100), use_rnns=False, # Params for collect num_environment_steps=10000000, collect_episodes_per_iteration=30, num_parallel_environments=30, replay_buffer_capacity=1001, # Per-environment # Params for train num_epochs=25, learning_rate=1e-4, # Params for eval num_eval_episodes=30, eval_interval=500, # Params for summaries and logging train_checkpoint_interval=100, policy_checkpoint_interval=50, rb_checkpoint_interval=200, log_interval=50, summary_interval=50, summaries_flush_secs=1, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None): """A simple train and eval for PPO.""" if root_dir is None: raise AttributeError('train_eval requires a root_dir.') root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ batched_py_metric.BatchedPyMetric( AverageReturnMetric, metric_args={'buffer_size': num_eval_episodes}, batch_size=num_parallel_environments), batched_py_metric.BatchedPyMetric( AverageEpisodeLengthMetric, metric_args={'buffer_size': num_eval_episodes}, batch_size=num_parallel_environments), ] eval_summary_writer_flush_op = eval_summary_writer.flush() global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): tf.compat.v1.set_random_seed(random_seed) eval_py_env = parallel_py_environment.ParallelPyEnvironment( [lambda: env_load_fn(env_name)] * num_parallel_environments) tf_env = tf_py_environment.TFPyEnvironment( parallel_py_environment.ParallelPyEnvironment( [lambda: env_load_fn(env_name)] * num_parallel_environments)) optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) if use_rnns: actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( tf_env.observation_spec(), tf_env.action_spec(), input_fc_layer_params=actor_fc_layers, output_fc_layer_params=None) value_net = value_rnn_network.ValueRnnNetwork( tf_env.observation_spec(), input_fc_layer_params=value_fc_layers, output_fc_layer_params=None) else: actor_net = actor_distribution_network.ActorDistributionNetwork( tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=actor_fc_layers) value_net = value_network.ValueNetwork( tf_env.observation_spec(), fc_layer_params=value_fc_layers) tf_agent = ppo_agent.PPOAgent( tf_env.time_step_spec(), tf_env.action_spec(), optimizer, actor_net=actor_net, value_net=value_net, num_epochs=num_epochs, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) eval_py_policy = py_tf_policy.PyTFPolicy(tf_agent.policy) environment_steps_metric = tf_metrics.EnvironmentSteps() environment_steps_count = environment_steps_metric.result() step_metrics = [ tf_metrics.NumberOfEpisodes(), environment_steps_metric, ] train_metrics = step_metrics + [ tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] # Add to replay buffer and other agent specific observers. replay_buffer_observer = [replay_buffer.add_batch] collect_policy = tf_agent.collect_policy collect_op = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, collect_policy, observers=replay_buffer_observer + train_metrics, num_episodes=collect_episodes_per_iteration).run() trajectories = replay_buffer.gather_all() train_op, _ = tf_agent.train(experience=trajectories) with tf.control_dependencies([train_op]): clear_replay_op = replay_buffer.clear() with tf.control_dependencies([clear_replay_op]): train_op = tf.identity(train_op) train_checkpointer = common.Checkpointer( ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')) policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'policy'), policy=tf_agent.policy, global_step=global_step) rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=replay_buffer) for train_metric in train_metrics: train_metric.tf_summaries(train_step=global_step, step_metrics=step_metrics) with eval_summary_writer.as_default(), \ tf.compat.v2.summary.record_if(True): for eval_metric in eval_metrics: eval_metric.tf_summaries(step_metrics=step_metrics) init_agent_op = tf_agent.initialize() with tf.compat.v1.Session(tf_master) as sess: # Initialize graph. train_checkpointer.initialize_or_restore(sess) rb_checkpointer.initialize_or_restore(sess) common.initialize_uninitialized_variables(sess) sess.run(init_agent_op) sess.run(train_summary_writer.init()) sess.run(eval_summary_writer.init()) collect_time = 0 train_time = 0 timed_at_step = sess.run(global_step) steps_per_second_ph = tf.compat.v1.placeholder( tf.float32, shape=(), name='steps_per_sec_ph') steps_per_second_summary = tf.contrib.summary.scalar( name='global_steps/sec', tensor=steps_per_second_ph) while sess.run(environment_steps_count) < num_environment_steps: global_step_val = sess.run(global_step) if global_step_val % eval_interval == 0: metric_utils.compute_summaries( eval_metrics, eval_py_env, eval_py_policy, num_episodes=num_eval_episodes, global_step=global_step_val, callback=eval_metrics_callback, log=True, ) sess.run(eval_summary_writer_flush_op) start_time = time.time() sess.run(collect_op) collect_time += time.time() - start_time start_time = time.time() total_loss = sess.run(train_op) train_time += time.time() - start_time global_step_val = sess.run(global_step) if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, total_loss) steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) logging.info('%.3f steps/sec', steps_per_sec) sess.run(steps_per_second_summary, feed_dict={steps_per_second_ph: steps_per_sec}) logging.info( '%s', 'collect_time = {}, train_time = {}'.format( collect_time, train_time)) timed_at_step = global_step_val collect_time = 0 train_time = 0 if global_step_val % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step_val) if global_step_val % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step_val) if global_step_val % rb_checkpoint_interval == 0: rb_checkpointer.save(global_step=global_step_val) # One final eval before exiting. metric_utils.compute_summaries( eval_metrics, eval_py_env, eval_py_policy, num_episodes=num_eval_episodes, global_step=global_step_val, callback=eval_metrics_callback, log=True, ) sess.run(eval_summary_writer_flush_op)
def train_eval( root_dir, env_name='HalfCheetah-v2', env_load_fn=suite_mujoco.load, random_seed=0, # TODO(b/127576522): rename to policy_fc_layers. actor_fc_layers=(512, 256, 256, 30), value_fc_layers=(512, 256, 256, 25), use_rnns=False, # Params for collect num_environment_steps=10000000, collect_episodes_per_iteration=NumEpisodes, num_parallel_environments=1, replay_buffer_capacity=10000, # Per-environment # Params for train num_epochs=25, learning_rate=5e-4, # Params for eval num_eval_episodes=5, eval_interval=500, # Params for summaries and logging log_interval=50, summary_interval=50, summaries_flush_secs=1, use_tf_functions=True, debug_summaries=False, summarize_grads_and_vars=False): """A simple train and eval for PPO.""" if root_dir is None: raise AttributeError('train_eval requires a root_dir.') root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train6') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes) ] global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): tf.compat.v1.set_random_seed(random_seed) #eval_tf_env = tf_py_environment.TFPyEnvironment(env_load_fn(env_name)) #tf_env = tf_py_environment.TFPyEnvironment( # parallel_py_environment.ParallelPyEnvironment( # [lambda: env_load_fn(env_name)] * num_parallel_environments)) env = xSpace() if isinstance(env, py_environment.PyEnvironment): eval_tf_env = tf_py_environment.TFPyEnvironment(env) tf_env = tf_py_environment.TFPyEnvironment(env) print("Py Env") elif isinstance(env, tf_environment.TFEnvironment): eval_tf_env = env tf_env = env print("TF Env") optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) if use_rnns: actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( tf_env.observation_spec(), tf_env.action_spec(), input_fc_layer_params=actor_fc_layers, output_fc_layer_params=None) value_net = value_rnn_network.ValueRnnNetwork( tf_env.observation_spec(), input_fc_layer_params=value_fc_layers, output_fc_layer_params=None) else: actor_net = actor_distribution_network.ActorDistributionNetwork( tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=actor_fc_layers) value_net = value_network.ValueNetwork( tf_env.observation_spec(), fc_layer_params=value_fc_layers) tf_agent = ppo_agent.PPOAgent( tf_env.time_step_spec(), tf_env.action_spec(), optimizer, lambda_value=0.98, discount_factor=0.995, #value_pred_loss_coef=0.005, use_gae=True, actor_net=actor_net, value_net=value_net, num_epochs=num_epochs, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step, normalize_observations=False) tf_agent.initialize() print("************ INITIALIZING **********************") environment_steps_metric = tf_metrics.EnvironmentSteps() step_metrics = [ tf_metrics.NumberOfEpisodes(), environment_steps_metric, ] train_metrics = step_metrics + [ tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy # this for tensorbaord replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_episodes=collect_episodes_per_iteration) if use_tf_functions: # TODO(b/123828980): Enable once the cause for slowdown was identified. collect_driver.run = common.function(collect_driver.run, autograph=False) tf_agent.train = common.function(tf_agent.train, autograph=False) collect_time = 0 train_time = 0 timed_at_step = global_step.numpy() while environment_steps_metric.result() < num_environment_steps: global_step_val = global_step.numpy() eval_tf_env.reset() if global_step_val % eval_interval == 0: #tf_env.ResetMattData() metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', ) #print("eager compute completed") eval_tf_env.reset() start_time = time.time() collect_driver.run() #print("collect completed") collect_time += time.time() - start_time print("collect_time:" + str(collect_time)) start_time = time.time() trajectories = replay_buffer.gather_all() #print("start train completed") #pdb.set_trace() #k=trajectories[5] #xMean=tf.reduce_mean(k) print('training...') total_loss, _ = tf_agent.train(experience=trajectories) print('training complete. total loss:' + str(total_loss)) #print("end train completed") replay_buffer.clear() train_time += time.time() - start_time for train_metric in train_metrics: train_metric.tf_summaries(train_step=global_step, step_metrics=step_metrics) if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, total_loss) steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) logging.info('%.3f steps/sec', steps_per_sec) logging.info('collect_time = {}, train_time = {}'.format( collect_time, train_time)) with tf.compat.v2.summary.record_if(True): tf.compat.v2.summary.scalar(name='global_steps_per_sec', data=steps_per_sec, step=global_step) timed_at_step = global_step_val collect_time = 0 train_time = 0 # One final eval before exiting. metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics', )