def testCreateAgentWithPrebuiltPreprocessingLayers(self): dense_layer = tf.keras.Sequential([ tf.keras.layers.Dense(10), tf.keras.layers.Flatten(), tf.keras.layers.Reshape([2, 5]), ]) q_net = KerasLayersNet(self._time_step_spec.observation, self._action_spec, dense_layer) with self.assertRaisesRegexp( ValueError, 'shares weights with the original network'): categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, categorical_q_network=q_net, optimizer=None) # Explicitly share weights between q and target networks. # This would be an unusual setup so we check that an error is thrown. q_target_net = KerasLayersNet(self._time_step_spec.observation, self._action_spec, dense_layer) with self.assertRaisesRegexp( ValueError, 'shares weights with the original network'): categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, categorical_q_network=q_net, optimizer=None, target_categorical_q_network=q_target_net)
def testCreateAgentWithPrebuiltPreprocessingLayers(self): dense_layer = tf.keras.layers.Dense(3) q_net = KerasLayersNet(self._time_step_spec.observation, self._action_spec, dense_layer) with self.assertRaisesRegexp( ValueError, 'shares weights with the original network'): categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, categorical_q_network=q_net, optimizer=None) # Explicitly share weights between q and target networks; this is ok. q_target_net = KerasLayersNet(self._time_step_spec.observation, self._action_spec, dense_layer) categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, categorical_q_network=q_net, optimizer=None, target_categorical_q_network=q_target_net) q_bad_target_net = KerasLayersNet(self._time_step_spec.observation, self._action_spec, dense_layer, num_atoms=3) with self.assertRaisesRegexp(ValueError, 'have different numbers of atoms'): categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, categorical_q_network=q_net, optimizer=None, target_categorical_q_network=q_bad_target_net)
def testCreateAgentDimChecks(self): action_spec = tensor_spec.BoundedTensorSpec([1, 2], tf.int32, 0, 1) with self.assertRaisesRegex(ValueError, 'Only scalar actions'): categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, action_spec, self._dummy_categorical_net, self._optimizer)
def testInitialize(self): agent = categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, self._categorical_net, self._optimizer) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_time_steps = ts.transition(observations, rewards, discounts) experience = test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) loss_info = agent._loss(experience) initialize = agent.initialize() self.evaluate(tf.compat.v1.global_variables_initializer()) losses = self.evaluate(loss_info).loss self.assertGreater(losses, 0.0) critic_variables = agent._q_network.variables target_critic_variables = agent._target_q_network.variables self.assertTrue(critic_variables) self.assertTrue(target_critic_variables) self.evaluate(initialize) for s, t in zip(critic_variables, target_critic_variables): self.assertAllClose(self.evaluate(s), self.evaluate(t))
def testTrain(self): agent = categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, self._dummy_categorical_net, self._optimizer) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32) next_time_steps = ts.transition(next_observations, rewards, discounts) experience = test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) train_step = agent.train(experience, weights=None) # Due to the constant initialization of the DummyCategoricalNet, we can # expect the same loss every time. expected_loss = 2.19525 self.evaluate(tf.compat.v1.global_variables_initializer()) evaluated_loss, _ = self.evaluate(train_step) self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
def testCriticLossNStep(self): agent = categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, self._dummy_categorical_net, self._optimizer, n_step_update=2) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32) next_time_steps = ts.transition(next_observations, rewards, discounts) third_observations = tf.constant([[9, 10], [11, 12]], dtype=tf.float32) third_time_steps = ts.transition(third_observations, rewards, discounts) experience1 = trajectory.from_transition(time_steps, action_steps, next_time_steps) experience2 = trajectory.from_transition(next_time_steps, action_steps, third_time_steps) experience3 = trajectory.from_transition(third_time_steps, action_steps, third_time_steps) experience = tf.nest.map_structure( lambda x, y, z: tf.stack([x, y, z], axis=1), experience1, experience2, experience3) loss_info = agent._loss(experience) # discounted_returns should evaluate to 10 + 0.9 * 10 = 19 and # 20 + 0.9 * 20 = 38. evaluated_discounted_returns = self.evaluate(agent._discounted_returns) self.assertAllClose(evaluated_discounted_returns, [[19], [38]], atol=1e-4) # Both final_value_discount values should be 0.9 * 0.9 = 0.81. evaluated_final_value_discount = self.evaluate( agent._final_value_discount) self.assertAllClose(evaluated_final_value_discount, [[0.81], [0.81]], atol=1e-4) # Due to the constant initialization of the DummyCategoricalNet, we can # expect the same loss every time. expected_loss = 2.19525 self.evaluate(tf.compat.v1.global_variables_initializer()) evaluated_loss = self.evaluate(loss_info).loss self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
def testCreateAgentNestSizeChecks(self): action_spec = [ tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1), tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1) ] with self.assertRaisesRegexp(ValueError, '.*Only one dimensional actions.*'): categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, action_spec, self._dummy_categorical_net, self._optimizer)
def testTrainWithRnn(self): action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1) batch_size = 5 observations = tf.constant([[[1, 2], [3, 4], [5, 6]]] * batch_size, dtype=tf.float32) actions = tf.constant([[[0], [1], [1]]] * batch_size, dtype=tf.int32) time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * batch_size, dtype=tf.int32), reward=tf.constant([[1] * 3] * batch_size, dtype=tf.float32), discount=tf.constant([[1] * 3] * batch_size, dtype=tf.float32), observation=[observations]) experience = trajectory.Trajectory(step_type=time_steps.step_type, observation=observations, action=actions, policy_info=(), next_step_type=time_steps.step_type, reward=time_steps.reward, discount=time_steps.discount) categorical_q_rnn_network = DummyCategoricalQRnnNetwork( self._obs_spec, action_spec, conv_layer_params=None, input_fc_layer_params=(16, ), preprocessing_combiner=None, lstm_size=(40, ), output_fc_layer_params=(16, ), ) counter = common.create_variable('test_train_counter') agent = categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, action_spec, categorical_q_rnn_network, optimizer=tf.compat.v1.train.AdamOptimizer(0.001), ) # Force variable creation. agent.policy.variables() if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(counter), 0) self.evaluate(loss)
def testPolicy(self): agent = categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, self._categorical_net, self._optimizer) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions, _, _ = agent.policy.action(time_steps) self.assertEqual(actions.shape, [2]) self.evaluate(tf.compat.v1.global_variables_initializer()) actions_ = self.evaluate(actions) self.assertTrue(all(actions_ <= self._action_spec.maximum)) self.assertTrue(all(actions_ >= self._action_spec.minimum))
def testCreateAgentWithPrebuiltPreprocessingLayersDiffAtoms(self): dense_layer = tf.keras.layers.Dense(3) q_net = KerasLayersNet( self._time_step_spec.observation, self._action_spec, dense_layer) dense_layer_target = tf.keras.layers.Dense(3) q_bad_target_net = KerasLayersNet( self._time_step_spec.observation, self._action_spec, dense_layer_target, num_atoms=3) with self.assertRaisesRegexp(ValueError, 'have different numbers of atoms'): categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, categorical_q_network=q_net, optimizer=None, target_categorical_q_network=q_bad_target_net)
def testCriticLossWithMaskedActions(self): # Observations are now a tuple of the usual observation and an action mask. observation_spec_with_mask = (self._obs_spec, tensor_spec.BoundedTensorSpec([2], tf.int32, 0, 1)) time_step_spec = ts.time_step_spec(observation_spec_with_mask) dummy_categorical_net = DummyCategoricalNet(self._obs_spec) agent = categorical_dqn_agent.CategoricalDqnAgent( time_step_spec, self._action_spec, dummy_categorical_net, self._optimizer, observation_and_action_constraint_splitter=lambda x: (x[0], x[1])) # For `observations`, the masks are set up so that only one action is valid # for each element in the batch. observations = (tf.constant([[1, 2], [3, 4]], dtype=tf.float32), tf.constant([[1, 0], [0, 1]], dtype=tf.int32)) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) # For `next_observations`, the masks are set up so the opposite actions as # before are valid. next_observations = (tf.constant([[5, 6], [7, 8]], dtype=tf.float32), tf.constant([[0, 1], [1, 0]], dtype=tf.int32)) next_time_steps = ts.transition(next_observations, rewards, discounts) experience = test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) # Due to the constant initialization of the DummyCategoricalNet, we can # expect the same loss every time. Note this is different from the loss in # testCriticLoss above due to previously optimal actions being masked out. expected_loss = 5.062895 loss_info = agent._loss(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) evaluated_loss = self.evaluate(loss_info).loss self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
def testUpdateTarget(self): agent = categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, self._categorical_net, self._optimizer) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) experience = test_utils.stacked_trajectory_from_transition( time_steps, action_steps, time_steps) loss_info = agent._loss(experience) update_targets = agent._update_target() self.evaluate(tf.compat.v1.global_variables_initializer()) losses = self.evaluate(loss_info).loss self.assertGreater(losses, 0.0) self.evaluate(update_targets)
def load_agents_and_create_videos( root_dir, env_name='CartPole-v0', num_iterations=NUM_ITERATIONS, max_ep_steps=1000, train_sequence_length=1, # Params for QNetwork fc_layer_params=((100, )), # Params for QRnnNetwork input_fc_layer_params=(50, ), lstm_size=(20, ), output_fc_layer_params=(20, ), # Params for collect initial_collect_steps=10000, collect_steps_per_iteration=1, epsilon_greedy=0.1, replay_buffer_capacity=100000, # Params for target update target_update_tau=0.05, target_update_period=5, # Params for train train_steps_per_iteration=1, batch_size=64, learning_rate=1e-3, num_atoms=51, min_q_value=-20, max_q_value=20, n_step_update=1, gamma=0.99, reward_scale_factor=1.0, gradient_clipping=None, use_tf_functions=True, # Params for eval num_eval_episodes=10, num_random_episodes=1, eval_interval=1000, # Params for checkpoints train_checkpoint_interval=10000, policy_checkpoint_interval=5000, rb_checkpoint_interval=20000, # Params for summaries and logging log_interval=1000, summary_interval=1000, summaries_flush_secs=10, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None, random_metrics_callback=None): # Define the directories to read from train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') random_dir = os.path.join(root_dir, 'random') # Match the writers and metrics used in training train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes) ] random_summary_writer = tf.compat.v2.summary.create_file_writer( random_dir, flush_millis=summaries_flush_secs * 1000) random_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes) ] global_step = tf.compat.v1.train.get_or_create_global_step() # Match the environments used in training tf_env = tf_py_environment.TFPyEnvironment( suite_gym.load(env_name, max_episode_steps=max_ep_steps)) eval_py_env = suite_gym.load(env_name, max_episode_steps=max_ep_steps) eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env) # Match the agents used in training categorical_q_net = categorical_q_network.CategoricalQNetwork( tf_env.observation_spec(), tf_env.action_spec(), num_atoms=num_atoms, fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) tf_agent = categorical_dqn_agent.CategoricalDqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), categorical_q_network=categorical_q_net, optimizer=optimizer, min_q_value=min_q_value, max_q_value=max_q_value, n_step_update=n_step_update, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, train_step_counter=global_step) tf_agent.initialize() train_metrics = [ # tf_metrics.NumberOfEpisodes(), # tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) collect_driver = dynamic_step_driver.DynamicStepDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_steps=collect_steps_per_iteration) train_checkpointer = common.Checkpointer(ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup( train_metrics, 'train_metrics')) policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'policy'), policy=eval_policy, global_step=global_step) rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=replay_buffer) train_checkpointer.initialize_or_restore() rb_checkpointer.initialize_or_restore() if use_tf_functions: # To speed up collect use common.function. collect_driver.run = common.function(collect_driver.run) tf_agent.train = common.function(tf_agent.train) random_policy = random_tf_policy.RandomTFPolicy( eval_tf_env.time_step_spec(), eval_tf_env.action_spec()) # Make movies of the trained agent and a random agent date_string = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S') # Finally, used the saved policy to generate the video trained_filename = "trainedC51_" + date_string create_policy_eval_video(eval_tf_env, eval_py_env, tf_agent.policy, trained_filename) # And, create one with a random agent for comparison random_filename = 'random_' + date_string create_policy_eval_video(eval_tf_env, eval_py_env, random_policy, random_filename)
def main(argv): tf.compat.v1.enable_v2_behavior() logging.config.dictConfig({ 'version': 1, # Other configs ... 'disable_existing_loggers': True }) argv = argv[0] evaluate = argv.eval # Mostly copied from https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial # Hyperparameters num_iterations = argv.num_iterations collect_steps_per_iteration = argv.collect_steps_per_iteration replay_buffer_max_length = 100000 batch_size = argv.batch_size learning_rate = 2.5e-5 log_interval = argv.log_interval num_atoms = argv.num_atoms min_q_value = argv.min_q_value max_q_value = argv.max_q_value n_step_update = argv.n_step_update gamma = 0.99 num_eval_episodes = 10 eval_interval = argv.eval_interval save_interval = argv.save_interval n_parallels = argv.n_parallels train_in_browser = argv.train_in_browser # Environment train_py_env = Env2048(evaluate) if evaluate else ParallelPyEnvironment( [lambda: Env2048(train_in_browser)] * n_parallels, start_serially=False) eval_py_env = Env2048(evaluate) train_env = tf_py_environment.TFPyEnvironment(train_py_env) eval_env = tf_py_environment.TFPyEnvironment(eval_py_env) # Agent fc_layer_params = (64, 64, 32) conv_layer_params = ((512, (2, 1), (1, 1)), (512, (1, 2), (1, 1))) preprocessing_layers = tf.keras.models.Sequential([ tf.keras.layers.Conv2D(512, (1, 1), (1, 1), padding='same'), tf.keras.layers.Conv2D(512, (2, 1), (1, 1), padding='same'), tf.keras.layers.Conv2D(512, (1, 2), (1, 1), padding='same'), tf.keras.layers.Flatten() ]) preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) global_step = tf.compat.v1.train.get_or_create_global_step() # q_net = q_network.QNetwork( # train_env.observation_spec(), # train_env.action_spec(), # fc_layer_params=fc_layer_params) # agent = dqn_agent.DqnAgent( # train_env.time_step_spec(), # train_env.action_spec(), # q_network=q_net, # optimizer=optimizer, # td_errors_loss_fn=common.element_wise_squared_loss, # train_step_counter=global_step) categorical_q_net = categorical_q_network.CategoricalQNetwork( train_env.observation_spec(), train_env.action_spec(), num_atoms=num_atoms, fc_layer_params=fc_layer_params, # conv_layer_params=conv_layer_params preprocessing_layers=preprocessing_layers, preprocessing_combiner=preprocessing_combiner) agent = categorical_dqn_agent.CategoricalDqnAgent( train_env.time_step_spec(), train_env.action_spec(), categorical_q_network=categorical_q_net, optimizer=optimizer, min_q_value=min_q_value, max_q_value=max_q_value, n_step_update=n_step_update, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, train_step_counter=global_step) agent.initialize() # Replay buffer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=train_env.batch_size, max_length=replay_buffer_max_length) # Data Collection collect_driver = dynamic_step_driver.DynamicStepDriver( train_env, agent.collect_policy, observers=[replay_buffer.add_batch], num_steps=collect_steps_per_iteration) collect_driver.run() dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) iterator = iter(dataset) # Checkpointer checkpoint_dir = os.path.join(os.getcwd(), 'checkpoint') train_checkpointer = common.Checkpointer(ckpt_dir=checkpoint_dir, max_to_keep=1, agent=agent, policy=agent.policy, replay_buffer=replay_buffer, global_step=global_step) train_checkpointer.initialize_or_restore() global_step = tf.compat.v1.train.get_global_step() # Training if evaluate: avg_return, best_eval_score = compute_avg_return( eval_env, agent.policy, num_eval_episodes) print(f"Average return: {avg_return}, best score = {best_eval_score}") train_env.station.shutdown() eval_env.station.shutdown() else: agent.train = common.function(agent.train) # agent.train_step_counter.assign(0) avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes) returns = [avg_return] t = trange(global_step.numpy(), num_iterations, leave=True) best_scores = np.array( list(map(lambda env: env.best_score, train_env.envs))) for _ in t: # Collect a few steps using collect_policy and save to the replay buffer. collect_driver.run() # Sample a batch of data from the buffer and update the agent's network. experience, unused_info = next(iterator) train_loss = agent.train(experience).loss scores = list(map(lambda env: env.score, train_env.envs)) t.set_description(desc=f"Scores = {scores}") step = tf.compat.v1.train.get_global_step().numpy() if step % log_interval == 0: t.write(f"step = {step}: loss = {train_loss}") if step % save_interval == 0: train_checkpointer.save(step) if step % eval_interval == 0: avg_return, best_eval_score = compute_avg_return( eval_env, agent.policy, num_eval_episodes) new_best_scores = np.array( list(map(lambda env: env.best_score, train_env.envs))) diff = np.subtract(new_best_scores, best_scores) best_scores = new_best_scores if np.count_nonzero(diff) > 0: t.write(f"step = {step}: Best scores = {best_scores}") t.write( f'step = {step}: Average Return = {avg_return}, best score reached in training = ' f'{max(list(map(lambda env: env.best_score, train_env.envs)))}' f', best score in eval = {best_eval_score}') returns.append(avg_return) steps = range(0, num_iterations + 1, eval_interval) plt.plot(steps, returns) plt.ylabel('Average Return') plt.xlabel('Step') train_env.close() eval_env.close() train_py_env.close()
def testCreateAgentDefaultNetwork(self): categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, self._categorical_net, self._optimizer)
momentum=optimizer_momentum, epsilon=optimizer_epsilon, centered=True) # Computes epsilon for epsilon greedy policy given the training step epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=epsilon_decay_steps, end_learning_rate=epsilon_final) # final ε agent = categorical_dqn_agent.CategoricalDqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), categorical_q_network=categorical_q_net, optimizer=optimizer, min_q_value=int(min_q_value), max_q_value=int(max_q_value), n_step_update=int(n_step_update), td_errors_loss_fn=common.element_wise_squared_loss, gamma=discount_factor, train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step)) agent.initialize() # Speed up as tensorflow function agent.train = function(agent.train) ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
def __init__( self, root_dir, env_name, num_iterations=200, max_episode_frames=108000, # ALE frames terminal_on_life_loss=False, conv_layer_params=((32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)), fc_layer_params=(512, ), # Params for collect initial_collect_steps=80000, # ALE frames epsilon_greedy=0.01, epsilon_decay_period=1000000, # ALE frames replay_buffer_capacity=1000000, # Params for train train_steps_per_iteration=1000000, # ALE frames update_period=16, # ALE frames target_update_tau=1.0, target_update_period=32000, # ALE frames batch_size=32, learning_rate=2.5e-4, n_step_update=2, gamma=0.99, reward_scale_factor=1.0, gradient_clipping=None, # Params for eval do_eval=True, eval_steps_per_iteration=500000, # ALE frames eval_epsilon_greedy=0.001, # Params for checkpoints, summaries, and logging log_interval=1000, summary_interval=1000, summaries_flush_secs=10, debug_summaries=True, summarize_grads_and_vars=True, eval_metrics_callback=None): """A simple Atari train and eval for DQN. Args: root_dir: Directory to write log files to. env_name: Fully-qualified name of the Atari environment (i.e. Pong-v0). num_iterations: Number of train/eval iterations to run. max_episode_frames: Maximum length of a single episode, in ALE frames. terminal_on_life_loss: Whether to simulate an episode termination when a life is lost. conv_layer_params: Params for convolutional layers of QNetwork. fc_layer_params: Params for fully connected layers of QNetwork. initial_collect_steps: Number of frames to ALE frames to process before beginning to train. Since this is in ALE frames, there will be initial_collect_steps/4 items in the replay buffer when training starts. epsilon_greedy: Final epsilon value to decay to for training. epsilon_decay_period: Period over which to decay epsilon, from 1.0 to epsilon_greedy (defined above). replay_buffer_capacity: Maximum number of items to store in the replay buffer. train_steps_per_iteration: Number of ALE frames to run through for each iteration of training. update_period: Run a train operation every update_period ALE frames. target_update_tau: Coeffecient for soft target network updates (1.0 == hard updates). target_update_period: Period, in ALE frames, to copy the live network to the target network. batch_size: Number of frames to include in each training batch. learning_rate: RMS optimizer learning rate. n_step_update: The number of steps to consider when computing TD error and TD loss. Applies standard single-step updates when set to 1. gamma: Discount for future rewards. reward_scale_factor: Scaling factor for rewards. gradient_clipping: Norm length to clip gradients. do_eval: If True, run an eval every iteration. If False, skip eval. eval_steps_per_iteration: Number of ALE frames to run through for each iteration of evaluation. eval_epsilon_greedy: Epsilon value to use for the evaluation policy (0 == totally greedy policy). log_interval: Log stats to the terminal every log_interval training steps. summary_interval: Write TF summaries every summary_interval training steps. summaries_flush_secs: Flush summaries to disk every summaries_flush_secs seconds. debug_summaries: If True, write additional summaries for debugging (see dqn_agent for which summaries are written). summarize_grads_and_vars: Include gradients in summaries. eval_metrics_callback: A callback function that takes (metric_dict, global_step) as parameters. Called after every eval with the results of the evaluation. """ self._update_period = update_period / ATARI_FRAME_SKIP self._train_steps_per_iteration = (train_steps_per_iteration / ATARI_FRAME_SKIP) self._do_eval = do_eval self._eval_steps_per_iteration = eval_steps_per_iteration / ATARI_FRAME_SKIP self._eval_epsilon_greedy = eval_epsilon_greedy self._initial_collect_steps = initial_collect_steps / ATARI_FRAME_SKIP self._summary_interval = summary_interval self._num_iterations = num_iterations self._log_interval = log_interval self._eval_metrics_callback = eval_metrics_callback with gin.unlock_config(): gin.bind_parameter(('tf_agents.environments.atari_preprocessing.' 'AtariPreprocessing.terminal_on_life_loss'), terminal_on_life_loss) root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() self._train_summary_writer = train_summary_writer self._eval_summary_writer = None if self._do_eval: self._eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) self._eval_metrics = [ py_metrics.AverageReturnMetric(name='PhaseAverageReturn', buffer_size=np.inf), py_metrics.AverageEpisodeLengthMetric( name='PhaseAverageEpisodeLength', buffer_size=np.inf), ] self._global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if(lambda: tf.math.equal( self._global_step % self._summary_interval, 0)): self._env = suite_atari.load( env_name, max_episode_steps=max_episode_frames / ATARI_FRAME_SKIP, gym_env_wrappers=suite_atari. DEFAULT_ATARI_GYM_WRAPPERS_WITH_STACKING) self._env = batched_py_environment.BatchedPyEnvironment( [self._env]) observation_spec = tensor_spec.from_spec( self._env.observation_spec()) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.from_spec(self._env.action_spec()) with tf.device('/cpu:0'): epsilon = tf.compat.v1.train.polynomial_decay( 1.0, self._global_step, epsilon_decay_period / ATARI_FRAME_SKIP / self._update_period, end_learning_rate=epsilon_greedy) with tf.device('/gpu:0'): optimizer = tf.compat.v1.train.RMSPropOptimizer( learning_rate=learning_rate, decay=0.95, momentum=0.0, epsilon=0.00001, centered=True) categorical_q_net = AtariCategoricalQNetwork( observation_spec, action_spec, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params) agent = categorical_dqn_agent.CategoricalDqnAgent( time_step_spec, action_spec, categorical_q_network=categorical_q_net, optimizer=optimizer, epsilon_greedy=epsilon, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=(target_update_period / ATARI_FRAME_SKIP / self._update_period), gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=self._global_step) self._collect_policy = py_tf_policy.PyTFPolicy( agent.collect_policy) if self._do_eval: self._eval_policy = py_tf_policy.PyTFPolicy( epsilon_greedy_policy.EpsilonGreedyPolicy( policy=agent.policy, epsilon=self._eval_epsilon_greedy)) py_observation_spec = self._env.observation_spec() py_time_step_spec = ts.time_step_spec(py_observation_spec) py_action_spec = policy_step.PolicyStep( self._env.action_spec()) data_spec = trajectory.from_transition(py_time_step_spec, py_action_spec, py_time_step_spec) self._replay_buffer = py_hashed_replay_buffer.PyHashedReplayBuffer( data_spec=data_spec, capacity=replay_buffer_capacity) with tf.device('/cpu:0'): ds = self._replay_buffer.as_dataset( sample_batch_size=batch_size, num_steps=n_step_update + 1) ds = ds.prefetch(4) ds = ds.apply( tf.data.experimental.prefetch_to_device('/gpu:0')) with tf.device('/gpu:0'): self._ds_itr = tf.compat.v1.data.make_one_shot_iterator(ds) experience = self._ds_itr.get_next() self._train_op = agent.train(experience) self._env_steps_metric = py_metrics.EnvironmentSteps() self._step_metrics = [ py_metrics.NumberOfEpisodes(), self._env_steps_metric, ] self._train_metrics = self._step_metrics + [ py_metrics.AverageReturnMetric(buffer_size=10), py_metrics.AverageEpisodeLengthMetric(buffer_size=10), ] # The _train_phase_metrics average over an entire train iteration, # rather than the rolling average of the last 10 episodes. self._train_phase_metrics = [ py_metrics.AverageReturnMetric(name='PhaseAverageReturn', buffer_size=np.inf), py_metrics.AverageEpisodeLengthMetric( name='PhaseAverageEpisodeLength', buffer_size=np.inf), ] self._iteration_metric = py_metrics.CounterMetric( name='Iteration') # Summaries written from python should run every time they are # generated. with tf.compat.v2.summary.record_if(True): self._steps_per_second_ph = tf.compat.v1.placeholder( tf.float32, shape=(), name='steps_per_sec_ph') self._steps_per_second_summary = tf.compat.v2.summary.scalar( name='global_steps_per_sec', data=self._steps_per_second_ph, step=self._global_step) for metric in self._train_metrics: metric.tf_summaries(train_step=self._global_step, step_metrics=self._step_metrics) for metric in self._train_phase_metrics: metric.tf_summaries( train_step=self._global_step, step_metrics=(self._iteration_metric, )) self._iteration_metric.tf_summaries( train_step=self._global_step) if self._do_eval: with self._eval_summary_writer.as_default(): for metric in self._eval_metrics: metric.tf_summaries( train_step=self._global_step, step_metrics=(self._iteration_metric, )) self._train_checkpointer = common.Checkpointer( ckpt_dir=train_dir, agent=agent, global_step=self._global_step, optimizer=optimizer, metrics=metric_utils.MetricsGroup( self._train_metrics + self._train_phase_metrics + [self._iteration_metric], 'train_metrics')) self._policy_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(train_dir, 'policy'), policy=agent.policy, global_step=self._global_step) self._rb_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=self._replay_buffer) self._init_agent_op = agent.initialize()
def create_agent( agent_class, environment, fc_layer_params, learning_rate, decaying_epsilon, n_step_update, target_update_tau, target_update_period, gamma, reward_scale_factor, gradient_clipping, debug_summaries, summarize_grads_and_vars, train_step_counter, num_atoms=None, # Only for categorical_dqn min_q_value=None, # Only for categorical_dqn max_q_value=None, # Only for categorical_dqn ): """Creates the Hanabi agent. Args: agent_class: str, type of agent to construct. environment: The environment. learning_rate: The Learning Rate decaying_epsilon: Epsilon for Epsilon Greedy Policy target_update_tau: Agent parameter target_update_period: Agent parameter gamma: Agent parameter reward_scale_factor: Agent parameter gradient_clipping: Agent parameter debug_summaries: Agent parameter summarize_grads_and_vars: Agent parameter train_step_counter: The train step tf.Variable to be passed to agent Returns: An agent for playing Hanabi. Raises: ValueError: if an unknown agent type is requested. """ if agent_class == 'DQN': return dqn_agent.DqnAgent( environment.time_step_spec(), environment.action_spec(), q_network=q_network.QNetwork( environment.time_step_spec().observation['observations'], environment.action_spec(), fc_layer_params=fc_layer_params), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), observation_and_action_constraint_splitter= observation_and_action_constraint_splitter, epsilon_greedy=decaying_epsilon, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) elif agent_class == 'DDQN': return dqn_agent.DdqnAgent( environment.time_step_spec(), environment.action_spec(), q_network=q_network.QNetwork( environment.time_step_spec().observation['observations'], environment.action_spec(), fc_layer_params=fc_layer_params), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), observation_and_action_constraint_splitter= observation_and_action_constraint_splitter, epsilon_greedy=decaying_epsilon, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) elif agent_class == 'categorical_dqn': return categorical_dqn_agent.CategoricalDqnAgent( environment.time_step_spec(), environment.action_spec(), categorical_q_network=categorical_q_network.CategoricalQNetwork( environment.time_step_spec().observation['observations'], environment.action_spec(), num_atoms=num_atoms, fc_layer_params=fc_layer_params), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), observation_and_action_constraint_splitter= observation_and_action_constraint_splitter, epsilon_greedy=decaying_epsilon, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, min_q_value=min_q_value, max_q_value=max_q_value, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) else: raise ValueError( 'Expected valid agent_type, got {}'.format(agent_class))
def main(arg, pars): """ """ print("load env ..") env_name = ("Car-v0") #env = gym.make("Car-v0") env = suite_gym.load(env_name, discount=arg.gamma, max_episode_steps=arg.max_t) print_parameter(arg, pars) train_py_env = suite_gym.load(env_name, discount=arg.gamma, max_episode_steps=arg.max_t) eval_py_env = suite_gym.load(env_name, discount=arg.gamma, max_episode_steps=arg.max_t) train_env = tf_py_environment.TFPyEnvironment(train_py_env) eval_env = tf_py_environment.TFPyEnvironment(eval_py_env) print("env loaded") train_dir = os.path.join(arg.root_dir, 'network_weights') eval_dir = os.path.join(arg.root_dir, 'eval') train_env.reset() fc_layer_params = (arg.hidden_size_1, ) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=arg.lr) train_step_counter = tf.compat.v2.Variable(0) categorical_q_net = CategoricalQNetwork(train_env.observation_spec(), train_env.action_spec(), fc_layer_params=fc_layer_params) agent = categorical_dqn_agent.CategoricalDqnAgent( train_env.time_step_spec(), train_env.action_spec(), categorical_q_network=categorical_q_net, optimizer=optimizer, epsilon_greedy=arg.eps_start) train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] global_step = tf.compat.v1.train.get_or_create_global_step() train_checkpointer = common.Checkpointer(ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup( train_metrics, 'train_metrics')) if arg.continue_training == False: tf_agent.initialize() if os.path.exists("network_weights/*"): os.remove("network_weights/*") else: print("Continue Training") train_checkpointer.initialize_or_restore() print("ready to go") eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec()) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=train_env.batch_size, max_length=arg.buffer_size) tf_agent.collect_data_spec tf_agent.collect_data_spec._fields collect_data(train_env, random_policy, replay_buffer, steps=arg.learn_start, max_t=40) print("create dataset") dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=arg.batch_size, num_steps=2).prefetch(3) iterator = iter(dataset) # (Optional) Optimize by wrapping some of the code in a graph using TF function. tf_agent.train = common.function(tf_agent.train) # Reset the train step tf_agent.train_step_counter.assign(0) avg_return = compute_avg_return(eval_env, tf_agent.policy, arg.num_eval_episodes) returns = [avg_return] returns_average = [avg_return] train_loss_average = [1] score = 0 scores_window = deque(maxlen=100) # last 100 scores total_train_loss = deque(maxlen=100) # last 100 scores train(arg, tf_agent, train_env, eval_env, replay_buffer, iterator, train_checkpointer)
categorical_q_net = categorical_q_network.CategoricalQNetwork( train_env.observation_spec(), train_env.action_spec(), num_atoms=num_atoms, fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) train_step_counter = tf.compat.v2.Variable(0) agent = categorical_dqn_agent.CategoricalDqnAgent( train_env.time_step_spec(), train_env.action_spec(), categorical_q_network=categorical_q_net, optimizer=optimizer, min_q_value=min_q_value, max_q_value=max_q_value, n_step_update=n_step_update, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, train_step_counter=train_step_counter) agent.initialize() # setup the policies eval_policy = agent.policy # The main policy that is used for evaluation and deployment collect_policy = agent.collect_policy # A second policy that is used for data collection random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec()) # Data Collection