def testMasking(self): batch_size = 1000 num_state_dims = 5 num_actions = 8 observations = tf.random.uniform([batch_size, num_state_dims]) time_step = ts.restart(observations, batch_size=batch_size) input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32) action_spec = tensor_spec.BoundedTensorSpec( [1], tf.int32, 0, num_actions - 1) mask = [0, 1, 0, 1, 0, 0, 1, 0] np_mask = np.array(mask) tf_mask = tf.constant([mask for _ in range(batch_size)]) q_network = categorical_q_network.CategoricalQNetwork( input_tensor_spec=input_tensor_spec, action_spec=action_spec, num_atoms=3, mask_split_fn=lambda observation: (observation, tf_mask), fc_layer_params=[4]) policy = categorical_q_policy.CategoricalQPolicy(self._min_q_value, self._max_q_value, q_network, action_spec) # Force creation of variables before global_variables_initializer. policy.variables() self.evaluate(tf.compat.v1.global_variables_initializer()) # Sample from the policy 1000 times and ensure that invalid actions are # never chosen. action_step = policy.action(time_step) action = self.evaluate(action_step.action) self.assertEqual(action.shape, (batch_size,)) self.assertAllEqual(np_mask[action], np.ones([batch_size]))
def build_categorical_dqn_agent(self): """Build categorical DQN agent with CategoricalQNetwork.""" temp_env = self.build_temp_env() if self.dropout_layer_params is not None: raise AttributeError('CategoricalQNetwork does accept dropout layers.') q_net = categorical_q_network.CategoricalQNetwork( temp_env.observation_spec(), temp_env.action_spec(), fc_layer_params=self.fc_layer_params) optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate) agent = CategoricalDqnAgent( temp_env.time_step_spec(), temp_env.action_spec(), n_step_update=self.n_step_update, categorical_q_network=q_net, optimizer=optimizer, min_q_value=0.0, max_q_value=3.0, epsilon_greedy=self.epsilon_greedy, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=tf.Variable(0, dtype=tf.int64)) return q_net, agent
def testCorrectOutputShape(self): batch_size = 3 num_state_dims = 5 action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1) num_actions = action_spec.maximum - action_spec.minimum + 1 self.assertEqual(num_actions, 2) observations_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32) observations = tf.random.uniform([batch_size, num_state_dims]) time_steps = ts.restart(observations, batch_size) q_network = categorical_q_network.CategoricalQNetwork( input_tensor_spec=observations_spec, action_spec=action_spec, fc_layer_params=[3]) logits, _ = q_network(time_steps.observation) self.assertAllEqual(logits.shape.as_list(), [batch_size, num_actions, q_network._num_atoms]) self.evaluate(tf.compat.v1.global_variables_initializer()) eval_logits = self.evaluate(logits) self.assertAllEqual(eval_logits.shape, [batch_size, num_actions, q_network._num_atoms])
def __init__(self, input_tensor_spec, action_spec, **kwargs): super(AtariCategoricalQNetwork, self).__init__(input_tensor_spec, state_spec=()) input_tensor_spec = tf.TensorSpec(dtype=tf.float32, shape=input_tensor_spec.shape) self._categorical_q_network = categorical_q_network.CategoricalQNetwork( input_tensor_spec, action_spec, **kwargs)
def testGinConfig(self): batch_size = 3 num_state_dims = 5 action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1) num_actions = action_spec.maximum - action_spec.minimum + 1 self.assertEqual(num_actions, 2) observations_spec = tensor_spec.TensorSpec([3, 3, num_state_dims], tf.float32) observations = tf.random.uniform([batch_size, 3, 3, num_state_dims]) next_observations = tf.random.uniform( [batch_size, 3, 3, num_state_dims]) time_steps = ts.restart(observations, batch_size) next_time_steps = ts.restart(next_observations, batch_size) gin.parse_config(""" CategoricalQNetwork.conv_layer_params = [(16, 2, 1), (15, 2, 1)] CategoricalQNetwork.fc_layer_params = [4, 3, 5] """) q_network = categorical_q_network.CategoricalQNetwork( input_tensor_spec=observations_spec, action_spec=action_spec) logits, _ = q_network(time_steps.observation) next_logits, _ = q_network(next_time_steps.observation) self.assertAllEqual(logits.shape.as_list(), [batch_size, num_actions, q_network._num_atoms]) self.assertAllEqual(next_logits.shape.as_list(), [batch_size, num_actions, q_network._num_atoms]) # This time there are six layers: two conv layers, three fc layers, and one # final logits layer, for 12 trainable_variables in total. self.assertLen(q_network.trainable_variables, 12)
def testBuild(self): batch_size = 3 num_state_dims = 5 action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1) num_actions = action_spec.maximum - action_spec.minimum + 1 self.assertEqual(num_actions, 2) observations_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32) observations = tf.random.uniform([batch_size, num_state_dims]) time_steps = ts.restart(observations, batch_size) q_network = categorical_q_network.CategoricalQNetwork( input_tensor_spec=observations_spec, action_spec=action_spec, fc_layer_params=[3]) logits, _ = q_network(time_steps.observation) self.assertAllEqual(logits.shape.as_list(), [batch_size, num_actions, q_network._num_atoms]) # There are two trainable layers here: the specified fc_layer and the final # logits layer. Each layer has two trainable_variables (kernel and bias), # for a total of 4. self.assertLen(q_network.trainable_variables, 4)
def testMasking(self): batch_size = 1000 num_state_dims = 5 num_actions = 8 observations = tf.random.uniform([batch_size, num_state_dims]) time_step = ts.restart(observations, batch_size=batch_size) input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32) action_spec = tensor_spec.BoundedTensorSpec( [1], tf.int32, 0, num_actions - 1) # We create a fixed mask here for testing purposes. Normally the mask would # be part of the observation. mask = [0, 1, 0, 1, 0, 0, 1, 0] np_mask = np.array(mask) tf_mask = tf.constant([mask for _ in range(batch_size)]) q_network = categorical_q_network.CategoricalQNetwork( input_tensor_spec=input_tensor_spec, action_spec=action_spec, num_atoms=3, fc_layer_params=[4]) policy = categorical_q_policy.CategoricalQPolicy( self._time_step_spec, action_spec, q_network, self._min_q_value, self._max_q_value, observation_and_action_constraint_splitter=( lambda observation: (observation, tf_mask))) self.evaluate(tf.compat.v1.global_variables_initializer()) # Sample from the policy 1000 times, and ensure that actions considered # invalid according to the mask are never chosen. action_step = policy.action(time_step) action = self.evaluate(action_step.action) self.assertEqual(action.shape, (batch_size,)) self.assertAllEqual(np_mask[action], np.ones([batch_size]))
def setUp(self): super(CategoricalDqnAgentTest, self).setUp() tf.compat.v1.enable_resource_variables() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 1) self._categorical_net = categorical_q_network.CategoricalQNetwork( self._obs_spec, self._action_spec, fc_layer_params=[4]) self._dummy_categorical_net = DummyCategoricalNet(self._obs_spec) self._optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.01)
def testMultipleActionsRaiseError(self): with self.assertRaisesRegexp( TypeError, '.*action_spec must be a BoundedTensorSpec.*'): # Replace the action_spec for this test. action_spec = [tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)] * 2 q_network = categorical_q_network.CategoricalQNetwork( input_tensor_spec=self._obs_spec, action_spec=action_spec, num_atoms=3, fc_layer_params=[4]) categorical_q_policy.CategoricalQPolicy( self._time_step_spec, action_spec, q_network, self._min_q_value, self._max_q_value)
def testMasking(self): batch_size = 3 num_state_dims = 5 num_actions = 6 states = tf.random.uniform([batch_size, num_state_dims]) input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32) action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, num_actions - 1) mask = tf.constant([[1, 0, 1, 0, 0, 1] for _ in range(batch_size)]) network = categorical_q_network.CategoricalQNetwork( input_tensor_spec, action_spec, mask_split_fn=lambda observation: (observation, mask)) self.assertIsNotNone(network.mask_split_fn) # Run a pass through the network to catch any shape errors. network(states)
def testChangeHiddenLayers(self): batch_size = 3 num_state_dims = 5 action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1) num_actions = action_spec.maximum - action_spec.minimum + 1 self.assertEqual(num_actions, 2) observations_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32) observations = tf.random.uniform([batch_size, num_state_dims]) time_steps = ts.restart(observations, batch_size) q_network = categorical_q_network.CategoricalQNetwork( input_tensor_spec=observations_spec, action_spec=action_spec, fc_layer_params=[3, 3]) logits, _ = q_network(time_steps.observation) self.assertAllEqual(logits.shape.as_list(), [batch_size, num_actions, q_network._num_atoms]) # This time there is an extra fc layer, for a total of 6 # trainable_variables. self.assertLen(q_network.trainable_variables, 6)
def testAddConvLayers(self): batch_size = 3 num_state_dims = 5 action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1) num_actions = action_spec.maximum - action_spec.minimum + 1 self.assertEqual(num_actions, 2) observations_spec = tensor_spec.TensorSpec([3, 3, num_state_dims], tf.float32) observations = tf.random.uniform([batch_size, 3, 3, num_state_dims]) time_steps = ts.restart(observations, batch_size) q_network = categorical_q_network.CategoricalQNetwork( input_tensor_spec=observations_spec, action_spec=action_spec, conv_layer_params=[(16, 2, 1), (15, 2, 1)]) logits, _ = q_network(time_steps.observation) self.assertAllEqual(logits.shape.as_list(), [batch_size, num_actions, q_network._num_atoms]) # This time there are two conv layers and one final logits layer, for a # total of 6 trainable_variables. self.assertLen(q_network.trainable_variables, 6)
if frame_stack is not None: board_preprocessing = Sequential([ keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32) / 2.), tf.keras.layers.Permute((4, 2, 3, 1)), tf.keras.layers.Lambda(lambda x: x[:, 0, :, :, :]) ]) else: board_preprocessing = Sequential([ keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32) / 2.), ]) # Layers params are specified by local variables ovtained from DataFrame categorical_q_net = categorical_q_network.CategoricalQNetwork( tf_env.observation_spec(), tf_env.action_spec(), preprocessing_layers=board_preprocessing, fc_layer_params=fc_layer_params, conv_layer_params=conv_layer_params, num_atoms=int(num_atoms)) ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ # Create variable that counts the number of training steps train_step = tf.Variable(0) # Create optimizer optimizer = tf.compat.v1.train.RMSPropOptimizer( learning_rate=optimizer_learning_rate, decay=optimizer_decay, momentum=optimizer_momentum, epsilon=optimizer_epsilon,
def create_agent( agent_class, environment, fc_layer_params, learning_rate, decaying_epsilon, n_step_update, target_update_tau, target_update_period, gamma, reward_scale_factor, gradient_clipping, debug_summaries, summarize_grads_and_vars, train_step_counter, num_atoms=None, # Only for categorical_dqn min_q_value=None, # Only for categorical_dqn max_q_value=None, # Only for categorical_dqn ): """Creates the Hanabi agent. Args: agent_class: str, type of agent to construct. environment: The environment. learning_rate: The Learning Rate decaying_epsilon: Epsilon for Epsilon Greedy Policy target_update_tau: Agent parameter target_update_period: Agent parameter gamma: Agent parameter reward_scale_factor: Agent parameter gradient_clipping: Agent parameter debug_summaries: Agent parameter summarize_grads_and_vars: Agent parameter train_step_counter: The train step tf.Variable to be passed to agent Returns: An agent for playing Hanabi. Raises: ValueError: if an unknown agent type is requested. """ if agent_class == 'DQN': return dqn_agent.DqnAgent( environment.time_step_spec(), environment.action_spec(), q_network=q_network.QNetwork( environment.time_step_spec().observation['observations'], environment.action_spec(), fc_layer_params=fc_layer_params), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), observation_and_action_constraint_splitter= observation_and_action_constraint_splitter, epsilon_greedy=decaying_epsilon, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) elif agent_class == 'DDQN': return dqn_agent.DdqnAgent( environment.time_step_spec(), environment.action_spec(), q_network=q_network.QNetwork( environment.time_step_spec().observation['observations'], environment.action_spec(), fc_layer_params=fc_layer_params), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), observation_and_action_constraint_splitter= observation_and_action_constraint_splitter, epsilon_greedy=decaying_epsilon, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) elif agent_class == 'categorical_dqn': return categorical_dqn_agent.CategoricalDqnAgent( environment.time_step_spec(), environment.action_spec(), categorical_q_network=categorical_q_network.CategoricalQNetwork( environment.time_step_spec().observation['observations'], environment.action_spec(), num_atoms=num_atoms, fc_layer_params=fc_layer_params), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), observation_and_action_constraint_splitter= observation_and_action_constraint_splitter, epsilon_greedy=decaying_epsilon, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, min_q_value=min_q_value, max_q_value=max_q_value, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) else: raise ValueError( 'Expected valid agent_type, got {}'.format(agent_class))
activation='relu'), keras.layers.Flatten() ]) else: board_preprocessing = Sequential([ keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32)), keras.layers.Flatten() ]) health_preprocessing = keras.layers.Flatten() # Layers params are specified by local variables ovtained from DataFrame categorical_q_net = categorical_q_network.CategoricalQNetwork( tf_env.observation_spec(), tf_env.action_spec(), preprocessing_layers=(board_preprocessing, health_preprocessing), preprocessing_combiner=tf.keras.layers.Concatenate(axis=-1), fc_layer_params=fc_layer_params, num_atoms=int(num_atoms)) ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ # Create variable that counts the number of training steps train_step = tf.Variable(0) # Create optimizer optimizer = tf.compat.v1.train.RMSPropOptimizer( learning_rate=optimizer_learning_rate, decay=optimizer_decay, momentum=optimizer_momentum, epsilon=optimizer_epsilon,
def main(argv): tf.compat.v1.enable_v2_behavior() logging.config.dictConfig({ 'version': 1, # Other configs ... 'disable_existing_loggers': True }) argv = argv[0] evaluate = argv.eval # Mostly copied from https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial # Hyperparameters num_iterations = argv.num_iterations collect_steps_per_iteration = argv.collect_steps_per_iteration replay_buffer_max_length = 100000 batch_size = argv.batch_size learning_rate = 2.5e-5 log_interval = argv.log_interval num_atoms = argv.num_atoms min_q_value = argv.min_q_value max_q_value = argv.max_q_value n_step_update = argv.n_step_update gamma = 0.99 num_eval_episodes = 10 eval_interval = argv.eval_interval save_interval = argv.save_interval n_parallels = argv.n_parallels train_in_browser = argv.train_in_browser # Environment train_py_env = Env2048(evaluate) if evaluate else ParallelPyEnvironment( [lambda: Env2048(train_in_browser)] * n_parallels, start_serially=False) eval_py_env = Env2048(evaluate) train_env = tf_py_environment.TFPyEnvironment(train_py_env) eval_env = tf_py_environment.TFPyEnvironment(eval_py_env) # Agent fc_layer_params = (64, 64, 32) conv_layer_params = ((512, (2, 1), (1, 1)), (512, (1, 2), (1, 1))) preprocessing_layers = tf.keras.models.Sequential([ tf.keras.layers.Conv2D(512, (1, 1), (1, 1), padding='same'), tf.keras.layers.Conv2D(512, (2, 1), (1, 1), padding='same'), tf.keras.layers.Conv2D(512, (1, 2), (1, 1), padding='same'), tf.keras.layers.Flatten() ]) preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) global_step = tf.compat.v1.train.get_or_create_global_step() # q_net = q_network.QNetwork( # train_env.observation_spec(), # train_env.action_spec(), # fc_layer_params=fc_layer_params) # agent = dqn_agent.DqnAgent( # train_env.time_step_spec(), # train_env.action_spec(), # q_network=q_net, # optimizer=optimizer, # td_errors_loss_fn=common.element_wise_squared_loss, # train_step_counter=global_step) categorical_q_net = categorical_q_network.CategoricalQNetwork( train_env.observation_spec(), train_env.action_spec(), num_atoms=num_atoms, fc_layer_params=fc_layer_params, # conv_layer_params=conv_layer_params preprocessing_layers=preprocessing_layers, preprocessing_combiner=preprocessing_combiner) agent = categorical_dqn_agent.CategoricalDqnAgent( train_env.time_step_spec(), train_env.action_spec(), categorical_q_network=categorical_q_net, optimizer=optimizer, min_q_value=min_q_value, max_q_value=max_q_value, n_step_update=n_step_update, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, train_step_counter=global_step) agent.initialize() # Replay buffer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=train_env.batch_size, max_length=replay_buffer_max_length) # Data Collection collect_driver = dynamic_step_driver.DynamicStepDriver( train_env, agent.collect_policy, observers=[replay_buffer.add_batch], num_steps=collect_steps_per_iteration) collect_driver.run() dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) iterator = iter(dataset) # Checkpointer checkpoint_dir = os.path.join(os.getcwd(), 'checkpoint') train_checkpointer = common.Checkpointer(ckpt_dir=checkpoint_dir, max_to_keep=1, agent=agent, policy=agent.policy, replay_buffer=replay_buffer, global_step=global_step) train_checkpointer.initialize_or_restore() global_step = tf.compat.v1.train.get_global_step() # Training if evaluate: avg_return, best_eval_score = compute_avg_return( eval_env, agent.policy, num_eval_episodes) print(f"Average return: {avg_return}, best score = {best_eval_score}") train_env.station.shutdown() eval_env.station.shutdown() else: agent.train = common.function(agent.train) # agent.train_step_counter.assign(0) avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes) returns = [avg_return] t = trange(global_step.numpy(), num_iterations, leave=True) best_scores = np.array( list(map(lambda env: env.best_score, train_env.envs))) for _ in t: # Collect a few steps using collect_policy and save to the replay buffer. collect_driver.run() # Sample a batch of data from the buffer and update the agent's network. experience, unused_info = next(iterator) train_loss = agent.train(experience).loss scores = list(map(lambda env: env.score, train_env.envs)) t.set_description(desc=f"Scores = {scores}") step = tf.compat.v1.train.get_global_step().numpy() if step % log_interval == 0: t.write(f"step = {step}: loss = {train_loss}") if step % save_interval == 0: train_checkpointer.save(step) if step % eval_interval == 0: avg_return, best_eval_score = compute_avg_return( eval_env, agent.policy, num_eval_episodes) new_best_scores = np.array( list(map(lambda env: env.best_score, train_env.envs))) diff = np.subtract(new_best_scores, best_scores) best_scores = new_best_scores if np.count_nonzero(diff) > 0: t.write(f"step = {step}: Best scores = {best_scores}") t.write( f'step = {step}: Average Return = {avg_return}, best score reached in training = ' f'{max(list(map(lambda env: env.best_score, train_env.envs)))}' f', best score in eval = {best_eval_score}') returns.append(avg_return) steps = range(0, num_iterations + 1, eval_interval) plt.plot(steps, returns) plt.ylabel('Average Return') plt.xlabel('Step') train_env.close() eval_env.close() train_py_env.close()
# setup the env train_py_env = FourInARow() eval_py_env = FourInARow() # convert the env to tf_env train_env = tf_py_environment.TFPyEnvironment(train_py_env) eval_env = tf_py_environment.TFPyEnvironment(eval_py_env) # Agent # setup the categorical network categorical_q_net = categorical_q_network.CategoricalQNetwork( train_env.observation_spec(), train_env.action_spec(), num_atoms=num_atoms, fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) train_step_counter = tf.compat.v2.Variable(0) agent = categorical_dqn_agent.CategoricalDqnAgent( train_env.time_step_spec(), train_env.action_spec(), categorical_q_network=categorical_q_net, optimizer=optimizer, min_q_value=min_q_value, max_q_value=max_q_value, n_step_update=n_step_update,
def load_agents_and_create_videos( root_dir, env_name='CartPole-v0', num_iterations=NUM_ITERATIONS, max_ep_steps=1000, train_sequence_length=1, # Params for QNetwork fc_layer_params=((100, )), # Params for QRnnNetwork input_fc_layer_params=(50, ), lstm_size=(20, ), output_fc_layer_params=(20, ), # Params for collect initial_collect_steps=10000, collect_steps_per_iteration=1, epsilon_greedy=0.1, replay_buffer_capacity=100000, # Params for target update target_update_tau=0.05, target_update_period=5, # Params for train train_steps_per_iteration=1, batch_size=64, learning_rate=1e-3, num_atoms=51, min_q_value=-20, max_q_value=20, n_step_update=1, gamma=0.99, reward_scale_factor=1.0, gradient_clipping=None, use_tf_functions=True, # Params for eval num_eval_episodes=10, num_random_episodes=1, eval_interval=1000, # Params for checkpoints train_checkpoint_interval=10000, policy_checkpoint_interval=5000, rb_checkpoint_interval=20000, # Params for summaries and logging log_interval=1000, summary_interval=1000, summaries_flush_secs=10, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None, random_metrics_callback=None): # Define the directories to read from train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') random_dir = os.path.join(root_dir, 'random') # Match the writers and metrics used in training train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes) ] random_summary_writer = tf.compat.v2.summary.create_file_writer( random_dir, flush_millis=summaries_flush_secs * 1000) random_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes) ] global_step = tf.compat.v1.train.get_or_create_global_step() # Match the environments used in training tf_env = tf_py_environment.TFPyEnvironment( suite_gym.load(env_name, max_episode_steps=max_ep_steps)) eval_py_env = suite_gym.load(env_name, max_episode_steps=max_ep_steps) eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env) # Match the agents used in training categorical_q_net = categorical_q_network.CategoricalQNetwork( tf_env.observation_spec(), tf_env.action_spec(), num_atoms=num_atoms, fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) tf_agent = categorical_dqn_agent.CategoricalDqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), categorical_q_network=categorical_q_net, optimizer=optimizer, min_q_value=min_q_value, max_q_value=max_q_value, n_step_update=n_step_update, td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, train_step_counter=global_step) tf_agent.initialize() train_metrics = [ # tf_metrics.NumberOfEpisodes(), # tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) collect_driver = dynamic_step_driver.DynamicStepDriver( tf_env, collect_policy, observers=[replay_buffer.add_batch] + train_metrics, num_steps=collect_steps_per_iteration) train_checkpointer = common.Checkpointer(ckpt_dir=train_dir, agent=tf_agent, global_step=global_step, metrics=metric_utils.MetricsGroup( train_metrics, 'train_metrics')) policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'policy'), policy=eval_policy, global_step=global_step) rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=replay_buffer) train_checkpointer.initialize_or_restore() rb_checkpointer.initialize_or_restore() if use_tf_functions: # To speed up collect use common.function. collect_driver.run = common.function(collect_driver.run) tf_agent.train = common.function(tf_agent.train) random_policy = random_tf_policy.RandomTFPolicy( eval_tf_env.time_step_spec(), eval_tf_env.action_spec()) # Make movies of the trained agent and a random agent date_string = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S') # Finally, used the saved policy to generate the video trained_filename = "trainedC51_" + date_string create_policy_eval_video(eval_tf_env, eval_py_env, tf_agent.policy, trained_filename) # And, create one with a random agent for comparison random_filename = 'random_' + date_string create_policy_eval_video(eval_tf_env, eval_py_env, random_policy, random_filename)