def save_model(): optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate = 1e-3); obs_spec = TensorSpec((7,), dtype = tf.float32, name = 'observation'); action_spec = BoundedTensorSpec((1,), dtype = tf.int32, minimum = 0, maximum = 3, name = 'action'); actor_net = ActorDistributionRnnNetwork(obs_spec, action_spec, lstm_size = (100,100)); value_net = ValueRnnNetwork(obs_spec); agent = ppo_agent.PPOAgent( time_step_spec = time_step_spec(obs_spec), action_spec = action_spec, optimizer = optimizer, actor_net = actor_net, value_net = value_net, normalize_observations = True, normalize_rewards = True, use_gae = True, num_epochs = 1, ); checkpointer = Checkpointer( ckpt_dir = 'checkpoints/policy', max_to_keep = 1, agent = agent, policy = agent.policy, global_step = tf.compat.v1.train.get_or_create_global_step()); checkpointer.initialize_or_restore(); saver = policy_saver.PolicySaver(agent.policy); saver.save('final_policy');
def create_algorithm(env, use_rnn=False, learning_rate=1e-1): observation_spec = env.observation_spec() action_spec = env.action_spec() if use_rnn: actor_net = ActorDistributionRnnNetwork(observation_spec, action_spec, input_fc_layer_params=(), output_fc_layer_params=(), lstm_size=(4, )) value_net = ValueRnnNetwork(observation_spec, input_fc_layer_params=(), output_fc_layer_params=(), lstm_size=(4, )) else: actor_net = ActorDistributionNetwork( observation_spec, action_spec, fc_layer_params=(), continuous_projection_net=StableNormalProjectionNetwork) value_net = ValueNetwork(observation_spec, fc_layer_params=()) optimizer = tf.optimizers.Adam(learning_rate=learning_rate) return PPOAlgorithm(action_spec=action_spec, actor_network=actor_net, value_network=value_net, loss=PPOLoss(action_spec=action_spec, gamma=1.0, debug_summaries=DEBUGGING), optimizer=optimizer, debug_summaries=DEBUGGING)
def create_algorithm(env, use_rnn=False, learning_rate=1e-1): observation_spec = env.observation_spec() action_spec = env.action_spec() if use_rnn: actor_net = ActorDistributionRnnNetwork(observation_spec, action_spec, input_fc_layer_params=(), output_fc_layer_params=(), lstm_size=(4, )) value_net = ValueRnnNetwork(observation_spec, input_fc_layer_params=(), output_fc_layer_params=(), lstm_size=(4, )) else: actor_net = ActorDistributionNetwork(observation_spec, action_spec, fc_layer_params=()) value_net = ValueNetwork(observation_spec, fc_layer_params=()) optimizer = tf.optimizers.Adam(learning_rate=learning_rate) ac_algorithm = ActorCriticAlgorithm(action_spec=action_spec, actor_network=actor_net, value_network=value_net, loss=PPOLoss(action_spec=action_spec, gamma=1.0), optimizer=optimizer) return PPOAlgorithm(ac_algorithm)
def create_networks(observation_spec, action_spec): actor_net = ActorDistributionRnnNetwork(observation_spec, action_spec, conv_layer_params=[(16, 8, 4), (32, 4, 2)], input_fc_layer_params=(256, ), lstm_size=(256, ), output_fc_layer_params=(128, ), activation_fn=tf.nn.elu) value_net = ValueRnnNetwork(observation_spec, conv_layer_params=[(16, 8, 4), (32, 4, 2)], input_fc_layer_params=(256, ), lstm_size=(256, ), output_fc_layer_params=(128, ), activation_fn=tf.nn.elu) return actor_net, value_net
def create_networks(tf_env): actor_net = ActorDistributionRnnNetwork(tf_env.observation_spec(), tf_env.action_spec(), conv_layer_params=[(16, 4, 2), (32, 2, 1)], input_fc_layer_params=(256, ), lstm_size=(256, ), output_fc_layer_params=(128, )) value_net = ValueRnnNetwork(tf_env.observation_spec(), conv_layer_params=[(16, 4, 2), (32, 2, 1)], input_fc_layer_params=(256, ), lstm_size=(256, ), output_fc_layer_params=(128, ), activation_fn=tf.nn.elu) return actor_net, value_net
def _create_ppo_algorithm(): observation_spec = common.get_observation_spec() action_spec = common.get_action_spec() optimizer = tf.optimizers.Adam(learning_rate=1e-3) actor_net = ActorDistributionRnnNetwork(observation_spec, action_spec, input_fc_layer_params=(), output_fc_layer_params=None) value_net = ValueRnnNetwork(observation_spec, input_fc_layer_params=(), output_fc_layer_params=None) return PPOAlgorithm(action_spec=action_spec, actor_network=actor_net, value_network=value_net, loss_class=PPOLoss, optimizer=optimizer, debug_summaries=True)
def create_networks(observation_spec, action_spec): preprocessing_combiner = tf.keras.layers.Concatenate() actor_net = ActorDistributionRnnNetwork( observation_spec, action_spec, conv_layer_params=[(16, 8, 4), (32, 4, 2)], input_fc_layer_params=(256, ), lstm_size=(256, ), preprocessing_combiner=preprocessing_combiner, output_fc_layer_params=(128, ), activation_fn=tf.nn.elu) value_net = ValueRnnNetwork(observation_spec, conv_layer_params=[(16, 8, 4), (32, 4, 2)], input_fc_layer_params=(256, ), preprocessing_combiner=preprocessing_combiner, lstm_size=(256, ), output_fc_layer_params=(128, ), activation_fn=tf.nn.elu) return actor_net, value_net
def test_actor_critic_rnn_policy(self): batch_size = 100 steps_per_episode = 5 gap = 3 env = RNNPolicyUnittestEnv(batch_size, steps_per_episode, gap) # We need to wrap env using TFPyEnvironment because the methods of env # has side effects (e.g, env._current_time_step can be changed) env = TFPyEnvironment(env) action_spec = env.action_spec() observation_spec = env.observation_spec() algorithm = ActorCriticAlgorithm( observation_spec=observation_spec, action_spec=action_spec, actor_network=ActorDistributionRnnNetwork( observation_spec, action_spec, input_fc_layer_params=(), output_fc_layer_params=None), value_network=ValueRnnNetwork(observation_spec, input_fc_layer_params=(), output_fc_layer_params=None), optimizer=tf.optimizers.Adam(learning_rate=1e-2)) driver = OnPolicyDriver(env, algorithm, train_interval=9) eval_driver = OnPolicyDriver(env, algorithm, training=False) driver.run = tf.function(driver.run) t0 = time.time() driver.run(max_num_steps=2000 * batch_size) logging.info("time=%s" % (time.time() - t0)) env.reset() time_step, _ = eval_driver.run(max_num_steps=4 * batch_size) logging.info("reward=%s" % tf.reduce_mean(time_step.reward)) self.assertAlmostEqual(1.0, float(tf.reduce_mean(time_step.reward)), delta=5e-2)
def get_ac_networks(conv_layer_params=None, num_embedding_dims=None, fc_layer_params=None, num_state_tiles=None, num_sentence_tiles=None): """ Generate the actor and value networks Args: conv_layer_params (list[int 3 tuple]): optional convolution layers parameters, where each item is a length-three tuple indicating (filters, kernel_size, stride). num_embedding_dims (int): optional number of dimensions of the vocabulary embedding space. fc_layer_params (list[int]): optional fully_connected parameters, where each item is the number of units in the layer. num_state_tiles (int): optional number of times to repeat the internal state tensor before concatenation with other inputs. The rationale is to match the number of dimentions of the image input, so that the final concatenation will have roughly equal representation from different sources of input. Without this, typically image input, due to its large input size, will take over and trump all other small dimensional inputs. num_sentence_tiles (int): optional number of times to repeat the sentence embedding tensor before concatenation with other inputs, so that sentence input won't be trumped by other high dimensional inputs like image observation. """ observation_spec = common.get_observation_spec() action_spec = common.get_action_spec() conv_layers = tf.keras.Sequential( tf_agents.networks.utils.mlp_layers( conv_layer_params=conv_layer_params)) preprocessing_layers = { 'image': conv_layers, } if common.get_states_shape(): state_layers = get_identity_layer() # [image: (1, 12800), sentence: (1, 16 * 800), states: (1, 16 * 800)] # Here, we tile along the last dimension of the input. if num_state_tiles: state_layers = tf.keras.Sequential([ tf.keras.layers.Lambda( lambda x: tf.tile(x, multiples=[1, num_state_tiles])) ]) preprocessing_layers['states'] = state_layers vocab_size = common.get_vocab_size() if vocab_size: sentence_layers = tf.keras.Sequential([ tf.keras.layers.Embedding(vocab_size, num_embedding_dims), tf.keras.layers.GlobalAveragePooling1D() ]) if num_sentence_tiles: sentence_layers.add( tf.keras.layers.Lambda( lambda x: tf.tile(x, multiples=[1, num_sentence_tiles]))) preprocessing_layers['sentence'] = sentence_layers preprocessing_combiner = tf.keras.layers.Concatenate() actor = ActorDistributionRnnNetwork( input_tensor_spec=observation_spec, output_tensor_spec=action_spec, preprocessing_layers=preprocessing_layers, preprocessing_combiner=preprocessing_combiner, input_fc_layer_params=fc_layer_params) value = ValueRnnNetwork(input_tensor_spec=observation_spec, preprocessing_layers=preprocessing_layers, preprocessing_combiner=preprocessing_combiner, input_fc_layer_params=fc_layer_params) return actor, value
def create_ac_algorithm(env, actor_fc_layers=(200, 100), value_fc_layers=(200, 100), encoding_conv_layers=(), encoding_fc_layers=(), use_rnns=False, use_icm=False, learning_rate=5e-5, algorithm_class=ActorCriticAlgorithm, loss_class=ActorCriticLoss, debug_summaries=False): """Create a simple ActorCriticAlgorithm. Args: env (TFEnvironment): A TFEnvironment actor_fc_layers (list[int]): list of fc layers parameters for actor network value_fc_layers (list[int]): list of fc layers parameters for value network encoding_conv_layers (list[int]): list of convolution layers parameters for encoding network encoding_fc_layers (list[int]): list of fc layers parameters for encoding network use_rnns (bool): True if rnn should be used use_icm (bool): True if intrinsic curiosity module should be used learning_rate (float): learning rate algorithm_class (type): class of the algorithm. Can be ActorCriticAlgorithm or PPOAlgorithm loss_class (type): the class of the loss. The signature of its constructor: loss_class(action_spec, debug_summaries) debug_summaries (bool): True if debug summaries should be created. """ optimizer = tf.optimizers.Adam(learning_rate=learning_rate) if use_rnns: actor_net = ActorDistributionRnnNetwork( env.observation_spec(), env.action_spec(), input_fc_layer_params=actor_fc_layers, output_fc_layer_params=None) value_net = ValueRnnNetwork(env.observation_spec(), input_fc_layer_params=value_fc_layers, output_fc_layer_params=None) else: actor_net = ActorDistributionNetwork(env.observation_spec(), env.action_spec(), fc_layer_params=actor_fc_layers) value_net = ValueNetwork(env.observation_spec(), fc_layer_params=value_fc_layers) encoding_net = None if encoding_fc_layers or encoding_conv_layers: encoding_net = EncodingNetwork( input_tensor_spec=env.observation_spec(), conv_layer_params=encoding_conv_layers, fc_layer_params=encoding_fc_layers) icm = None if use_icm: feature_spec = env.observation_spec() if encoding_net: feature_spec = tf.TensorSpec((encoding_fc_layers[-1], ), dtype=tf.float32) icm = ICMAlgorithm(env.action_spec(), feature_spec, encoding_net=encoding_net) algorithm = algorithm_class(action_spec=env.action_spec(), actor_network=actor_net, value_network=value_net, intrinsic_curiosity_module=icm, loss_class=loss_class, optimizer=optimizer, debug_summaries=debug_summaries) return algorithm
def main(): if len(sys.argv) != 2: raise ValueError(f"Usage: ./{sys.argv[0]} experiment_name") experiment_name = sys.argv[1] tf.compat.v1.enable_v2_behavior() # Create train and evaluation environments for Tensorflow train_env = tf_py_environment.TFPyEnvironment( parallel_py_environment.ParallelPyEnvironment( [Environment.Environment] * num_parallel_environments)) eval_py_env = Environment.Environment() eval_env = tf_py_environment.TFPyEnvironment(eval_py_env) optimizer = Adam(learning_rate=learning_rate, epsilon=1e-5) global_step = tf.compat.v1.train.get_or_create_global_step() timed_at_step = global_step.numpy() # Initialize actor and value networks actor_net = ActorDistributionRnnNetwork( input_tensor_spec=train_env.observation_spec(), output_tensor_spec=train_env.action_spec(), conv_layer_params=[(3, 4, 1), (7, 4, 2), (5, 8, 2)], input_fc_layer_params=(128, ), lstm_size=(128, ), output_fc_layer_params=(64, ), activation_fn=tf.nn.elu) value_net = ValueRnnNetwork(input_tensor_spec=train_env.observation_spec(), conv_layer_params=[(3, 4, 1), (7, 4, 2), (5, 8, 2)], input_fc_layer_params=(128, ), lstm_size=(128, ), output_fc_layer_params=(64, ), activation_fn=tf.nn.elu) agent = ppo_agent.PPOAgent(train_env.time_step_spec(), train_env.action_spec(), optimizer, actor_net, value_net, num_epochs=num_epochs, train_step_counter=global_step, discount_factor=0.99, gradient_clipping=0.5, entropy_regularization=1e-2, importance_ratio_clipping=0.2, use_gae=True, use_td_lambda_return=True) agent.initialize() step_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps() ] replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) collect_driver = dynamic_episode_driver.DynamicEpisodeDriver( train_env, agent.collect_policy, observers=[replay_buffer.add_batch] + step_metrics, num_episodes=collect_episodes_per_iteration) environment_steps_metric = tf_metrics.EnvironmentSteps() collect_time = 0 train_time = 0 # Reset the train step agent.train_step_counter.assign(0) saved_model_dir = os.path.join("saved_models", experiment_name) checkpoint_dir = os.path.join(saved_model_dir, 'checkpoint') train_checkpointer = common.Checkpointer( ckpt_dir=checkpoint_dir, max_to_keep=1, agent=agent, policy=agent.policy, #replay_buffer=replay_buffer, global_step=global_step) train_checkpointer.initialize_or_restore() global_step = tf.compat.v1.train.get_global_step() print(f"Starting training at step: {global_step.numpy()}") while environment_steps_metric.result() < num_environment_steps: start_time = time.time() collect_driver.run() collect_time += time.time() - start_time start_time = time.time() trajectories = replay_buffer.gather_all() total_loss, unused_info = agent.train(experience=trajectories) replay_buffer.clear() train_time += time.time() - start_time global_step_val = global_step.numpy() if global_step_val % eval_interval == 0: with open( eval_path(saved_model_dir, global_step_val, eval_interval), 'w') as f: avg_return = evaluate_perf(f, eval_env, agent.policy, num_eval_episodes) steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) print( f"step = {global_step_val}: loss = {total_loss}, Avg return: {avg_return}, {steps_per_sec:.3f} steps/sec, collect_time = {collect_time}, train_time = {train_time}" ) timed_at_step = global_step_val collect_time = 0 train_time = 0 if global_step_val % policy_saver_interval == 0: train_checkpointer.save(global_step_val)
def main(_): # environment serves as the dataset in reinforcement learning train_env = tf_py_environment.TFPyEnvironment( ParallelPyEnvironment([lambda: suite_mujoco.load('HalfCheetah-v2')] * batch_size)) eval_env = tf_py_environment.TFPyEnvironment( suite_mujoco.load('HalfCheetah-v2')) # create agent actor_net = ActorDistributionRnnNetwork(train_env.observation_spec(), train_env.action_spec(), lstm_size=(100, 100)) value_net = ValueRnnNetwork(train_env.observation_spec()) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3) tf_agent = ppo_agent.PPOAgent(train_env.time_step_spec(), train_env.action_spec(), optimizer=optimizer, actor_net=actor_net, value_net=value_net, normalize_observations=False, normalize_rewards=False, use_gae=True, num_epochs=25) tf_agent.initialize() # replay buffer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( tf_agent.collect_data_spec, batch_size=train_env.batch_size, max_length=1000000) # policy saver saver = policy_saver.PolicySaver(tf_agent.policy) # define trajectory collector train_episode_count = tf_metrics.NumberOfEpisodes() train_total_steps = tf_metrics.EnvironmentSteps() train_avg_reward = tf_metrics.AverageReturnMetric( batch_size=train_env.batch_size) train_avg_episode_len = tf_metrics.AverageEpisodeLengthMetric( batch_size=train_env.batch_size) train_driver = dynamic_episode_driver.DynamicEpisodeDriver( train_env, tf_agent.collect_policy, # NOTE: use PPOPolicy to collect episode observers=[ replay_buffer.add_batch, train_episode_count, train_total_steps, train_avg_reward, train_avg_episode_len ], # callbacks when an episode is completely collected num_episodes=30, # how many episodes are collected in an iteration ) # training eval_avg_reward = tf_metrics.AverageReturnMetric(buffer_size=30) eval_avg_episode_len = tf_metrics.AverageEpisodeLengthMetric( buffer_size=30) while train_total_steps.result() < 25000000: train_driver.run() trajectories = replay_buffer.gather_all() loss, _ = tf_agent.train(experience=trajectories) replay_buffer.clear() # clear collected episodes right after training if tf_agent.train_step_counter.numpy() % 50 == 0: print('step = {0}: loss = {1}'.format( tf_agent.train_step_counter.numpy(), loss)) if tf_agent.train_step_counter.numpy() % 500 == 0: # save checkpoint saver.save('checkpoints/policy_%d' % tf_agent.train_step_counter.numpy()) # evaluate the updated policy eval_avg_reward.reset() eval_avg_episode_len.reset() eval_driver = dynamic_episode_driver.DynamicEpisodeDriver( eval_env, tf_agent.policy, observers=[ eval_avg_reward, eval_avg_episode_len, ], num_episodes= 30, # how many epsiodes are collected in an iteration ) eval_driver.run() print( 'step = {0}: Average Return = {1} Average Episode Length = {2}' .format(tf_agent.train_step_counter.numpy(), train_avg_reward.result(), train_avg_episode_len.result())) # play cartpole for the last 3 times and visualize import cv2 for _ in range(3): status = eval_env.reset() policy_state = tf_agent.policy.get_initial_state(eval_env.batch_size) while not status.is_last(): action = tf_agent.policy.action(status, policy_state) # NOTE: use greedy policy to test status = eval_env.step(action.action) policy_state = action.state cv2.imshow('halfcheetah', eval_env.pyenv.envs[0].render()) cv2.waitKey(25)
def make_networks(env, size=(96, 96), num_frames=1, num_channels=3, conv_params=[(16, 8, 4), (32, 3, 2)], in_fc_params=(256, ), out_fc_params=(128, ), use_lstm=False, lstm_size=(256, )): """ Creates the actor and critic neural networks of the PPO agent. Function for creating the neural networks for the PPO agent, namely the actor and value networks. Source for network params: https://www.arconsis.com/unternehmen/blog/reinforcement-learning-doom-with-tf-agents-and-ppo Arguments: env (TfPyEnvironment): A TensorFlow environment the agent interacts with. size (tuple): The desired width and height of the observation space. Defaults to (96, 96). Input tuple should preserve the original observation aspect ratio. num_frames (int): Number of frames used in the agent's observation. Defaults to 1, num_frames > 1 indicates frame stacking. num_channels (int): Number of color channels to include for each frame. Defaults to 3 (RGB), and 1 denotes grayscale. conv_params (list): A list corresponding to convolutional layer parameters for the PPO agent's actor and critic neural networks. in_fc_params (tuple): The number of neurons in the input fully connected layer of the actor and critic networks of the agent. out_fc_params (tuple): The number of neurons in the output fully connected layer of the actor and critic networks of the agent. use_lstm (bool): Whether to use LSTM-based actor and critic networks. lstm_size (tuple): The number of hidden states inside the LSTM for the actor and critic networks of the agents. Returns: actor_net (ActorDistributionNetwork): A tf-agents Actor Distribution Network used for PPO agent action selection. value_net (ValueNetwork): A tf-agents Value Network used for PPO agent value estimation. """ # Restructure time step spec to match expected processed observations processed_shape = tuple(size + (num_channels * num_frames, )) obs_spec = env.observation_spec() # Get old observation spec obs_spec = tensor_spec.BoundedTensorSpec(processed_shape, obs_spec.dtype, minimum=obs_spec.minimum, maximum=obs_spec.maximum, name=obs_spec.name) if use_lstm: # LSTM-based policies # Define actor network actor_net = ActorDistributionRnnNetwork( obs_spec, env.action_spec(), conv_layer_params=conv_params, input_fc_layer_params=in_fc_params, lstm_size=lstm_size, output_fc_layer_params=out_fc_params) # Define value network value_net = ValueRnnNetwork(obs_spec, conv_layer_params=conv_params, input_fc_layer_params=in_fc_params, lstm_size=lstm_size, output_fc_layer_params=out_fc_params) print("Created Actor and Value Networks with LSTM...") else: # non-LSTM-based policies # Define actor network actor_net = ActorDistributionNetwork(obs_spec, env.action_spec(), conv_layer_params=conv_params) # Define value network value_net = ValueNetwork(obs_spec, conv_layer_params=conv_params) return actor_net, value_net