def dyke_dqn_agent(env: TFPyEnvironment, layers: Optional[List[Layer]] = None) -> DqnAgent: """ Prepares a deep Q-network (DQN) agent for use in the dyke maintenance environment. :param env: The dyke environment on which to base the DQN agent. :param layers: Optional. A list of layers to supply to the DQN agent's network. :return: The agent. """ layers = fully_connected_dyke_dqn_agent_network( sizes=(100, 50)) if layers is None else layers # prepare the Q-values layer action_as: BoundedArraySpec = from_spec(env.action_spec()) number_actions: int = int(action_as.maximum - action_as.minimum + 1) q_values_layer: Layer = Dense(units=number_actions, activation=None, kernel_initializer=RandomUniform( minval=-3e-3, maxval=3e-3), bias_initializer=Constant(-2e-1)) net = Sequential(layers=layers + [q_values_layer]) # instantiate and return the agent optimizer = Adam(learning_rate=1e-3) train_step_counter = Variable(initial_value=0) return DqnAgent(time_step_spec=env.time_step_spec(), action_spec=env.action_spec(), q_network=net, optimizer=optimizer, epsilon_greedy=0.1, td_errors_loss_fn=element_wise_squared_loss, train_step_counter=train_step_counter)
def train_eval( # tensorboard files root_dir, # environment env_name="CartPole-v1", random_seed=0, # Params for collect num_environment_steps=100000, replay_buffer_capacity=1001, # Per-environment # Params for eval num_eval_episodes=30, eval_interval=200, # Params for summaries summary_interval=50, ): tf.compat.v1.set_random_seed(random_seed) environment = TFPyEnvironment(suite_gym.load(env_name)) evaluation_environment = TFPyEnvironment(suite_gym.load(env_name)) actor_net = ActorDistributionNetwork(environment.observation_spec(), environment.action_spec(), fc_layer_params=(200, 100)) value_net = ValueNetwork(environment.observation_spec(), fc_layer_params=(200, 100)) global_step = tf.compat.v1.train.get_or_create_global_step() agent = PPOClipAgent( # should be closer to the paper than PPOAgent... environment.time_step_spec(), environment.action_spec(), optimizer=tf.compat.v1.train.AdamOptimizer( ), # default None does not work actor_net=actor_net, value_net=value_net, importance_ratio_clipping=0.2, normalize_observations=False, normalize_rewards=False, use_gae=True, lambda_value=0.5, discount_factor=0.95, train_step_counter=global_step, ) agent_trainer = OnPolicyModelFreeAgentTrainer(400) experiment_harness = ExperimentHarness( root_dir, environment, evaluation_environment, agent, agent_trainer, replay_buffer_capacity, num_environment_steps, summary_interval, eval_interval, num_eval_episodes, number_of_initial_random_policy_steps=0, use_tf_function=True, ) experiment_harness.run()
def test_incorrect_termination_model(): """ The generic model-based agent should only allow a ConstantFalseTermination model. """ # setup arguments for the model-based agent constructor py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() network = LinearTransitionNetwork(observation_spec) transition_model = KerasTransitionModel([network], observation_spec, action_spec) reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState(observation_spec) termination_model = MountainCarTermination(observation_spec) policy = RandomTFPolicy(time_step_spec, action_spec) with pytest.raises(AssertionError) as excinfo: ModelBasedAgent( time_step_spec, action_spec, transition_model, reward_model, termination_model, initial_state_distribution_model, policy, policy, ) assert "Only constant false termination supported" in str(excinfo.value)
def test_unknown_transition_model(): """ Pets Agent has prespecified transition model, RuntimeError should raise on unknown model. """ # setup the environment and a prespecified model components py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState( observation_spec) # trajectory optimiser trajectory_optimiser_type = TrajectoryOptimizationType.CrossEntropyMethod transition_model_type = "unknown_model" trajectory_sampler_type = TrajectorySamplerType.TS1 # some parameters need to be set correctly ensemble_size = 2 num_elites = 10 learning_rate = 0.9 max_iterations = 5 population_size = num_elites + 10 number_of_particles = 1 horizon = 1 with pytest.raises(RuntimeError) as excinfo: PetsAgent( time_step_spec, action_spec, transition_model_type, 1, 10, tf.nn.relu, ensemble_size, False, 1, 1, [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)], reward_model, initial_state_distribution_model, trajectory_sampler_type, trajectory_optimiser_type, horizon, population_size, number_of_particles, num_elites, learning_rate, max_iterations, ) assert "Unknown transition model" in str(excinfo.value)
def test_ensemble_size_set_correctly(): """ For ensemble transition models ensemble size needs to be larger than 1. """ # setup the environment and a prespecified model components py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState(observation_spec) # transition model and model-free agent transition_model_type = TransitionModelType.DeterministicEnsemble trajectory_sampler_type = TrajectorySamplerType.TS1 model_free_agent_type = ModelFreeAgentType.Ppo # some parameters need to be set correctly ensemble_size = 1 population_size = 10 horizon = 1 # define agent, many transition model and trajectory optimiser parameters can # be arbitrary with pytest.raises(AssertionError) as excinfo: MbpoAgent( time_step_spec, action_spec, transition_model_type, 1, 10, tf.nn.relu, ensemble_size, False, 1, 1, [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)], reward_model, initial_state_distribution_model, trajectory_sampler_type, horizon, population_size, model_free_agent_type, 1, 10, tf.nn.relu, 2, 1, ) assert "ensemble_size should be > 1" in str(excinfo.value)
def test_unknown_transition_model(): """ Mepo Agent has prespecified transition model, RuntimeError should raise on unknown model. """ # setup the environment and a prespecified model components py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState(observation_spec) # transition model and model-free agent transition_model_type = "unknown_model" trajectory_sampler_type = TrajectorySamplerType.TS1 model_free_agent_type = ModelFreeAgentType.Ppo # some parameters need to be set correctly ensemble_size = 2 num_elites = 10 population_size = num_elites + 10 horizon = 1 with pytest.raises(RuntimeError) as excinfo: MbpoAgent( time_step_spec, action_spec, transition_model_type, 1, 10, tf.nn.relu, ensemble_size, False, 1, 1, [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)], reward_model, initial_state_distribution_model, trajectory_sampler_type, horizon, population_size, model_free_agent_type, 1, 10, tf.nn.relu, 2, 1, ) assert "Unknown transition model" in str(excinfo.value)
def _create_environment_and_policy(batch_size): tf_batched_environment = TFPyEnvironment( BatchedPyEnvironment([ PyEnvironmentMock(final_state=TRAJECTORY_LENGTH) for _ in range(batch_size) ])) policy = TFPolicyMock( tf_batched_environment.time_step_spec(), tf_batched_environment.action_spec(), batch_size=batch_size, ) return tf_batched_environment, policy
def test_planning_policy_batch_environment_model(): """ Ensure that planning policy is operational. """ # number of trajectories for planning and planning horizon population_size = 3 planner_horizon = 5 number_of_particles = 1 # setup the environment and a model of it py_env = suite_gym.load("MountainCar-v0") tf_env = TFPyEnvironment(py_env) reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec()) terminates = MountainCarTermination(tf_env.observation_spec()) network = LinearTransitionNetwork(tf_env.observation_spec()) transition_model = KerasTransitionModel( [network], tf_env.observation_spec(), tf_env.action_spec(), ) initial_state = MountainCarInitialState(tf_env.observation_spec()) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=terminates, initial_state_distribution_model=initial_state, ) # setup the trajectory optimiser random_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy, planner_horizon, population_size, number_of_particles) planning_policy = PlanningPolicy(environment_model, trajectory_optimiser) # test whether it runs collect_driver_planning_policy = DynamicEpisodeDriver(tf_env, planning_policy, num_episodes=1) time_step = tf_env.reset() collect_driver_planning_policy.run(time_step)
def generic_dqn_agent(env: TFPyEnvironment) -> (dqn_agent.DqnAgent, q_network.QNetwork): """ Function that returns a generic dqn agent args: env (TFPyEnvironment) : The environment the agent will live in Returns: dqn_agent.DqnAgent: The agent to train q_network.QNetwork: The network used in the agent """ inp = env.observation_spec().shape[0] q_net = q_network.QNetwork( env.observation_spec(), env.action_spec(), fc_layer_params=(20,20,20,20,20), activation_fn=tf.keras.activations.relu) optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005) agent = dqn_agent.DqnAgent( env.time_step_spec(), env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=tf.Variable(0), epsilon_greedy=0.1 ) """def observation_and_action_constraint_splitter(observation): action_mask = [1,1] if observation[0][-1] > 5: action_mask[0] = 1 return observation, tf.convert_to_tensor(action_mask, dtype=np.int32) agent.policy._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter )""" #tf_agents.policies.greedy_policy.GreedyPolicy agent.initialize() return agent, q_net
conv_layer_params = [(32,(8,8),4), (64,(4,4), 2), (64, (3,3), 1)] fc_layer_params = [512] q_net = QNetwork( tf_env.observation_spec(), tf_env.action_spec(), preprocessing_layers=preprocessing_layer, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params) train_step = tf.Variable(0) update_period = 4 # train the model every 4 steps optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=1.0, decay_steps= 250000, end_learning_rate=0.01) agent = DqnAgent(tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=2000, td_errors_loss_fn=keras.losses.Huber(reduction='none'), gamma=0.99, train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step)) agent.initialize() replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=850000)
initial_learning_rate=0.7, decay_steps=total_steps, end_learning_rate=0.001, ) # 3. Constructing the DQN Agent. optimizer = Yogi(learning_rate=0.00025) loss = Huber() n_steps = 3 tau = 0.001 gamma = 0.99 min_q = -200 max_q = 200 agent = CategoricalDqnAgent( time_step_spec=train_env.time_step_spec(), action_spec=train_env.action_spec(), categorical_q_network=online_q_net, optimizer=optimizer, min_q_value=min_q, max_q_value=max_q, epsilon_greedy=lambda: decay_epsilon_greedy(train_step), n_step_update=n_steps, target_categorical_q_network=target_q_net, target_update_tau=tau, target_update_period=1, td_errors_loss_fn=loss, gamma=gamma, train_step_counter=train_step ) agent.initialize()
suite_gym.load(env_name, max_episode_steps=max_episode_steps_eval, gym_env_wrappers=[ShrinkWrapper, DiscreteActionWrapper])) # create DQN (deep Q-Learning network) q_net = QNetwork(train_env.observation_spec(), train_env.action_spec(), conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) train_step_counter = tf.compat.v2.Variable(0) # create deep reinforcement learning agent tf_agent = DqnAgent(train_env.time_step_spec(), train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=element_wise_squared_loss, train_step_counter=train_step_counter) tf_agent.initialize() # create evaluation and data collection policies eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy # create replay buffer print("Creating replay buffer") replay_buffer = TFUniformReplayBuffer(data_spec=tf_agent.collect_data_spec, batch_size=train_env.batch_size,
class DQNAgent: def __init__(self) -> None: """ A class for training a TF-agent based on https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial """ self.train_env = None # Training environment self.agent = None # The algorithm used to solve an RL problem is represented by a TF-Agent self.replay_buffer = None # The replay buffer keeps track of data collected from the environment self.dataset = None # The agent needs access to the replay buffer via an iterable tf.data.Dataset self.iterator = None # The iterator of self.dataset def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float, replay_buffer_max_length: int, layers: dict) -> None: """ Create the Q-network, agent and policy Args: X_train: A np.ndarray for training samples. y_train: A np.ndarray for the class labels of the training samples. lr: learn rate for the optimizer (default Adam) epsilon: Used for the default epsilon greedy policy for choosing a random action. gamma: The discount factor for learning Q-values imb_ratio: ratio of imbalance. Used to specifiy reward in the environment replay_buffer_max_length: Maximum lenght of replay memory. layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout). """ dense_layers = layers.get("dense") conv_layers = layers.get("conv") dropout_layers = layers.get("dropout") self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio)) # create a custom environment q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers, fc_layer_params=dense_layers, dropout_layer_params=dropout_layers) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr) train_step_counter = tf.Variable(0) self.agent = DqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter, gamma=gamma, epsilon_greedy=epsilon, ) self.agent.initialize() self.replay_buffer = TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=replay_buffer_max_length) def fit(self, X_train: np.ndarray, y_train: np.ndarray, epochs: int, batch_size: int, eval_step: int, log_step: int, collect_steps_per_episode: int) -> None: """ Starts the training of the Agent. Args: X_train: A np.ndarray for training samples. y_train: A np.ndarray for the class labels of the training samples. epochs: Number of epochs to train Agent batch_size: The Batch Size eval_step: Evaluate Model each 'eval_step' log_step: Monitor results of model each 'log_step' collect_steps_per_episode: Collect a few steps using collect_policy and save to the replay buffer. """ self.dataset = self.replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) self.iterator = iter(self.dataset) def collect_step(environment, policy, buffer): time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer buffer.add_batch(traj) def collect_data(env, policy, buffer, steps): for _ in range(steps): collect_step(env, policy, buffer) # (Optional) Optimize by wrapping some of the code in a graph using TF function. self.agent.train = common.function(self.agent.train) # Reset the train step self.agent.train_step_counter.assign(0) for _ in range(epochs): #print("epoch: ", _) # Collect a few steps using collect_policy and save to the replay buffer. collect_data(self.train_env, self.agent.collect_policy, self.replay_buffer, collect_steps_per_episode) # Sample a batch of data from the buffer and update the agent's network. experience, _ = next(self.iterator) train_loss = self.agent.train(experience).loss step = self.agent.train_step_counter.numpy() if step % log_step == 0: print('step = {0}: loss = {1}'.format(step, train_loss)) if step % eval_step == 0: metrics = self.compute_metrics(X_train, y_train) print(metrics) def compute_metrics(self, X: np.ndarray, y_true: list) -> dict: """Compute Metrics for Evaluation""" # TODO: apply softmax layer for q logits? q, _ = self.agent._target_q_network (X, training=False) # y_scores = np.max(q.numpy(), axis=1) # predicted scores (Q-Values) y_pred = np.argmax(q.numpy(), axis=1) # predicted class label metrics = custom_metrics(y_true, y_pred) return metrics def evaluate(self, X: np.ndarray, y: list, X_train=None, y_train=None) -> dict: """ Evaluation of trained Q-network """ metrics = self.compute_metrics(X, y) print("evaluation: ", metrics) return metrics
def initialize_tf_agent(model_class: ABCMeta, train_env: TFPyEnvironment) -> TFAgent: optimizer = Adam(learning_rate=1e-3) if model_class in [agents.PPOAgent]: actor_net = actor_distribution_network.ActorDistributionNetwork( train_env.observation_spec(), train_env.action_spec(), fc_layer_params=(200, 100), activation_fn=tf.keras.activations.tanh, ) value_net = value_network.ValueNetwork( train_env.observation_spec(), fc_layer_params=(200, 100), activation_fn=tf.keras.activations.tanh, ) model = model_class( time_step_spec=train_env.time_step_spec(), action_spec=train_env.action_spec(), actor_net=actor_net, value_net=value_net, optimizer=optimizer, ) elif model_class in [agents.DqnAgent]: action_spec = train_env.action_spec() num_actions = action_spec.maximum - action_spec.minimum + 1 q_network = create_feedforward_network(fc_layer_units=(100, ), num_actions=num_actions) model = model_class( time_step_spec=train_env.time_step_spec(), action_spec=train_env.action_spec(), q_network=q_network, optimizer=optimizer, ) elif model_class in [agents.ReinforceAgent]: optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3) actor_net = actor_distribution_network.ActorDistributionNetwork( train_env.time_step_spec().observation, train_env.action_spec(), fc_layer_params=(100, )) model = model_class( time_step_spec=train_env.time_step_spec(), action_spec=train_env.action_spec(), actor_network=actor_net, optimizer=optimizer, ) elif model_class in [agents.SacAgent]: time_step_spec = train_env.time_step_spec() observation_spec = time_step_spec.observation action_spec = train_env.action_spec() critic_joint_fc_layers = (256, 256) actor_net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, fc_layer_params=(256, 256), continuous_projection_net=TanhNormalProjectionNetwork, ) critic_net = critic_network.CriticNetwork( (observation_spec, action_spec), joint_fc_layer_params=critic_joint_fc_layers, kernel_initializer="glorot_uniform", last_kernel_initializer="glorot_uniform", ) model = agents.SacAgent( time_step_spec, action_spec, actor_network=actor_net, critic_network=critic_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer(3e-4), critic_optimizer=tf.compat.v1.train.AdamOptimizer(3e-4), alpha_optimizer=tf.compat.v1.train.AdamOptimizer(3e-4), ) else: raise ValueError( f"Class of class `{model_class.__name__}` is not supported") model.initialize() return model
def main(_): # Environment env_name = "Breakout-v4" train_num_parallel_environments = 5 max_steps_per_episode = 1000 # Replay buffer replay_buffer_capacity = 50000 init_replay_buffer = 500 # Driver collect_steps_per_iteration = 1 * train_num_parallel_environments # Training train_batch_size = 32 train_iterations = 100000 train_summary_interval = 200 train_checkpoint_interval = 200 # Evaluation eval_num_parallel_environments = 5 eval_summary_interval = 500 eval_num_episodes = 20 # File paths path = pathlib.Path(__file__) parent_dir = path.parent.resolve() folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S") train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint") train_summary_dir = str(parent_dir / folder_name / "train_summary") eval_summary_dir = str(parent_dir / folder_name / "eval_summary") # Parallel training environment tf_env = TFPyEnvironment( ParallelPyEnvironment([ lambda: suite_atari.load( env_name, env_wrappers= [lambda env: TimeLimit(env, duration=max_steps_per_episode)], gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) ] * train_num_parallel_environments)) tf_env.seed([42] * tf_env.batch_size) tf_env.reset() # Parallel evaluation environment eval_tf_env = TFPyEnvironment( ParallelPyEnvironment([ lambda: suite_atari.load( env_name, env_wrappers= [lambda env: TimeLimit(env, duration=max_steps_per_episode)], gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) ] * eval_num_parallel_environments)) eval_tf_env.seed([42] * eval_tf_env.batch_size) eval_tf_env.reset() # Creating the Deep Q-Network preprocessing_layer = keras.layers.Lambda( lambda obs: tf.cast(obs, np.float32) / 255.) conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)] fc_layer_params = [512] q_net = QNetwork(tf_env.observation_spec(), tf_env.action_spec(), preprocessing_layers=preprocessing_layer, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params) # Creating the DQN Agent optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=2500000, end_learning_rate=0.01) # final ε global_step = tf.compat.v1.train.get_or_create_global_step() agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=200, td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.99, # discount factor train_step_counter=global_step, epsilon_greedy=lambda: epsilon_fn(global_step)) agent.initialize() # Creating the Replay Buffer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) # Observer: Replay Buffer Observer replay_buffer_observer = replay_buffer.add_batch # Observer: Training Metrics train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size), tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size), ] # Creating the Collect Driver collect_driver = DynamicStepDriver(tf_env, agent.collect_policy, observers=[replay_buffer_observer] + train_metrics, num_steps=collect_steps_per_iteration) # Initialize replay buffer initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) init_driver = DynamicStepDriver( tf_env, initial_collect_policy, observers=[replay_buffer_observer, ShowProgress()], num_steps=init_replay_buffer) final_time_step, final_policy_state = init_driver.run() # Creating the Dataset dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size, num_steps=2, num_parallel_calls=3).prefetch(3) # Optimize by wrapping some of the code in a graph using TF function. collect_driver.run = function(collect_driver.run) agent.train = function(agent.train) print("\n\n++++++++++++++++++++++++++++++++++\n") # Create checkpoint train_checkpointer = Checkpointer( ckpt_dir=train_checkpoint_dir, max_to_keep=1, agent=agent, # replay_buffer=replay_buffer, global_step=global_step, # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics') ) # Restore checkpoint # train_checkpointer.initialize_or_restore() # Summary writers and metrics train_summary_writer = tf.summary.create_file_writer(train_summary_dir) eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir) eval_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes), tf_metrics.AverageEpisodeLengthMetric( batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes) ] # Create evaluate callback function eval_callback = evaluate(eval_metrics=eval_metrics, eval_tf_env=eval_tf_env, eval_policy=agent.policy, eval_num_episodes=eval_num_episodes, train_step=global_step, eval_summary_writer=eval_summary_writer) # Train agent train_agent(tf_env=tf_env, train_iterations=train_iterations, global_step=global_step, agent=agent, dataset=dataset, collect_driver=collect_driver, train_metrics=train_metrics, train_checkpointer=train_checkpointer, train_checkpoint_interval=train_checkpoint_interval, train_summary_writer=train_summary_writer, train_summary_interval=train_summary_interval, eval_summary_interval=eval_summary_interval, eval_callback=eval_callback) print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")
if __name__ == '__main__': # Create global step counter global_step = tf.compat.v1.train.get_or_create_global_step() # Create a dummy environment with no policy, just to extract the specs dummy_env = TFPyEnvironment(NineMensMorris(None, discount=DISCOUNT)) # Create Q Network q_net = QNetwork(input_tensor_spec=dummy_env.observation_spec(), action_spec=dummy_env.action_spec(), fc_layer_params=(100, 600, 600, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 600, 600), dropout_layer_params=(None, 0.1, 0.1, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.1, None)) # Create agent agent = DdqnAgent(time_step_spec=dummy_env.time_step_spec(), action_spec=dummy_env.action_spec(), q_network=q_net, optimizer=Adam(learning_rate=1e-4), td_errors_loss_fn=common.element_wise_squared_loss, epsilon_greedy=0.1, train_step_counter=global_step) # Initialize agent agent.initialize() # Wrap the training function in a TF graph agent.train = common.function(agent.train) # Create game environments: training and evaluation train_env = TFPyEnvironment(NineMensMorris(agent.policy, discount=DISCOUNT)) eval_env = TFPyEnvironment(NineMensMorris(agent.policy, discount=DISCOUNT))
class TrainDDQN(): """Wrapper for DDQN training, validation, saving etc.""" def __init__(self, episodes: int, warmup_steps: int, learning_rate: float, gamma: float, min_epsilon: float, decay_episodes: int, model_path: str = None, log_dir: str = None, batch_size: int = 64, memory_length: int = None, collect_steps_per_episode: int = 1, val_every: int = None, target_update_period: int = 1, target_update_tau: float = 1.0, progressbar: bool = True, n_step_update: int = 1, gradient_clipping: float = 1.0, collect_every: int = 1) -> None: """ Wrapper to make training easier. Code is partly based of https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial :param episodes: Number of training episodes :type episodes: int :param warmup_steps: Number of episodes to fill Replay Buffer with random state-action pairs before training starts :type warmup_steps: int :param learning_rate: Learning Rate for the Adam Optimizer :type learning_rate: float :param gamma: Discount factor for the Q-values :type gamma: float :param min_epsilon: Lowest and final value for epsilon :type min_epsilon: float :param decay_episodes: Amount of episodes to decay from 1 to `min_epsilon` :type decay_episodes: int :param model_path: Location to save the trained model :type model_path: str :param log_dir: Location to save the logs, usefull for TensorBoard :type log_dir: str :param batch_size: Number of samples in minibatch to train on each step :type batch_size: int :param memory_length: Maximum size of the Replay Buffer :type memory_length: int :param collect_steps_per_episode: Amount of data to collect for Replay Buffer each episiode :type collect_steps_per_episode: int :param collect_every: Step interval to collect data during training :type collect_every: int :param val_every: Validate the model every X episodes using the `collect_metrics()` function :type val_every: int :param target_update_period: Update the target Q-network every X episodes :type target_update_period: int :param target_update_tau: Parameter for softening the `target_update_period` :type target_update_tau: float :param progressbar: Enable or disable the progressbar for collecting data and training :type progressbar: bool :return: None :rtype: NoneType """ self.episodes = episodes # Total episodes self.warmup_steps = warmup_steps # Amount of warmup steps before training self.batch_size = batch_size # Batch size of Replay Memory self.collect_steps_per_episode = collect_steps_per_episode # Amount of steps to collect data each episode self.collect_every = collect_every # Step interval to collect data during training self.learning_rate = learning_rate # Learning Rate self.gamma = gamma # Discount factor self.min_epsilon = min_epsilon # Minimal chance of choosing random action self.decay_episodes = decay_episodes # Number of episodes to decay from 1.0 to `EPSILON` self.target_update_period = target_update_period # Period for soft updates self.target_update_tau = target_update_tau self.progressbar = progressbar # Enable or disable the progressbar for collecting data and training self.n_step_update = n_step_update self.gradient_clipping = gradient_clipping # Clip the loss self.compiled = False NOW = datetime.now().strftime("%Y%m%d_%H%M%S") if memory_length is not None: self.memory_length = memory_length # Max Replay Memory length else: self.memory_length = warmup_steps if val_every is not None: self.val_every = val_every # Validate the policy every `val_every` episodes else: self.val_every = self.episodes // min( 50, self.episodes ) # Can't validate the model 50 times if self.episodes < 50 if model_path is not None: self.model_path = model_path else: self.model_path = "./models/" + NOW + ".pkl" if log_dir is None: log_dir = "./logs/" + NOW self.writer = tf.summary.create_file_writer(log_dir) def compile_model(self, X_train, y_train, layers: list = [], imb_ratio: float = None, loss_fn=common.element_wise_squared_loss) -> None: """Initializes the neural networks, DDQN-agent, collect policies and replay buffer. :param X_train: Training data for the model. :type X_train: np.ndarray :param y_train: Labels corresponding to `X_train`. 1 for the positive class, 0 for the negative class. :param y_train: np.ndarray :param layers: List of layers to feed into the TF-agents custom Sequential(!) layer. :type layers: list :param imb_ratio: The imbalance ratio of the data. :type imb_ratio: float :param loss_fn: Callable loss function :type loss_fn: tf.compat.v1.losses :return: None :rtype: NoneType """ if imb_ratio is None: imb_ratio = imbalance_ratio(y_train) self.train_env = TFPyEnvironment( ClassifierEnv(X_train, y_train, imb_ratio)) self.global_episode = tf.Variable( 0, name="global_episode", dtype=np.int64, trainable=False) # Global train episode counter # Custom epsilon decay: https://github.com/tensorflow/agents/issues/339 epsilon_decay = tf.compat.v1.train.polynomial_decay( 1.0, self.global_episode, self.decay_episodes, end_learning_rate=self.min_epsilon) self.q_net = Sequential(layers, self.train_env.observation_spec()) self.agent = DdqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=self.q_net, optimizer=Adam(learning_rate=self.learning_rate), td_errors_loss_fn=loss_fn, train_step_counter=self.global_episode, target_update_period=self.target_update_period, target_update_tau=self.target_update_tau, gamma=self.gamma, epsilon_greedy=epsilon_decay, n_step_update=self.n_step_update, gradient_clipping=self.gradient_clipping) self.agent.initialize() self.random_policy = RandomTFPolicy(self.train_env.time_step_spec(), self.train_env.action_spec()) self.replay_buffer = TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=self.memory_length) self.warmup_driver = DynamicStepDriver( self.train_env, self.random_policy, observers=[self.replay_buffer.add_batch], num_steps=self.warmup_steps) # Uses a random policy self.collect_driver = DynamicStepDriver( self.train_env, self.agent.collect_policy, observers=[self.replay_buffer.add_batch], num_steps=self.collect_steps_per_episode ) # Uses the epsilon-greedy policy of the agent self.agent.train = common.function(self.agent.train) # Optimalization self.warmup_driver.run = common.function(self.warmup_driver.run) self.collect_driver.run = common.function(self.collect_driver.run) self.compiled = True def train(self, *args) -> None: """Starts the training of the model. Includes warmup period, metrics collection and model saving. :param *args: All arguments will be passed to `collect_metrics()`. This can be usefull to pass callables, testing environments or validation data. Overwrite the TrainDDQN.collect_metrics() function to use your own *args. :type *args: Any :return: None :rtype: NoneType, last step is saving the model as a side-effect """ assert self.compiled, "Model must be compiled with model.compile_model(X_train, y_train, layers) before training." # Warmup period, fill memory with random actions if self.progressbar: print( f"\033[92mCollecting data for {self.warmup_steps:_} steps... This might take a few minutes...\033[0m" ) self.warmup_driver.run( time_step=None, policy_state=self.random_policy.get_initial_state( self.train_env.batch_size)) if self.progressbar: print( f"\033[92m{self.replay_buffer.num_frames():_} frames collected!\033[0m" ) dataset = self.replay_buffer.as_dataset( sample_batch_size=self.batch_size, num_steps=self.n_step_update + 1, num_parallel_calls=data.experimental.AUTOTUNE).prefetch( data.experimental.AUTOTUNE) iterator = iter(dataset) def _train(): experiences, _ = next(iterator) return self.agent.train(experiences).loss _train = common.function(_train) # Optimalization ts = None policy_state = self.agent.collect_policy.get_initial_state( self.train_env.batch_size) self.collect_metrics(*args) # Initial collection for step 0 pbar = tqdm(total=self.episodes, disable=(not self.progressbar), desc="Training the DDQN") # TQDM progressbar for _ in range(self.episodes): if not self.global_episode % self.collect_every: # Collect a few steps using collect_policy and save to `replay_buffer` if self.collect_steps_per_episode != 0: ts, policy_state = self.collect_driver.run( time_step=ts, policy_state=policy_state) pbar.update( self.collect_every ) # More stable TQDM updates, collecting could take some time # Sample a batch of data from `replay_buffer` and update the agent's network train_loss = _train() if not self.global_episode % self.val_every: with self.writer.as_default(): tf.summary.scalar("train_loss", train_loss, step=self.global_episode) self.collect_metrics(*args) pbar.close() def collect_metrics(self, X_val: np.ndarray, y_val: np.ndarray, save_best: str = None): """Collects metrics using the trained Q-network. :param X_val: Features of validation data, same shape as X_train :type X_val: np.ndarray :param y_val: Labels of validation data, same shape as y_train :type y_val: np.ndarray :param save_best: Saving the best model of all validation runs based on given metric: Choose one of: {Gmean, F1, Precision, Recall, TP, TN, FP, FN} This improves stability since the model at the last episode is not guaranteed to be the best model. :type save_best: str """ y_pred = network_predictions(self.agent._target_q_network, X_val) stats = classification_metrics(y_val, y_pred) avgQ = np.mean(decision_function(self.agent._target_q_network, X_val)) # Max action for each x in X if save_best is not None: if not hasattr(self, "best_score"): # If no best model yet self.best_score = 0.0 if stats.get(save_best) >= self.best_score: # Overwrite best model self.save_network( ) # Saving directly to avoid shallow copy without trained weights self.best_score = stats.get(save_best) with self.writer.as_default(): tf.summary.scalar( "AverageQ", avgQ, step=self.global_episode) # Average Q-value for this epoch for k, v in stats.items(): tf.summary.scalar(k, v, step=self.global_episode) def evaluate(self, X_test, y_test, X_train=None, y_train=None): """ Final evaluation of trained Q-network with X_test and y_test. Optional PR and ROC curve comparison to X_train, y_train to ensure no overfitting is taking place. :param X_test: Features of test data, same shape as X_train :type X_test: np.ndarray :param y_test: Labels of test data, same shape as y_train :type y_test: np.ndarray :param X_train: Features of train data :type X_train: np.ndarray :param y_train: Labels of train data :type y_train: np.ndarray """ if hasattr(self, "best_score"): print(f"\033[92mBest score: {self.best_score:6f}!\033[0m") network = self.load_network( self.model_path) # Load best saved model else: network = self.agent._target_q_network # Load latest target model if (X_train is not None) and (y_train is not None): plot_pr_curve(network, X_test, y_test, X_train, y_train) plot_roc_curve(network, X_test, y_test, X_train, y_train) y_pred = network_predictions(network, X_test) return classification_metrics(y_test, y_pred) def save_network(self): """Saves Q-network as pickle to `model_path`.""" with open(self.model_path, "wb") as f: # Save Q-network as pickle pickle.dump(self.agent._target_q_network, f) @staticmethod def load_network(fp: str): """Static method to load Q-network pickle from given filepath. :param fp: Filepath to the saved pickle of the network :type fp: str :returns: The network-object loaded from a pickle file. :rtype: tensorflow.keras.models.Model """ with open(fp, "rb") as f: # Load the Q-network network = pickle.load(f) return network
initial_learning_rate=0.9, decay_steps=total_steps, end_learning_rate=0.001, ) # 3. Constructing the DQN Agent. optimizer = Yogi(learning_rate=0.00025) loss = Huber() n_steps = 3 tau = 0.001 gamma = 0.99 min_q = -200 max_q = 200 agent = CategoricalDqnAgent( time_step_spec=eval_env.time_step_spec(), action_spec=eval_env.action_spec(), categorical_q_network=online_q_net, optimizer=optimizer, min_q_value=min_q, max_q_value=max_q, epsilon_greedy=lambda: decay_epsilon_greedy(train_step), n_step_update=n_steps, target_categorical_q_network=target_q_net, target_update_tau=tau, target_update_period=1, td_errors_loss_fn=loss, gamma=gamma, train_step_counter=train_step) agent.initialize()
def create_reinforce_agent( env: TFPyEnvironment, gamma: float = 0.99, agent_name: str = 'reinforce_agent', debug: bool = False, training_step_counter: Optional[Any] = None, agent_params: Optional[Dict[str, Any]] = None) -> ReinforceAgent: """ Function for creating a REINFORCE agent in line with the TensorFlow Agents implementation. This function builds an action network and uses this to instantiate the agent which is returned. :param env: TensorFlow Environment implementing the ControlledRandomWalk. :param gamma: Discount factor. :param agent_name: Name for the agent to aid in identifying TensorFlow variables etc. when debugging. :param debug: Flag which toggles debugging in the REINFORCE agent. :param training_step_counter: An optional counter to increment every time the train op of the agent is run. If None if provided it defaults to the global_step. :param agent_params: A dictionary of possible overrides for the default TF-Agents agent set up. :return: An instance of TensorFlow Agents REINFORCE agent. """ # Process the action specification to attain the dimensions of the action subspaces to ensure # that in the case that there is only one resource set (and therefore only one action subspace) # the tuple of action specifications of length one is replaced by a single action specification. # This is to align with the fact that the actor network is implemented to return a tuple of # (OneHotCategorical) distributions (one for each resource set) where there are multiple action # subspaces and a single distribution (tfp.distributions.OneHotCategorical) otherwise. # First attain the action spec. action_spec = env.action_spec() # Extract the shape of the subspaces from the action specification tuple. # Action spaces are defined with shape (1, num_actions_for_resource_set) so take the -1th entry. action_subspace_dimensions = tuple( int(subspace.shape[-1]) for subspace in action_spec) # Then test if there is only one action subspace. if len(action_spec) == 1: # Pull out the only action spec. action_spec = action_spec[0] if agent_params is None: agent_params = dict() # Set up the action network. See `multi_headed_softmax_policy.py` for details. actor_network = MultiHeadedCategoricalActionNetwork( input_tensor_spec=env.observation_spec(), output_tensor_spec=action_spec, action_subspace_dimensions=action_subspace_dimensions, hidden_units=agent_params.get('hidden_units', (64, ))) # Set up the REINFORCE agent in line with standard tf_agents. agent = ReinforceAgent( time_step_spec=env.time_step_spec(), action_spec=action_spec, actor_network=actor_network, optimizer=tf.compat.v1.train.AdamOptimizer(), value_network=agent_params.get('value_network', None), value_estimation_loss_coef=agent_params.get( 'value_estimation_loss_coef', 0.2), advantage_fn=agent_params.get('advantage_fn', None), use_advantage_loss=agent_params.get('use_advantage_loss', True), gamma=gamma, normalize_returns=agent_params.get('normalize_returns', True), gradient_clipping=agent_params.get('gradient_clipping', None), debug_summaries=debug, summarize_grads_and_vars=debug, entropy_regularization=agent_params.get('entropy_regulariztion', None), train_step_counter=training_step_counter, name=agent_name) return agent
def create_bellman_pets_agent( env: TFPyEnvironment, agent_name: str = 'PETS_Agent', debug: bool = False, # REQUIRED? reward_model_class: RewardModel = None, initial_state_distribution_model_class: InitialStateDistributionModel = None, training_step_counter: Optional[Any] = None, agent_params: Optional[Dict[str, Any]] = None) -> PetsAgent: """ Function for creating a Bellman PETS agent in line with the Bellman implementation. This function builds an action network and uses this to instantiate the agent which is returned. :param env: TensorFlow Environment implementing the ControlledRandomWalk. :param num_epochs: Number of epochs for computing policy updates. :param agent_name: Name for the agent to aid in identifying TensorFlow variables etc. when debugging. :param debug: Flag which toggles debugging in the PETS agent. :param reward_model_class: CRWRewardModel, dummy variable, currently extracted from env :param initial_state_distribution_model_class: CRWStateInitialiser, dummy variable, currently extracted from env :param training_step_counter: An optional counter to increment every time the train op of the agent is run. If None if provided it defaults to the global_step. :param agent_params: A dictionary of possible overrides for the default TF-Agents agent set up. :return: An instance of Bellman PETS agent. """ # Process the action specification to attain the dimensions of the action subspaces to ensure # that in the case that there is only one resource set (and therefore only one action subspace) # the tuple of action specifications of length one is replaced by a single action specification. # This is to align with the fact that the actor network is implemented to return a tuple of # (OneHotCategorical) distributions (one for each resource set) where there are multiple action # subspaces and a single distribution (tfp.distributions.OneHotCategorical) otherwise. # First attain the action spec. # action_spec = env.action_spec() # Extract the shape of the subspaces from the action specification tuple. # Action spaces are defined with shape (1, num_actions_for_resource_set) so take the -1th entry. # action_subspace_dimensions = tuple(int(subspace.shape[-1]) for subspace in action_spec) # # Then test if there is only one action subspace. # if len(action_spec) == 1: # # Pull out the only action spec. # action_spec = action_spec[0] if agent_params is None: agent_params = dict() callbacks = [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)] # initializing given MDP components # NOTE: hacked for the time being, using quantities directly from the environment for now reward_model = reward_model_class(env.observation_spec(), env.action_spec(), env) initial_state_distribution_model = initial_state_distribution_model_class( env) # Set up the PETS agent in line with Bellman toolbox agent = PetsAgent( time_step_spec=env.time_step_spec(), action_spec=env.action_spec(), transition_model_type=agent_params.get( 'transition_model_type', TransitionModelType.DeterministicEnsemble), num_hidden_layers=agent_params.get('num_hidden_layers', 3), num_hidden_nodes=agent_params.get('num_hidden_nodes', 250), activation_function=agent_params.get('activation_function', tf.nn.relu), ensemble_size=agent_params.get('ensemble_size', 5), predict_state_difference=agent_params.get('predict_state_difference', True), epochs=agent_params.get('epochs', 100), training_batch_size=agent_params.get('training_batch_size', 32), callbacks=agent_params.get('callbacks', callbacks), reward_model=reward_model, initial_state_distribution_model=initial_state_distribution_model, trajectory_sampler_type=gent_params.get('trajectory_sampler_type', TrajectorySamplerType.TS1), trajectory_optimization_type=agent_params.get( 'trajectory_optimization_type', TrajectoryOptimizationType.RandomShooting), horizon=agent_params.get('horizon', 25), population_size=agent_params.get('population_size', 2500), number_of_particles=agent_params.get('number_of_particles', 1), num_elites=agent_params.get('num_elites', 40), learning_rate=agent_params.get('learning_rate', 0.9), max_iterations=agent_params.get('max_iterations', 5), train_step_counter=training_step_counter, ) return agent
def create_ppo_agent( env: TFPyEnvironment, num_epochs: int = 10, gamma: float = 0.99, agent_name: str = 'PPO_Agent', debug: bool = False, training_step_counter: Optional[Any] = None, agent_params: Optional[Dict[str, Any]] = None) -> PPOAgent: """ Function for creating a Proximal Policy Optimisation agent in line with the TensorFlow Agents implementation. This function builds an action network and uses this to instantiate the agent which is returned. :param env: TensorFlow Environment implementing the ControlledRandomWalk. :param num_epochs: Number of epochs for computing policy updates. :param gamma: Discount factor. :param agent_name: Name for the agent to aid in identifying TensorFlow variables etc. when debugging. :param debug: Flag which toggles debugging in the PPO agent. :param training_step_counter: An optional counter to increment every time the train op of the agent is run. If None if provided it defaults to the global_step. :param agent_params: A dictionary of possible overrides for the default TF-Agents agent set up. :return: An instance of TensorFlow Agents PPO agent. """ # Process the action specification to attain the dimensions of the action subspaces to ensure # that in the case that there is only one resource set (and therefore only one action subspace) # the tuple of action specifications of length one is replaced by a single action specification. # This is to align with the fact that the actor network is implemented to return a tuple of # (OneHotCategorical) distributions (one for each resource set) where there are multiple action # subspaces and a single distribution (tfp.distributions.OneHotCategorical) otherwise. # First attain the action spec. action_spec = env.action_spec() # Extract the shape of the subspaces from the action specification tuple. # Action spaces are defined with shape (1, num_actions_for_resource_set) so take the -1th entry. action_subspace_dimensions = tuple( int(subspace.shape[-1]) for subspace in action_spec) # Then test if there is only one action subspace. if len(action_spec) == 1: # Pull out the only action spec. action_spec = action_spec[0] if agent_params is None: agent_params = dict() # Set up the action network. See `multi_headed_softmax_policy.py` for details. actor_network = MultiHeadedCategoricalActionNetwork( input_tensor_spec=env.observation_spec(), output_tensor_spec=action_spec, action_subspace_dimensions=action_subspace_dimensions, hidden_units=agent_params.get('hidden_units', (64, ))) # PPO Requires a value network, we set one up using the default tf_agents set up. value_network = tf_agents.networks.value_network.ValueNetwork( env.observation_spec(), fc_layer_params=agent_params.get('value_fc_layer_params', (128, 64)), activation_fn=agent_params.get('value_net_activation_fn', tf.nn.tanh)) # Set up the PPO agent in line with standard tf_agents. agent = PPOAgent( time_step_spec=env.time_step_spec(), action_spec=action_spec, actor_net=actor_network, optimizer=tf.compat.v1.train.AdamOptimizer( agent_params.get('learning_rate', 0.001)), value_net=value_network, importance_ratio_clipping=agent_params.get('importance_ratio_clipping', 0.0), lambda_value=agent_params.get('lambda_value', 0.95), discount_factor=gamma, policy_l2_reg=agent_params.get('policy_l2_reg', 0.0), value_function_l2_reg=agent_params.get('value_function_l2_reg', 0.0), value_pred_loss_coef=agent_params.get('value_pred_loss_coef', 0.5), num_epochs=num_epochs, use_gae=agent_params.get('use_gae', False), use_td_lambda_return=agent_params.get('use_td_lambda_return', False), normalize_rewards=agent_params.get('normalise_rewards', True), reward_norm_clipping=agent_params.get('reward_norm_clipping', 10), kl_cutoff_factor=agent_params.get('kl_cutoff_factor', 2.0), kl_cutoff_coef=agent_params.get('kl_cutoff_coef', 1000), initial_adaptive_kl_beta=agent_params.get('initial_adaptive_kl_beta', 1.0), adaptive_kl_target=agent_params.get('adaptive_kl_target', 0.01), adaptive_kl_tolerance=agent_params.get('adaptive_kl_tolerance', 0.3), normalize_observations=agent_params.get('normalize_observations', True), gradient_clipping=agent_params.get('gradient_clipping', None), debug_summaries=debug, summarize_grads_and_vars=debug, check_numerics=agent_params.get('check_numerics', False), entropy_regularization=agent_params.get('entropy_regularization', 0.0), train_step_counter=training_step_counter, name=agent_name) return agent
target_update_period = 5 learning_rate = 1e-3 n_step_update = 1 gamma = 0.99 gradient_clipping = None reward_scale_factor = 1.0 debug_summaries = False summarize_grads_and_vars = False q_net = q_network.QNetwork( tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=fc_layer_params ) tf_agent = dqn_agent.DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, epsilon_greedy=epsilon_greedy, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate), td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step, )
# epsilon=0.00001, centered=True) train_step = tf.Variable(0) update_period = 4 # run a training step every 4 collect steps optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4, decay=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=250000 // update_period, # <=> 1,000,000 ALE frames end_learning_rate=0.01) # final ε agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=2000, # <=> 32,000 ALE frames td_errors_loss_fn=tf.keras.losses.Huber(reduction="none"), gamma=0.99, # discount factor train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step)) agent.initialize() replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=1000000)
[create_env] * 4 ) train_env = TFPyEnvironment(parallel_env) # train_env = TFPyEnvironment(suite_gym.load(env_name)) eval_env = TFPyEnvironment(suite_gym.load(env_name)) fc_layer_params = (100,) q_net = QNetwork( train_env.observation_spec(), train_env.action_spec(), fc_layer_params=fc_layer_params ) train_step_counter = tf.Variable(0) agent = DqnAgent( train_env.time_step_spec(), train_env.action_spec(), q_network=q_net, optimizer=Adam(learning_rate=LEARNING_RATE), td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter ) agent.initialize() random_policy = RandomTFPolicy( train_env.time_step_spec(), train_env.action_spec() ) def compute_avg_return(environment, policy, num_episodes=10): total_return = 0
#print(trajectory.reward) prev_lives = tf_env.pyenv.envs[0].ale.lives() def reset_and_fire_on_life_lost(trajectory): global prev_lives lives = tf_env.pyenv.envs[0].ale.lives() if prev_lives != lives: #tf_env.reset() tf_env.pyenv.envs[0].step(np.array(1, dtype=np.int32)) prev_lives = lives policy_num = sys.argv[1] #print(type(agent)) saved_policy = tf.compat.v2.saved_model.load(f'policy_{policy_num}') saved_policy.time_step_spec = tf_env.time_step_spec() saved_policy.action_spec = tf_env.action_spec() saved_policy.policy_state_spec = () # tf_env.policy_state_spec saved_policy.info_spec = () saved_policy.emit_log_probability = True saved_policy = EpsilonGreedyPolicy(saved_policy, epsilon=0.005) #saved_policy = tf_agents.policies.gaussian_policy.GaussianPolicy(saved_policy) #agent = tf.saved_model.load('policy_100') #agent = tf.keras.models.load_model('policy_100') #agent = tf.keras.models.load_model('policy_100') #policy = tf.saved_model.load('') #print(type(agent)) tf_env.pyenv.envs[0].step(np.array(1, dtype=np.int32))
def test_all_mepo_variants_work(transition_model, trajectory_sampler, model_free_agent_type): """ Mepo Agent has prespecified transition model, trajectory sampler and model-free agent types. Here we check that all combinations execute without errors. """ # setup the environment and a prespecified model components py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState( observation_spec) # some parameters need to be set correctly ensemble_size = 2 num_elites = 10 population_size = num_elites + 10 horizon = 1 # define agent, many transition model and trajectory optimiser parameters can # be arbitrary agent = MepoAgent( time_step_spec, action_spec, transition_model, 1, 10, tf.nn.relu, ensemble_size, False, 1, 1, [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)], reward_model, initial_state_distribution_model, trajectory_sampler, horizon, population_size, model_free_agent_type, 1, 10, tf.nn.relu, 2, ) # we need some training data random_policy = RandomTFPolicy( time_step_spec, action_spec, info_spec=agent.collect_policy.info_spec, ) model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( random_policy.trajectory_spec, batch_size=1, max_length=1000) collect_driver_random_policy = TFDriver( tf_env, random_policy, observers=[model_training_buffer.add_batch], max_steps=10, disable_tf_function=True, ) initial_time_step = tf_env.reset() collect_driver_random_policy.run(initial_time_step) pets_agent_trainer = BackgroundPlanningAgentTrainer(10, 10) tf_training_scheduler = pets_agent_trainer.create_training_scheduler( agent, model_training_buffer) training_losses = tf_training_scheduler.maybe_train( tf.constant(10, dtype=tf.int64)) assert EnvironmentModelComponents.TRANSITION in training_losses # test the agent collect_driver_planning_policy = TFDriver( tf_env, agent.collect_policy, observers=[model_training_buffer.add_batch], max_steps=10, disable_tf_function=True, ) time_step = tf_env.reset() collect_driver_planning_policy.run(time_step)
def breakout_v4(seed=42): env = suite_gym.load("Breakout-v4") env.seed(seed) env.reset() repeating_env = ActionRepeat(env, times=4) for name in dir(tf_agents.environments.wrappers): obj = getattr(tf_agents.environments.wrappers, name) if hasattr(obj, "__base__") and issubclass( obj, tf_agents.environments.wrappers.PyEnvironmentBaseWrapper): print("{:27s} {}".format(name, obj.__doc__.split("\n")[0])) limited_repeating_env = suite_gym.load( "Breakout-v4", gym_env_wrappers=[partial(TimeLimit, max_episode_steps=10000)], env_wrappers=[partial(ActionRepeat, times=4)], ) max_episode_steps = 27000 # <=> 108k ALE frames since 1 step = 4 frames environment_name = "BreakoutNoFrameskip-v4" env = suite_atari.load( environment_name, max_episode_steps=max_episode_steps, gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) env.seed(42) env.reset() time_step = env.step(np.array(1)) # FIRE for _ in range(4): time_step = env.step(np.array(3)) # LEFT def plot_observation(obs): # Since there are only 3 color channels, you cannot display 4 frames # with one primary color per frame. So this code computes the delta between # the current frame and the mean of the other frames, and it adds this delta # to the red and blue channels to get a pink color for the current frame. obs = obs.astype(np.float32) img_ = obs[..., :3] current_frame_delta = np.maximum( obs[..., 3] - obs[..., :3].mean(axis=-1), 0.0) img_[..., 0] += current_frame_delta img_[..., 2] += current_frame_delta img_ = np.clip(img_ / 150, 0, 1) plt.imshow(img_) plt.axis("off") plt.figure(figsize=(6, 6)) plot_observation(time_step.observation) plt.tight_layout() plt.savefig("./images/preprocessed_breakout_plot.png", format="png", dpi=300) plt.show() tf_env = TFPyEnvironment(env) preprocessing_layer = keras.layers.Lambda( lambda obs: tf.cast(obs, np.float32) / 255.0) conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)] fc_layer_params = [512] q_net = QNetwork( tf_env.observation_spec(), tf_env.action_spec(), preprocessing_layers=preprocessing_layer, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params, ) # see TF-agents issue #113 # optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, # epsilon=0.00001, centered=True) train_step = tf.Variable(0) update_period = 4 # run a training step every 4 collect steps optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4, decay=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=250000 // update_period, # <=> 1,000,000 ALE frames end_learning_rate=0.01, ) # final ε agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=2000, # <=> 32,000 ALE frames td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.99, # discount factor train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step), ) agent.initialize() from tf_agents.replay_buffers import tf_uniform_replay_buffer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=1000000) replay_buffer_observer = replay_buffer.add_batch class ShowProgress: def __init__(self, total): self.counter = 0 self.total = total def __call__(self, trajectory): if not trajectory.is_boundary(): self.counter += 1 if self.counter % 100 == 0: print("\r{}/{}".format(self.counter, self.total), end="") from tf_agents.metrics import tf_metrics train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] from tf_agents.eval.metric_utils import log_metrics import logging logging.getLogger().setLevel(logging.INFO) log_metrics(train_metrics) from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver collect_driver = DynamicStepDriver( tf_env, agent.collect_policy, observers=[replay_buffer_observer] + train_metrics, num_steps=update_period, ) # collect 4 steps for each training iteration from tf_agents.policies.random_tf_policy import RandomTFPolicy initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) init_driver = DynamicStepDriver( tf_env, initial_collect_policy, observers=[replay_buffer.add_batch, ShowProgress(20000)], num_steps=20000, ) # <=> 80,000 ALE frames final_time_step, final_policy_state = init_driver.run()
def train_eval( # tensorboard files root_dir, # environment env_name="Pendulum-v0", random_seed=0, # Params for collect num_environment_steps=100000, replay_buffer_capacity=1001, # Per-environment # Params for eval num_eval_episodes=30, eval_interval=200, # Params for summaries summary_interval=50, ): tf.compat.v1.set_random_seed(random_seed) environment = TFPyEnvironment(suite_gym.load(env_name)) evaluation_environment = TFPyEnvironment(suite_gym.load(env_name)) critic_network = CriticNetwork( input_tensor_spec=(environment.observation_spec(), environment.action_spec()), observation_fc_layer_params=None, action_fc_layer_params=None, joint_fc_layer_params=(200, 100), ) actor_network = ActorNetwork( input_tensor_spec=environment.observation_spec(), output_tensor_spec=environment.action_spec(), fc_layer_params=(200, 100), ) global_step = tf.compat.v1.train.get_or_create_global_step() agent = DdpgAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), critic_network=critic_network, actor_network=actor_network, actor_optimizer=tf.compat.v1.train.AdamOptimizer(), critic_optimizer=tf.compat.v1.train.AdamOptimizer(), train_step_counter=global_step, ) agent_trainer = OffPolicyModelFreeAgentTrainer(1, 256) experiment_harness = ExperimentHarness( root_dir, environment, evaluation_environment, agent, agent_trainer, replay_buffer_capacity, num_environment_steps, summary_interval, eval_interval, num_eval_episodes, number_of_initial_random_policy_steps=0, use_tf_function=True, ) experiment_harness.run()
# Create a global step global_step = tf.compat.v1.train.get_or_create_global_step() # Create the actor network (with the normal distribution) actor_net = ActorDistributionNetwork( input_tensor_spec=train_env.observation_spec(), output_tensor_spec=train_env.action_spec(), fc_layer_params=(128, 256, 512, 512, 256), continuous_projection_net=normal_net) # Create the value network value_net = ValueNetwork(input_tensor_spec=train_env.observation_spec(), fc_layer_params=(256, 512, 512)) # Create the PPO agent ppo_agent = PPOClipAgent(time_step_spec=train_env.time_step_spec(), action_spec=train_env.action_spec(), optimizer=Adam(learning_rate=5e-4), actor_net=actor_net, value_net=value_net, importance_ratio_clipping=0.2, discount_factor=0.95, entropy_regularization=0.0, num_epochs=16, use_gae=True, use_td_lambda_return=True, log_prob_clipping=3, gradient_clipping=0.5, train_step_counter=global_step) # Initialize the agent ppo_agent.initialize()