def dyke_dqn_agent(env: TFPyEnvironment, layers: Optional[List[Layer]] = None) -> DqnAgent: """ Prepares a deep Q-network (DQN) agent for use in the dyke maintenance environment. :param env: The dyke environment on which to base the DQN agent. :param layers: Optional. A list of layers to supply to the DQN agent's network. :return: The agent. """ layers = fully_connected_dyke_dqn_agent_network( sizes=(100, 50)) if layers is None else layers # prepare the Q-values layer action_as: BoundedArraySpec = from_spec(env.action_spec()) number_actions: int = int(action_as.maximum - action_as.minimum + 1) q_values_layer: Layer = Dense(units=number_actions, activation=None, kernel_initializer=RandomUniform( minval=-3e-3, maxval=3e-3), bias_initializer=Constant(-2e-1)) net = Sequential(layers=layers + [q_values_layer]) # instantiate and return the agent optimizer = Adam(learning_rate=1e-3) train_step_counter = Variable(initial_value=0) return DqnAgent(time_step_spec=env.time_step_spec(), action_spec=env.action_spec(), q_network=net, optimizer=optimizer, epsilon_greedy=0.1, td_errors_loss_fn=element_wise_squared_loss, train_step_counter=train_step_counter)
def create_dqn_agent(env, q_net): # see TF-agents issue #113 #optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, # epsilon=0.00001, centered=True) train_step = tf.Variable(0) update_period = config.UPDATE_PERIOD # run a training step every 4 collect steps optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4, decay=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=250000 // update_period, # <=> 1,000,000 ALE frames end_learning_rate=0.01) # final ε agent = DqnAgent( env.time_step_spec(), env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=2000, # <=> 32,000 ALE frames td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.99, # discount factor train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step)) return agent
def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float, replay_buffer_max_length: int, layers: dict) -> None: """ Create the Q-network, agent and policy Args: X_train: A np.ndarray for training samples. y_train: A np.ndarray for the class labels of the training samples. lr: learn rate for the optimizer (default Adam) epsilon: Used for the default epsilon greedy policy for choosing a random action. gamma: The discount factor for learning Q-values imb_ratio: ratio of imbalance. Used to specifiy reward in the environment replay_buffer_max_length: Maximum lenght of replay memory. layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout). """ dense_layers = layers.get("dense") conv_layers = layers.get("conv") dropout_layers = layers.get("dropout") self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio)) # create a custom environment q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers, fc_layer_params=dense_layers, dropout_layer_params=dropout_layers) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr) train_step_counter = tf.Variable(0) self.agent = DqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter, gamma=gamma, epsilon_greedy=epsilon, ) self.agent.initialize() self.replay_buffer = TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=replay_buffer_max_length)
def train_dyke_agent(train_env: TFPyEnvironment, eval_env: TFPyEnvironment, agent: DqnAgent, train_steps: int, steps_per_episode: int, eval_episodes: int) -> Dict[str, Any]: """ Trains the DQN agent on the dyke maintenance task. :param train_env: The training environment. :param eval_env: The environment for testing agent performance. :param agent: The agent. :param train_steps: The number of training steps to use. :param steps_per_episode: The number of time steps that can be taken in a single dyke environment episode. :param eval_episodes: The number of episodes to use per evaluation. :return: A mapping to various metrics pertaining to the training's results. """ losses: np.ndarray = np.zeros(shape=(train_steps, steps_per_episode)) evaluations: np.ndarray = np.zeros(shape=(train_steps, eval_episodes)) train_metrics: Tuple = (AverageReturnMetric, ) train_metric_results: np.ndarray = np.zeros(shape=(len(train_metrics), train_steps, steps_per_episode)) for step in range(train_steps): # we uniformly sample experiences (single time steps) from one episode per train step print('STEP %d/%d' % (step + 1, train_steps)) train_env.reset() rep_buf = _dyke_replay_buffer(train_env, agent, steps_per_episode) train_metric_inst: Tuple = tuple( [metric() for metric in train_metrics]) # instantiate the metrics obs: Tuple = (rep_buf.add_batch, ) + train_metric_inst _ = DynamicStepDriver( env=train_env, policy=agent.collect_policy, observers=obs, num_steps=steps_per_episode ).run( ) # experience a single episode using the agent's current configuration dataset: tf.data.Dataset = rep_buf.as_dataset( sample_batch_size=_REP_BUF_BATCH_SIZE, num_steps=_REP_BUF_NUM_STEPS) iterator = iter(dataset) for tr in range(steps_per_episode): trajectories, _ = next(iterator) losses[step, tr] = agent.train(experience=trajectories).loss for met in range(len(train_metrics)): train_metric_results[ met, step, tr] = train_metric_inst[met].result().numpy() evaluations[step, :] = _evaluate_dyke_agent(eval_env, agent, eval_episodes) return { 'loss': losses, 'eval': evaluations, 'train-metrics': train_metric_results }
def create_pong_agent(train_environment: TFEnvironment, dense_layer_sizes: Sequence[int], learning_rate: float) -> (DqnAgent, QNetwork): optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) global_step = tf.compat.v1.train.get_or_create_global_step() q_net = QNetwork(input_tensor_spec=train_environment.observation_spec(), action_spec=train_environment.action_spec(), fc_layer_params=dense_layer_sizes) agent = DqnAgent(time_step_spec=train_environment.time_step_spec(), action_spec=train_environment.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=element_wise_squared_loss, train_step_counter=global_step) agent.initialize() agent.train = common.function(agent.train) agent.train_step_counter.assign(0) return agent
def build_dqn_agent(self): """Build DQN agent with QNetwork.""" temp_env = self.build_temp_env() q_net = q_network.QNetwork( temp_env.observation_spec(), temp_env.action_spec(), fc_layer_params=self.fc_layer_params, dropout_layer_params=self.dropout_layer_params) optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate) agent = DqnAgent( temp_env.time_step_spec(), temp_env.action_spec(), n_step_update=self.n_step_update, q_network=q_net, optimizer=optimizer, epsilon_greedy=self.epsilon_greedy, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=tf.Variable(0, dtype=tf.int64)) return q_net, agent
def train_model(self): train_step = tf.Variable(0) # count num of training steps update_period = 4 # train the model every 4 steps optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=250000 // update_period, # <=> 1,000,000 ALE frames end_learning_rate=0.01) # final ε agent = DqnAgent( self.tf_env.time_step_spec(), self.tf_env.action_spec(), q_network=self.q_net, optimizer=optimizer, target_update_period=2000, # <=> 32,000 ALE frames td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.99, # discount factor train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step)) return agent
class DQNAgent: def __init__(self) -> None: """ A class for training a TF-agent based on https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial """ self.train_env = None # Training environment self.agent = None # The algorithm used to solve an RL problem is represented by a TF-Agent self.replay_buffer = None # The replay buffer keeps track of data collected from the environment self.dataset = None # The agent needs access to the replay buffer via an iterable tf.data.Dataset self.iterator = None # The iterator of self.dataset def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float, replay_buffer_max_length: int, layers: dict) -> None: """ Create the Q-network, agent and policy Args: X_train: A np.ndarray for training samples. y_train: A np.ndarray for the class labels of the training samples. lr: learn rate for the optimizer (default Adam) epsilon: Used for the default epsilon greedy policy for choosing a random action. gamma: The discount factor for learning Q-values imb_ratio: ratio of imbalance. Used to specifiy reward in the environment replay_buffer_max_length: Maximum lenght of replay memory. layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout). """ dense_layers = layers.get("dense") conv_layers = layers.get("conv") dropout_layers = layers.get("dropout") self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio)) # create a custom environment q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers, fc_layer_params=dense_layers, dropout_layer_params=dropout_layers) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr) train_step_counter = tf.Variable(0) self.agent = DqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter, gamma=gamma, epsilon_greedy=epsilon, ) self.agent.initialize() self.replay_buffer = TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=replay_buffer_max_length) def fit(self, X_train: np.ndarray, y_train: np.ndarray, epochs: int, batch_size: int, eval_step: int, log_step: int, collect_steps_per_episode: int) -> None: """ Starts the training of the Agent. Args: X_train: A np.ndarray for training samples. y_train: A np.ndarray for the class labels of the training samples. epochs: Number of epochs to train Agent batch_size: The Batch Size eval_step: Evaluate Model each 'eval_step' log_step: Monitor results of model each 'log_step' collect_steps_per_episode: Collect a few steps using collect_policy and save to the replay buffer. """ self.dataset = self.replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) self.iterator = iter(self.dataset) def collect_step(environment, policy, buffer): time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer buffer.add_batch(traj) def collect_data(env, policy, buffer, steps): for _ in range(steps): collect_step(env, policy, buffer) # (Optional) Optimize by wrapping some of the code in a graph using TF function. self.agent.train = common.function(self.agent.train) # Reset the train step self.agent.train_step_counter.assign(0) for _ in range(epochs): #print("epoch: ", _) # Collect a few steps using collect_policy and save to the replay buffer. collect_data(self.train_env, self.agent.collect_policy, self.replay_buffer, collect_steps_per_episode) # Sample a batch of data from the buffer and update the agent's network. experience, _ = next(self.iterator) train_loss = self.agent.train(experience).loss step = self.agent.train_step_counter.numpy() if step % log_step == 0: print('step = {0}: loss = {1}'.format(step, train_loss)) if step % eval_step == 0: metrics = self.compute_metrics(X_train, y_train) print(metrics) def compute_metrics(self, X: np.ndarray, y_true: list) -> dict: """Compute Metrics for Evaluation""" # TODO: apply softmax layer for q logits? q, _ = self.agent._target_q_network (X, training=False) # y_scores = np.max(q.numpy(), axis=1) # predicted scores (Q-Values) y_pred = np.argmax(q.numpy(), axis=1) # predicted class label metrics = custom_metrics(y_true, y_pred) return metrics def evaluate(self, X: np.ndarray, y: list, X_train=None, y_train=None) -> dict: """ Evaluation of trained Q-network """ metrics = self.compute_metrics(X, y) print("evaluation: ", metrics) return metrics
train_step = tf.Variable(0) # Create optimizer optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=optimizer_learning_rate, decay=optimizer_decay, momentum=optimizer_momentum, epsilon=optimizer_epsilon, centered=True) # Computes epsilon for epsilon greedy policy given the training step epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=0.01, # initial ε decay_steps=epsilon_decay_steps, end_learning_rate=epsilon_final) # final ε agent = DqnAgent(tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=target_update_period, td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=discount_factor, # discount factor train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step)) agent.policy = tf.compat.v2.saved_model.load('../DATA/policy_{}'.format(II)) agent.initialize() # Speed up as tensorflow function agent.train = function(agent.train) ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
update_period = 4 optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True) print("Before Epsilon function") epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, decay_steps=250000, end_learning_rate=0.01) print("Before Agent") agent = DqnAgent(tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=2000, td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.99, train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step)) agent.initialize() if policy != None: agent.policy = policy print("After Agent.initialize()") replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=100000)
def _init_qagent(self, optimizer, q_net, global_step): args = self._get_agent_args(optimizer, q_net, global_step) return DqnAgent(**args)
hidden_layers = (100, ) dqn_network = QNetwork(train_env.observation_spec(), train_env.action_spec(), fc_layer_params=hidden_layers) ddqn_network = QNetwork(train_env.observation_spec(), train_env.action_spec(), fc_layer_params=hidden_layers) counter = tf.Variable(0) dqn_agent = DqnAgent( train_env.time_step_spec(), train_env.action_spec(), q_network=dqn_network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3), td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=counter) ddqn_agent = DdqnAgent( train_env.time_step_spec(), train_env.action_spec(), q_network=ddqn_network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3), td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=counter) dqn_agent.initialize() ddqn_agent.initialize()
gym_env_wrappers=[ShrinkWrapper, DiscreteActionWrapper])) # create DQN (deep Q-Learning network) q_net = QNetwork(train_env.observation_spec(), train_env.action_spec(), conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) train_step_counter = tf.compat.v2.Variable(0) # create deep reinforcement learning agent tf_agent = DqnAgent(train_env.time_step_spec(), train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=element_wise_squared_loss, train_step_counter=train_step_counter) tf_agent.initialize() # create evaluation and data collection policies eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy # create replay buffer print("Creating replay buffer") replay_buffer = TFUniformReplayBuffer(data_spec=tf_agent.collect_data_spec, batch_size=train_env.batch_size, max_length=replay_buffer_capacity)
update_period = 4 # train model every 4 steps optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial epsilon decay_steps=250000, end_learning_rate=0.01) # final epsilon agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=2000, # every 32,000 frames td_errors_loss_fn=keras.losses.Huber( reduction="none"), # must return error per instance gamma=0.99, # discount factor train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step)) agent.initialize() # Create the reply buffer and the observer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=1000000) replay_buffer_observer = replay_buffer.add_batch
def main(_): # Environment env_name = "Breakout-v4" train_num_parallel_environments = 5 max_steps_per_episode = 1000 # Replay buffer replay_buffer_capacity = 50000 init_replay_buffer = 500 # Driver collect_steps_per_iteration = 1 * train_num_parallel_environments # Training train_batch_size = 32 train_iterations = 100000 train_summary_interval = 200 train_checkpoint_interval = 200 # Evaluation eval_num_parallel_environments = 5 eval_summary_interval = 500 eval_num_episodes = 20 # File paths path = pathlib.Path(__file__) parent_dir = path.parent.resolve() folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S") train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint") train_summary_dir = str(parent_dir / folder_name / "train_summary") eval_summary_dir = str(parent_dir / folder_name / "eval_summary") # Parallel training environment tf_env = TFPyEnvironment( ParallelPyEnvironment([ lambda: suite_atari.load( env_name, env_wrappers= [lambda env: TimeLimit(env, duration=max_steps_per_episode)], gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) ] * train_num_parallel_environments)) tf_env.seed([42] * tf_env.batch_size) tf_env.reset() # Parallel evaluation environment eval_tf_env = TFPyEnvironment( ParallelPyEnvironment([ lambda: suite_atari.load( env_name, env_wrappers= [lambda env: TimeLimit(env, duration=max_steps_per_episode)], gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) ] * eval_num_parallel_environments)) eval_tf_env.seed([42] * eval_tf_env.batch_size) eval_tf_env.reset() # Creating the Deep Q-Network preprocessing_layer = keras.layers.Lambda( lambda obs: tf.cast(obs, np.float32) / 255.) conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)] fc_layer_params = [512] q_net = QNetwork(tf_env.observation_spec(), tf_env.action_spec(), preprocessing_layers=preprocessing_layer, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params) # Creating the DQN Agent optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=2500000, end_learning_rate=0.01) # final ε global_step = tf.compat.v1.train.get_or_create_global_step() agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=200, td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.99, # discount factor train_step_counter=global_step, epsilon_greedy=lambda: epsilon_fn(global_step)) agent.initialize() # Creating the Replay Buffer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) # Observer: Replay Buffer Observer replay_buffer_observer = replay_buffer.add_batch # Observer: Training Metrics train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size), tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size), ] # Creating the Collect Driver collect_driver = DynamicStepDriver(tf_env, agent.collect_policy, observers=[replay_buffer_observer] + train_metrics, num_steps=collect_steps_per_iteration) # Initialize replay buffer initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) init_driver = DynamicStepDriver( tf_env, initial_collect_policy, observers=[replay_buffer_observer, ShowProgress()], num_steps=init_replay_buffer) final_time_step, final_policy_state = init_driver.run() # Creating the Dataset dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size, num_steps=2, num_parallel_calls=3).prefetch(3) # Optimize by wrapping some of the code in a graph using TF function. collect_driver.run = function(collect_driver.run) agent.train = function(agent.train) print("\n\n++++++++++++++++++++++++++++++++++\n") # Create checkpoint train_checkpointer = Checkpointer( ckpt_dir=train_checkpoint_dir, max_to_keep=1, agent=agent, # replay_buffer=replay_buffer, global_step=global_step, # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics') ) # Restore checkpoint # train_checkpointer.initialize_or_restore() # Summary writers and metrics train_summary_writer = tf.summary.create_file_writer(train_summary_dir) eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir) eval_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes), tf_metrics.AverageEpisodeLengthMetric( batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes) ] # Create evaluate callback function eval_callback = evaluate(eval_metrics=eval_metrics, eval_tf_env=eval_tf_env, eval_policy=agent.policy, eval_num_episodes=eval_num_episodes, train_step=global_step, eval_summary_writer=eval_summary_writer) # Train agent train_agent(tf_env=tf_env, train_iterations=train_iterations, global_step=global_step, agent=agent, dataset=dataset, collect_driver=collect_driver, train_metrics=train_metrics, train_checkpointer=train_checkpointer, train_checkpoint_interval=train_checkpoint_interval, train_summary_writer=train_summary_writer, train_summary_interval=train_summary_interval, eval_summary_interval=eval_summary_interval, eval_callback=eval_callback) print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")
decay=0.95, momentum=0.0, epsilon=0.0001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ? decay_steps=25000 // update_period, # <=> 1,000,000 ALE frames end_learning_rate=0.01) # final ? agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=2000, # <=> 32,000 ALE frames td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.95, # discount factor train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step), reward_scale_factor=1.5) agent.initialize() # Create Replay Buffer and Observer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=1000000) replay_buffer_observer = replay_buffer.add_batch
def breakout_v4(seed=42): env = suite_gym.load("Breakout-v4") env.seed(seed) env.reset() repeating_env = ActionRepeat(env, times=4) for name in dir(tf_agents.environments.wrappers): obj = getattr(tf_agents.environments.wrappers, name) if hasattr(obj, "__base__") and issubclass( obj, tf_agents.environments.wrappers.PyEnvironmentBaseWrapper): print("{:27s} {}".format(name, obj.__doc__.split("\n")[0])) limited_repeating_env = suite_gym.load( "Breakout-v4", gym_env_wrappers=[partial(TimeLimit, max_episode_steps=10000)], env_wrappers=[partial(ActionRepeat, times=4)], ) max_episode_steps = 27000 # <=> 108k ALE frames since 1 step = 4 frames environment_name = "BreakoutNoFrameskip-v4" env = suite_atari.load( environment_name, max_episode_steps=max_episode_steps, gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) env.seed(42) env.reset() time_step = env.step(np.array(1)) # FIRE for _ in range(4): time_step = env.step(np.array(3)) # LEFT def plot_observation(obs): # Since there are only 3 color channels, you cannot display 4 frames # with one primary color per frame. So this code computes the delta between # the current frame and the mean of the other frames, and it adds this delta # to the red and blue channels to get a pink color for the current frame. obs = obs.astype(np.float32) img_ = obs[..., :3] current_frame_delta = np.maximum( obs[..., 3] - obs[..., :3].mean(axis=-1), 0.0) img_[..., 0] += current_frame_delta img_[..., 2] += current_frame_delta img_ = np.clip(img_ / 150, 0, 1) plt.imshow(img_) plt.axis("off") plt.figure(figsize=(6, 6)) plot_observation(time_step.observation) plt.tight_layout() plt.savefig("./images/preprocessed_breakout_plot.png", format="png", dpi=300) plt.show() tf_env = TFPyEnvironment(env) preprocessing_layer = keras.layers.Lambda( lambda obs: tf.cast(obs, np.float32) / 255.0) conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)] fc_layer_params = [512] q_net = QNetwork( tf_env.observation_spec(), tf_env.action_spec(), preprocessing_layers=preprocessing_layer, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params, ) # see TF-agents issue #113 # optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, # epsilon=0.00001, centered=True) train_step = tf.Variable(0) update_period = 4 # run a training step every 4 collect steps optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4, decay=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=250000 // update_period, # <=> 1,000,000 ALE frames end_learning_rate=0.01, ) # final ε agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=2000, # <=> 32,000 ALE frames td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.99, # discount factor train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step), ) agent.initialize() from tf_agents.replay_buffers import tf_uniform_replay_buffer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=1000000) replay_buffer_observer = replay_buffer.add_batch class ShowProgress: def __init__(self, total): self.counter = 0 self.total = total def __call__(self, trajectory): if not trajectory.is_boundary(): self.counter += 1 if self.counter % 100 == 0: print("\r{}/{}".format(self.counter, self.total), end="") from tf_agents.metrics import tf_metrics train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(), tf_metrics.AverageEpisodeLengthMetric(), ] from tf_agents.eval.metric_utils import log_metrics import logging logging.getLogger().setLevel(logging.INFO) log_metrics(train_metrics) from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver collect_driver = DynamicStepDriver( tf_env, agent.collect_policy, observers=[replay_buffer_observer] + train_metrics, num_steps=update_period, ) # collect 4 steps for each training iteration from tf_agents.policies.random_tf_policy import RandomTFPolicy initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) init_driver = DynamicStepDriver( tf_env, initial_collect_policy, observers=[replay_buffer.add_batch, ShowProgress(20000)], num_steps=20000, ) # <=> 80,000 ALE frames final_time_step, final_policy_state = init_driver.run()
train_env = TFPyEnvironment(parallel_env) # train_env = TFPyEnvironment(suite_gym.load(env_name)) eval_env = TFPyEnvironment(suite_gym.load(env_name)) fc_layer_params = (100,) q_net = QNetwork( train_env.observation_spec(), train_env.action_spec(), fc_layer_params=fc_layer_params ) train_step_counter = tf.Variable(0) agent = DqnAgent( train_env.time_step_spec(), train_env.action_spec(), q_network=q_net, optimizer=Adam(learning_rate=LEARNING_RATE), td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter ) agent.initialize() random_policy = RandomTFPolicy( train_env.time_step_spec(), train_env.action_spec() ) def compute_avg_return(environment, policy, num_episodes=10): total_return = 0 for _ in range(num_episodes): time_step = environment.reset() episode_return = 0
learning_rate=optimizer_learning_rate, decay=optimizer_decay, momentum=optimizer_momentum, epsilon=optimizer_epsilon, centered=True) # Computes epsilon for epsilon greedy policy given the training step epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=epsilon_decay_steps, end_learning_rate=epsilon_final) # final ε agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=target_update_period, td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=discount_factor, # discount factor train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step)) agent.initialize() # Speed up as tensorflow function agent.train = function(agent.train) ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ ## ------------------------------------------------------------------------------ replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( # Determines the data spec type data_spec=agent.collect_data_spec,
optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=1e-5) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, decay_steps=int(250e3) // update_period, end_learning_rate=0.01, ) agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=TARGET_UPDATE_PERIOD, td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.99, train_step_counter=train_step, epsilon_greedy=lambda: epsilon_fn(train_step), ) agent.initialize() # %% Create rezplay buffer from tf_agents.replay_buffers import tf_uniform_replay_buffer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=int(REPLAY_BUFFER_MAXLEN),