class ExperienceReplay(object): def __init__(self, agent, enviroment, batch_size): self._replay_buffer = TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=enviroment.batch_size, max_length=100000) self._random_policy = RandomTFPolicy(enviroment.time_step_spec(), enviroment.action_spec()) self._fill_buffer(enviroment, self._random_policy, steps=100) self.dataset = self._replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2, single_deterministic_pass=False).prefetch(3) self.iterator = iter(self.dataset) def _fill_buffer(self, enviroment, policy, steps): for _ in range(steps): self.timestamp_data(enviroment, policy) def timestamp_data(self, environment, policy): time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) timestamp_trajectory = trajectory.from_transition( time_step, action_step, next_time_step) self._replay_buffer.add_batch(timestamp_trajectory)
def test_reinforce_agent_learning(env_name): """ Extension of the test for an agent playing in the environment to include training. Note: This does not test that training improves the policy. It simply tests that the training loop runs effectively. """ # Set up environment using default parameters. # Environment parameters do not affect the test result here. tf_env, _ = rl_env_from_snc_env(load_scenario( env_name, job_gen_seed=10, override_env_params={'max_episode_length': 25})[1], discount_factor=0.99) # Set up a training step counter. global_step = tf.compat.v1.train.get_or_create_global_step() # Instantiate a REINFORCE agent reinforce_agent = create_reinforce_agent(tf_env, training_step_counter=global_step) # Instantiate a replay buffer. replay_buffer = TFUniformReplayBuffer( data_spec=reinforce_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=1000) # Initialise the action network weights etc. reinforce_agent.initialize() # Use a driver to handle data collection for the agent. This handles a lot of the backend # TensorFlow set up and solves previous errors with episodes of differing lengths. collect_driver = DynamicEpisodeDriver(tf_env, reinforce_agent.collect_policy, observers=[replay_buffer.add_batch], num_episodes=2) # Get the initial states of the agent and environment before training. time_step = tf_env.reset() policy_state = reinforce_agent.collect_policy.get_initial_state( tf_env.batch_size) # Take a copy of the variables in order to ensure that training does lead to parameter changes. initial_vars = deepcopy(reinforce_agent.trainable_variables) assert len(initial_vars) > 0, "Agent has no trainable variables." # Set up a minimal training loop to simply test training mechanics work. for _ in range(5): # Collect experience. time_step, policy_state = collect_driver.run(time_step=time_step, policy_state=policy_state) # Now the replay buffer should have data in it so we can collect the data and train the # agent. experience = replay_buffer.gather_all() reinforce_agent.train(experience) # Clear the replay buffer and return to play. replay_buffer.clear() # Check that training has had some effect for v1, v2 in zip(initial_vars, reinforce_agent.trainable_variables): assert not np.allclose(v1.numpy(), v2.numpy())
def __init__(self, experience_spec, batch_size): # TFUniformReplayBuffer does not support list in spec, we have to do # some conversion. self._experience_spec = experience_spec self._exp_has_list = nest_utils.nest_contains_list(experience_spec) tuple_experience_spec = nest_utils.nest_list_to_tuple(experience_spec) self._buffer = TFUniformReplayBuffer(tuple_experience_spec, batch_size) self._data_iter = None
def collect_step(self, env: tf_py_environment.TFPyEnvironment, policy: tf_policy.Base, replay_buffer: TFUniformReplayBuffer): time_step = env.current_time_step() action_step = policy.action(time_step) next_time_step = env.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) replay_buffer.add_batch(traj)
def main(): env = suite_gym.load('Trajectory-v0', gym_kwargs={ 'num_dimensions': 2, 'num_observables': 3, 'max_targets': 100, 'max_steps': 5000, 'max_steps_without_target': 5000, 'max_position': 100.0, 'max_acceleration': 10.2, 'max_velocity': 15.0, 'collision_epsilon': 10.0 }) tf_env = tf_py_environment.TFPyEnvironment(env) agent = RandomAgent(tf_env.time_step_spec(), tf_env.action_spec()) uniform_replay_buffer = TFUniformReplayBuffer(agent.collect_data_spec, batch_size=1) transitions = [] driver = DynamicStepDriver( tf_env, policy=agent.policy, observers=[uniform_replay_buffer.add_batch], transition_observers=[transitions.append], num_steps=500 ) initial_time_step = tf_env.reset() final_time_step, final_policy_state = driver.run(initial_time_step) dataset = uniform_replay_buffer.as_dataset() input_state = [] input_action = [] output_state = [] output_reward = [] for transition in transitions: input_state.append(tf.concat(tf.nest.flatten(transition[0].observation), axis=-1)) input_action.append(tf.concat(tf.nest.flatten(transition[1].action), axis=-1)) output_state.append(tf.concat(tf.nest.flatten(transition[2].observation), axis=-1)) output_reward.append(tf.concat(tf.nest.flatten(transition[2].reward), axis=-1)) tf_input_state = tf.squeeze(tf.stack(input_state), axis=1) tf_input_action = tf.squeeze(tf.stack(input_action), axis=1) tf_output_state = tf.squeeze(tf.stack(output_state), axis=1) tf_output_reward = tf.stack(output_reward) # dataset = (features, labels) # (time_step_before, policy_step_action, time_step_after) = transitions[0] # observation = time_step_before.observation # action = policy_step_action.action # # (discount_, observation_, reward_, step_type_) = time_step_after # observation_ = time_step_after.observation pass
def main(): env = suite_gym.load('Trajectory-v0', gym_kwargs={ 'num_dimensions': 2, 'num_observables': 15, 'max_targets': 100, 'max_steps': 5000, 'max_steps_without_target': 5000, 'max_position': 100.0, 'max_acceleration': 10.2, 'max_velocity': 15.0, 'collision_epsilon': 10.0 }) tf_env = tf_py_environment.TFPyEnvironment(env) agent = RandomAgent(time_step_spec=tf_env.time_step_spec(), action_spec=tf_env.action_spec()) metric = AverageReturnMetric() replay_buffer = [] # uniform_replay_buffer = PyUniformReplayBuffer(data_spec=agent.collect_data_spec, capacity=2000) uniform_replay_buffer = TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=1) # observers = [replay_buffer.append, metric] # driver = PyDriver( # env, # policy=RandomPyPolicy(env.time_step_spec(), env.action_spec()), # observers=[replay_buffer.append, metric], # max_steps=2000 # ) # driver = TFDriver( # tf_env, # # policy=RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()), # policy=agent.policy, # observers=[uniform_replay_buffer], # max_steps=2000 # ) driver = DynamicStepDriver( tf_env, policy=agent.policy, observers=[uniform_replay_buffer.add_batch], #, metric], # transition_observers=None, num_steps=1000) agent.initialize() initial_time_step = tf_env.reset() final_time_step, final_policy_state = driver.run(initial_time_step) dataset = uniform_replay_buffer.as_dataset()
def collect_step(environment: TFEnvironment, policy: TFPyPolicy, replay_buffer: TFUniformReplayBuffer): """ Coleta uma iteração com o ambiente e devolve o resultado m :param environment: :param policy: :param replay_buffer: :return: """ time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer replay_buffer.add_batch(traj)
def __init__(self, agent, enviroment): self._replay_buffer = TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=enviroment.batch_size, max_length=50000) self._random_policy = RandomTFPolicy(train_env.time_step_spec(), enviroment.action_spec()) self._fill_buffer(train_env, self._random_policy, steps=100) self.dataset = self._replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=BATCH_SIZE, num_steps=2).prefetch(3) self.iterator = iter(self.dataset)
def __init__(self, **kwargs): self.batch_size = 1 self.tf_agent = kwargs["tf_agent"] self.replay_buffer = TFUniformReplayBuffer( self.tf_agent.collect_data_spec, batch_size=self.batch_size, max_length=kwargs[MAX_REPLAY_BUFFER_LENGTH])
def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float, replay_buffer_max_length: int, layers: dict) -> None: """ Create the Q-network, agent and policy Args: X_train: A np.ndarray for training samples. y_train: A np.ndarray for the class labels of the training samples. lr: learn rate for the optimizer (default Adam) epsilon: Used for the default epsilon greedy policy for choosing a random action. gamma: The discount factor for learning Q-values imb_ratio: ratio of imbalance. Used to specifiy reward in the environment replay_buffer_max_length: Maximum lenght of replay memory. layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout). """ dense_layers = layers.get("dense") conv_layers = layers.get("conv") dropout_layers = layers.get("dropout") self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio)) # create a custom environment q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers, fc_layer_params=dense_layers, dropout_layer_params=dropout_layers) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr) train_step_counter = tf.Variable(0) self.agent = DqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter, gamma=gamma, epsilon_greedy=epsilon, ) self.agent.initialize() self.replay_buffer = TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=replay_buffer_max_length)
def __init__(self, agent, enviroment, batch_size): self._replay_buffer = TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=enviroment.batch_size, max_length=50000) self._random_policy = RandomTFPolicy(enviroment.time_step_spec(), enviroment.action_spec()) self._fill_buffer(enviroment, self._random_policy, steps=100) self.dataset = self._replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2, single_deterministic_pass=False).prefetch(3) self.iterator = iter(self.dataset)
def train(agent: DdqnAgent, train_env: TFEnvironment, replay_buffer: TFUniformReplayBuffer, num_episodes: int, replay_buffer_batch_size: int, save_path: str, ramdomize_step: int, validate_step: int): train_dataset = replay_buffer.as_dataset( num_parallel_calls=tf.data.experimental.AUTOTUNE, num_steps=REPLAY_BUFFER_NUM_STEPS, sample_batch_size=replay_buffer_batch_size) train_iterator = iter(train_dataset) # Savers checkpointer, saver = create_savers(save_path, agent, replay_buffer) policy_path = os.path.join(save_path, "saved", "policy") checkpointer.initialize_or_restore() global_step = 0 episode = 0 tf.print("Aguardando conexão do cliente") random_policy = create_random_policy(train_env) collect_episode_data(train_env, random_policy, replay_buffer, repeats=3, phase="Random") for episode in range(num_episodes): tf.print(f"Episódio {episode} iniciado") # Colect random data episode_info = collect_episode_data(train_env, agent.collect_policy, replay_buffer, repeats=1) experience, unused_info = next(train_iterator) train_loss = agent.train(experience).loss global_step = agent.train_step_counter.numpy() tf.print( f"Episódio {episode} finalizado, com custo {train_loss} e recompensa {episode_info['reward']}" ) collect_episode_data(train_env, agent.policy, replay_buffer, phase="Inference") #TODO: Salvar métricas no tensorboard ou outro cara # Salvar política e agente checkpointer.save(global_step=global_step) saver.save(policy_path) tf.print( f"Treinamento finalizado no eposiódio {episode} passo {global_step}")
def create_real_replay_buffer(self) -> ReplayBuffer: """ Create the replay buffer for storing data from the real environment. """ return TFUniformReplayBuffer( self._agent.collect_policy.trajectory_spec, batch_size=1, max_length=self._real_replay_buffer_capacity, )
def _init_replay_buffer(self, batch_size, data_spec): self._batch_size = batch_size buffer_config = { "batch_size": self._batch_size, "data_spec": data_spec, "max_length": 1 } tf.compat.v2.summary.scalar(name="replay_buffer_size", data=self._batch_size) self._replay_buffer = TFReplayBuffer(**buffer_config)
class TFUniformReplayBuffer(TFReplayBufferAbstract): def _init_replay_buffer(self, batch_size, data_spec): self._batch_size = batch_size buffer_config = { "batch_size": self._batch_size, "data_spec": data_spec, "max_length": 1 } tf.compat.v2.summary.scalar(name="replay_buffer_size", data=self._batch_size) self._replay_buffer = TFReplayBuffer(**buffer_config) def add_batch(self, traj_dict): """ add a trajectory to the replay buffer Params traj (dict[dim]:numpy): a dict of tensors representing the trajectory to be added it to the replay buffer """ collect_spec_dict = self.collect_data_spec._asdict() traj_tf, traj_spec = build_tf_trajectory(traj_dict, collect_spec_dict) if not self._replay_buffer: batch_size = len(traj_dict["observation"]) self._init_replay_buffer(batch_size, traj_spec) self._replay_buffer.add_batch(traj_tf) def get_batch(self, batch_size): if batch_size is None: batch_size = self._batch_size # TODO: convert the replay buffer to a dataset and iterate over it traj, metadata = self._replay_buffer.get_next( sample_batch_size=batch_size) return traj, metadata
def replay_actions_across_batch_transition_models(env_model, actions) -> Trajectory: """ Use an open loop policy to apply a sequence of actions to the environment model. This returns at least one episode per environment batch (the same action sequence is applied to each batch). """ open_loop_policy = TFOpenLoopPolicy(env_model.time_step_spec(), env_model.action_spec(), actions) buffer = TFUniformReplayBuffer(open_loop_policy.trajectory_spec, batch_size=env_model.batch_size, max_length=1000) driver = TFDriver( env_model, open_loop_policy, observers=[buffer.add_batch], max_steps=env_model.batch_size * actions.shape[0], disable_tf_function=True, ) driver.run(env_model.reset()) trajectories = buffer.gather_all() return trajectories
def get_replay_buffer( env: TFPyEnvironment, agent: Union[ReinforceAgent, PPOAgent], max_length: int = 100000 ) -> TFUniformReplayBuffer: """ Sets up a replay buffer object for use in training the agent. :param env: TensorFlow environment which provides specifications for use in setting up a replay buffer. :param agent: The agent which provides specifications for use in setting up a replay buffer. :param max_length: The maximum length/capacity of the replay buffer. :return: A replay buffer (TFUniformReplayBuffer) """ replay_buffer = TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=env.batch_size, max_length=max_length ) return replay_buffer
class SyncUniformExperienceReplayer(ExperienceReplayer): """ For synchronous off-policy training. Example algorithms: DDPG, SAC """ def __init__(self, experience_spec, batch_size): self._buffer = TFUniformReplayBuffer(experience_spec, batch_size) self._data_iter = None def observe(self, exp, env_ids=None): """ For the sync driver, `exp` has the shape (`env_batch_size`, ...) with `num_envs`==1 and `unroll_length`==1. This function always ignores `env_ids`. """ self._buffer.add_batch(exp) def replay(self, sample_batch_size, mini_batch_length): if self._data_iter is None: dataset = self._buffer.as_dataset( num_parallel_calls=3, sample_batch_size=sample_batch_size, num_steps=mini_batch_length).prefetch(3) self._data_iter = iter(dataset) return next(self._data_iter) def replay_all(self): return self._buffer.gather_all() def clear(self): self._buffer.clear() @property def batch_size(self): return self._buffer._batch_size
optimizer, actor_net, value_net, num_epochs=num_epochs, train_step_counter=global_step, discount_factor=0.995, gradient_clipping=0.5, entropy_regularization=1e-2, importance_ratio_clipping=0.2, use_gae=True, use_td_lambda_return=True) agent.initialize() replay_buffer = TFUniformReplayBuffer(data_spec=agent.collect_data_spec, batch_size=env.batch_size, max_length=100000) def compute_avg_return(environment, policy, num_episodes=10): total_return = 0.0 for _ in range(num_episodes): time_step = environment.reset() episode_return = 0.0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = environment.step(action_step.action) episode_return += time_step.reward total_return += episode_return
def _dyke_replay_buffer(env: TFPyEnvironment, agent: DqnAgent, steps_per_episode: int) -> TFUniformReplayBuffer: return TFUniformReplayBuffer(data_spec=agent.collect_data_spec, batch_size=env.batch_size, max_length=steps_per_episode)
target_categorical_q_network=target_q_net, target_update_tau=tau, target_update_period=1, td_errors_loss_fn=loss, gamma=gamma, train_step_counter=train_step ) agent.initialize() # 4. Constructing the Replay Memory. memory_size = 20000 batch_size = 64 replay_buffer = TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=train_env.batch_size, max_length=memory_size ) # Initializing Observer of replay buffer to store experiences (trajectories) to memory. replay_buffer_observer = replay_buffer.add_batch # Defining Metrics for measuring training progress. train_metrics = [ AverageReturnMetric(), AverageEpisodeLengthMetric() ] # 5. Defining initial policy as random to collect enough examples to fill the memory buffer (Training delay). initial_collect_policy = random_tf_policy.RandomTFPolicy( train_env.time_step_spec(), train_env.action_spec() ) initial_collect_steps = 2000 class ShowProgress:
def build_replay_buffer(self): """Build replay buffer.""" return TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=1, max_length=self.replay_buffer_max_length)
class DQNAgent: def __init__(self) -> None: """ A class for training a TF-agent based on https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial """ self.train_env = None # Training environment self.agent = None # The algorithm used to solve an RL problem is represented by a TF-Agent self.replay_buffer = None # The replay buffer keeps track of data collected from the environment self.dataset = None # The agent needs access to the replay buffer via an iterable tf.data.Dataset self.iterator = None # The iterator of self.dataset def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float, replay_buffer_max_length: int, layers: dict) -> None: """ Create the Q-network, agent and policy Args: X_train: A np.ndarray for training samples. y_train: A np.ndarray for the class labels of the training samples. lr: learn rate for the optimizer (default Adam) epsilon: Used for the default epsilon greedy policy for choosing a random action. gamma: The discount factor for learning Q-values imb_ratio: ratio of imbalance. Used to specifiy reward in the environment replay_buffer_max_length: Maximum lenght of replay memory. layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout). """ dense_layers = layers.get("dense") conv_layers = layers.get("conv") dropout_layers = layers.get("dropout") self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio)) # create a custom environment q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers, fc_layer_params=dense_layers, dropout_layer_params=dropout_layers) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr) train_step_counter = tf.Variable(0) self.agent = DqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter, gamma=gamma, epsilon_greedy=epsilon, ) self.agent.initialize() self.replay_buffer = TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=replay_buffer_max_length) def fit(self, X_train: np.ndarray, y_train: np.ndarray, epochs: int, batch_size: int, eval_step: int, log_step: int, collect_steps_per_episode: int) -> None: """ Starts the training of the Agent. Args: X_train: A np.ndarray for training samples. y_train: A np.ndarray for the class labels of the training samples. epochs: Number of epochs to train Agent batch_size: The Batch Size eval_step: Evaluate Model each 'eval_step' log_step: Monitor results of model each 'log_step' collect_steps_per_episode: Collect a few steps using collect_policy and save to the replay buffer. """ self.dataset = self.replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) self.iterator = iter(self.dataset) def collect_step(environment, policy, buffer): time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer buffer.add_batch(traj) def collect_data(env, policy, buffer, steps): for _ in range(steps): collect_step(env, policy, buffer) # (Optional) Optimize by wrapping some of the code in a graph using TF function. self.agent.train = common.function(self.agent.train) # Reset the train step self.agent.train_step_counter.assign(0) for _ in range(epochs): #print("epoch: ", _) # Collect a few steps using collect_policy and save to the replay buffer. collect_data(self.train_env, self.agent.collect_policy, self.replay_buffer, collect_steps_per_episode) # Sample a batch of data from the buffer and update the agent's network. experience, _ = next(self.iterator) train_loss = self.agent.train(experience).loss step = self.agent.train_step_counter.numpy() if step % log_step == 0: print('step = {0}: loss = {1}'.format(step, train_loss)) if step % eval_step == 0: metrics = self.compute_metrics(X_train, y_train) print(metrics) def compute_metrics(self, X: np.ndarray, y_true: list) -> dict: """Compute Metrics for Evaluation""" # TODO: apply softmax layer for q logits? q, _ = self.agent._target_q_network (X, training=False) # y_scores = np.max(q.numpy(), axis=1) # predicted scores (Q-Values) y_pred = np.argmax(q.numpy(), axis=1) # predicted class label metrics = custom_metrics(y_true, y_pred) return metrics def evaluate(self, X: np.ndarray, y: list, X_train=None, y_train=None) -> dict: """ Evaluation of trained Q-network """ metrics = self.compute_metrics(X, y) print("evaluation: ", metrics) return metrics
def train_agent( env: TFPyEnvironment, agent: Union[ReinforceAgent, PPOAgent], data_collection_driver: DynamicEpisodeDriver, replay_buffer: TFUniformReplayBuffer, num_iters: int, global_step=None, metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None, policy_metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None, policy_summary_writers: Optional[Sequence[tf.summary.SummaryWriter]] = None, eval_env: Optional[TFPyEnvironment] = None, eval_summary_writer: Optional[tf.summary.SummaryWriter] = None, num_eval_episodes: int = 1, eval_metrics: Optional[List[tf_metric.TFStepMetric]] = None, per_step_eval_metrics: Optional[List[Any]] = None, eval_freq: int = 10, log_freq: int = 5, save_freq: int = 5, model_save_path: Optional[str] = None, tf_log_stream_path: Optional[str] = None) -> None: """ Function for putting the pieces together to train and evaluate an agent. :param env: The environment for which the agent will be trained. :param agent: The agent to train. :param data_collection_driver: The driver used for data collection and metric tracking. :param replay_buffer: Replay buffer in which to store experience. :param num_iters: The number of training iterations to perform. :param global_step: A counter of the number of training iterations. :param metrics: A list of the metrics to track during training. :param policy_metrics: A list of metrics related to the policy distribution to track during training. :param policy_summary_writers: A list of summary writers to facilitate overlaying plots of policy metrics in TensorBoard. :param eval_env: The environment in which to play out evaluations of the policy. :param eval_summary_writer: The summary writer used for evaluation metrics. :param num_eval_episodes: The number of evaluation episodes to run at each evaluation point. :param eval_metrics: The metrics to track when evaluating the policy (with episodic resolution). :param per_step_eval_metrics: The metrics to track when evaluating the policy (with time step resolution). :param eval_freq: The number of training iterations between runs of policy evaluation logging. :param log_freq: The frequency with which to log values to TensorBoard. :param save_freq: The number of training iterations between model saves. :param model_save_path: Directory in which to save model checkpoints (weights etc). If None model will not be saved. :param tf_log_stream_path: """ # Get the initial states of the agent and environment before training. time_step = env.reset() policy_state = agent.collect_policy.get_initial_state(env.batch_size) # Set up the model saving infrastructure if a path to save to is provided. save_model = bool(model_save_path) if save_model: # Ensure that we save all trackable values (i.e. variables) from the TensorFlow Agent. checkpoint = tf.train.Checkpoint(agent=agent) # The checkpoint manager enables us to save multiple versions of the check point at # different training steps. We save the 20 most recent saves to span a wide section of # training. checkpoint_manager = tf.train.CheckpointManager(checkpoint, model_save_path, max_to_keep=20) else: # Warn the user that training will continue but models will not be saved. warn("No save directory provided. Model will not be saved.") if metrics is None: metrics = [] if per_step_eval_metrics is None: per_step_eval_metrics = [] # Set up a minimal training loop to simply test training mechanics work. for i in range(num_iters): with tf.summary.record_if(lambda: tf.math.equal(global_step % log_freq, 0)): # Collect experience. time_step, policy_state = data_collection_driver.run( time_step=time_step, policy_state=policy_state ) # Now the replay buffer should have data in it so we can collect the data and train the # agent. experience = replay_buffer.gather_all() agent.train(experience) # Clear the replay buffer and return to play. replay_buffer.clear() for metric in metrics: metric.tf_summaries( train_step=global_step, step_metrics=metrics[:2] ) # Run the policy tracking metrics one at a time each on their own summary writer to # enable shared axes on TensorBoard. for metric, summary_writer in zip(policy_metrics, policy_summary_writers): with summary_writer.as_default(): tf.summary.scalar(name=metric.name, data=metric.result(), step=global_step) if eval_summary_writer and eval_metrics and eval_env: if i > 0 and global_step % eval_freq == 0: evaluate_policy( eval_metrics, eval_env, agent.policy, per_step_metrics=per_step_eval_metrics, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix="Metrics", logging=True, tf_log_stream_path=tf_log_stream_path ) # Periodically save the model provided that we have the infrastructure in place. if save_model and i > 0 and (i + 1) % save_freq == 0: checkpoint_manager.save(i + 1) if i % (num_iters // 100) == 0: print(f"\tCompleted: {i / num_iters * 100} %") checkpoint_manager.save(num_iters)
class SyncUniformExperienceReplayer(ExperienceReplayer): """ For synchronous off-policy training. Example algorithms: DDPG, SAC """ def __init__(self, experience_spec, batch_size): # TFUniformReplayBuffer does not support list in spec, we have to do # some conversion. self._experience_spec = experience_spec self._exp_has_list = nest_utils.nest_contains_list(experience_spec) tuple_experience_spec = nest_utils.nest_list_to_tuple(experience_spec) self._buffer = TFUniformReplayBuffer(tuple_experience_spec, batch_size) self._data_iter = None def _list_to_tuple(self, exp): if self._exp_has_list: return nest_utils.nest_list_to_tuple(exp) else: return exp def _tuple_to_list(self, exp): if self._exp_has_list: return nest_utils.nest_tuple_to_list(exp, self._experience_spec) else: return exp def observe(self, exp, env_ids=None): """ For the sync driver, `exp` has the shape (`env_batch_size`, ...) with `num_envs`==1 and `unroll_length`==1. This function always ignores `env_ids`. """ self._buffer.add_batch(self._list_to_tuple(exp)) def replay(self, sample_batch_size, mini_batch_length): """Get a random batch. Args: sample_batch_size (int): number of sequences mini_batch_length (int): the length of each sequence Returns: Experience: experience batch in batch major (B, T, ...) tf_uniform_replay_buffer.BufferInfo: information about the batch """ if self._data_iter is None: dataset = self._buffer.as_dataset( num_parallel_calls=3, sample_batch_size=sample_batch_size, num_steps=mini_batch_length).prefetch(3) self._data_iter = iter(dataset) exp, info = next(self._data_iter) return self._tuple_to_list(exp), info def replay_all(self): return self._tuple_to_list(self._buffer.gather_all()) def clear(self): self._buffer.clear() @property def batch_size(self): return self._buffer._batch_size
] output_layer = Dense(num_actions, activation=None) cloning_net = Sequential(dense_layers + [output_layer]) optimizer = Adam(learning_rate=learning_rate) train_step_counter = train_utils.create_train_step() agent = BehavioralCloningAgent(env.time_step_spec(), env.action_spec(), cloning_network=cloning_net, optimizer=optimizer) policy = agent.policy replay_buffer = TFUniformReplayBuffer(data_spec=agent.collect_data_spec, batch_size=env.batch_size, max_length=replay_buffer_capacity) agent.train_step_counter.assign(0) replay_observer = [replay_buffer.add_batch] with strategy.scope(): driver = TFDriver(env, collect_policy, replay_observer, max_episodes=100) average = AverageReturnMetric() metrics_observer = [average] metrics_driver = TFRenderDriver(env, policy,
# Initialize agent agent.initialize() # Wrap the training function in a TF graph agent.train = common.function(agent.train) # Create game environments: training and evaluation train_env = TFPyEnvironment(NineMensMorris(agent.policy, discount=DISCOUNT)) eval_env = TFPyEnvironment(NineMensMorris(agent.policy, discount=DISCOUNT)) # Random policy for data collection random_policy = RandomTFPolicy(time_step_spec=train_env.time_step_spec(), action_spec=train_env.action_spec()) # Create replay buffer for data collection replay_buffer = TFUniformReplayBuffer(data_spec=agent.collect_data_spec, batch_size=train_env.batch_size, max_length=BUFFER_LENGTH) # Create driver for the agent driver = DynamicStepDriver(env=train_env, policy=agent.collect_policy, observers=[replay_buffer.add_batch], num_steps=STEPS_PER_ITER) # Wrap the run function in a TF graph driver.run = common.function(driver.run) # Create driver for the random policy random_driver = DynamicStepDriver(env=train_env, policy=random_policy, observers=[replay_buffer.add_batch], num_steps=STEPS_PER_ITER) # Wrap the run function in a TF graph
total_return = 0 for _ in range(num_episodes): time_step = environment.reset() episode_return = 0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = environment.step(action_step.action) episode_return += time_step.reward total_return += episode_return return total_return / num_episodes # Replay buffer replay_buffer = TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=train_env.batch_size, max_length=REPLAY_BUFFER_MAX ) driver = DynamicStepDriver( train_env, agent.collect_policy, observers=[replay_buffer.add_batch], num_steps=1 ) dataset = replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=BATCH_SIZE, num_steps=2).prefetch(3) iterator = iter(dataset)
def __init__(self, experience_spec, batch_size): self._buffer = TFUniformReplayBuffer(experience_spec, batch_size) self._data_iter = None
def create_replay_buffer(agent: DdqnAgent, environment: TFEnvironment, max_lenght: int) -> TFUniformReplayBuffer: return TFUniformReplayBuffer(agent.collect_data_spec, environment.batch_size, max_length=max_lenght)