def run_experiment(): with mlflow.start_run(): # num_iterations = 1000 mlflow.set_tag("agent_type", "dqn") mlflow.log_param("num_act_units", fc_layer_params) mlflow.log_param("num_iterations", num_iterations) mlflow.log_param("initial_collect_steps", initial_collect_steps) mlflow.log_param("batch_size", batch_size) mlflow.log_param("learning_rate", learning_rate) mlflow.set_tag("data_set", "initial_dataset_after_pca") mlflow.log_param("discount", discount) mlflow.log_param("run", 1) agent.train = common.function(agent.train) agent.train_step_counter.assign(0) best_score = 0 for _ in range(num_iterations): collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration) experience, unused_info = next(iterator) train_loss = agent.train(experience).loss step = agent.train_step_counter.numpy() if (step - 1) % log_interval == 0: print("step: ", step) mlflow.log_metric("loss", train_loss.numpy()) if _ % eval_interval == 0: t = time.localtime() current_time = time.strftime("%H:%M:%S", t) print("\n") print(_, current_time) t_eval, u_eval, ratio_of_ones_eval = calculate_u_metric( eval_df) print("\n") t_train, u_train, ratio_of_ones_train = calculate_u_metric( train) mlflow.log_metrics({ "t_eval": t_eval, "u_eval": u_eval, "t_train": t_train, "u_train": u_train, "ratio_of_ones_eval": ratio_of_ones_eval, "ratio_of_ones_train": ratio_of_ones_train }) if u_eval > best_score: best_score = u_eval saver = PolicySaver(agent.policy, batch_size=None) saver.save("dqn_policy") subprocess.run(["zip", "-r", "dqn_policy.zip", "dqn_policy"]) mlflow.log_artifact("dqn_policy.zip")
def save_policy(self, step): """Save strong policy with tf-agent PolicySaver.""" print('Saving agent policy.') # saving environment params as metadata in order to reconstruct environment metadata = py_to_tf(self.env_params) saver = PolicySaver(self.agent.policy, train_step=self.agent.train_step_counter, metadata=metadata, batch_size=None) dir_name = f'{self.uid.numpy().decode()}-{step}' filepath = os.path.join(configs.POLICY_DIR, dir_name) saver.save(filepath) print('Policy saved.')
def main(): all_returns = [] for _ in range(NUM_RUNS): agent, train_env, evaluation_env, experience_replay = init() returns = training_loop(agent, train_env, evaluation_env, experience_replay) all_returns.append(returns) # save policy PolicySaver(agent.policy).save('policy_saved') steps_axis = [i for i in range(0, NUMBER_ITERATION + 1, EVAL_INTERVAL)] plt.figure() for i in range(NUM_RUNS): plt.plot(steps_axis, all_returns[i]) plt.xlabel('Time steps') plt.ylabel('Average return') plt.title("Rewards over 5 runs") for i in range(NUM_RUNS): plt.figure() plt.plot(steps_axis, all_returns[i]) plt.title("Rewards overall") plt.xlabel('Time steps') plt.ylabel('Average return') plt.show()
def train_tf_agent( model: typing.Union[TFAgent, typing.Type[TFAgent]], env: gym.Env, total_timesteps: int, model_name: typing.Optional[str] = None, maximum_episode_reward: int = 200, stop_training_threshold: int = 195, ): train_env = environment_converter.gym_to_tf(env) environment_name = env.__class__.__name__ model_dir = f"{kindo.paths.save_path}/{environment_name}/{model_name}" Path(model_dir).mkdir(parents=True, exist_ok=True) stop_training_callback = callbacks.StopTrainingWhenMean100EpReward( reward_threshold=stop_training_threshold) history_saving_callback = callbacks.HistorySavingCallback( total_timesteps=total_timesteps, history_save_dir=model_dir, maximum_episode_reward=maximum_episode_reward, stop_callback=stop_training_callback, ) if isinstance(model, ABCMeta): model = initialize_tf_agent(model_class=model, train_env=train_env) if model.__class__ in [ agents.DqnAgent, DdqnAgent, agents.DdpgAgent, agents.SacAgent, ]: train_off_policy_tf_agent(model, train_env, total_timesteps, history_saving_callback) elif model.__class__ in [ agents.PPOAgent, agents.ReinforceAgent, agents.Td3Agent ]: train_on_policy_tf_agent(model, train_env, total_timesteps, history_saving_callback) else: raise WrongModelError( f"Model of class `{model.__class__.__name__}` is not supported by Kindo API" ) collect_policy = model.collect_policy saver = PolicySaver(collect_policy, batch_size=None) saver.save(f"{model_dir}/model")
def train_agent(n_iterations): saver = PolicySaver(agent.policy, batch_size=tf_env.batch_size) time_step = None policy_state = agent.collect_policy.get_initial_state( tf_env.batch_size) iterator = iter(dataset) for iteration in tqdm(range(n_iterations)): time_step, policy_state = collect_driver.run( time_step, policy_state) trajectories, buffer_info = next(iterator) train_loss = agent.train(trajectories) if iteration % 1000 == 0: print("\r{} loss:{:.5f}".format(iteration, train_loss.loss.numpy()), end="") log_metrics(train_metrics) # save the policy each 10K iteration if iteration % 100000 == 0: saver.save('policy_%d' % iteration)
def train_agent(n_iterations): time_step = None policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size) iterator = iter(dataset) for iteration in range(initial_policy, n_iterations): time_step, policy_state = collect_driver.run(time_step, policy_state) trajectories, buffer_info = next(iterator) train_loss = agent.train(trajectories) print("\r{} loss:{:.5f} done:{:.5f}".format( iteration, train_loss.loss.numpy(), iteration / n_iterations * 100.0), end="") if iteration % 1000 == 0: log_metrics(train_metrics) if iteration % 10000 == 0 and iteration > 0: #keras.saved_model.saved_model(my_policy, 'policy_' + str(iteration)) #tf.saved_model.save(agent, 'policy_' + str(iteration)) my_policy = agent.policy saver = PolicySaver(my_policy) saver.save('policy_' + str(iteration))
def main(): agent, train_env, evaluation_env, experience_replay = init() returns = training_loop(agent, train_env, evaluation_env, experience_replay) # save policy PolicySaver(agent.policy).save('policy_saved') plt.plot(returns) plt.title("Rewards overall") plt.show()
def main(): agent, train_env, evaluation_env, experience_replay = init() returns = training_loop(agent, train_env, evaluation_env, experience_replay) # save policy PolicySaver(agent.policy).save('policy_saved') plt.plot([i for i in range(0, NUMBER_ITERATION + 1, EVAL_INTERVAL)], returns) plt.title("Rewards overall") plt.show()
add_metrics(training_info, train_metrics) train_agent(n_iterations=n_iterations) # c) For storing frames def get_vid_frames(policy, filename, num_episodes=20, fps=2): frames = [] for _ in range(num_episodes): time_step = tf_env.reset() frames.append(np.abs(env.get_board()) * 100) while not time_step.is_last(): action_step = policy.action(time_step) time_step = tf_env.step(action_step.action) frames.append(np.abs(env.get_board()) * 100) return frames # Store Data df = pd.DataFrame(np.array(training_info).T, columns=['N_Ep', 'Env_Steps', 'Avf_RM', 'Avg_EPLM']) df.to_csv('../DATA/stats_{}.txt'.format(II), index=False, mode="a") # Store Frames frames = get_vid_frames(agent.policy, "trained-agent") with open('../DATA/frames_{}.pkl'.format(II), 'wb') as f: pickle.dump(frames, f) # Store Model my_policy = agent.policy saver = PolicySaver(my_policy, batch_size=None) saver.save('..\\DATA\\policy_{}'.format(II))
train_loss = agent.train(experience).loss step = agent.train_step_counter.numpy() # Print loss every 200 steps. if step % 200 == 0: print('step = {0}: loss = {1}'.format(step, train_loss)) # Evaluate agent's performance every 1000 steps. if step % 1000 == 0: avg_return = compute_avg_return(env, agent.policy, 5) print('step = {0}: Average Return = {1}'.format(step, avg_return)) returns.append(avg_return) pd.DataFrame(returns).plot() print(returns) plt.show() for num in range(10): step = env.reset() time_step = env.current_time_step() while not time_step.is_last().numpy()[0]: time_step = env.current_time_step() print(time_step.observation.numpy()) action_step = agent.policy.action(time_step) next_time_step = env.step(action_step.action) print("end") PolicySaver(agent.policy).save('temp')
def train_eval_doom_simple( # Params for collect num_environment_steps=100000, collect_episodes_per_iteration=32, num_parallel_environments=1, replay_buffer_capacity=301, # Per-environment # Params for train num_epochs=25, learning_rate=4e-4, # Params for eval eval_interval=10, num_video_episodes=10, # Params for summaries and logging log_interval=10): """A simple train and eval for PPO.""" # if not os.path.exists(videos_dir): # os.makedirs(videos_dir) global terminate eval_py_env = CarlaEnv() tf_env = tf_py_environment.TFPyEnvironment(eval_py_env) actor_net, value_net = create_networks(tf_env.observation_spec(), tf_env.action_spec()) global_step = tf.compat.v1.train.get_or_create_global_step() optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-5) tf_agent = ppo_agent.PPOAgent( tf_env.time_step_spec(), tf_env.action_spec(), optimizer, actor_net, value_net, num_epochs=num_epochs, train_step_counter=global_step, discount_factor=0.99, gradient_clipping=0.5, entropy_regularization=1e-2, importance_ratio_clipping=0.2, use_gae=True, use_td_lambda_return=True ) tf_agent.initialize() environment_steps_metric = tf_metrics.EnvironmentSteps() step_metrics = [ tf_metrics.NumberOfEpisodes(), environment_steps_metric, ] replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) train_replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity) collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(tf_env, tf_agent.collect_policy, observers=[replay_buffer.add_batch] + step_metrics, num_episodes=collect_episodes_per_iteration) collect_time = 0 train_time = 0 timed_at_step = global_step.numpy() my_policy = tf_agent.policy saver = PolicySaver(my_policy, batch_size=None) def train_step(): trajectories = train_replay_buffer.gather_all() return tf_agent.train(experience=trajectories) def evaluate(policy, step_count): create_video(tf_env, policy, 10, f'agent/behave/imageio_{step_count}.mp4') print("collecting samples initial:") collect_driver.run() train_replay_buffer = copy.deepcopy(replay_buffer) replay_buffer.clear() print(f"train size {train_replay_buffer.num_frames()} buffer size{replay_buffer.num_frames()}") while environment_steps_metric.result() < num_environment_steps and not terminate: start_time = time.time() print("collecting samples") collector_thread = threading.Thread(target=collect_driver.run) collector_thread.start() start_time = time.time() count = 0 # while collector_thread.is_alive() and not terminate: # count = count + 1 print(f"Training agent {count}") total_loss, _ = train_step() print() print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''") print(f'step = {global_step.numpy()}, loss = {total_loss}, env_metric = {environment_steps_metric.result()}') print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''") print() train_replay_buffer.clear() print("Training agent Finshed") print("Waiting for collecting samples thread") collector_thread.join() print("collecting samples Finished") collect_time += time.time() - start_time train_replay_buffer = copy.deepcopy(replay_buffer) replay_buffer.clear() train_time += time.time() - start_time global_step_val = global_step.numpy() print(f"global_step_val:{global_step_val} % log_interval:{log_interval} = {global_step_val % log_interval}") # if global_step_val % log_interval == 0: print() print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''") print(f'step = {global_step_val}, loss = {total_loss}') steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) print(f'{steps_per_sec} steps/sec') print(f'collect_time = {collect_time}, train_time = {train_time}') print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''") print() timed_at_step = global_step_val collect_time = 0 train_time = 0 if global_step_val % eval_interval == 0: print("Evaluating!!") saver.save(f'agent/saved/policy_ppo_simple_{global_step_val}') policy = tf_agent.policy evaluate(policy, global_step_val) print("Terminated") policy = tf_agent.policy evaluate(policy, global_step_val)
if prev_lives != lives: tf_env.reset() tf_env.pyenv.envs[0].step(np.array(1)) prev_lives = lives watch_driver = DynamicStepDriver( tf_env, agent.policy, observers=[save_frames, reset_and_fire_on_life_lost, ShowProgress(1000)], num_steps=1000) final_time_step, final_policy_state = watch_driver.run() plot_animation(frames) # Create updated animated gif of agent in action image_path = os.path.join(PROJECT_ROOT_DIR, "myAgentPlays.gif") frame_images = [PIL.Image.fromarray(frame) for frame in frames[:150]] frame_images[0].save(image_path, format='GIF', append_images=frame_images[1:], save_all=True, duration=30, loop=0) # Save policy and model policy_dir = os.path.join(PROJECT_ROOT_DIR, "savedPolicy") tf_policy_saver = PolicySaver(agent.policy) tf_policy_saver.save(policy_dir)
def train_and_evaluate_ACagent(tf_agent, train_env=None, eval_env=None, num_iterations=None, batch_size=32, replay_buffer_capacity=1000, name='agent'): if train_env is None: raise ValueError( "train_env is None! Environment should be implemented") if eval_env is None: raise ValueError( "eval_env is None! Environment for evaluation should be implemented" ) if num_iterations is None: raise ValueError("Number of iterations should be implemented!") tf_agent.initialize() initial_collect_steps = 1 collect_steps_per_iteration = 1 print('Initial collect step is', initial_collect_steps) print('collect steps per iteration', collect_steps_per_iteration) print('batch size is ', batch_size) print('replay buffer capacity is', replay_buffer_capacity) eval_policy = tf_agent.policy collect_policy = gaussian_policy.GaussianPolicy(tf_agent.collect_policy) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=1, max_length=replay_buffer_capacity) initial_collect_driver = dynamic_step_driver.DynamicStepDriver( train_env, collect_policy, observers=[replay_buffer.add_batch], num_steps=initial_collect_steps) initial_collect_driver.run() collect_driver = dynamic_step_driver.DynamicStepDriver( train_env, collect_policy, observers=[replay_buffer.add_batch], num_steps=collect_steps_per_iteration) tf_agent.train = common.function(tf_agent.train) collect_driver.run = common.function(collect_driver.run) # Reset the train step tf_agent.train_step_counter.assign(0) # Evalute the agent's policy once before training avg_return = compute_avg_return(eval_env, eval_policy) train_return = compute_avg_return(train_env, eval_policy) returns = [(0, avg_return)] losses = [] train_returns = [train_return] for _ in range(num_iterations): # Collect a few steps using collect_policy and svae to the replay buffer. for _ in range(collect_steps_per_iteration): collect_driver.run() dataset = replay_buffer.as_dataset(batch_size, 2) iterator = iter(dataset) # Sample a batch of data from the buffer and update the agent's network experience, _ = next(iterator) train_loss = tf_agent.train(experience) step = tf_agent.train_step_counter.numpy() log_interval = 50 eval_interval = 50 if step % log_interval == 0: print('step = {0}: loss = {1}'.format(step, train_loss.loss)) losses.append((step, train_loss.loss)) if step % eval_interval == 0: eval_policy = tf_agent.policy avg_return = compute_avg_return(eval_env, eval_policy) train_avg_return = compute_avg_return(train_env, eval_policy) print('step = {0}: Average Return = {1}'.format(step, avg_return)) returns.append((step, avg_return)) train_returns.append(train_avg_return) saver = PolicySaver(tf_agent.policy, batch_size=None) saver.save(r'C:\Users\DELL\Desktop\Python\\' + name + "policy_%d" % step) steps_list = [r[0] for r in returns] rewards_list = [r[1] for r in returns] loss_steps_list = [l[0] for l in losses] loss_list = [l[1] for l in losses] return steps_list, rewards_list, name, loss_steps_list, loss_list, train_returns
def train_sac_agent( env_factory, batch_size=128, reward_scale_factor=1.0, total_training_steps=1000000, eval_callback_rate=None, eval_callback=None, avg_return_report_rate=1000, initial_collect_steps=10000, training_iteration_collect_steps=1, replay_buffer_size=120000, num_eval_episodes=3, checkpoint_dir=None, latest_policy_dir=None, best_policy_dir=None, tensorboard_dir=None, latest_policy_save_rate=5000, checkpoint_save_rate=20000, ): train_env = as_tf_env(env_factory()) eval_env = as_tf_env(env_factory()) agent = create_sac_agent(train_env, reward_scale_factor) agent.train = common.function(agent.train) agent.train_step_counter.assign(0) eval_policy = greedy_policy.GreedyPolicy(agent.policy) replay_buffer = create_replay_buffer(agent, train_env, replay_buffer_size) collect_driver = create_collect_driver( train_env, agent, replay_buffer, collect_steps=training_iteration_collect_steps) collect_driver.run = common.function(collect_driver.run) initial_collect_driver = create_collect_driver( train_env, agent, replay_buffer, collect_steps=initial_collect_steps) initial_collect_driver.run() dataset = replay_buffer.as_dataset(num_parallel_calls=2, sample_batch_size=batch_size, num_steps=2).prefetch(1) dataset_iter = iter(dataset) if checkpoint_dir is None: checkpoint_dir = tempfile.mkdtemp() print('Checkpoints will be stored in {0}'.format(checkpoint_dir)) train_checkpointer = common.Checkpointer( ckpt_dir=checkpoint_dir, max_to_keep=1, agent=agent, policy=agent.policy, replay_buffer=replay_buffer, global_step=agent.train_step_counter, ) train_checkpointer.initialize_or_restore() if latest_policy_dir is None: latest_policy_dir = tempfile.mkdtemp() print('Learned policies will be stored in {0}'.format(latest_policy_dir)) latest_policy_saver = PolicySaver(eval_policy) if best_policy_dir is None: best_policy_dir = tempfile.mkdtemp() print('Learned policies will be stored in {0}'.format(best_policy_dir)) best_policy_saver = PolicySaver(eval_policy) if tensorboard_dir is None: tensorboard_dir = tempfile.mkdtemp() print('Tensorboard logs will be stored in {0}'.format(tensorboard_dir)) writer = tf.summary.create_file_writer(tensorboard_dir) with writer.as_default(): avg_return, avg_num_steps = evaluate_policy( eval_env, eval_policy, num_episodes=num_eval_episodes) tf.summary.scalar('Average return', avg_return, step=0) tf.summary.scalar('Average number of steps', avg_num_steps, step=0) best_avg_return = avg_return for _ in range(total_training_steps): collect_driver.run() experience, _ = next(dataset_iter) agent.train(experience) step = agent.train_step_counter.numpy() if step % avg_return_report_rate == 0: avg_return, avg_num_steps = evaluate_policy( eval_env, eval_policy, num_episodes=num_eval_episodes) tf.summary.scalar('Average return', avg_return, step=step) tf.summary.scalar('Average number of steps', avg_num_steps, step=step) if avg_return > best_avg_return: best_avg_return = avg_return best_policy_saver.save(best_policy_dir) if eval_callback_rate is not None and step % eval_callback_rate == 0: eval_callback(eval_env, eval_policy) if latest_policy_save_rate is not None and step % latest_policy_save_rate == 0: latest_policy_saver.save(latest_policy_dir) if checkpoint_save_rate is not None and step % checkpoint_save_rate == 0: train_checkpointer.save(agent.train_step_counter) return agent
num_steps=STEPS_PER_ITER) # Wrap the run function in a TF graph random_driver.run = common.function(random_driver.run) # Create a checkpointer checkpointer = common.Checkpointer(ckpt_dir=os.path.relpath('checkpoint'), max_to_keep=1, agent=agent, policy=agent.policy, replay_buffer=replay_buffer, global_step=global_step) checkpointer.initialize_or_restore() global_step = tf.compat.v1.train.get_global_step() # Create a policy saver policy_saver = PolicySaver(agent.policy) # Main training loop time_step, policy_state = None, None for it in range(N_ITERATIONS): if COLLECT_RANDOM: print('Running random driver...') time_step, policy_state = random_driver.run(time_step, policy_state) print('Running agent driver...') time_step, policy_state = driver.run(time_step, policy_state) print('Training...') for train_it in range(BUFFER_LENGTH//BATCH_SIZE): experience, _ = replay_buffer.get_next(sample_batch_size=BATCH_SIZE, num_steps=2) agent.train(experience) if (train_it + 1) % 100 == 0: print('{0} training iterations'.format(train_it + 1))
initial_collect_policy, observers=[replay_buffer.add_batch, ShowProgress(20000)], num_steps=20000) final_time_step, final_policy_state = init_driver.run() dataset = replay_buffer.as_dataset(sample_batch_size=64, num_steps=2, num_parallel_calls=3).prefetch(3) #collect_driver.run = function(collect_driver.run) #agent.train = function(agent.train) my_policy = agent.collect_policy saver = PolicySaver(my_policy, batch_size=None) def train_agent(n_iterations): time_step = None policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size) iterator = iter(dataset) for iteration in range(initial_policy, n_iterations): time_step, policy_state = collect_driver.run(time_step, policy_state) trajectories, buffer_info = next(iterator) train_loss = agent.train(trajectories) print("\r{} loss:{:.5f} done:{:.5f}".format( iteration, train_loss.loss.numpy(), iteration / n_iterations * 100.0), end="") if iteration % 1000 == 0:
start_time = time.time() for _ in range(NUM_ITERATIONS): driver.run() experience, unused_info = next(iterator) train_loss = agent.train(experience).loss step = agent.train_step_counter.numpy() if step % 200 == 0: print(f'Step {step}: loss = {train_loss}') if step % 1000 == 0: avg_return = compute_avg_return(eval_env, agent.policy) current_time = time.time() elapsed_time = current_time - start_time print(f'Step {step}, Time: {elapsed_time} : Average Return = {avg_return}') PolicySaver(agent.policy).save(f'parallel_policies/step_{step}') returns.append(avg_return) # Graph results iterations = range(0, NUM_ITERATIONS + 1, 1000) plt.plot(iterations, returns) plt.ylabel('Average Return') plt.xlabel('Iterations') plt.show()
def save_policy(tf_agent, fname='/content/reinforce_cz'): my_policy = tf_agent.collect_policy # save policy PolicySaver(my_policy).save(fname)
class PPOTrainer: """ A PPO trainer for tf-agents. Uses PPO agent objects with TensorFlow environments to train agents to maxinimize reward in their environments. Arguments: 1. ppo_agent (PPO agent): A PPO agent used for learning in the environment env. 2. train_env (tf env): A TensorFlow environment that the agent interacts with via the neural networks. Used for creating training trajectories for the agent, and for optimizing its networks. 3. eval_env (tf env): A TensorFlow environment that the agent interacts with via the neural networks. Used for evaluating the performance of the agent. 5. use_tensorboard (bool): Whether or not to plot losses with tensorboard. 4. add_training_to_video (bool): Whether or not to create videos of the agent's training and save them as videos. """ def __init__(self, ppo_agent, train_env, eval_env, use_tensorboard=True, add_training_to_video=True): # Environment attributes self.train_env = train_env # Environment for training self.eval_env = eval_env # Environment for testing # Agent attributes self.agent = ppo_agent # An instance of a tf-agents agent self.actor_net = self.agent._actor_net self.value_net = self.agent._value_net self.eval_policy = self.agent.policy self.collect_policy = self.agent.collect_policy # Specifics of training self.max_buffer_size = 1000 # Collect entire memory buffer each time self.collect_steps_per_iteration = 1000 # Collect entire memory buffer each time self.epochs = 10000 # Total number of episodes self.total_steps = self.epochs * self.collect_steps_per_iteration print("Total steps: {}".format(self.total_steps)) # Evaluation self.num_eval_episodes = 5 # How many episodes we evaluate each time self.eval_returns = [] # Keep track of evaluation performance self.eval_interval = 100 # Evaluate every <x> epochs self.max_eval_episode_steps = 1000 # Most steps we can have in an episode # Logging self.time_ext = datetime.now().strftime("%Y%m%d-%H%M%S") self.log_interval = 1 self.policy_save_dir = os.path.join( os.getcwd(), "logging_{}/".format(self.time_ext)) if not os.path.exists(self.policy_save_dir): print("Directory {} does not exist; creating it now".format( self.policy_save_dir)) os.mkdir(self.policy_save_dir) self.video_train = [] self.add_training_to_video = add_training_to_video self.video_eval = [] # Tensorboard self.log_dir = "./tb_log_{}".format( self.time_ext) # Log directory for tensorboard self.train_file_writer = tf.summary.create_file_writer( self.log_dir) # File writer for tf if not os.path.exists(self.log_dir): os.mkdir(self.log_dir) self.use_tensorboard = use_tensorboard # Boolean for whether or not we use tensorboard for plotting # Create a replay buffer self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=self.max_buffer_size) # Get train and evaluation policy savers self.train_saver = PolicySaver(self.collect_policy, batch_size=None) self.eval_saver = PolicySaver(self.eval_policy, batch_size=None) # Specify directories for training and evaluation policies self.policy_save_dir = os.path.join(os.getcwd(), "models", self.time_ext) self.save_interval = 500 # Save every 100 epochs if not os.path.exists(self.policy_save_dir): print("Directory {} does not exist;" " creating it now".format(self.policy_save_dir)) os.makedirs(self.policy_save_dir, exist_ok=True) def make_checkpoints(self): """Function for creating checkpoints to save model and track progress.""" global_step = tf.compat.v1.train.get_or_create_global_step() # Create a checkpoint for training self.train_checkpointer = common_utils.Checkpointer( ckpt_dir=self.policy_save_dir, agent=self.agent, global_step=global_step) # Create a readback checkpointer self.rb_checkpointer = common_utils.Checkpointer( ckpt_dir=os.path.join(train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=self.replay_buffer) def collect_step(self, add_to_video=False, step=0, epoch=0): """ Function for collecting a single step from the environment. Used for adding trajectories to the replay buffer. Resets on the first time step - indicating the start of a new episode. Arguments: 1. add_to_video (bool): Whether or not to create a video of the training trajectories and save it to the 'logging/' directory. 2. step (int): The current step of the episode. Important for determining whether or not the environment needs to be reset and for tracking the training trajectories in tensorboard (if tensorboard plotting is enabled). 3. epoch (int): The current epoch of training. Used for tracking the training trajectories in tensorboard (if tensorboard plotting is enabled). """ # Get current time step if step == 0: # Reset the environment time_step = self.train_env.reset() else: # Take the most recent time step time_step = self.train_env.current_time_step() # Take action using the collect policy action_step = self.collect_policy.action(time_step) # Compute the next time step by stepping the training environment next_time_step = self.train_env.step(action_step.action) # Create trajectory and write it to replay buffer traj = trajectory.from_transition(time_step, action_step, next_time_step) self.replay_buffer.add_batch(traj) # Log to tensorboard, if enabled if self.use_tensorboard: with self.train_file_writer.as_default(): tf.summary.image( "Training Trajectories, Epoch {}".format(epoch), time_step.observation, step=step) # Add observation to video, if enabled if add_to_video: # print(time_step.observation.numpy().shape) self.video_train.append(time_step.observation.numpy()) def collect_episode(self, add_to_video=False, epoch=0): """ Function for generating experience data for the replay buffer. Calls collect_step() above to add trajectories from the environment to the replay buffer in an episodic fashion. Trajectories from the replay buffer are then used for training the agent. Arguments: 1. add_to_video (bool): Whether or not to create a video of the training trajectories and save it to the 'logging/' directory. 2. epoch (int): The current epoch of training. Used for tracking the training trajectories in tensorboard (if tensorboard plotting is enabled). """ # Iteratively call collect_step method above to add trajectories to replay buffer for i in range(self.collect_steps_per_iteration): self.collect_step(add_to_video=add_to_video, step=i, epoch=epoch) def compute_avg_reward(self, epoch=None): """ Function for computing the average reward over a series of evaluation episodes by creating simulation episodes using the agent's current policies, then computing rewards from taking actions using the evaluation (greedy) policy and averaging them. Arguments: 1. epoch (int): The current epoch of training. Used for tracking the training trajectories in tensorboard (if tensorboard plotting is enabled). Returns: 1. episode_return (float): A float representing the average reward over the interval of episodes which the agent's policies are evaluated. """ total_return = 0.0 for _ in range(self.num_eval_episodes): time_step = self.eval_env.reset() # Set step counter - capped at self.max_eval_episode_steps i = 0 # Add to value in loop episode_return = 0.0 while not time_step.is_last() and i < self.max_eval_episode_steps: action_step = self.eval_policy.action(time_step) self.video_eval.append( time_step.observation.numpy()) # Add to video frame time_step = self.eval_env.step(action_step.action) # Log to tensorboard if self.use_tensorboard: with self.train_file_writer.as_default(): try: tf.summary.image( "Eval Trajectories, Epoch {}".format(epoch), time_step.observation, step=i) except: print( "Please provide an input for the epoch number." ) episode_return += time_step.reward if i % 250 == 0: print("Action: {}, Reward: {}".format( action_step.action.numpy(), episode_return)) i += 1 print("Steps in episode: {}".format(i)) total_return += episode_return avg_return = total_return / self.num_eval_episodes print("Average return: {}".format(avg_return)) self.eval_returns.append(avg_return) return avg_return def train_agent(self): """ Function for training a PPO tf-agent using trajectories from the replay buffer. Does initial evaluation of the agent prior to training, and then iterates over epochs of the following procedure: a. Collect an episode of data, and write the trajectories to the replay buffer. b. Train from the trajectories on the replay buffer. Updates the weights of the actor and value networks. c. Empty the replay buffer. d. (If enabled) Save data to disk for tensorboard. e. Depending on epoch number and the evaluation and logging intervals, evaluate the agent or log information. Returns: 1. agent (PPO agent): The PPO agent trained during the training process """ eval_epochs = [] # Optimize by wrapping some of the code in a graph using TF function. self.agent.train = common.function(self.agent.train) self.agent.train_step_counter.assign(0) avg_return = self.compute_avg_reward( epoch=0) # Compute pre-training metrics # Log average reward to tensorboard if self.use_tensorboard: with self.train_file_writer.as_default(): tf.summary.scalar("Avg. Reward", float(avg_return), step=0) print("DONE WITH PRELIMINARY EVALUATION...") # Append for output plot eval_epochs.append(0) self.video_eval = [] # Empty to create a new eval video returns = [avg_return] time_step = self.train_env.reset() # Episode counter i = 0 for i in range(self.epochs): print("Training epoch: {}".format(i)) # Collect data and train agent; clear buffer at end print("COLLECTING EPISODE") # Reset the old training video self.video_train = [] self.collect_episode(add_to_video=self.add_training_to_video, epoch=i) self.create_video(mode='train', ext=i) print("COLLECTED EPISODE") trajectories = self.replay_buffer.gather_all() # Old weights old_vnet = copy.deepcopy( self.agent._value_net.trainable_variables[0]) old_anet = copy.deepcopy( self.agent._actor_net.trainable_variables[0]) # Take training step train_loss = self.agent.train(experience=trajectories) # Log loss to tensorboard if self.use_tensorboard: with self.train_file_writer.as_default(): tf.summary.scalar("Training Loss", float(train_loss.loss), step=i) # Get new weights new_vnet = copy.deepcopy( self.agent._value_net.trainable_variables[0]) new_anet = copy.deepcopy( self.agent._actor_net.trainable_variables[0]) # Display Frobenius norm print("VALUE NET Frobenius Norm Difference: {}".format( tf.norm(old_vnet - new_vnet))) print("ACTOR NET Frobenius Norm Difference: {}".format( tf.norm(old_anet - new_anet))) # Step the counter, and log/evaluate agent step = self.agent.train_step_counter.numpy() if self.epochs % self.log_interval == 0: print('step = {0}: loss = {1}'.format(step, train_loss.loss)) if (i + 1) % self.eval_interval == 0: avg_return = self.compute_avg_reward(epoch=i) # Log average reward to tensorboard if self.use_tensorboard: with self.train_file_writer.as_default(): tf.summary.scalar("Avg. Reward", float(avg_return), step=i) eval_epochs.append(i + 1) print('epoch = {0}: Average Return = {1}'.format( step, avg_return)) returns.append(avg_return) self.create_video(mode='eval', ext=i) self.video_eval = [] # Empty to create a new eval video # We should save checkpoints every save_interval epochs if i % self.save_interval == 0 and i != 0: self.save_policy(epochs_done=i) print("Epochs: {}".format(i)) self.replay_buffer.clear() # At the end of training, return the agent return self.agent def playback_trajectories(self, recdir=None): counts = [] def handle_ep(observations, actions, rewards): counts[0] += 1 counts[1] += observations.shape[0] logger.debug( 'Observations.shape={}, actions.shape={}, rewards.shape={}', observations.shape, actions.shape, rewards.shape) if recdir is None: print( "Error: Please specify a recording directory by calling gym_env.directory" ) else: scan_recorded_traces(recdir, handle_ep) def create_video(self, mode='eval', ext=0): if mode == 'eval': video = self.video_eval elif mode == 'train': video = self.video_train # Check if video is zero length if len(video) == 0: raise AssertionError("Video is empty.") print("Number of frames in video: {}".format(len(video))) obs_size = video[0].shape width = np.uint(obs_size[-3]) height = np.uint(obs_size[-2]) channels = np.uint(obs_size[-1]) print("HEIGHT IS: {}, WIDTH IS: {}, CHANNELS IS: {}".format( width, height, channels)) fourcc = cv.VideoWriter_fourcc(*'XVID') out = cv.VideoWriter( os.path.join(self.policy_save_dir, "trajectories_{}_epoch_{}.avi".format(mode, ext)), fourcc, self.FPS, (width, height)) for i in range(len(video)): img_rgb = cv.cvtColor(np.uint8(255 * video[i][0]), cv.COLOR_BGR2RGB) # Save as RGB image out.write(img_rgb) out.release() def plot_eval(self): xs = [i * self.eval_interval for i in range(len(self.eval_returns))] plt.plot(xs, self.eval_returns) plt.xlabel("Training epochs") plt.ylabel("Average Return") plt.title("Average Returns as a Function of Training") plt.savefig(os.path.join(self.policy_save_dir, "eval_returns.png")) print("CREATED PLOT OF RETURNS...") def save_policy(self, epochs_done=0): """ Using the PolicySaver(s) defined in the trainer constructor, this function saves the training and evaluation policies according to the policy_save_dir attribute and whether multiple PPO agents or a single master PPO agent is used. Arguments: 1. epochs_done (int): The number of epochs completed in the training process at the time this save function is called. """ # Save training policy train_save_dir = os.path.join(self.policy_save_dir, "train", "epochs_{}".format(epochs_done)) if not os.path.exists(train_save_dir): os.makedirs(train_save_dir, exist_ok=True) self.train_saver.save(train_save_dir) print("Training policy saved...") # Save eval policy eval_save_dir = os.path.join(self.policy_save_dir, "eval", "epochs_{}".format(epochs_done)) if not os.path.exists(eval_save_dir): os.makedirs(eval_save_dir, exist_ok=True) self.eval_saver.save(eval_save_dir) print("Eval policy saved...") def load_saved_policy(self, eval_model_path=None, train_model_path=None): # Load evaluation and/or training policies from path if eval_model_path is not None: self.eval_policy = tf.saved_model.load(eval_model_path) print("Loading evaluation policy from: {}".format(eval_model_path)) if train_model_path is not None: self.collect_policy = tf.saved_model.load(train_model_path) print("Loading training policy from: {}".format(train_model_path))
def __init__(self, ppo_agent, train_env, eval_env, use_tensorboard=True, add_training_to_video=True): # Environment attributes self.train_env = train_env # Environment for training self.eval_env = eval_env # Environment for testing # Agent attributes self.agent = ppo_agent # An instance of a tf-agents agent self.actor_net = self.agent._actor_net self.value_net = self.agent._value_net self.eval_policy = self.agent.policy self.collect_policy = self.agent.collect_policy # Specifics of training self.max_buffer_size = 1000 # Collect entire memory buffer each time self.collect_steps_per_iteration = 1000 # Collect entire memory buffer each time self.epochs = 10000 # Total number of episodes self.total_steps = self.epochs * self.collect_steps_per_iteration print("Total steps: {}".format(self.total_steps)) # Evaluation self.num_eval_episodes = 5 # How many episodes we evaluate each time self.eval_returns = [] # Keep track of evaluation performance self.eval_interval = 100 # Evaluate every <x> epochs self.max_eval_episode_steps = 1000 # Most steps we can have in an episode # Logging self.time_ext = datetime.now().strftime("%Y%m%d-%H%M%S") self.log_interval = 1 self.policy_save_dir = os.path.join( os.getcwd(), "logging_{}/".format(self.time_ext)) if not os.path.exists(self.policy_save_dir): print("Directory {} does not exist; creating it now".format( self.policy_save_dir)) os.mkdir(self.policy_save_dir) self.video_train = [] self.add_training_to_video = add_training_to_video self.video_eval = [] # Tensorboard self.log_dir = "./tb_log_{}".format( self.time_ext) # Log directory for tensorboard self.train_file_writer = tf.summary.create_file_writer( self.log_dir) # File writer for tf if not os.path.exists(self.log_dir): os.mkdir(self.log_dir) self.use_tensorboard = use_tensorboard # Boolean for whether or not we use tensorboard for plotting # Create a replay buffer self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=self.max_buffer_size) # Get train and evaluation policy savers self.train_saver = PolicySaver(self.collect_policy, batch_size=None) self.eval_saver = PolicySaver(self.eval_policy, batch_size=None) # Specify directories for training and evaluation policies self.policy_save_dir = os.path.join(os.getcwd(), "models", self.time_ext) self.save_interval = 500 # Save every 100 epochs if not os.path.exists(self.policy_save_dir): print("Directory {} does not exist;" " creating it now".format(self.policy_save_dir)) os.makedirs(self.policy_save_dir, exist_ok=True)
f.close() train_agent(n_iterations=n_iterations) # c) For storing frames def get_vid_frames(policy, filename, num_episodes=20, fps=2): frames = [] for _ in range(num_episodes): time_step = tf_env.reset() frames.append(np.abs(env.get_board())) while not time_step.is_last(): action_step = policy.action(time_step) time_step = tf_env.step(action_step.action) frames.append(np.abs(env.get_board())) return frames # Store Data df = pd.DataFrame(np.array(training_info).T, columns=['N_Ep', 'Env_Steps', 'Avf_RM', 'Avg_EPLM']) df.to_csv('../DATA/Single/stats_{}.txt'.format(II), index=False, mode="a") # Store Frames frames = get_vid_frames(agent.policy, "trained-agent") with open('../DATA/Single/frames_{}.pkl'.format(II), 'wb') as f: pickle.dump(frames, f) # Store Model my_policy = agent.policy saver = PolicySaver(my_policy, batch_size=None) saver.save('../DATA/Single/policy_{}'.format(II))
train_env.action_spec()) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=train_env.batch_size, max_length=config.REPLAY_BUFFER_MAX_LENGTH) collect_data(train_env, random_policy, replay_buffer, steps=100) dataset = replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=config.BATCH_SIZE, num_steps=2).prefetch(3) my_policy = tf_agent.collect_policy saver = PolicySaver(my_policy, batch_size=None) iterator = iter(dataset) tf_agent.train = common.function(tf_agent.train) # Reset the train step tf_agent.train_step_counter.assign(0) # Evaluate the agent's policy once before training. avg_return = compute_avg_return(eval_env, tf_agent.policy, \ config.NUM_EVAL_EPISODES) returns = [avg_return] iterations=[0] for _ in tqdm(range(config.NUM_ITERATIONS),total=config.NUM_ITERATIONS): # Collect a few steps using collect_policy and save to the replay buffer. for _ in range(config.COLLECT_STEPS_PER_ITERATION):