def init_memory_buffer(self, params: Dict) -> PPOReplayBuffer: params["obs_dim"] = self.state_dim params["action_dim"] = self.num_actions params["discount_factor"] = self.γ return PPOReplayBuffer(**params)
def train_PG( exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, mini_batch_size, max_path_length, learning_rate, num_ppo_updates, num_value_iters, animate, logdir, normalize_advantages, nn_critic, seed, n_layers, size, gru_size, history, num_tasks, l2reg, recurrent, generalized, granularity ): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment envs = {'pm': PointEnv, 'pm-obs': ObservedPointEnv, } env = envs[env_name](num_tasks) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] task_dim = len(env._goal) # rude, sorry #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'task_dim': task_dim, 'size': size, 'gru_size': gru_size, 'learning_rate': learning_rate, 'history': history, 'num_value_iters': num_value_iters, 'l2reg': l2reg, 'recurrent': recurrent, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, 'generalized': generalized, 'granularity': granularity, } estimate_return_args = { 'gamma': gamma, 'nn_critic': nn_critic, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args) # build computation graph agent.build_computation_graph() # tensorflow: config, session, variable initialization agent.init_tf_sess() #========================================================================================# # Training Loop #========================================================================================# def unpack_sample(data): ''' unpack a sample from the replay buffer ''' ob = data["observations"] ac = data["actions"] re = data["rewards"] hi = data["hiddens"] ma = 1 - data["terminals"] return ob, ac, re, hi, ma # construct PPO replay buffer, perhaps rude to do outside the agent ppo_buffer = PPOReplayBuffer(agent.replay_buffer) total_timesteps = 0 for itr in range(n_iter): # for PPO: flush the replay buffer! ppo_buffer.flush() # sample trajectories to fill agent's replay buffer print("********** Iteration %i ************"%itr) stats = [] for _ in range(num_tasks): s, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch) total_timesteps += timesteps_this_batch stats += s # compute the log probs, advantages, and returns for all data in agent's buffer # store in ppo buffer for use in multiple ppo updates # TODO: should move inside the agent probably data = agent.replay_buffer.all_batch() ob_no, ac_na, re_n, hidden, masks = unpack_sample(data) fixed_log_probs = agent.sess.run(agent.sy_lp_n, feed_dict={agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na}) q_n, adv_n = agent.estimate_return(ob_no, re_n, hidden, masks) ppo_buffer.add_samples(fixed_log_probs, adv_n, q_n) # update with mini-batches sampled from ppo buffer for _ in range(num_ppo_updates): data = ppo_buffer.random_batch(mini_batch_size) ob_no, ac_na, re_n, hidden, masks = unpack_sample(data) fixed_log_probs = data["log_probs"] adv_n = data["advantages"] q_n = data["returns"] log_probs = agent.sess.run(agent.sy_lp_n, feed_dict={agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na}) agent.update_parameters(ob_no, hidden, ac_na, fixed_log_probs, q_n, adv_n) # compute validation statistics print('Validating...') val_stats = [] for _ in range(num_tasks): vs, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch // 10, is_evaluation=True) val_stats += vs # save trajectories for viz with open("output/{}-epoch{}.pkl".format(exp_name, itr), 'wb') as f: pickle.dump(agent.val_replay_buffer.all_batch(), f, pickle.HIGHEST_PROTOCOL) agent.val_replay_buffer.flush() # Log TRAIN diagnostics returns = [sum(s["rewards"]) for s in stats] final_rewards = [s["rewards"][-1] for s in stats] ep_lengths = [s['ep_len'] for s in stats] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("FinalReward", np.mean(final_rewards)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) # Log VAL diagnostics val_returns = [sum(s["rewards"]) for s in val_stats] val_final_rewards = [s["rewards"][-1] for s in val_stats] logz.log_tabular("ValAverageReturn", np.mean(val_returns)) logz.log_tabular("ValFinalReward", np.mean(val_final_rewards)) logz.dump_tabular() logz.pickle_tf_vars()