def setup_logger(logdir, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params)
def AC_train(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, num_target_updates, num_grad_steps_per_target_update, animate, logdir, normalize_advantages, seed, n_layers, size): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters # args = inspect.getargspec(PG_train)[0] # params = {k: locals()[k] if k in locals() else None for k in args} params = locals() print(params) logz.save_params(params) # Make the gym environment env = gym.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # initialize Policy Gradient Agent network_args = { 'n_layers': n_layers, 'size': size, 'learning_rate': learning_rate, 'num_target_updates': num_target_updates, 'num_grad_steps_per_target_update': num_grad_steps_per_target_update } env_args = { 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, } sample_traj_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'normalize_advantages': normalize_advantages, } # Agent agent = ACAgent(network_args, env_args, sample_traj_args, estimate_return_args) agent.build_computation_graph() agent.init_tf_sess() # start training total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) paths, timesteps_this_batch = agent.sample_trajs(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = np.concatenate([path["reward"] for path in paths]) next_ob_no = np.concatenate( [path["next_observation"] for path in paths]) terminal_n = np.concatenate([path["terminal"] for path in paths]) agent.update_critic(ob_no, next_ob_no, re_n, terminal_n) adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) agent.update_actor(ob_no, ac_na, adv_n) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [len(path["reward"]) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
parser.add_argument('--tau', type=float, default=0.005) parser.add_argument('--layer_norm', type=bool, default=True) parser.add_argument('--batch_size', type=int, default=100) parser.add_argument('--buffer_size', type=int, default=1000000) parser.add_argument('--pop_size', type=int, default=16) parser.add_argument('--elite_size', type=int, default=16) parser.add_argument('--max_ep_len', type=int, default=1000) parser.add_argument('--alpha', type=float, default=0.5) parser.add_argument('--beta', type=float, default=2.) parser.add_argument('--sigma', type=float, default=0.1) parser.add_argument('--k', type=float, default=10) parser.add_argument('--epochs', type=int, default=1000) parser.add_argument('--save_freq', type=int, default=5) parser.add_argument('--start_epoch', type=int, default=1) parser.add_argument('--rl_train_steps', type=int, default=1000) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--dir_path', type=str, default='results_v3/') args = parser.parse_args() output_path = args.dir_path for seed in range(1, 11): args.seed = seed args.dir_path = get_output_folder(output_path, args.env, args.seed) logz.configure_output_dir(args.dir_path) logz.save_params(vars(args)) gesrl = GESRL() gesrl.train()
def TD3_train(env, logdir='.', actor_critic=actor_critic, iterations=600000, replay_size=int(1e6), gamma=0.99, polyak=0.995, actor_lr=1e-3, critic_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=4): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters # args = inspect.getargspec(PG_train)[0] # params = {k: locals()[k] if k in locals() else None for k in args} params = locals() print(params) logz.save_params(params) td3 = TD3Agent(env, actor_critic, gamma, polyak, actor_lr, critic_lr, act_noise) td3.build_computation_graph() td3.init_tf_sess() td3.graph_initialization() ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] replay_buffer = ReplayBuffer(ob_dim, ac_dim, replay_size) start_time = time.time() ob = env.reset() ac, rew, done = 0, 0, 0 actor_loss = [] critic_loss = [] for ii in range(iterations): if ii < start_steps: ac = env.action_space.sample() else: ac = td3.sample_action(ob) ob_next, rew, done = env.step(ac) replay_buffer.store(ob, ac, rew, ob_next, done) if done is True: ob = env.reset() # if iteration < start_step, only put steps into buffer if ii < start_steps: continue batch = replay_buffer.sample_batch(batch_size=batch_size) # update critic a_loss = td3.update_critic(batch['obs1'], batch['obs2'], batch['acts'], batch['rews'], batch['done']) actor_loss.append(a_loss) if ii % policy_delay == 0: # Delayed actor update and target update # update actor and target c_loss = td3.update_actor_and_target(batch['obs1'], batch['obs2'], batch['acts'], batch['rews'], batch['done']) critic_loss.append(c_loss) if ii % 10000 == 0: logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", ii) logz.log_tabular("AverageActorLoss", np.mean(np.array(actor_loss))) logz.log_tabular("AverageCriticLoss", np.mean(np.array(critic_loss))) logz.log_tabular("AverageActorStd", np.std(np.array(actor_loss))) logz.log_tabular("AverageCriticStd", np.std(np.array(critic_loss))) logz.dump_tabular() logz.pickle_tf_vars()