def train_agent( agent, env, steps, outdir, checkpoint_freq=None, max_episode_len=None, step_offset=0, evaluator=None, successful_score=None, step_hooks=(), logger=None, ): logger = logger or logging.getLogger(__name__) episode_r = 0 episode_idx = 0 # o_0, r_0 obs = env.reset() t = step_offset if hasattr(agent, "t"): agent.t = step_offset episode_len = 0 try: while t < steps: # a_t action = agent.act(obs) # o_{t+1}, r_{t+1} obs, r, done, info = env.step(action) t += 1 episode_r += r episode_len += 1 reset = episode_len == max_episode_len or info.get( "needs_reset", False) agent.observe(obs, r, done, reset) for hook in step_hooks: hook(env, agent, t) if done or reset or t == steps: logger.info( "outdir:%s step:%s episode:%s R:%s", outdir, t, episode_idx, episode_r, ) logger.info("statistics:%s", agent.get_statistics()) if evaluator is not None: evaluator.evaluate_if_necessary(t=t, episodes=episode_idx + 1) if (successful_score is not None and evaluator.max_score >= successful_score): break if t == steps: break # Start a new episode episode_r = 0 episode_idx += 1 episode_len = 0 obs = env.reset() if checkpoint_freq and t % checkpoint_freq == 0: save_agent(agent, t, outdir, logger, suffix="_checkpoint") except (Exception, KeyboardInterrupt): # Save the current model before being killed save_agent(agent, t, outdir, logger, suffix="_except") raise # Save the final model save_agent(agent, t, outdir, logger, suffix="_finish")
def train_agent_batch( agent, env, steps, outdir, checkpoint_freq=None, log_interval=None, max_episode_len=None, step_offset=0, evaluator=None, successful_score=None, step_hooks=(), evaluation_hooks=(), return_window_size=100, logger=None, ): """Train an agent in a batch environment. Args: agent: Agent to train. env: Environment to train the agent against. steps (int): Number of total time steps for training. outdir (str): Path to the directory to output things. checkpoint_freq (int): frequency at which agents are stored. log_interval (int): Interval of logging. max_episode_len (int): Maximum episode length. step_offset (int): Time step from which training starts. return_window_size (int): Number of training episodes used to estimate the average returns of the current agent. successful_score (float): Finish training if the mean score is greater or equal to thisvalue if not None step_hooks (Sequence): Sequence of callable objects that accepts (env, agent, step) as arguments. They are called every step. See pfrl.experiments.hooks. evaluation_hooks (Sequence): Sequence of callable objects that accepts (env, agent, evaluator, step, eval_score) as arguments. They are called every evaluation. See pfrl.experiments.evaluation_hooks. logger (logging.Logger): Logger used in this function. Returns: List of evaluation episode stats dict. """ logger = logger or logging.getLogger(__name__) recent_returns = deque(maxlen=return_window_size) num_envs = env.num_envs episode_r = np.zeros(num_envs, dtype=np.float64) episode_idx = np.zeros(num_envs, dtype="i") episode_len = np.zeros(num_envs, dtype="i") # o_0, r_0 obss = env.reset() t = step_offset if hasattr(agent, "t"): agent.t = step_offset eval_stats_history = [] # List of evaluation episode stats dict try: while True: # a_t actions = agent.batch_act(obss) # o_{t+1}, r_{t+1} obss, rs, dones, infos = env.step(actions) episode_r += rs episode_len += 1 # Compute mask for done and reset if max_episode_len is None: resets = np.zeros(num_envs, dtype=bool) else: resets = episode_len == max_episode_len resets = np.logical_or( resets, [info.get("needs_reset", False) for info in infos] ) # Agent observes the consequences agent.batch_observe(obss, rs, dones, resets) # Make mask. 0 if done/reset, 1 if pass end = np.logical_or(resets, dones) not_end = np.logical_not(end) # For episodes that ends, do the following: # 1. increment the episode count # 2. record the return # 3. clear the record of rewards # 4. clear the record of the number of steps # 5. reset the env to start a new episode # 3-5 are skipped when training is already finished. episode_idx += end recent_returns.extend(episode_r[end]) for _ in range(num_envs): t += 1 if checkpoint_freq and t % checkpoint_freq == 0: save_agent(agent, t, outdir, logger, suffix="_checkpoint") for hook in step_hooks: hook(env, agent, t) if ( log_interval is not None and t >= log_interval and t % log_interval < num_envs ): logger.info( "outdir:{} step:{} episode:{} last_R: {} average_R:{}".format( # NOQA outdir, t, np.sum(episode_idx), recent_returns[-1] if recent_returns else np.nan, np.mean(recent_returns) if recent_returns else np.nan, ) ) logger.info("statistics: {}".format(agent.get_statistics())) if evaluator: eval_score = evaluator.evaluate_if_necessary( t=t, episodes=np.sum(episode_idx) ) if eval_score is not None: eval_stats = dict(agent.get_statistics()) eval_stats["eval_score"] = eval_score eval_stats_history.append(eval_stats) for hook in evaluation_hooks: hook(env, agent, evaluator, t, eval_score) if ( successful_score is not None and evaluator.max_score >= successful_score ): break if t >= steps: break # Start new episodes if needed episode_r[end] = 0 episode_len[end] = 0 obss = env.reset(not_end) except (Exception, KeyboardInterrupt): # Save the current model before being killed save_agent(agent, t, outdir, logger, suffix="_except") env.close() if evaluator: evaluator.env.close() raise else: # Save the final model save_agent(agent, t, outdir, logger, suffix="_finish") return eval_stats_history
def train_hrl_agent( agent: HIROAgent, env, steps, outdir, checkpoint_freq=None, max_episode_len=None, step_offset=0, evaluator=None, successful_score=None, step_hooks=(), logger=None, ): logger = logger or logging.getLogger(__name__) episode_r = 0 episode_idx = 0 obs_dict = env.reset() fg = obs_dict['desired_goal'] obs = obs_dict['observation'] # sample from subgoal sg = env.subgoal_space.sample() t = step_offset step = 0 if hasattr(agent, "t"): agent.t = step_offset episode_len = 0 try: while t < steps: # get action action = agent.act_low_level(obs, sg) # take a step in the environment obs_dict, r, done, info = env.step(action) obs = obs_dict['observation'] n_sg = agent.act_high_level(obs, fg, sg, step, t) episode_r += r episode_len += 1 reset = episode_len == max_episode_len or info.get( "needs_reset", False) agent.observe(obs, fg, n_sg, r, done, reset, step, t) sg = n_sg t += 1 step += 1 for hook in step_hooks: hook(env, agent, t) if done or reset or t == steps: logger.info( "outdir:%s step:%s episode:%s R:%s", outdir, t, episode_idx, episode_r, ) logger.info("statistics:%s", agent.get_statistics()) if evaluator is not None: evaluator.evaluate_if_necessary(t=t, episodes=episode_idx + 1) if (successful_score is not None and evaluator.max_score >= successful_score): break if t == steps: break # Start a new episode, reset the environment and goal agent.last_x = obs[0] agent.last_y = obs[1] agent.last_z = obs[2] env.evaluate = False episode_r = 0 episode_idx += 1 episode_len = 0 step = 0 agent.end_episode() obs_dict = env.reset() fg = obs_dict['desired_goal'] obs = obs_dict['observation'] agent.sample_subgoal(obs, fg) if checkpoint_freq and t % checkpoint_freq == 0: save_agent(agent, t, outdir, logger, suffix="_checkpoint") except (Exception, KeyboardInterrupt): # Save the current model before being killed save_agent(agent, t, outdir, logger, suffix="_except") raise # Save the final model save_agent(agent, t, outdir, logger, suffix="_finish")
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=( "Directory path to save output files." " If it does not exist, it will be created." ), ) parser.add_argument( "--env", type=str, default="'DClawTurnFixed-v0'", help="OpenAI Gym MuJoCo env to perform algorithm on.", ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument( "--gpu", type=int, default=-1, help="GPU to use, set to -1 if no GPU." ) parser.add_argument( "--load", type=str, default="", help="Directory to load agent from." ) parser.add_argument( "--max-steps", type=int, default=10 ** 6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=5000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=10000, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument("--batch-size", type=int, default=64, help="Minibatch size") parser.add_argument( "--render", action="store_true", help="Render env states in a GUI window." ) parser.add_argument( "--demo", action="store_true", help="Just run evaluation, not training." ) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument( "--pretrained-type", type=str, default="best", choices=["best", "final"] ) parser.add_argument( "--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor." ) parser.add_argument( "--log-level", type=int, default=logging.INFO, help="Level of the root logger." ) parser.add_argument("--gamma", type=float, default=0.9) parser.add_argument("--ddpg-training-steps", type=int, default=int(1e3)) parser.add_argument("--adversary-training-steps", type=int,default=int(1e3)) args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = './results' print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) def make_env(test): env = gym.make('DClawTurnFixed-v0') # Unwrap TimeLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_space = env.observation_space action_space = env.action_space print("Observation space:", obs_space) print("Action space:", action_space) obs_size = obs_space.low.size action_size = action_space.low.size q_func = nn.Sequential( ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256,256), nn.ReLU(), nn.Linear(256, 1), ) policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256,256), nn.ReLU(), nn.Linear(256, action_size), BoundByTanh(low=action_space.low, high=action_space.high), DeterministicHead(), ) ddpg_opt_a = torch.optim.Adam(policy.parameters()) ddpg_opt_c = torch.optim.Adam(q_func.parameters()) ddpg_rbuf = replay_buffers.ReplayBuffer(10 ** 6) ddpg_explorer = explorers.AdditiveGaussian( scale=0.1, low=action_space.low, high=action_space.high ) def ddpg_burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 ddpg_agent = DDPG( policy, q_func, ddpg_opt_a, ddpg_opt_c, ddpg_rbuf, gamma=args.gamma, explorer=ddpg_explorer, replay_start_size=args.replay_start_size, target_update_method="soft", target_update_interval=1, update_interval=1, soft_update_tau=5e-3, n_times_update=1, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=ddpg_burnin_action_func, ) def adversary_random_func(): return np.random.randint(0,9) # adversary_q = Critic(obs_size, 1, hidden_size=adversary_hidden_size) # adversary_action_space = gym.spaces.discrete.Discrete(9) # adversary_q = q_functions.FCQuadraticStateQFunction( # obs_size, 1, n_hidden_channels = 256, n_hidden_layers = 2,action_space = adversary_action_space # ) adversary_q = nn.Sequential( nn.Linear(obs_size, 256), nn.Linear(256,256), nn.Linear(256,256), nn.Linear(256,1), DiscreteActionValueHead(), ) adversary_optimizer = torch.optim.Adam(adversary_q.parameters(), lr=1e-3) adversary_rbuf_capacity = int(1e6) adversary_rbuf = replay_buffers.ReplayBuffer(adversary_rbuf_capacity) adversary_explorer = explorers.LinearDecayEpsilonGreedy( 1.0, 0.1, 10**4, adversary_random_func ) adversary_agent = DQN( adversary_q, adversary_optimizer, adversary_rbuf, gpu=args.gpu, gamma=args.gamma, explorer=adversary_explorer, replay_start_size=args.replay_start_size, target_update_interval=1, minibatch_size=args.batch_size, target_update_method='soft', soft_update_tau=5e-3 ) logger = logging.getLogger(__name__) eval_env = make_env(test=True) evaluator = Evaluator( agent=ddpg_agent, n_steps=None, n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, max_episode_len=timestep_limit, env=eval_env, step_offset=0, save_best_so_far_agent=True, use_tensorboard=True, logger=logger, ) episode_reward = 0 ddpg_episode_idx = 0 adversary_episode_idx = 0 # o_0, r_0 current_state = env.reset() t = 0 ddpg_t = 0 adversary_t = 0 episode_len = 0 try: while t < args.max_steps: for i in range(args.ddpg_training_steps): t += 1 ddpg_t += 1 ddpg_action = ddpg_agent.act(current_state) adversary_action = adversary_agent.act(current_state) ddpg_action[adversary_action] = 0 next_state, reward, done, info = env.step(ddpg_action) episode_reward += reward episode_len += 1 reset = episode_len == timestep_limit or info.get("needs_reset", False) ddpg_agent.observe(next_state, reward, done, reset) current_state = next_state if done or reset or t == args.max_steps: logger.info( "ddpg phase: outdir:%s step:%s episode:%s R:%s", args.outdir, ddpg_t, ddpg_episode_idx, episode_reward, ) logger.info("statistics:%s", ddpg_agent.get_statistics()) if evaluator is not None: evaluator.evaluate_if_necessary(t=t, episodes=ddpg_episode_idx + 1) if t == args.max_steps: break episode_reward = 0 ddpg_episode_idx += 1 episode_len = 0 current_state = env.reset() episode_reward = 0 episode_len = 0 current_state = env.reset() print("start adversary training ") for i in range(args.adversary_training_steps): t += 1 adversary_t += 1 ddpg_action = ddpg_agent.act(current_state) adversary_action = adversary_agent.act(current_state) ddpg_action[adversary_action] = 0 next_state, reward, done, info = env.step(ddpg_action) reward = -reward episode_len += 1 reset = episode_len == timestep_limit or info.get("needs_reset", False) adversary_agent.observe(next_state, reward, done, reset) current_state = next_state if done or reset or t == args.max_steps: if t == args.max_steps: break episode_reward = 0 adversary_episode_idx += 1 episode_len = 0 current_state = env.reset() except (Exception, KeyboardInterrupt): # Save the current model before being killed save_agent(ddpg_agent, t, args.outdir, logger, suffix="_ddpg_except") save_agent(adversary_agent, t, args.outdir, logger, suffix="_adversary_except" ) raise # Save the final model save_agent(ddpg_agent, t, args.outdir, logger, suffix="_ddpg_finish") save_agent(adversary_agent, t, args.outdir, logger, suffix="_adversary_finish" ) # if args.demo: # eval_env.render() # eval_stats = experiments.eval_performance( # env=eval_env, # agent=ddpg_agent, # n_steps=None, # n_episodes=args.eval_n_runbase_envs, # max_episode_len=timestep_limit, # ) # print( # "n_runs: {} mean: {} median: {} stdev {}".format( # args.eval_n_runs, # eval_stats["mean"], # eval_stats["median"], # eval_stats["stdev"], # ) # ) # else: # experiments.train_agent_with_evaluation( # agent=ddpg_agent, # env=env, # steps=args.steps, # eval_env=eval_env, # eval_n_steps=None, # eval_n_episodes=args.eval_n_runs, # eval_interval=args.eval_interval, # outdir=args.outdir, # train_max_episode_len=timestep_limit, # ) print("finish")