def _test_actor_learner_training(self, gpu, steps=100000, require_success=True): logging.basicConfig(level=logging.DEBUG) test_env, successful_return = self.make_env_and_successful_return(test=True) agent = self.make_agent(test_env, gpu) # cumulative_steps init to 0 assert agent.cumulative_steps == 0 def make_env(process_idx, test): env, _ = self.make_env_and_successful_return(test=test) return env step_hook = mock.Mock() optimizer_step_hook = mock.Mock() # Train if steps > 0: ( make_actor, learner, poller, exception_event, ) = agent.setup_actor_learner_training( n_actors=2, step_hooks=[step_hook], optimizer_step_hooks=[optimizer_step_hook], ) poller.start() learner.start() train_agent_async( processes=2, steps=steps, outdir=self.tmpdir, eval_interval=200, eval_n_steps=None, eval_n_episodes=5, successful_score=successful_return, make_env=make_env, make_agent=make_actor, stop_event=learner.stop_event, exception_event=exception_event, ) learner.stop() learner.join() poller.stop() poller.join() # Test # Because in actor-learner traininig the model can be updated between # evaluation and saving, it is difficult to guarantee the learned # model successfully passes the test. # Thus we only check if the training was successful. # As the test can finish before running all the steps, # we can only test the range assert agent.cumulative_steps > 0 assert agent.cumulative_steps <= steps + 1 # Unlike the non-actor-learner cases, the step_hooks and # optimizer_step_hooks are only called when the update happens # when we do a fast test, the update may not be triggered due to # limited amount of experience, the call_count can be 0 in such case assert step_hook.call_count >= 0 assert step_hook.call_count <= steps / agent.update_interval assert optimizer_step_hook.call_count == step_hook.call_count for i, call in enumerate(step_hook.call_args_list): args, kwargs = call assert args[0] is None assert args[1] is agent assert args[2] == (i + 1) * agent.update_interval for i, call in enumerate(optimizer_step_hook.call_args_list): args, kwargs = call assert args[0] is None assert args[1] is agent assert args[2] == i + 1 successful_path = os.path.join(self.tmpdir, "successful") finished_path = os.path.join(self.tmpdir, "{}_finish".format(steps)) if require_success: assert os.path.exists(successful_path) else: assert os.path.exists(successful_path) or os.path.exists(finished_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--processes", type=int, default=16) parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--t-max", type=int, default=5) parser.add_argument("--beta", type=float, default=1e-2) parser.add_argument("--profile", action="store_true") parser.add_argument("--steps", type=int, default=8 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--lr", type=float, default=7e-4) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--load", type=str, default="") parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env sample_env = make_env(0, False) obs_size = sample_env.observation_space.low.shape[0] n_actions = sample_env.action_space.n model = nn.Sequential( nn.Conv2d(obs_size, 16, 8, stride=4), nn.ReLU(), nn.Conv2d(16, 32, 4, stride=2), nn.ReLU(), nn.Flatten(), nn.Linear(2592, 256), nn.ReLU(), pfrl.nn.Branched( nn.Sequential( nn.Linear(256, n_actions), SoftmaxCategoricalHead(), ), nn.Linear(256, 1), ), ) # SharedRMSprop is same as torch.optim.RMSprop except that it initializes # its state in __init__, allowing it to be moved to shared memory. opt = SharedRMSpropEpsInsideSqrt(model.parameters(), lr=7e-4, eps=1e-1, alpha=0.99) assert opt.state_dict()["state"], ( "To share optimizer state across processes, the state must be" " initialized before training.") def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = a3c.A3C( model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi, max_grad_norm=40.0, ) if args.load_pretrained: raise Exception("Pretrained models are currently unsupported.") if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print("n_steps: {} mean: {} median: {} stdev: {}".format( args.eval_n_steps, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): for pg in agent.optimizer.param_groups: assert "lr" in pg pg["lr"] = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=True, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("processes", type=int) parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--t-max", type=int, default=5) parser.add_argument("--replay-start-size", type=int, default=10000) parser.add_argument("--n-times-replay", type=int, default=4) parser.add_argument("--beta", type=float, default=1e-2) parser.add_argument("--profile", action="store_true") parser.add_argument("--steps", type=int, default=10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--lr", type=float, default=7e-4) parser.add_argument("--eval-interval", type=int, default=10**5) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--use-lstm", action="store_true") parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default="") parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.set_defaults(use_lstm=False) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) n_actions = gym.make(args.env).action_space.n input_to_hidden = nn.Sequential( nn.Conv2d(4, 16, 8, stride=4), nn.ReLU(), nn.Conv2d(16, 32, 4, stride=2), nn.ReLU(), nn.Flatten(), nn.Linear(2592, 256), nn.ReLU(), ) head = acer.ACERDiscreteActionHead( pi=nn.Sequential( nn.Linear(256, n_actions), SoftmaxCategoricalHead(), ), q=nn.Sequential( nn.Linear(256, n_actions), DiscreteActionValueHead(), ), ) if args.use_lstm: model = pfrl.nn.RecurrentSequential( input_to_hidden, nn.LSTM(num_layers=1, input_size=256, hidden_size=256), head, ) else: model = nn.Sequential(input_to_hidden, head) model.apply(pfrl.initializers.init_chainer_default) opt = pfrl.optimizers.SharedRMSpropEpsInsideSqrt(model.parameters(), lr=args.lr, eps=4e-3, alpha=0.99) replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = acer.ACER( model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, beta=args.beta, phi=phi, max_grad_norm=40, recurrent=args.use_lstm, ) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): for pg in agent.optimizer.param_groups: assert "lr" in pg pg["lr"] = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
def main(): import logging logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--env", type=str, default="Pendulum-v0") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--final-exploration-steps", type=int, default=10**4) parser.add_argument("--start-epsilon", type=float, default=1.0) parser.add_argument("--end-epsilon", type=float, default=0.1) parser.add_argument("--noisy-net-sigma", type=float, default=None) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--steps", type=int, default=10**5) parser.add_argument("--prioritized-replay", action="store_true") parser.add_argument("--replay-start-size", type=int, default=1000) parser.add_argument("--target-update-interval", type=int, default=10**2) parser.add_argument("--target-update-method", type=str, default="hard") parser.add_argument("--soft-update-tau", type=float, default=1e-2) parser.add_argument("--update-interval", type=int, default=1) parser.add_argument("--eval-n-runs", type=int, default=100) parser.add_argument("--eval-interval", type=int, default=10**4) parser.add_argument("--n-hidden-channels", type=int, default=100) parser.add_argument("--n-hidden-layers", type=int, default=2) parser.add_argument("--gamma", type=float, default=0.99) parser.add_argument("--minibatch-size", type=int, default=None) parser.add_argument("--render-train", action="store_true") parser.add_argument("--render-eval", action="store_true") parser.add_argument("--monitor", action="store_true") parser.add_argument("--reward-scale-factor", type=float, default=1e-3) parser.add_argument( "--actor-learner", action="store_true", help="Enable asynchronous sampling with asynchronous actor(s)", ) # NOQA parser.add_argument( "--num-envs", type=int, default=1, help=("The number of environments for sampling (only effective with" " --actor-learner enabled)"), ) # NOQA args = parser.parse_args() # Set a random seed used in PFRL utils.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(idx=0, test=False): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed utils.set_random_seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): utils.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = pfrl.wrappers.ScaleReward(env, args.reward_scale_factor) if (args.render_eval and test) or (args.render_train and not test): env = pfrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space, ) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, ) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample, ) if args.noisy_net_sigma is not None: pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() opt = optim.Adam(q_func.parameters()) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) // args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffers.ReplayBuffer(rbuf_capacity) agent = DQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) elif not args.actor_learner: print( "WARNING: Since https://github.com/pfnet/pfrl/pull/112 we have started" " setting `eval_during_episode=True` in this script, which affects the" " timings of evaluation phases.") experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, train_max_episode_len=timestep_limit, eval_during_episode=True, ) else: # using impala mode when given num of envs # When we use multiple envs, it is critical to ensure each env # can occupy a CPU core to get the best performance. # Therefore, we need to prevent potential CPU over-provision caused by # multi-threading in Openmp and Numpy. # Disable the multi-threading on Openmp and Numpy. os.environ["OMP_NUM_THREADS"] = "1" # NOQA ( make_actor, learner, poller, exception_event, ) = agent.setup_actor_learner_training(args.num_envs) poller.start() learner.start() experiments.train_agent_async( processes=args.num_envs, make_agent=make_actor, make_env=make_env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, stop_event=learner.stop_event, exception_event=exception_event, ) poller.stop() learner.stop() poller.join() learner.join()
def _test_actor_learner_training(self, gpu, steps=100000, require_success=True): logging.basicConfig(level=logging.DEBUG) test_env, successful_return = self.make_env_and_successful_return(test=True) agent = self.make_agent(test_env, gpu) # cumulative_steps init to 0 assert agent.cumulative_steps == 0 def make_env(process_idx, test): env, _ = self.make_env_and_successful_return(test=test) return env # Train if steps > 0: ( make_actor, learner, poller, exception_event, ) = agent.setup_actor_learner_training(n_actors=2) poller.start() learner.start() train_agent_async( processes=2, steps=steps, outdir=self.tmpdir, eval_interval=200, eval_n_steps=None, eval_n_episodes=5, successful_score=successful_return, make_env=make_env, make_agent=make_actor, stop_event=learner.stop_event, exception_event=exception_event, ) learner.stop() learner.join() poller.stop() poller.join() # Test # Because in actor-learner traininig the model can be updated between # evaluation and saving, it is difficult to guarantee the learned # model successfully passes the test. # Thus we only check if the training was successful. # As the test can finish before running all the steps, # we can only test the range assert agent.cumulative_steps > 0 assert agent.cumulative_steps <= steps + 1 successful_path = os.path.join(self.tmpdir, "successful") finished_path = os.path.join(self.tmpdir, "{}_finish".format(steps)) if require_success: assert os.path.exists(successful_path) else: assert os.path.exists(successful_path) or os.path.exists(finished_path)