def _test_load_rainbow(self, gpu): from pfrl.q_functions import DistributionalDuelingDQN q_func = DistributionalDuelingDQN(4, 51, -10, 10) pnn.to_factorized_noisy(q_func, sigma_scale=0.5) explorer = explorers.Greedy() opt = torch.optim.Adam(q_func.parameters(), 6.25e-5, eps=1.5 * 10**-4) rbuf = replay_buffers.ReplayBuffer(100) agent = agents.CategoricalDoubleDQN( q_func, opt, rbuf, gpu=gpu, gamma=0.99, explorer=explorer, minibatch_size=32, replay_start_size=50, target_update_interval=32000, update_interval=4, batch_accumulator="mean", phi=lambda x: x, ) downloaded_model, exists = download_model( "Rainbow", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type) agent.load(downloaded_model) if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"): assert exists
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="SlimeVolley-v0") parser.add_argument( "--outdir", type=str, default="results", help=( "Directory path to save output files." " If it does not exist, it will be created." ), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--noisy-net-sigma", type=float, default=0.1) parser.add_argument("--steps", type=int, default=2 * 10 ** 6) parser.add_argument("--replay-start-size", type=int, default=1600) parser.add_argument("--eval-n-episodes", type=int, default=1000) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help=( "Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--gamma", type=float, default=0.98) parser.add_argument("--v-max", type=float, default=1) parser.add_argument("--n-step-return", type=int, default=3) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2 ** 31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): if "SlimeVolley" in args.env: # You need to install slimevolleygym import slimevolleygym # NOQA env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env.seed(int(env_seed)) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training" ) if args.render: env = pfrl.wrappers.Render(env) if isinstance(env.action_space, gym.spaces.MultiBinary): env = MultiBinaryAsDiscreteAction(env) return env env = make_env(test=False) eval_env = make_env(test=True) obs_size = env.observation_space.low.size n_actions = env.action_space.n n_atoms = 51 v_max = args.v_max v_min = -args.v_max hidden_size = 512 q_func = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU(), DistributionalDuelingHead(hidden_size, n_actions, n_atoms, v_min, v_max), ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) # Noisy nets pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Use the same eps as https://arxiv.org/abs/1710.02298 opt = torch.optim.Adam(q_func.parameters(), 1e-4, eps=1.5e-4) # Prioritized Replay # Anneal beta from beta0 to 1 throughout training update_interval = 1 betasteps = args.steps / update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 10 ** 6, alpha=0.5, beta0=0.4, betasteps=betasteps, num_steps=args.n_step_return, normalize_by_max="memory", ) agent = agents.CategoricalDoubleDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, minibatch_size=32, replay_start_size=args.replay_start_size, target_update_interval=2000, update_interval=update_interval, batch_accumulator="mean", phi=phi, max_grad_norm=10, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_episodes, ) print( "n_episodes: {} mean: {} median: {} stdev {}".format( eval_stats["episodes"], eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], ) ) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_episodes, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--load", type=str, default=None) parser.add_argument("--eval-epsilon", type=float, default=0.0) parser.add_argument("--noisy-net-sigma", type=float, default=0.5) parser.add_argument("--steps", type=int, default=5 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--replay-start-size", type=int, default=2 * 10**4) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--n-best-episodes", type=int, default=200) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n n_atoms = 51 v_max = 10 v_min = -10 q_func = DistributionalDuelingDQN( n_actions, n_atoms, v_min, v_max, ) # Noisy nets pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Use the same hyper parameters as https://arxiv.org/abs/1710.02298 opt = torch.optim.Adam(q_func.parameters(), 6.25e-5, eps=1.5 * 10**-4) # Prioritized Replay # Anneal beta from beta0 to 1 throughout training update_interval = 4 betasteps = args.steps / update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 10**6, alpha=0.5, beta0=0.4, betasteps=betasteps, num_steps=3, normalize_by_max="memory", ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.CategoricalDoubleDQN agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, minibatch_size=32, replay_start_size=args.replay_start_size, target_update_interval=32000, update_interval=update_interval, batch_accumulator="mean", phi=phi, ) if args.load or args.load_pretrained: # either load_ or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load( utils.download_model("Rainbow", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print("n_episodes: {} mean: {} median: {} stdev {}".format( eval_stats["episodes"], eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 200 evaluation episodes, each capped at 30 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=args.max_frames / 4, logger=None, ) with open(os.path.join(args.outdir, "bestscores.json"), "w") as f: json.dump(stats, f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))
def main(): import logging logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--env", type=str, default="Pendulum-v0") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--final-exploration-steps", type=int, default=10**4) parser.add_argument("--start-epsilon", type=float, default=1.0) parser.add_argument("--end-epsilon", type=float, default=0.1) parser.add_argument("--noisy-net-sigma", type=float, default=None) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--steps", type=int, default=10**5) parser.add_argument("--prioritized-replay", action="store_true") parser.add_argument("--replay-start-size", type=int, default=1000) parser.add_argument("--target-update-interval", type=int, default=10**2) parser.add_argument("--target-update-method", type=str, default="hard") parser.add_argument("--soft-update-tau", type=float, default=1e-2) parser.add_argument("--update-interval", type=int, default=1) parser.add_argument("--eval-n-runs", type=int, default=100) parser.add_argument("--eval-interval", type=int, default=10**4) parser.add_argument("--n-hidden-channels", type=int, default=100) parser.add_argument("--n-hidden-layers", type=int, default=2) parser.add_argument("--gamma", type=float, default=0.99) parser.add_argument("--minibatch-size", type=int, default=None) parser.add_argument("--render-train", action="store_true") parser.add_argument("--render-eval", action="store_true") parser.add_argument("--monitor", action="store_true") parser.add_argument("--reward-scale-factor", type=float, default=1e-3) parser.add_argument( "--actor-learner", action="store_true", help="Enable asynchronous sampling with asynchronous actor(s)", ) # NOQA parser.add_argument( "--num-envs", type=int, default=1, help=("The number of environments for sampling (only effective with" " --actor-learner enabled)"), ) # NOQA args = parser.parse_args() # Set a random seed used in PFRL utils.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(idx=0, test=False): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed utils.set_random_seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): utils.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = pfrl.wrappers.ScaleReward(env, args.reward_scale_factor) if (args.render_eval and test) or (args.render_train and not test): env = pfrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space, ) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, ) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample, ) if args.noisy_net_sigma is not None: pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() opt = optim.Adam(q_func.parameters()) rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) // args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffers.ReplayBuffer(rbuf_capacity) agent = DQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, ) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) elif not args.actor_learner: print( "WARNING: Since https://github.com/pfnet/pfrl/pull/112 we have started" " setting `eval_during_episode=True` in this script, which affects the" " timings of evaluation phases.") experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, train_max_episode_len=timestep_limit, eval_during_episode=True, ) else: # using impala mode when given num of envs # When we use multiple envs, it is critical to ensure each env # can occupy a CPU core to get the best performance. # Therefore, we need to prevent potential CPU over-provision caused by # multi-threading in Openmp and Numpy. # Disable the multi-threading on Openmp and Numpy. os.environ["OMP_NUM_THREADS"] = "1" # NOQA ( make_actor, learner, poller, exception_event, ) = agent.setup_actor_learner_training(args.num_envs) poller.start() learner.start() experiments.train_agent_async( processes=args.num_envs, make_agent=make_actor, make_env=make_env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, stop_event=learner.stop_event, exception_event=exception_event, ) poller.stop() learner.stop() poller.join() learner.join()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--env", type=str, default="BreakoutNoFrameskip-v4", help="OpenAI Atari domain to perform algorithm on.", ) parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument( "--final-exploration-frames", type=int, default=10**6, help="Timesteps after which we stop " + "annealing exploration rate", ) parser.add_argument( "--final-epsilon", type=float, default=0.01, help="Final value of epsilon during training.", ) parser.add_argument( "--eval-epsilon", type=float, default=0.001, help="Exploration epsilon used during eval episodes.", ) parser.add_argument("--noisy-net-sigma", type=float, default=None) parser.add_argument( "--arch", type=str, default="doubledqn", choices=["nature", "nips", "dueling", "doubledqn"], help="Network architecture to use.", ) parser.add_argument( "--steps", type=int, default=5 * 10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument( "--replay-start-size", type=int, default=5 * 10**4, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument( "--target-update-interval", type=int, default=3 * 10**4, help="Frequency (in timesteps) at which " + "the target network is updated.", ) parser.add_argument( "--eval-interval", type=int, default=10**5, help="Frequency (in timesteps) of evaluation phase.", ) parser.add_argument( "--update-interval", type=int, default=4, help="Frequency (in timesteps) of network updates.", ) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--no-clip-delta", dest="clip_delta", action="store_false") parser.add_argument("--num-step-return", type=int, default=1) parser.set_defaults(clip_delta=True) parser.add_argument("--agent", type=str, default="DoubleDQN", choices=["DQN", "DoubleDQN", "PAL"]) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--lr", type=float, default=2.5e-4, help="Learning rate.") parser.add_argument( "--prioritized", action="store_true", default=False, help="Use prioritized experience replay.", ) parser.add_argument( "--checkpoint-frequency", type=int, default=None, help="Frequency at which agents are stored.", ) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, ) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = parse_arch(args.arch, n_actions) if args.noisy_net_sigma is not None: pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() else: explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions), ) # Use the Nature paper's hyperparameters opt = pfrl.optimizers.RMSpropEpsInsideSqrt( q_func.parameters(), lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2, centered=True, ) # Select a replay buffer to use if args.prioritized: # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 10**6, alpha=0.6, beta0=0.4, betasteps=betasteps, num_steps=args.num_step_return, ) else: rbuf = replay_buffers.ReplayBuffer(10**6, args.num_step_return) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator="sum", phi=phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, checkpoint_freq=args.checkpoint_frequency, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, eval_env=eval_env, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4") parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--final-exploration-frames", type=int, default=10**6) parser.add_argument("--final-epsilon", type=float, default=0.01) parser.add_argument("--eval-epsilon", type=float, default=0.001) parser.add_argument("--noisy-net-sigma", type=float, default=None) parser.add_argument( "--arch", type=str, default="doubledqn", choices=["nature", "nips", "dueling", "doubledqn"], ) parser.add_argument("--steps", type=int, default=5 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--replay-start-size", type=int, default=5 * 10**4) parser.add_argument("--target-update-interval", type=int, default=3 * 10**4) parser.add_argument("--eval-interval", type=int, default=10**5) parser.add_argument("--update-interval", type=int, default=4) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--no-clip-delta", dest="clip_delta", action="store_false") parser.set_defaults(clip_delta=True) parser.add_argument("--agent", type=str, default="DoubleDQN", choices=["DQN", "DoubleDQN", "PAL"]) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--lr", type=float, default=2.5e-4, help="Learning rate") parser.add_argument( "--prioritized", action="store_true", default=False, help="Use prioritized experience replay.", ) parser.add_argument("--num-envs", type=int, default=1) parser.add_argument("--n-step-return", type=int, default=1) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, frame_stack=False, ) if test: # Randomize actions like epsilon-greedy in evaluation as well env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon) env.seed(env_seed) if args.monitor: env = pfrl.wrappers.Monitor( env, args.outdir, mode="evaluation" if test else "training") if args.render: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): vec_env = pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) vec_env = pfrl.wrappers.VectorFrameStack(vec_env, 4) return vec_env sample_env = make_env(0, test=False) n_actions = sample_env.action_space.n q_func = parse_arch(args.arch, n_actions) if args.noisy_net_sigma is not None: pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Use the same hyper parameters as the Nature paper's opt = optim.RMSprop( q_func.parameters(), lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2, centered=True, ) # Select a replay buffer to use if args.prioritized: # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 10**6, alpha=0.6, beta0=0.4, betasteps=betasteps, num_steps=args.n_step_return, ) else: rbuf = replay_buffers.ReplayBuffer(10**6, num_steps=args.n_step_return) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions), ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = parse_agent(args.agent) agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator="sum", phi=phi, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, )
from DQN_model import Net, RNDNet, Embedding_fn, Embedding_full from IR_modules import NGU_module, RND_module from utils import RandomSelectionEpsilonGreedy, wrap_env import pdb env = wrap_env(params.env_name, max_frames=params.max_frames, clip_rewards=True) if params.rainbow: q_func = DistributionalDuelingDQN(env.action_space.n, params.n_atoms, params.v_min, params.v_max) else: q_func = Net(actions=env.action_space.n, dueling=params.dueling) if params.noisynet: pnn.to_factorized_noisy(q_func, sigma_scale=params.noisy_net_sigma) explorer = explorers.Greedy() else: explorer = explorers.LinearDecayEpsilonGreedy( params.epsilon_max, params.epsilon_min, params.epsilon_steps, lambda: np.random.randint(env.action_space.n) ) if params.explorer_method == 0 else RandomSelectionEpsilonGreedy( params.epsilon_min, params.epsilon_max, params.epsilon_num, params.epsilon_interval, lambda: np.random.randint(env.action_space.n) ) optimizer = torch.optim.Adam(q_func.parameters(), lr=params.lr, eps=1e-08) # eps=1.5*10**-4) rbuf = replay_buffers.PrioritizedReplayBuffer( params.per_size, alpha=params.per_alpha,
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)") parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--load", type=str, default=None) parser.add_argument("--eval-epsilon", type=float, default=0.0) parser.add_argument("--noisy-net-sigma", type=float, default=0.2) parser.add_argument("--steps", type=int, default=5 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) parser.add_argument("--replay-start-size", type=int, default=2 * 10**3) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument( "--log-level", type=int, default=20, help="Logging level. 10:DEBUG, 20:INFO etc.", ) parser.add_argument( "--render", action="store_true", default=False, help="Render env states in a GUI window.", ) parser.add_argument( "--monitor", action="store_true", default=False, help= ("Monitor env. Videos and additional information are saved as output files." ), ) parser.add_argument("--n-best-episodes", type=int, default=200) args = parser.parse_args() import logging logging.basicConfig(level=args.log_level) # Set a random seed used in PFRL. utils.set_random_seed(args.seed) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed test_ID = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S") args.outdir = experiments.prepare_output_dir(args, args.outdir, test_ID) print("Output files are saved in {}".format(args.outdir)) env = MapRootEnv() eval_env = MapRootEnv() n_actions = env.action_space.n input_shape = env.input_shape n_atoms = 51 v_max = 10 v_min = -10 q_func = MyDistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max, input_shape[2]) # Noisy nets pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Use the same hyper parameters as https://arxiv.org/abs/1710.02298 opt = torch.optim.Adam(q_func.parameters(), 6.25e-5, eps=1.5 * 10**-4) # Prioritized Replay # Anneal beta from beta0 to 1 throughout training update_interval = 4 betasteps = args.steps / update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( 5 * 10**4, alpha=0.5, beta0=0.4, betasteps=betasteps, num_steps=3, normalize_by_max="batch", ) Agent = agents.CategoricalDoubleDQN agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.80, explorer=explorer, minibatch_size=32, replay_start_size=args.replay_start_size, target_update_interval=32000, update_interval=update_interval, batch_accumulator="mean") if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print("n_episodes: {} mean: {} median: {} stdev {}".format( eval_stats["episodes"], eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, logger=TBLogger(args.outdir)) # dir_of_best_network = os.path.join(args.outdir, "best") # agent.load(dir_of_best_network) # run 200 evaluation episodes, each capped at 30 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=args.max_frames / 4, logger=None) with open(os.path.join(args.outdir, "bestscores.json"), "w") as f: json.dump(stats, f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))