def _test_load_rainbow(self, gpu): q_func = DistributionalDuelingDQN(4, 51, -10, 10) links.to_factorized_noisy(q_func, sigma_scale=0.5) explorer = explorers.Greedy() opt = chainer.optimizers.Adam(6.25e-5, eps=1.5 * 10**-4) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(100) agent = agents.CategoricalDoubleDQN( q_func, opt, rbuf, gpu=gpu, gamma=0.99, explorer=explorer, minibatch_size=32, replay_start_size=50, target_update_interval=32000, update_interval=4, batch_accumulator='mean', phi=lambda x: x, ) model, exists = download_model("Rainbow", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def _test_load_iqn(self, gpu): q_func = agents.iqn.ImplicitQuantileQFunction( psi=links.Sequence( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, functools.partial(F.reshape, shape=(-1, 3136)), ), phi=links.Sequence( agents.iqn.CosineBasisLinear(64, 3136), F.relu, ), f=links.Sequence( L.Linear(None, 512), F.relu, L.Linear(None, 4), ), ) opt = chainer.optimizers.Adam(5e-5, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(100) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, decay_steps=10**6, random_action_func=lambda: np.random.randint(4)) agent = agents.IQN( q_func, opt, rbuf, gpu=gpu, gamma=0.99, explorer=explorer, replay_start_size=50, target_update_interval=10**4, update_interval=4, batch_accumulator='mean', phi=lambda x: x, quantile_thresholds_N=64, quantile_thresholds_N_prime=64, quantile_thresholds_K=32, ) model, exists = download_model("IQN", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def _test_load_a3c(self, gpu): model = A3CFF(4) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) agent = agents.A3C(model, opt, t_max=5, gamma=0.99, beta=1e-2, phi=lambda x: x) model, exists = download_model("A3C", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def _test_load_ppo(self, gpu): winit = chainerrl.initializers.Orthogonal(1.) winit_last = chainerrl.initializers.Orthogonal(1e-2) action_size = 3 policy = chainer.Sequential( L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, action_size, initialW=winit_last), policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 ), ) vf = chainer.Sequential(L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 1, initialW=winit)) model = links.Branched(policy, vf) opt = chainer.optimizers.Adam(3e-4, eps=1e-5) opt.setup(model) agent = agents.PPO(model, opt, obs_normalizer=None, gpu=gpu, update_interval=2048, minibatch_size=64, epochs=10, clip_eps_vf=None, entropy_coef=0, standardize_advantages=True, gamma=0.995, lambd=0.97) model, exists = download_model("PPO", "Hopper-v2", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def test_load_trpo(self): winit = chainerrl.initializers.Orthogonal(1.) winit_last = chainerrl.initializers.Orthogonal(1e-2) action_size = 3 policy = chainer.Sequential( L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, action_size, initialW=winit_last), policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 ), ) vf = chainer.Sequential( L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 1, initialW=winit), ) vf_opt = chainer.optimizers.Adam() vf_opt.setup(vf) agent = agents.TRPO(policy=policy, vf=vf, vf_optimizer=vf_opt, update_interval=5000, max_kl=0.01, conjugate_gradient_max_iter=20, conjugate_gradient_damping=1e-1, gamma=0.995, lambd=0.97, vf_epochs=5, entropy_coef=0) model, exists = download_model("TRPO", "Hopper-v2", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def _test_load_dqn(self, gpu): q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, 4), DiscreteActionValue) opt = optimizers.RMSpropGraves(lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(100) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, decay_steps=10**6, random_action_func=lambda: np.random.randint(4)) agent = agents.DQN(q_func, opt, rbuf, gpu=gpu, gamma=0.99, explorer=explorer, replay_start_size=50, target_update_interval=10**4, clip_delta=True, update_interval=4, batch_accumulator='sum', phi=lambda x: x) model, exists = download_model("DQN", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def main(): parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='Hopper-v2', help='OpenAI Gym MuJoCo env to perform algorithm on.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--load', type=str, default='', help='Directory to load agent from.') parser.add_argument('--steps', type=int, default=10 ** 6, help='Total number of timesteps to train the agent.') parser.add_argument('--eval-n-runs', type=int, default=10, help='Number of episodes run for each evaluation.') parser.add_argument('--eval-interval', type=int, default=5000, help='Interval in timesteps between evaluations.') parser.add_argument('--replay-start-size', type=int, default=10000, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--batch-size', type=int, default=100, help='Minibatch size') parser.add_argument('--render', action='store_true', help='Render env states in a GUI window.') parser.add_argument('--demo', action='store_true', help='Just run evaluation, not training.') parser.add_argument('--load-pretrained', action='store_true', default=False) parser.add_argument('--pretrained-type', type=str, default="best", choices=['best', 'final']) parser.add_argument('--monitor', action='store_true', help='Wrap env with gym.wrappers.Monitor.') parser.add_argument('--logger-level', type=int, default=logging.INFO, help='Level of the root logger.') args = parser.parse_args() logging.basicConfig(level=args.logger_level) args.outdir = experiments.prepare_output_dir( args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) def make_env(test): env = gym.make(args.env) # Unwrap TimeLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_space = env.observation_space action_space = env.action_space print('Observation space:', obs_space) print('Action space:', action_space) action_size = action_space.low.size winit = chainer.initializers.LeCunUniform(3 ** -0.5) policy = chainer.Sequential( L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, action_size, initialW=winit), F.tanh, chainerrl.distribution.ContinuousDeterministicDistribution, ) policy_optimizer = optimizers.Adam().setup(policy) def make_q_func_with_optimizer(): q_func = chainer.Sequential( concat_obs_and_action, L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, 1, initialW=winit), ) q_func_optimizer = optimizers.Adam().setup(q_func) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable( policy.xp.zeros_like(obs_space.low, dtype=np.float32)[None], name='observation') fake_action = chainer.Variable( policy.xp.zeros_like(action_space.low, dtype=np.float32)[None], name='action') chainerrl.misc.draw_computational_graph( [policy(fake_obs)], os.path.join(args.outdir, 'policy')) chainerrl.misc.draw_computational_graph( [q_func1(fake_obs, fake_action)], os.path.join(args.outdir, 'q_func1')) chainerrl.misc.draw_computational_graph( [q_func2(fake_obs, fake_action)], os.path.join(args.outdir, 'q_func2')) rbuf = replay_buffer.ReplayBuffer(10 ** 6) explorer = explorers.AdditiveGaussian( scale=0.1, low=action_space.low, high=action_space.high) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform( action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = chainerrl.agents.TD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, soft_update_tau=5e-3, explorer=explorer, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, ) if len(args.load) > 0 or args.load_pretrained: # either load or load_pretrained must be false assert not len(args.load) > 0 or not args.load_pretrained if len(args.load) > 0: agent.load(args.load) else: agent.load(misc.download_model( "TD3", args.env, model_type=args.pretrained_type)[0]) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_env=eval_env, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, train_max_episode_len=timestep_limit)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load-pretrained', action='store_true', default=False) parser.add_argument('--pretrained-type', type=str, default="best", choices=['best', 'final']) parser.add_argument('--load', type=str, default=None) parser.add_argument('--final-exploration-frames', type=int, default=10 ** 6) parser.add_argument('--final-epsilon', type=float, default=0.01) parser.add_argument('--eval-epsilon', type=float, default=0.001) parser.add_argument('--steps', type=int, default=5 * 10 ** 7) parser.add_argument('--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--replay-start-size', type=int, default=5 * 10 ** 4) parser.add_argument('--target-update-interval', type=int, default=10 ** 4) parser.add_argument('--eval-interval', type=int, default=250000) parser.add_argument('--eval-n-steps', type=int, default=125000) parser.add_argument('--update-interval', type=int, default=4) parser.add_argument('--batch-size', type=int, default=32) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--batch-accumulator', type=str, default='mean', choices=['mean', 'sum']) parser.add_argument('--quantile-thresholds-N', type=int, default=64) parser.add_argument('--quantile-thresholds-N-prime', type=int, default=64) parser.add_argument('--quantile-thresholds-K', type=int, default=32) parser.add_argument('--n-best-episodes', type=int, default=200) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu,)) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2 ** 31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = chainerrl.agents.iqn.ImplicitQuantileQFunction( psi=chainerrl.links.Sequence( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, functools.partial(F.reshape, shape=(-1, 3136)), ), phi=chainerrl.links.Sequence( chainerrl.agents.iqn.CosineBasisLinear(64, 3136), F.relu, ), f=chainerrl.links.Sequence( L.Linear(None, 512), F.relu, L.Linear(None, n_actions), ), ) # Draw the computational graph and save it in the output directory. fake_obss = np.zeros((4, 84, 84), dtype=np.float32)[None] fake_taus = np.zeros(32, dtype=np.float32)[None] chainerrl.misc.draw_computational_graph( [q_func(fake_obss)(fake_taus)], os.path.join(args.outdir, 'model')) # Use the same hyper parameters as https://arxiv.org/abs/1710.10044 opt = chainer.optimizers.Adam(5e-5, eps=1e-2 / args.batch_size) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10 ** 6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = chainerrl.agents.IQN( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator=args.batch_accumulator, phi=phi, quantile_thresholds_N=args.quantile_thresholds_N, quantile_thresholds_N_prime=args.quantile_thresholds_N_prime, quantile_thresholds_K=args.quantile_thresholds_K, ) if args.load or args.load_pretrained: # either load or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load(misc.download_model("IQN", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None, ) print('n_steps: {} mean: {} median: {} stdev {}'.format( args.eval_n_steps, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 200 evaluation episodes, each capped at 30 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=args.max_frames / 4, logger=None) with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f: json.dump(stats, f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4', help='OpenAI Atari domain to perform algorithm on.') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load-pretrained', action='store_true', default=False) parser.add_argument('--pretrained-type', type=str, default="best", choices=['best', 'final']) parser.add_argument('--load', type=str, default=None) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--steps', type=int, default=5 * 10**7, help='Total number of timesteps to train the agent.') parser.add_argument('--replay-start-size', type=int, default=5 * 10**4, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--eval-n-steps', type=int, default=125000) parser.add_argument('--eval-interval', type=int, default=250000) parser.add_argument('--n-best-episodes', type=int, default=30) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=None), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, 0.05) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions), DiscreteActionValue) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the same hyperparameters as the Nature paper opt = optimizers.RMSpropGraves(lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, decay_steps=10**6, random_action_func=lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.DQN agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=10**4, clip_delta=True, update_interval=4, batch_accumulator='sum', phi=phi) if args.load or args.load_pretrained: # either load or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load( misc.download_model("DQN", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print('n_episodes: {} mean: {} median: {} stdev {}'.format( eval_stats['episodes'], eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 30 evaluation episodes, each capped at 5 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=4500, logger=None) with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f: json.dump(stats, f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='SlimeVolleySurvivalNoFrameskip-v0') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load-pretrained', action='store_true', default=False) parser.add_argument('--pretrained-type', type=str, default="best", choices=['best', 'final']) parser.add_argument('--load', type=str, default=None) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--eval-epsilon', type=float, default=0.0) parser.add_argument('--noisy-net-sigma', type=float, default=0.5) parser.add_argument('--steps', type=int, default=200000000) parser.add_argument('--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--replay-start-size', type=int, default=2 * 10 ** 4) parser.add_argument('--eval-n-steps', type=int, default=125000) parser.add_argument('--eval-interval', type=int, default=250000) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--n-best-episodes', type=int, default=200) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu,)) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2 ** 31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind( custom_make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=False) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n n_atoms = 51 v_max = 10 v_min = -10 q_func = DistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max,) # Noisy nets links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the same hyper parameters as https://arxiv.org/abs/1707.06887 opt = chainer.optimizers.Adam(6.25e-5, eps=1.5 * 10 ** -4) opt.setup(q_func) # Prioritized Replay # Anneal beta from beta0 to 1 throughout training update_interval = 4 betasteps = args.steps / update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( 10 ** 6, alpha=0.5, beta0=0.4, betasteps=betasteps, num_steps=3, normalize_by_max='memory', ) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.CategoricalDoubleDQN agent = Agent( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, minibatch_size=32, replay_start_size=args.replay_start_size, target_update_interval=32000, update_interval=update_interval, batch_accumulator='mean', phi=phi, ) if args.load or args.load_pretrained: # either load_ or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load(misc.download_model("Rainbow", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print('n_episodes: {} mean: {} median: {} stdev {}'.format( eval_stats['episodes'], eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 200 evaluation episodes, each capped at 30 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=args.max_frames / 4, logger=None) with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f: json.dump(stats, f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--env', type=str, default='Hopper-v2', help='OpenAI Gym MuJoCo env to perform algorithm on.') parser.add_argument('--num-envs', type=int, default=1, help='Number of envs run in parallel.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=2 * 10**6, help='Total number of timesteps to train the agent.') parser.add_argument('--eval-interval', type=int, default=100000, help='Interval in timesteps between evaluations.') parser.add_argument('--eval-n-runs', type=int, default=100, help='Number of episodes run for each evaluation.') parser.add_argument('--render', action='store_true', help='Render env states in a GUI window.') parser.add_argument('--demo', action='store_true', help='Just run evaluation, not training.') parser.add_argument('--load-pretrained', action='store_true', default=False) parser.add_argument('--load', type=str, default='', help='Directory to load agent from.') parser.add_argument('--logger-level', type=int, default=logging.INFO, help='Level of the root logger.') parser.add_argument('--monitor', action='store_true', help='Wrap env with gym.wrappers.Monitor.') parser.add_argument('--log-interval', type=int, default=1000, help='Interval in timesteps between outputting log' ' messages during training') parser.add_argument('--update-interval', type=int, default=2048, help='Interval in timesteps between model updates.') parser.add_argument('--epochs', type=int, default=10, help='Number of epochs to update model for per PPO' ' iteration.') parser.add_argument('--batch-size', type=int, default=64, help='Minibatch size') args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = gym.make(args.env) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space print('Observation space:', obs_space) print('Action space:', action_space) assert isinstance(action_space, gym.spaces.Box) # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) # While the original paper initialized weights by normal distribution, # we use orthogonal initialization as the latest openai/baselines does. winit = chainerrl.initializers.Orthogonal(1.) winit_last = chainerrl.initializers.Orthogonal(1e-2) action_size = action_space.low.size policy = chainer.Sequential( L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, action_size, initialW=winit_last), chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 ), ) vf = chainer.Sequential( L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 1, initialW=winit), ) # Combine a policy and a value function into a single model model = chainerrl.links.Branched(policy, vf) opt = chainer.optimizers.Adam(3e-4, eps=1e-5) opt.setup(model) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batch_size, epochs=args.epochs, clip_eps_vf=None, entropy_coef=0, standardize_advantages=True, gamma=0.995, lambd=0.97, ) if args.load or args.load_pretrained: # either load or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load( misc.download_model("PPO", args.env, model_type="final")[0]) if args.demo: env = make_batch_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=timestep_limit, save_best_so_far_agent=False, )
def _test_load_td3(self, gpu): def concat_obs_and_action(obs, action): """Concat observation and action to feed the critic.""" return F.concat((obs, action), axis=-1) def make_q_func_with_optimizer(): q_func = chainer.Sequential( concat_obs_and_action, L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, 1, initialW=winit), ) q_func_optimizer = optimizers.Adam().setup(q_func) return q_func, q_func_optimizer winit = chainer.initializers.LeCunUniform(3**-0.5) q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() action_size = 3 policy = chainer.Sequential( L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, action_size, initialW=winit), F.tanh, chainerrl.distribution.ContinuousDeterministicDistribution, ) policy_optimizer = optimizers.Adam().setup(policy) rbuf = replay_buffer.ReplayBuffer(100) explorer = explorers.AdditiveGaussian(scale=0.1, low=[-1., -1., -1.], high=[1., 1., 1.]) agent = agents.TD3(policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, soft_update_tau=5e-3, explorer=explorer, replay_start_size=10000, gpu=gpu, minibatch_size=100, burnin_action_func=None) model, exists = download_model("TD3", "Hopper-v2", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def _test_load_ddpg(self, gpu): def concat_obs_and_action(obs, action): return F.concat((obs, action), axis=-1) action_size = 3 winit = chainer.initializers.LeCunUniform(3**-0.5) q_func = chainer.Sequential( concat_obs_and_action, L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, 1, initialW=winit), ) policy = chainer.Sequential( L.Linear(None, 400, initialW=winit), F.relu, L.Linear(None, 300, initialW=winit), F.relu, L.Linear(None, action_size, initialW=winit), F.tanh, chainerrl.distribution.ContinuousDeterministicDistribution, ) from chainerrl.agents.ddpg import DDPGModel model = DDPGModel(q_func=q_func, policy=policy) obs_low = [-np.inf] * 11 fake_obs = chainer.Variable(model.xp.zeros_like( obs_low, dtype=np.float32)[None], name='observation') fake_action = chainer.Variable(model.xp.zeros_like( [-1., -1., -1.], dtype=np.float32)[None], name='action') policy(fake_obs) q_func(fake_obs, fake_action) opt_a = optimizers.Adam() opt_c = optimizers.Adam() opt_a.setup(model['policy']) opt_c.setup(model['q_function']) explorer = explorers.AdditiveGaussian(scale=0.1, low=[-1., -1., -1.], high=[1., 1., 1.]) agent = agents.DDPG(model, opt_a, opt_c, replay_buffer.ReplayBuffer(100), gamma=0.99, explorer=explorer, replay_start_size=1000, target_update_method='soft', target_update_interval=1, update_interval=1, soft_update_tau=5e-3, n_times_update=1, gpu=gpu, minibatch_size=100, burnin_action_func=None) model, exists = download_model("DDPG", "Hopper-v2", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def main(): parser = argparse.ArgumentParser() parser.add_argument('--processes', type=int, default=16) parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=250000) parser.add_argument('--eval-n-steps', type=int, default=125000) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load-pretrained', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = gym.make(args.env).action_space.n model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph([model(fake_obs)], os.path.join( args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) if args.load or args.load_pretrained: # either load or load_pretrained must be false assert not args.load or not args.load_pretrained if args.load: agent.load(args.load) else: agent.load( misc.download_model("A3C", args.env, model_type="final")[0]) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print('n_steps: {} mean: {} median: {} stdev: {}'.format( args.eval_n_steps, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
def _test_load_sac(self, gpu): winit = chainer.initializers.GlorotUniform() winit_policy_output = chainer.initializers.GlorotUniform(1.0) def concat_obs_and_action(obs, action): """Concat observation and action to feed the critic.""" return F.concat((obs, action), axis=-1) def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == 3 * 2 mean, log_scale = F.split_axis(x, 2, axis=1) log_scale = F.clip(log_scale, -20., 2.) var = F.exp(log_scale * 2) return chainerrl.distribution.SquashedGaussianDistribution(mean, var=var) policy = chainer.Sequential( L.Linear(None, 256, initialW=winit), F.relu, L.Linear(None, 256, initialW=winit), F.relu, L.Linear(None, 3 * 2, initialW=winit_policy_output), squashed_diagonal_gaussian_head, ) policy_optimizer = optimizers.Adam(3e-4).setup(policy) def make_q_func_with_optimizer(): q_func = chainer.Sequential( concat_obs_and_action, L.Linear(None, 256, initialW=winit), F.relu, L.Linear(None, 256, initialW=winit), F.relu, L.Linear(None, 1, initialW=winit), ) q_func_optimizer = optimizers.Adam(3e-4).setup(q_func) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() agent = agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffer.ReplayBuffer(100), gamma=0.99, replay_start_size=1000, gpu=gpu, minibatch_size=256, burnin_action_func=None, entropy_target=-3, temperature_optimizer=optimizers.Adam(3e-4), ) model, exists = download_model("SAC", "Hopper-v2", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def main(argv): env = FLAGS.env + "NoFrameskip-v4" try: misc.download_model(FLAGS.alg, env, model_type="final")[0] except HTTPError: print("ERROR: Could not download %s for %s" % (alg, env))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='Hopper-v2', help='OpenAI Gym MuJoCo env to perform algorithm on.') parser.add_argument('--num-envs', type=int, default=1, help='Number of envs run in parallel.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--load', type=str, default='', help='Directory to load agent from.') parser.add_argument('--steps', type=int, default=10**6, help='Total number of timesteps to train the agent.') parser.add_argument('--eval-n-runs', type=int, default=10, help='Number of episodes run for each evaluation.') parser.add_argument('--eval-interval', type=int, default=5000, help='Interval in timesteps between evaluations.') parser.add_argument('--replay-start-size', type=int, default=10000, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--batch-size', type=int, default=256, help='Minibatch size') parser.add_argument('--render', action='store_true', help='Render env states in a GUI window.') parser.add_argument('--demo', action='store_true', help='Just run evaluation, not training.') parser.add_argument('--load-pretrained', action='store_true', default=False) parser.add_argument('--pretrained-type', type=str, default="best", choices=['best', 'final']) parser.add_argument('--monitor', action='store_true', help='Wrap env with gym.wrappers.Monitor.') parser.add_argument('--log-interval', type=int, default=1000, help='Interval in timesteps between outputting log' ' messages during training') parser.add_argument('--logger-level', type=int, default=logging.INFO, help='Level of the root logger.') parser.add_argument('--policy-output-scale', type=float, default=1., help='Weight initialization scale of policy output.') parser.add_argument('--debug', action='store_true', help='Debug mode.') args = parser.parse_args() logging.basicConfig(level=args.logger_level) if args.debug: chainer.set_debug(True) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def make_env(process_idx, test): env = gym.make(args.env) # Unwrap TimiLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) # Normalize action space to [-1, 1]^n env = chainerrl.wrappers.NormalizeActionSpace(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(process_idx=0, test=False) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space print('Observation space:', obs_space) print('Action space:', action_space) action_size = action_space.low.size winit = chainer.initializers.GlorotUniform() winit_policy_output = chainer.initializers.GlorotUniform( args.policy_output_scale) def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = F.split_axis(x, 2, axis=1) log_scale = F.clip(log_scale, -20., 2.) var = F.exp(log_scale * 2) return chainerrl.distribution.SquashedGaussianDistribution(mean, var=var) policy = chainer.Sequential( L.Linear(None, 256, initialW=winit), F.relu, L.Linear(None, 256, initialW=winit), F.relu, L.Linear(None, action_size * 2, initialW=winit_policy_output), squashed_diagonal_gaussian_head, ) policy_optimizer = optimizers.Adam(3e-4).setup(policy) def make_q_func_with_optimizer(): q_func = chainer.Sequential( concat_obs_and_action, L.Linear(None, 256, initialW=winit), F.relu, L.Linear(None, 256, initialW=winit), F.relu, L.Linear(None, 1, initialW=winit), ) q_func_optimizer = optimizers.Adam(3e-4).setup(q_func) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(policy.xp.zeros_like(obs_space.low, dtype=np.float32)[None], name='observation') fake_action = chainer.Variable(policy.xp.zeros_like( action_space.low, dtype=np.float32)[None], name='action') chainerrl.misc.draw_computational_graph([policy(fake_obs)], os.path.join( args.outdir, 'policy')) chainerrl.misc.draw_computational_graph([q_func1(fake_obs, fake_action)], os.path.join( args.outdir, 'q_func1')) chainerrl.misc.draw_computational_graph([q_func2(fake_obs, fake_action)], os.path.join( args.outdir, 'q_func2')) rbuf = replay_buffer.ReplayBuffer(10**6) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = chainerrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer=chainer.optimizers.Adam(3e-4), ) if len(args.load) > 0 or args.load_pretrained: # either load or load_pretrained must be false assert not len(args.load) > 0 or not args.load_pretrained if len(args.load) > 0: agent.load(args.load) else: agent.load( misc.download_model("SAC", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=timestep_limit, )