def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='Pendulum-v0') parser.add_argument('--arch', type=str, default='Gaussian', choices=('FFSoftmax', 'FFMellowmax', 'Gaussian')) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--update-steps', type=int, default=5) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-5) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--gamma', type=float, default=0.99, help='discount factor') parser.add_argument('--use-gae', action='store_true', default=False, help='use generalized advantage estimation') parser.add_argument('--tau', type=float, default=0.95, help='gae parameter') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--max-grad-norm', type=float, default=0.5, help='value loss coefficient') parser.add_argument('--alpha', type=float, default=0.99, help='RMSprop optimizer alpha') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--num-envs', type=int, default=1) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(process_idx=0, test=False) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'Gaussian': model = A2CGaussian(obs_space.low.size, action_space.low.size) elif args.arch == 'FFSoftmax': model = A2CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A2CFFMellowmax(obs_space.low.size, action_space.n) optimizer = chainer.optimizers.RMSprop(args.lr, eps=args.rmsprop_epsilon, alpha=args.alpha) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm)) if args.weight_decay > 0: optimizer.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a2c.A2C(model, optimizer, gamma=args.gamma, gpu=args.gpu, num_processes=args.num_envs, update_steps=args.update_steps, use_gae=args.use_gae, tau=args.tau) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, log_interval=args.log_interval, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, )
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 if not os.path.exists(args.outdir): os.makedirs(args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(process_idx=0, test=False) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'Gaussian': model = A2CGaussian(obs_space.low.size, action_space.low.size) elif args.arch == 'FFSoftmax': model = A2CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A2CFFMellowmax(obs_space.low.size, action_space.n) optimizer = chainer.optimizers.RMSprop(args.lr, eps=args.rmsprop_epsilon, alpha=args.alpha) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm)) if args.weight_decay > 0: optimizer.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a2c.A2C(model, optimizer, gamma=args.gamma, gpu=args.gpu, num_processes=args.num_envs, update_steps=args.update_steps, use_gae=args.use_gae, tau=args.tau) if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, log_interval=args.log_interval, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, log_type=args.log_type) elif (args.mode == 'check'): from matplotlib import animation import matplotlib.pyplot as plt frames = [] env = make_env(process_idx=0, test=True) for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode='rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) anim.save(args.save_mp4) return anim
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results') parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--update-steps', type=int, default=5) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--gamma', type=float, default=0.99, help='discount factor') parser.add_argument('--rmsprop-epsilon', type=float, default=1e-5) parser.add_argument('--use-gae', action='store_true', default=False, help='use generalized advantage estimation') parser.add_argument('--tau', type=float, default=0.95, help='gae parameter') parser.add_argument('--alpha', type=float, default=0.99, help='RMSprop optimizer alpha') parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--max-grad-norm', type=float, default=40, help='value loss coefficient') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.set_defaults(use_lstm=False) args = parser.parse_args() logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(0, test=False) n_actions = sample_env.action_space.n model = A2CFF(n_actions) optimizer = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=args.alpha) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm)) if args.weight_decay > 0: optimizer.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a2c.A2C( model, optimizer, gamma=args.gamma, gpu=args.gpu, num_processes=args.num_envs, update_steps=args.update_steps, phi=phi, use_gae=args.use_gae, tau=args.tau, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, )
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2 ** 31 def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2 ** 31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_env_check(): # Use different random seeds for train and test envs env_seed = args.seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=True, clip_rewards=True) env.seed(int(env_seed)) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv( [functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs))]) sample_env = make_env(0, test=False) n_actions = sample_env.action_space.n model = A2CFF(n_actions) optimizer = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=args.alpha) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm)) if args.weight_decay > 0: optimizer.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a2c.A2C( model, optimizer, gamma=args.gamma, gpu=args.gpu, num_processes=args.num_envs, update_steps=args.update_steps, phi=phi, use_gae=args.use_gae, tau=args.tau, ) if args.load_agent: agent.load(args.load_agent) if (args.mode=='train'): experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_frequency, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, log_type=args.log_type ) elif (args.mode=='check'): return tools.make_video.check(env=make_env_check(),agent=agent,save_mp4=args.save_mp4) elif (args.mode=='growth'): return tools.make_video.growth(env=make_env_check(),agent=agent,outdir=args.outdir,max_num=args.max_frames,save_mp4=args.save_mp4)