def __init__(self, obs_space): self.seed = 0 self.rmsprop_epsilon = 1e-5 self.gamma = 0.99 self.use_gae = False self.tau = 0.95 self.num_envs = 8 self.lr = 7e-4 self.weight_decay = 0.0 self.max_grad_norm = 0.5 self.alpha = 0.99 self.update_steps = 5 # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(self.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(self.num_envs) + self.seed * self.num_envs assert process_seeds.max() < 2**3 action_space = 1 # Switch policy types accordingly to action space types model = A2CGaussian(obs_space, action_space) optimizer = chainer.optimizers.RMSprop(self.lr, eps=self.rmsprop_epsilon, alpha=self.alpha) optimizer.setup(model) optimizer.add_hook( chainer.optimizer.GradientClipping(self.max_grad_norm)) if self.weight_decay > 0: optimizer.add_hook(NonbiasWeightDecay(self.weight_decay)) self.agent = A2C(model, optimizer, gamma=self.gamma, num_processes=self.num_envs, update_steps=self.update_steps, use_gae=self.use_gae, tau=self.tau)
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='Pendulum-v0') parser.add_argument('--arch', type=str, default='Gaussian', choices=('FFSoftmax', 'FFMellowmax', 'Gaussian')) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--update-steps', type=int, default=5) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-5) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--gamma', type=float, default=0.99, help='discount factor') parser.add_argument('--use-gae', action='store_true', default=False, help='use generalized advantage estimation') parser.add_argument('--tau', type=float, default=0.95, help='gae parameter') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--max-grad-norm', type=float, default=0.5, help='value loss coefficient') parser.add_argument('--alpha', type=float, default=0.99, help='RMSprop optimizer alpha') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--num-envs', type=int, default=1) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(process_idx=0, test=False) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'Gaussian': model = A2CGaussian(obs_space.low.size, action_space.low.size) elif args.arch == 'FFSoftmax': model = A2CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A2CFFMellowmax(obs_space.low.size, action_space.n) optimizer = chainer.optimizers.RMSprop(args.lr, eps=args.rmsprop_epsilon, alpha=args.alpha) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm)) if args.weight_decay > 0: optimizer.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a2c.A2C(model, optimizer, gamma=args.gamma, gpu=args.gpu, num_processes=args.num_envs, update_steps=args.update_steps, use_gae=args.use_gae, tau=args.tau) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, log_interval=args.log_interval, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, )
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--replay-start-size', type=int, default=10000) parser.add_argument('--n-times-replay', type=int, default=4) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.set_defaults(use_sdl=False) parser.set_defaults(use_lstm=False) args = parser.parse_args() # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions if args.use_lstm: model = acer.ACERSharedModel( shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) else: model = acer.ACERSharedModel( shared=links.NIPSDQNHead(), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, beta=args.beta, phi=dqn_phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook])
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v1') parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--normalize-obs', action='store_true') parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.getLogger().setLevel(args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(test): env = gym.make(args.env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if args.reward_scale_factor and not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render: misc.env_modifiers.make_rendered(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean, normalize_obs=args.normalize_obs) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--n-times-replay', type=int, default=8) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-frequency', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.getLogger().setLevel(args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render and process_idx == 0 and not test: misc.env_modifiers.make_rendered(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space n_hidden_channels = 200 model = acer.ACERSeparateModel( pi=links.Sequence( L.Linear(obs_space.low.size, n_hidden_channels), F.relu, L.Linear(n_hidden_channels, action_space.n, wscale=1e-3), SoftmaxDistribution), q=links.Sequence( L.Linear(obs_space.low.size, n_hidden_channels), F.relu, L.Linear(n_hidden_channels, action_space.n, wscale=1e-3), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) replay_buffer = EpisodicReplayBuffer(10**5 // args.processes) agent = acer.DiscreteACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, beta=args.beta, phi=phi) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) mean, median, stdev = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev'.format( args.eval_n_runs, mean, median, stdev)) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_frequency=args.eval_frequency, max_episode_len=timestep_limit)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results') parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--update-steps', type=int, default=5) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--gamma', type=float, default=0.99, help='discount factor') parser.add_argument('--rmsprop-epsilon', type=float, default=1e-5) parser.add_argument('--use-gae', action='store_true', default=False, help='use generalized advantage estimation') parser.add_argument('--tau', type=float, default=0.95, help='gae parameter') parser.add_argument('--alpha', type=float, default=0.99, help='RMSprop optimizer alpha') parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--max-grad-norm', type=float, default=40, help='value loss coefficient') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.set_defaults(use_lstm=False) args = parser.parse_args() logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(0, test=False) n_actions = sample_env.action_space.n model = A2CFF(n_actions) optimizer = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=args.alpha) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm)) if args.weight_decay > 0: optimizer.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a2c.A2C( model, optimizer, gamma=args.gamma, gpu=args.gpu, num_processes=args.num_envs, update_steps=args.update_steps, phi=phi, use_gae=args.use_gae, tau=args.tau, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, )
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.set_defaults(use_sdl=False) parser.set_defaults(use_lstm=False) args = parser.parse_args() # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions if args.use_lstm: model = A3CLSTM(n_actions) else: model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph([model(fake_obs)], os.path.join( args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=dqn_phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook])
def main(): parser = argparse.ArgumentParser() parser.add_argument('config', help='YAML config file') parser.add_argument('outdir', type=str, help='directory to put training log') parser.add_argument('--profile', action='store_true') parser.add_argument('--load', type=str, default='') parser.add_argument('--logger_level', type=int, default=logging.INFO) args = parser.parse_args() print_args(args) # init a logger logging.basicConfig(level=args.logger_level) # load yaml config file with open(args.config) as f: config = yaml.load(f) # set random seed misc.set_random_seed(config['seed']) # create directory to put the results args.outdir = experiments.prepare_output_dir(args, args.outdir) # save config file to outdir with open(os.path.join(args.outdir, 'config.yaml'), 'w') as f: yaml.dump(config, f, indent=4, default_flow_style=False) # define func to create env, target data sampler, and models if config['problem'] == 'toy': assert config['imsize'] == 3, 'invalid imsize' assert config['in_channel'] == 1, 'invalid in_channel' def make_env(process_idx, test): env = ToyEnv(config['imsize']) return env gen = SpiralToyModel(imsize, config['conditional']) dis = SpiralToyDiscriminator(imsize, config['conditional']) if config['conditional']: train_patterns = [(1, 4, 7), (0, 1, 2), (3, 4, 5), (2, 5, 8)] test_patterns = [(6, 7, 8)] else: train_patterns = [(1, 4, 7)] test_patterns = train_patterns dataset = ToyDataset(config['imsize'], train_patterns, test_patterns) else: # my paint env def make_env(process_idx, test): env = MyPaintEnv(max_episode_steps=config['max_episode_steps'], imsize=config['imsize'], pos_resolution=config['pos_resolution'], brush_info_file=config['brush_info_file']) return env # generator gen = SpiralModel(config['imsize'], config['conditional']) dis = SpiralDiscriminator(config['imsize'], config['conditional']) if config['problem'] == 'mnist': single_label = config['mnist_target_label'] is not None dataset = MnistDataset(config['imsize'], single_label, config['mnist_target_label'], config['mnist_binarization']) elif config['problem'] == 'emnist': dataset = EMnistDataset(config['emnist_gz_images'], config['emnist_gz_labels'], config['emnist_single_label']) elif config['problem'] == 'jikei': dataset = JikeiDataset(config['jikei_npz']) elif config['problem'] == 'quickdraw': dataset = QuickdrawDataset(config['quickdraw_npz']) else: raise NotImplementedError() # initialize optimizers gen_opt = chainer.optimizers.Adam(alpha=config['lr'], beta1=0.5) dis_opt = chainer.optimizers.Adam(alpha=config['lr'], beta1=0.5) gen_opt.setup(gen) dis_opt.setup(dis) gen_opt.add_hook(chainer.optimizer.GradientClipping(40)) dis_opt.add_hook(chainer.optimizer.GradientClipping(40)) if config['weight_decay'] > 0: gen_opt.add_hook(NonbiasWeightDecay(config['weight_decay'])) dis_opt.add_hook(NonbiasWeightDecay(config['weight_decay'])) # init an spiral agent agent = SPIRAL( generator=gen, discriminator=dis, gen_optimizer=gen_opt, dis_optimizer=dis_opt, dataset=dataset, conditional=config['conditional'], reward_mode=config['reward_mode'], imsize=config['imsize'], max_episode_steps=config['max_episode_steps'], rollout_n=config['rollout_n'], gamma=config['gamma'], beta=config['beta'], gp_lambda=config['gp_lambda'], lambda_R=config['lambda_R'], staying_penalty=config['staying_penalty'], empty_drawing_penalty=config['empty_drawing_penalty'], n_save_final_obs_interval=config['n_save_final_obs_interval'], outdir=args.outdir) # load from a snapshot if args.load: agent.load(args.load) # training mode max_episode_len = config['max_episode_steps'] * config['rollout_n'] steps = config['processes'] * config['n_update'] * max_episode_len save_interval = config['processes'] * config[ 'n_save_interval'] * max_episode_len eval_interval = config['processes'] * config[ 'n_eval_interval'] * max_episode_len step_hook = SpiralStepHook(config['max_episode_steps'], save_interval, args.outdir) if config['processes'] == 1: # single process for easy to debug agent.process_idx = 0 env = make_env(0, False) experiments.train_agent_with_evaluation( agent=agent, outdir=args.outdir, env=env, steps=steps, eval_n_runs=config['eval_n_runs'], eval_interval=eval_interval, max_episode_len=max_episode_len, step_hooks=[step_hook], save_best_so_far_agent=False) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=config['processes'], make_env=make_env, profile=args.profile, steps=steps, eval_n_runs=config['eval_n_runs'], eval_interval=eval_interval, max_episode_len=max_episode_len, global_step_hooks=[step_hook], save_best_so_far_agent=False)
def __init__(self, args, sample_env): obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_space.low.size, clip_threshold=5) # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': #model = A3CFFSoftmax(obs_space.low.size, action_space.n) model = A3CFFSoftmax(obs_space.low.size, sample_env.env_prop.get_softmax_layer_size(), n_hidden_channels=600, beta=cfg.beta) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean, n_hidden_channels=cfg.n_hidden_channels) elif args.arch == 'FFParamSoftmax': model = A3CFFParamSoftmax( obs_space.low.size, sample_env.env_prop.get_pre_output_layer_size(), sample_env.env_prop.get_parametric_segments(), sample_env.env_prop.get_parametric_softmax_segments_sizes(), n_hidden_channels=600, beta=cfg.beta) else: raise NotImplementedError opt = chainer.optimizers.Adam(alpha=args.adam_lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) # a workaround for saving obs_normalizer # see https://github.com/chainer/chainerrl/issues/376 if 'obs_normalizer' not in PPO.saved_attributes: PPO.saved_attributes.append('obs_normalizer') agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, phi=lambda x: x.astype(np.float32, copy=False), gamma=args.ppo_gamma, lambd=args.ppo_lambda, update_interval=args.ppo_update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) self._agent = agent
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian')) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--frame-buffer-length', type=int, default=4) parser.add_argument('--render-b2w', action='store_true', default=False) # Additional params parser.add_argument('--min_reward', type=float, default=sys.float_info.min) args = parser.parse_args() def f_trim_reward(x, min_reward=args.min_reward): if x < min_reward: x = 0 else: if x != 0: print("XXXXXXXXXXXXX ", x) return x logging.getLogger().setLevel(args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: f_trim_reward(x) * args.reward_scale_factor) misc.env_modifiers.make_reward_clipped(env, -1, 1) if args.render and process_idx == 0 and not test: misc.env_modifiers.make_rendered(env) env = env_gym_chainer.GymEnvironment( env, res_width=X_SHAPE, res_height=Y_SHAPE, agent_history_length=args.frame_buffer_length, render=args.render_b2w) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types model = A3CFF(action_space.n) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) # Clipping by gradient norm (changed from 40 to 10) # opt.add_hook(chainer.optimizer.GradientClipping(10)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = SaliencyA3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) if args.load: agent.load(args.load) if args.demo: with chainer.using_config("train", False): env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--window-size', type=int, default=100) parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) winit_last = chainer.initializers.LeCunNormal(1e-2) # Switch policy types accordingly to action space types if isinstance(action_space, gym.spaces.Discrete): n_actions = action_space.n policy = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ) elif isinstance(action_space, gym.spaces.Box): action_size = action_space.low.size policy = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, action_size, initialW=winit_last), chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 ), ) else: print("""\ This example only supports gym.spaces.Box or gym.spaces.Discrete action spaces.""" ) # NOQA return vf = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, 1), ) # Combine a policy and a value function into a single model model = chainerrl.links.Branched(policy, vf) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: env = make_batch_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, return_window_size=args.window_size, max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, ], )
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2 ** 31 def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2 ** 31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_env_check(): # Use different random seeds for train and test envs env_seed = args.seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=True, clip_rewards=True) env.seed(int(env_seed)) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv( [functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs))]) sample_env = make_env(0, test=False) n_actions = sample_env.action_space.n model = A2CFF(n_actions) optimizer = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=args.alpha) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm)) if args.weight_decay > 0: optimizer.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a2c.A2C( model, optimizer, gamma=args.gamma, gpu=args.gpu, num_processes=args.num_envs, update_steps=args.update_steps, phi=phi, use_gae=args.use_gae, tau=args.tau, ) if args.load_agent: agent.load(args.load_agent) if (args.mode=='train'): experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_frequency, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, log_type=args.log_type ) elif (args.mode=='check'): return tools.make_video.check(env=make_env_check(),agent=agent,save_mp4=args.save_mp4) elif (args.mode=='growth'): return tools.make_video.growth(env=make_env_check(),agent=agent,outdir=args.outdir,max_num=args.max_frames,save_mp4=args.save_mp4)
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--outdir', type=str, default='a3c_training', help='Directory path to save output files. If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='TTT-A3C-v0') parser.add_argument('--seed', type=int, default=17, help='Random seed [0, 2 ** 32)') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--steps', type=int, default=5*10 ** 5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--eval-interval', type=int, default=10 ** 5) parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax')) parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e0) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=1*1e-4) parser.add_argument('--weight-decay', type=float, default=0) parser.add_argument('--logger-level', type=int, default=logging.ERROR) parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer deterministic even with the same random seed. misc.set_random_seed(args.seed) process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2 ** 32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) # NOTE: uncomment the next line to start from a pretrained agent # env.set_agent(gym_ttt.pretrained_agent.get_pretrained_agent("./")) return env sample_env = gym.make(args.env) # number of steps after which an episode is ended (whether the game is over or not) timestep_limit = sample_env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Initialize the NN and the optimizer model = A3CFFSoftmax(obs_space.low.size, action_space.n) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.8, beta=args.beta) if args.load: agent.load(args.load) # draw the policy and state value network chainerrl.misc.draw_computational_graph( [agent.model.pi_and_v(np.array([np.array([[0. for _ in range(3)] for _ in range(3)], dtype=np.float32)]))[0]], os.path.join(args.outdir, 'model_pi')) chainerrl.misc.draw_computational_graph( [agent.model.pi_and_v(np.array([np.array([[0. for _ in range(3)] for _ in range(3)], dtype=np.float32)]))[1]], os.path.join(args.outdir, 'model_v')) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env = env, agent = agent, n_steps = None, n_episodes = args.eval_n_runs, max_episode_len = timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format(args.eval_n_runs, eval_stats['mean'], eval_stats['median'],eval_stats['stdev'])) else: experiments.train_agent_async( agent = agent, outdir = args.outdir, processes = args.processes, make_env = make_env, profile = args.profile, steps = args.steps, eval_n_runs = args.eval_n_runs, eval_interval = args.eval_interval, max_episode_len = timestep_limit)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use more than one process (i.e. processes > 1), # the results will be no longer be deterministic # even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 n_actions = gym.make(args.env).action_space.n if args.use_lstm: model = A3CLSTM(n_actions) else: model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph([model(fake_obs)], os.path.join( args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_env_check(): # Use different random seeds for train and test envs env_seed = args.seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=True, clip_rewards=True) env.seed(int(env_seed)) return env if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_frequency, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, log_type=args.log_type) elif (args.mode == 'check'): return tools.make_video.check(env=make_env_check(), agent=agent, save_mp4=args.save_mp4) elif (args.mode == 'growth'): return tools.make_video.growth(env=make_env_check(), agent=agent, outdir=args.outdir, max_num=args.max_frames, save_mp4=args.save_mp4)
# Define agents to be used model = A3Cagent(obs_size, n_actions, args.nHidden) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros(obs_size, dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph( [model(fake_obs)], os.path.join(args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.eps, alpha=args.alpha) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(args.gclipping)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) phi = lambda x: x.astype(np.float32, copy=False) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=args.gamma, beta=args.beta, phi=phi) lr_decay_hook = experiments.LinearInterpolationHook(args.steps, args.lr, 0, lr_setter) training = experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.threads, make_env=make_env, profile=False, steps=args.steps,
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--window-size', type=int, default=100) parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() #logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ (lambda: make_env(idx, test)) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.ppo_update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, return_window_size=args.window_size, max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(): parser = argparse.ArgumentParser() parser.add_argument('-e', '--enemy_step_width', default=2, type=int) parser.add_argument('-l', '--level', default=0, type=int, help='difficulty') parser.add_argument('-o', '--out_dir', default=None) parser.add_argument('-p', '--player_step_width', default=4, type=int) parser.add_argument('-r', '--random_seed', default=None, type=int) args = parser.parse_args() print(argv2line(sys.argv)) print() print_args(args) print() if args.out_dir is None: out_dir = 'results_' + dt.now().strftime('%Y%m%d%H%M%S') else: out_dir = args.out_dir if not os.path.exists(out_dir): os.makedirs(out_dir) assert os.path.isdir(out_dir) np.random.seed(args.random_seed) envs = [ DanmakuEnv(level=args.level, random_seed=rs) for rs in np.random.randint(np.iinfo(np.uint32).max, size=N_PROCESSES) ] obs_space = envs[0].observation_space action_space = envs[0].action_space model = Model(obs_space.shape[0], action_space.n) model( np.random.uniform( size=obs_space.shape).astype('float32')[np.newaxis, :, :, :]) opt = RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(GradientClipping(40)) opt.add_hook(NonbiasWeightDecay(1e-4)) agents = [ A3C(model, opt, t_max=10, gamma=0.99, beta=1e-2, process_idx=idx) for idx in range(N_PROCESSES) ] states = [env.reset() for env in envs] rewards = [0.] * N_PROCESSES episode_count = 0 step_count = 0 print('episode: {0:06d}'.format(episode_count + 1)) while step_count < N_STEPS: results = [ train_one_step(idx, env, agent, state, reward) for idx, env, agent, state, reward in zip(range(N_PROCESSES), envs, agents, states, rewards) ] states = [result[1] for result in results] rewards = [result[2] for result in results] step_count += 1 print(dt.now()) print('passed steps: {0:07d}'.format(step_count)) print('statistics: {}'.format(agents[0].get_statistics())) if envs[0].t == 0: episode_count += 1 print('episode: {0:06d}'.format(episode_count + 1)) if (step_count) % SAVE_INTERVAL == 0: save_agent_path = os.path.join( out_dir, 'agent_step_{0:07d}_'.format(step_count) + dt.now().strftime('%Y%m%d%H%M%S')) agents[0].save(save_agent_path) print('save ' + save_agent_path)
def main(): # Prevent numpy from using multiple threads os.environ['OMP_NUM_THREADS'] = '1' import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--replay-start-size', type=int, default=10000) parser.add_argument('--n-times-replay', type=int, default=4) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-frequency', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.set_defaults(use_sdl=False) parser.set_defaults(use_lstm=False) args = parser.parse_args() if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions if args.use_lstm: model = acer.ACERSharedModel( shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) else: model = acer.ACERSharedModel( shared=links.NIPSDQNHead(), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, beta=args.beta, phi=dqn_phi) if args.load: agent.load(args.load) def make_env(process_idx, test): env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(0, True) mean, median, stdev = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev'.format( args.eval_n_runs, mean, median, stdev)) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_frequency=args.eval_frequency, max_episode_len=args.max_episode_len)
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian')) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'LSTMGaussian': model = A3CLSTMGaussian(obs_space.low.size, action_space.low.size) elif args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 if not os.path.exists(args.outdir): os.makedirs(args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(process_idx=0, test=False) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'Gaussian': model = A2CGaussian(obs_space.low.size, action_space.low.size) elif args.arch == 'FFSoftmax': model = A2CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A2CFFMellowmax(obs_space.low.size, action_space.n) optimizer = chainer.optimizers.RMSprop(args.lr, eps=args.rmsprop_epsilon, alpha=args.alpha) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm)) if args.weight_decay > 0: optimizer.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a2c.A2C(model, optimizer, gamma=args.gamma, gpu=args.gpu, num_processes=args.num_envs, update_steps=args.update_steps, use_gae=args.use_gae, tau=args.tau) if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, log_interval=args.log_interval, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, log_type=args.log_type) elif (args.mode == 'check'): from matplotlib import animation import matplotlib.pyplot as plt frames = [] env = make_env(process_idx=0, test=True) for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode='rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) anim.save(args.save_mp4) return anim
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int, default=4) # increase for more asynchronous workers parser.add_argument('--outdir', type=str, default='a3c_training', help='Directory path to save output files. If it does not exist, it will be created.') # set directory to which output files will be written parser.add_argument('--env', type=str, default='1DIsing-A3C-v0') # specify environment to explore parser.add_argument('--steps', type=int, default=1 * 10 ** 7) # maximum number of steps before training ends parser.add_argument('--eval-interval', type=int, default=10**4) # frequency at which the agent will be evaluated parser.add_argument('--eval-n-runs', type=int, default=10) # number of evaluation runs per evaluation parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax')) # NN to use for policy and state value estimates parser.add_argument('--t-max', type=int, default=5) # increase for later truncation of the sum parser.add_argument('--beta', type=float, default=1e-2) # increase for more exploration parser.add_argument('--gamma', type=float, default=0.99) # increase for less discount of future rewards parser.add_argument('--lr', type=float, default=1 * 1e-4) # decrease for slower learning rate parser.add_argument('--weight-decay', type=float, default=0) # turn on to get weight decay parser.add_argument('--seed', type=int, default=17, help='Random seed [0, 2 ** 32)') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--profile', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e0) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--logger-level', type=int, default=logging.ERROR) # set to logging.DEBUG for (much more) information parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2 ** 32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space model = A3CFFSoftmax(obs_space.low.size, action_space.n) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=args.gamma, beta=args.beta) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 n_actions = gym.make(args.env).action_space.n if args.use_lstm: model = acer.ACERSharedModel( shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) else: model = acer.ACERSharedModel( shared=links.NIPSDQNHead(), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, beta=args.beta, phi=phi) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_env_check(): # Use different random seeds for train and test envs env_seed = args.seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=True, clip_rewards=True) env.seed(int(env_seed)) return env if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, ) elif (args.mode == 'check'): return tools.make_video.check(env=make_env_check(), agent=agent, save_mp4=args.save_mp4) elif (args.mode == 'growth'): return tools.make_video.growth(env=make_env_check(), agent=agent, outdir=args.outdir, max_num=args.max_frames, save_mp4=args.save_mp4)
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian')) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.getLogger().setLevel(args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render and process_idx == 0 and not test: misc.env_modifiers.make_rendered(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'LSTMGaussian': model = A3CLSTMGaussian(obs_space.low.size, action_space.low.size) elif args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(): # Prevent numpy from using multiple threads os.environ['OMP_NUM_THREADS'] = '1' import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.set_defaults(use_sdl=False) parser.set_defaults(use_lstm=False) args = parser.parse_args() if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions if args.use_lstm: model = A3CLSTM(n_actions) else: model = A3CFF(n_actions) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=dqn_phi) if args.load: agent.load(args.load) def make_env(process_idx, test): env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook])
def main(): # Prevent numpy from using multiple threads os.environ['OMP_NUM_THREADS'] = '1' import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('rom', type=str) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=2.5e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') # In the original paper, agent runs in 8 environments parallely # and samples 128 steps per environment. # Sample 128 * 8 steps, instead. parser.add_argument('--update-interval', type=int, default=128 * 8) parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--epochs', type=int, default=3) parser.set_defaults(use_sdl=False) args = parser.parse_args() # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions model = A3CFF(n_actions) opt = chainer.optimizers.Adam(alpha=args.lr) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, gpu=args.gpu, phi=dqn_phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) def make_env(test): # Use different random seeds for train and test envs env_seed = 2**31 - 1 - args.seed if test else args.seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.1, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**7) parser.add_argument( '--max-episode-len', type=int, default=5 * 60 * 60 // 4, # 5 minutes with 60/4 fps help='Maximum number of steps for each episode.') parser.add_argument('--lr', type=float, default=2.5e-4) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') # In the original paper, agent runs in 8 environments parallely # and samples 128 steps per environment. # Sample 128 * 8 steps, instead. parser.add_argument('--update-interval', type=int, default=128 * 8) parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--epochs', type=int, default=3) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n model = A3CFF(n_actions) opt = chainer.optimizers.Adam(alpha=args.lr) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = max(value, 1e-8) clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.1, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=env, eval_env=eval_env, outdir=args.outdir, steps=args.steps, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=args.max_episode_len, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--processes', type=int, default=16) parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=250000) parser.add_argument('--eval-n-steps', type=int, default=125000) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = gym.make(args.env).action_space.n model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph([model(fake_obs)], os.path.join( args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('algo', default='ppo', choices=['ppo', 'gail', 'airl'], type=str) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10 ** 6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--load_demo', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) if not (args.demo and args.load): args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_space.low.size, clip_threshold=5) # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) if args.algo == 'ppo': agent = PPO(model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) elif args.algo == 'gail': import numpy as np from irl.gail import GAIL from irl.gail import Discriminator demonstrations = np.load(args.load_demo) D = Discriminator(gpu=args.gpu) agent = GAIL(demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages,) elif args.algo == 'airl': import numpy as np from irl.airl import AIRL as Agent from irl.airl import Discriminator # obs_normalizer = None demonstrations = np.load(args.load_demo) D = Discriminator(gpu=args.gpu) agent = Agent(demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages,) if args.load: agent.load(args.load) if args.demo: env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) outdir = args.load if args.load else args.outdir save_agent_demo(make_env(False), agent, outdir) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = max(value, 1e-8) clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], ) save_agent_demo(make_env(False), agent, args.outdir)
def main(): parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--replay-start-size', type=int, default=10000) parser.add_argument('--n-times-replay', type=int, default=4) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=10**7) parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.set_defaults(use_lstm=False) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = gym.make(args.env).action_space.n if args.use_lstm: model = acer.ACERSharedModel( shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) else: model = acer.ACERSharedModel( shared=links.NIPSDQNHead(), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, beta=args.beta, phi=phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('config', help='YAML config file', default="settings/photo_enhancement.yaml") parser.add_argument('--profile', action='store_true') parser.add_argument('--load_generator', type=str, default='generator_weights_for_demo.npz') parser.add_argument('--logger_level', type=int, default=logging.INFO) parser.add_argument('--file_name', type=str, default='images/demo_2.jpeg') args = parser.parse_args() print_args(args) # init a logger logging.basicConfig(level=args.logger_level) # load yaml config file with open(args.config) as f: config = yaml.load(f) # set random seed misc.set_random_seed(config['seed']) # define func to create env, target data sampler, and models if config['problem'] == 'photo_enhancement': def make_env(process_idx, test): assert test, "error: test should be True" env = PhotoEnhancementEnvDemo(batch_size=1, max_episode_steps=config['max_episode_steps'], imsize=config['imsize'], file_name=args.file_name) return env sample_env = make_env(0, True) gen = SpiralModel(config['imsize'], sample_env.num_parameters, config['L_stages'], config['conditional']) dis = SpiralDiscriminator(config['imsize'], config['conditional']) dataset = PhotoEnhancementDataset() else: raise NotImplementedError() # initialize optimizers gen_opt = chainer.optimizers.Adam(alpha=config['lr'], beta1=0.5) dis_opt = chainer.optimizers.Adam(alpha=config['lr'], beta1=0.5) gen_opt.setup(gen) dis_opt.setup(dis) gen_opt.add_hook(chainer.optimizer.GradientClipping(40)) dis_opt.add_hook(chainer.optimizer.GradientClipping(40)) if config['weight_decay'] > 0: gen_opt.add_hook(NonbiasWeightDecay(config['weight_decay'])) dis_opt.add_hook(NonbiasWeightDecay(config['weight_decay'])) # load generator's weight assert args.load_generator, "error: specify the weight of the model" if args.load_generator: serializers.load_npz(args.load_generator, gen) # init an spiral agent agent = SPIRAL( generator=gen, discriminator=dis, gen_optimizer=gen_opt, dis_optimizer=dis_opt, dataset=dataset, conditional=config['conditional'], reward_mode=config['reward_mode'], imsize=config['imsize'], max_episode_steps=config['max_episode_steps'], rollout_n=config['rollout_n'], gamma=config['gamma'], alpha=config['alpha'], beta=config['beta'], L_stages=config['L_stages'], U_update=config['U_update'], gp_lambda=config['gp_lambda'], n_save_final_obs_interval=config['n_save_final_obs_interval'], outdir=None, act_deterministically=True ) # training mode max_episode_len = config['max_episode_steps'] * config['rollout_n'] steps = config['processes'] * config['n_update'] * max_episode_len save_interval = config['processes'] * config['n_save_interval'] * max_episode_len eval_interval = config['processes'] * config['n_eval_interval'] * max_episode_len step_hook = SpiralStepHook(config['max_episode_steps'], save_interval, None) env = make_env(0, True) with chainer.using_config('train', False): eval_stats = experiments.evaluator.run_evaluation_episodes( env=env, agent=agent, n_steps=None, n_episodes=1, max_episode_len=1)