def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v1') parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--normalize-obs', action='store_true') parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.getLogger().setLevel(args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(test): env = gym.make(args.env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if args.reward_scale_factor and not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render: misc.env_modifiers.make_rendered(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean, normalize_obs=args.normalize_obs) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.set_defaults(use_sdl=False) parser.set_defaults(use_lstm=False) args = parser.parse_args() # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions if args.use_lstm: model = A3CLSTM(n_actions) else: model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph([model(fake_obs)], os.path.join( args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=dqn_phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook])
def main(): parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument( '--max-episode-len', type=int, default=5 * 60 * 60 // 4, # 5 minutes with 60/4 fps help='Maximum number of steps for each episode.') parser.add_argument('--final-exploration-frames', type=int, default=4 * 10**6) parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--profile', action='store_true') parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, 0.05) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env sample_env = make_env(0, test=False) action_space = sample_env.action_space assert isinstance(action_space, spaces.Discrete) # Define a model and its optimizer q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n), DiscreteActionValue) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99) opt.setup(q_func) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 # Make process-specific agents to diversify exploration def make_agent(process_idx): # Random epsilon assignment described in the original paper rand = random.random() if rand < 0.4: epsilon_target = 0.1 elif rand < 0.7: epsilon_target = 0.01 else: epsilon_target = 0.5 explorer = explorers.LinearDecayEpsilonGreedy( 1, epsilon_target, args.final_exploration_frames, action_space.sample) # Suppress the explorer logger explorer.logger.setLevel(logging.INFO) return nsq.NSQ(q_func, opt, t_max=5, gamma=0.99, i_target=40000, explorer=explorer, phi=phi) if args.demo: env = make_env(0, True) agent = make_agent(0) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( outdir=args.outdir, processes=args.processes, make_env=make_env, make_agent=make_agent, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
chainerrl.misc.draw_computational_graph( [model(fake_obs)], os.path.join(args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.eps, alpha=args.alpha) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(args.gclipping)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) phi = lambda x: x.astype(np.float32, copy=False) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=args.gamma, beta=args.beta, phi=phi) lr_decay_hook = experiments.LinearInterpolationHook(args.steps, args.lr, 0, lr_setter) training = experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.threads, make_env=make_env, profile=False, steps=args.steps, eval_interval=args.eval_interval, eval_n_episodes=args.eval_n_runs, max_episode_len=args.max_episode_len, successful_score=args.stop, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, logger=logger,
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4', help='Gym Env ID.') parser.add_argument('--gpu', type=int, default=0, help='GPU device ID. Set to -1 to use CPUs only.') parser.add_argument('--num-envs', type=int, default=8, help='Number of env instances run in parallel.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**7, help='Total time steps for training.') parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--lr', type=float, default=2.5e-4, help='Learning rate.') parser.add_argument('--eval-interval', type=int, default=100000, help='Interval (in timesteps) between evaluation' ' phases.') parser.add_argument('--eval-n-runs', type=int, default=10, help='Number of episodes ran in an evaluation phase.') parser.add_argument('--demo', action='store_true', default=False, help='Run demo episodes, not training.') parser.add_argument('--load', type=str, default='', help='Directory path to load a saved agent data from' ' if it is a non-empty string.') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--update-interval', type=int, default=128 * 8, help='Interval (in timesteps) between PPO iterations.') parser.add_argument('--batchsize', type=int, default=32 * 8, help='Size of minibatch (in timesteps).') parser.add_argument('--epochs', type=int, default=4, help='Number of epochs used for each PPO iteration.') parser.add_argument('--log-interval', type=int, default=10000, help='Interval (in timesteps) of printing logs.') parser.add_argument('--recurrent', action='store_true', default=False, help='Use a recurrent model. See the code for the' ' model definition.') parser.add_argument('--flicker', action='store_true', default=False, help='Use so-called flickering Atari, where each' ' screen is blacked out with probability 0.5.') parser.add_argument('--no-frame-stack', action='store_true', default=False, help='Disable frame stacking so that the agent can' ' only see the current screen.') parser.add_argument('--checkpoint-frequency', type=int, default=None, help='Frequency at which agents are stored.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, flicker=args.flicker, frame_stack=not args.no_frame_stack, ) env.seed(env_seed) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ (lambda: make_env(idx, test)) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(0, test=False) print('Observation space', sample_env.observation_space) print('Action space', sample_env.action_space) n_actions = sample_env.action_space.n winit_last = chainer.initializers.LeCunNormal(1e-2) if args.recurrent: model = chainerrl.links.StatelessRecurrentSequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, L.NStepGRU(1, 512, 512, 0), chainerrl.links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1), )) else: model = chainer.Sequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, chainerrl.links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1), )) # Draw the computational graph and save it in the output directory. fake_obss = np.zeros(sample_env.observation_space.shape, dtype=np.float32)[None] if args.recurrent: fake_out, _ = model(fake_obss, None) else: fake_out = model(fake_obss) chainerrl.misc.draw_computational_graph([fake_out], os.path.join(args.outdir, 'model')) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(0.5)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=True, entropy_coef=1e-2, recurrent=args.recurrent, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: step_hooks = [] # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value step_hooks.append( experiments.LinearInterpolationHook(args.steps, args.lr, 0, lr_setter)) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, checkpoint_freq=args.checkpoint_frequency, eval_interval=args.eval_interval, log_interval=args.log_interval, save_best_so_far_agent=False, step_hooks=step_hooks, )
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--replay-start-size', type=int, default=10000) parser.add_argument('--n-times-replay', type=int, default=4) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.set_defaults(use_sdl=False) parser.set_defaults(use_lstm=False) args = parser.parse_args() # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions if args.use_lstm: model = acer.ACERSharedModel( shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) else: model = acer.ACERSharedModel( shared=links.NIPSDQNHead(), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, beta=args.beta, phi=dqn_phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook])
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--window-size', type=int, default=100) parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) # def make_env(process_idx, test): # env = gym.make(args.env) # # Use different random seeds for train and test envs # process_seed = int(process_seeds[process_idx]) # env_seed = 2 ** 32 - 1 - process_seed if test else process_seed # env.seed(env_seed) # # Cast observations to float32 because our model uses float32 # env = chainerrl.wrappers.CastObservationToFloat32(env) # if args.monitor: # env = chainerrl.wrappers.Monitor(env, args.outdir) # if not test: # # Scale rewards (and thus returns) to a reasonable range so that # # training is easier # env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) # if args.render: # env = chainerrl.wrappers.Render(env) # return env def make_env(test): env = gym.make( "DaktyPushingSimulationEnv-v0", level=5, simulation_backend="mujoco", control_frequency_in_hertz=100, state_space_components_to_be_used=None, alternate_env_object=None, discretization_factor_torque_control_space=None, model_as_function_for_pixel_to_latent_space_parsing=(None, None)) # print('\n############\n', env, '\n############\n') env.unwrapped.finger.set_resolution_quality('low') # print('\n############\n', env, '\n############\n') env = gym.wrappers.TimeLimit(env) # print('\n############\n', env, '\n############\n') # Unwrap TimeLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs # env_seed = 2 ** 32 - 1 - args.seed if test else args.seed # env.seed(env_seed) process_seed = 420 env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = make_env(0) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space print('\n\n------------------- obs_space: ', obs_space.shape, '\n\n\n') # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) winit_last = chainer.initializers.LeCunNormal(1e-2) action_size = action_space.low.size policy = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, action_size, initialW=winit_last), chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 )) vf = chainer.Sequential( concat_obs_and_action, L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, 1), ) # Combine a policy and a value function into a single model model = chainerrl.links.Branched(policy, vf) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: env = make_env(False) n_episodes = 10000 # pbar = tqdm(total=n_episodes) max_episode_len = 1000 for i in range(1, n_episodes + 1): # pbar.update(1) obs = env.reset() # print('obs inital..............', obs.shape) reward = 0 done = False R = 0 # return (sum of rewards) t = 0 # time step # pbar = tqdm(total=max_episode_len) while not done and t < max_episode_len: # pbar.update(1) # Uncomment to watch the behaviour # env.render() action = agent.act_and_train(obs, reward) # print('action..................', action) obs, reward, done, _ = env.step(action) # print('obs.....................', obs) # print('reward..................', reward) R += reward t += 1 if i % 10 == 0: print('episode:', i, 'R:', R, 'statistics:', agent.get_statistics()) agent.stop_episode_and_train(obs, reward, done) print('Finished.') # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, return_window_size=args.window_size, max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, ], )
def main(): # This prevents numpy from using multiple threads os.environ['OMP_NUM_THREADS'] = '1' import logging # logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--final-exploration-frames', type=int, default=4 * 10**6) parser.add_argument('--outdir', type=str, default='nsq_output') parser.add_argument('--profile', action='store_true') parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) args = parser.parse_args() if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env sample_env = make_env(0, test=False) action_space = sample_env.action_space assert isinstance(action_space, spaces.Discrete) # Define a model and its optimizer q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n), DiscreteActionValue) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99) opt.setup(q_func) # Make process-specific agents to diversify exploration def make_agent(process_idx): # Random epsilon assignment described in the original paper rand = random.random() if rand < 0.4: epsilon_target = 0.1 elif rand < 0.7: epsilon_target = 0.01 else: epsilon_target = 0.5 explorer = explorers.LinearDecayEpsilonGreedy( 1, epsilon_target, args.final_exploration_frames, action_space.sample) # Suppress the explorer logger explorer.logger.setLevel(logging.INFO) return nsq.NSQ(q_func, opt, t_max=5, gamma=0.99, i_target=40000, explorer=explorer, phi=dqn_phi) if args.demo: env = make_env(0, True) agent = make_agent(0) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: explorer = explorers.ConstantEpsilonGreedy(0.05, action_space.sample) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(outdir=args.outdir, processes=args.processes, make_env=make_env, make_agent=make_agent, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, eval_explorer=explorer, global_step_hooks=[lr_decay_hook])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**7) parser.add_argument( '--max-episode-len', type=int, default=5 * 60 * 60 // 4, # 5 minutes with 60/4 fps help='Maximum number of steps for each episode.') parser.add_argument('--lr', type=float, default=2.5e-4) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') # In the original paper, agent runs in 8 environments parallely # and samples 128 steps per environment. # Sample 128 * 8 steps, instead. parser.add_argument('--update-interval', type=int, default=128 * 8) parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--epochs', type=int, default=3) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n model = A3CFF(n_actions) opt = chainer.optimizers.Adam(alpha=args.lr) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = max(value, 1e-8) clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.1, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=env, eval_env=eval_env, outdir=args.outdir, steps=args.steps, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=args.max_episode_len, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
gpu=gpu, phi=phi, update_interval=update_interval, minibatch_size=64, epochs=10, clip_eps_vf=None, entropy_coef=0.0, ) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook(steps, 3e-4, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( steps, 0.2, 0, clip_eps_setter) # Use GPU if any available if gpu >= 0: chainer.cuda.get_device(gpu).use() model.to_gpu(gpu)
def main(): # Prevent numpy from using multiple threads os.environ['OMP_NUM_THREADS'] = '1' import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.set_defaults(use_sdl=False) parser.set_defaults(use_lstm=False) args = parser.parse_args() if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions if args.use_lstm: model = A3CLSTM(n_actions) else: model = A3CFF(n_actions) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=dqn_phi) if args.load: agent.load(args.load) def make_env(process_idx, test): env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook])
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--window-size', type=int, default=100) parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() #logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ (lambda: make_env(idx, test)) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.ppo_update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, return_window_size=args.window_size, max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--window-size', type=int, default=100) parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) winit_last = chainer.initializers.LeCunNormal(1e-2) # Switch policy types accordingly to action space types if isinstance(action_space, gym.spaces.Discrete): n_actions = action_space.n policy = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ) elif isinstance(action_space, gym.spaces.Box): action_size = action_space.low.size policy = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, action_size, initialW=winit_last), chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 ), ) else: print("""\ This example only supports gym.spaces.Box or gym.spaces.Discrete action spaces.""" ) # NOQA return vf = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, 1), ) # Combine a policy and a value function into a single model model = chainerrl.links.Branched(policy, vf) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: env = make_batch_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, return_window_size=args.window_size, max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, ], )
def train_or_evaluate( args, agent, env_id, env, seed, render, eval_n_runs, steps, eval_interval, outdir, ): timestep_limit = env.spec.max_episode_steps if args.algo in [AlgoName.CAPG_PPO, AlgoName.CAPG_TRPO]: train_agent_with_evaluation( agent=agent, env=env, eval_env=make_env(args, env_id, seed, render, outdir, is_test=True), outdir=outdir, steps=steps, eval_n_runs=eval_n_runs, eval_interval=eval_interval, max_episode_len=timestep_limit, save_best_so_far_agent=True, ) elif args.algo is AlgoName.CHAINERRL_PPO: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.adam_lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(args, False), eval_env=make_batch_env(args, True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, return_window_size=args.window_size, max_episode_len=timestep_limit, save_best_so_far_agent=True, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], use_humans_reward=args.use_humans_reward, humans_reward_interval=args.humans_reward_interval, step_offset=args.offset_steps, )
def main(): # Prevent numpy from using multiple threads os.environ['OMP_NUM_THREADS'] = '1' import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('rom', type=str) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=2.5e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') # In the original paper, agent runs in 8 environments parallely # and samples 128 steps per environment. # Sample 128 * 8 steps, instead. parser.add_argument('--update-interval', type=int, default=128 * 8) parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--epochs', type=int, default=3) parser.set_defaults(use_sdl=False) args = parser.parse_args() # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions model = A3CFF(n_actions) opt = chainer.optimizers.Adam(alpha=args.lr) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, gpu=args.gpu, phi=dqn_phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) def make_env(test): # Use different random seeds for train and test envs env_seed = 2**31 - 1 - args.seed if test else args.seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.1, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--final-exploration-frames', type=int, default=4 * 10**6) parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--profile', action='store_true') parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) args = parser.parse_args() # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env sample_env = make_env(0, test=False) action_space = sample_env.action_space assert isinstance(action_space, spaces.Discrete) # Define a model and its optimizer q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n), DiscreteActionValue) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99) opt.setup(q_func) # Make process-specific agents to diversify exploration def make_agent(process_idx): # Random epsilon assignment described in the original paper rand = random.random() if rand < 0.4: epsilon_target = 0.1 elif rand < 0.7: epsilon_target = 0.01 else: epsilon_target = 0.5 explorer = explorers.LinearDecayEpsilonGreedy( 1, epsilon_target, args.final_exploration_frames, action_space.sample) # Suppress the explorer logger explorer.logger.setLevel(logging.INFO) return nsq.NSQ(q_func, opt, t_max=5, gamma=0.99, i_target=40000, explorer=explorer, phi=dqn_phi) if args.demo: env = make_env(0, True) agent = make_agent(0) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: explorer = explorers.ConstantEpsilonGreedy(0.05, action_space.sample) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(outdir=args.outdir, processes=args.processes, make_env=make_env, make_agent=make_agent, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, eval_explorer=explorer, global_step_hooks=[lr_decay_hook])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--processes', type=int, default=16) parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=250000) parser.add_argument('--eval-n-steps', type=int, default=125000) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = gym.make(args.env).action_space.n model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph([model(fake_obs)], os.path.join( args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('algo', default='ppo', choices=['ppo', 'gail', 'airl'], type=str) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10 ** 6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--load_demo', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) if not (args.demo and args.load): args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_space.low.size, clip_threshold=5) # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) if args.algo == 'ppo': agent = PPO(model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) elif args.algo == 'gail': import numpy as np from irl.gail import GAIL from irl.gail import Discriminator demonstrations = np.load(args.load_demo) D = Discriminator(gpu=args.gpu) agent = GAIL(demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages,) elif args.algo == 'airl': import numpy as np from irl.airl import AIRL as Agent from irl.airl import Discriminator # obs_normalizer = None demonstrations = np.load(args.load_demo) D = Discriminator(gpu=args.gpu) agent = Agent(demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages,) if args.load: agent.load(args.load) if args.demo: env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) outdir = args.load if args.load else args.outdir save_agent_demo(make_env(False), agent, outdir) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = max(value, 1e-8) clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], ) save_agent_demo(make_env(False), agent, args.outdir)
def main(): parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--replay-start-size', type=int, default=10000) parser.add_argument('--n-times-replay', type=int, default=4) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=10**7) parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.set_defaults(use_lstm=False) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = gym.make(args.env).action_space.n if args.use_lstm: model = acer.ACERSharedModel( shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) else: model = acer.ACERSharedModel( shared=links.NIPSDQNHead(), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, beta=args.beta, phi=phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
def main(args, train_env): logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) if not (args.demo and args.load): args.outdir = experiments.prepare_output_dir(args, args.outdir) temp = args.outdir.split('/')[-1] dst = args.outdir.strip(temp) def make_env(test): env = gym.make(args.env) if test: episode_length = args.eval_episode_length else: episode_length = args.episode_length env.initialize_environment( case=args.state_rep, n_historical_events=args.n_historical_events, episode_length=episode_length, n_experts=args.n_experts, n_demos_per_expert=1, n_expert_time_steps=args.length_expert_TS, seed_agent=args.seed_agent, seed_expert=args.seed_expert, adam_days=args.adam_days) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) sample_env.initialize_environment( case=args.state_rep, n_historical_events=args.n_historical_events, episode_length=args.episode_length, n_experts=args.n_experts, n_demos_per_expert=1, n_expert_time_steps=args.length_expert_TS, seed_agent=args.seed_agent, seed_expert=args.seed_expert, adam_days=args.adam_days) demonstrations = sample_env.generate_expert_trajectories(out_dir=dst, eval=False) timestep_limit = None #sample_env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') # This value is None # Generate expert data for evaluation temp_env = gym.make(args.env) temp_env.initialize_environment( case=args.state_rep, n_historical_events=args.n_historical_events, episode_length= 0, # This parameter does not really matter since we create this env only for generating samples n_experts=args.n_experts, n_demos_per_expert=1, # We do not perform any clustering right now # n_demos_per_expert=args.n_demos_per_expert, # How large should the expert cluster be? n_expert_time_steps=args. eval_episode_length, # How long should each expert trajectory be? seed_expert=args.seed_expert, adam_days=args.adam_days) temp_env.generate_expert_trajectories(out_dir=dst, eval=True) obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance if args.state_rep == 1: obs_dim = obs_space.low.size elif args.state_rep == 2 or args.state_rep == 21 or args.state_rep == 22 or args.state_rep == 24 or args.state_rep == 4 or args.state_rep == 221 or args.state_rep == 222 \ or args.state_rep == 71 or args.state_rep == 17 or args.state_rep == 81: obs_dim = obs_space.n elif args.state_rep == 3 or args.state_rep == 11 or args.state_rep == 23 or args.state_rep == 31 or args.state_rep == 7: obs_dim = obs_space.nvec.size else: raise NotImplementedError if args.normalize_obs: obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_dim, clip_threshold=5) # shape: Shape of input values except batch axis else: obs_normalizer = None # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_dim, action_space.n, hidden_sizes=args.G_layers) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean) opt = chainer.optimizers.Adam(alpha=args.lr, eps=10e-1) opt.setup(model) if args.show_D_dummy: # Let discriminator see dummy input_dim_D = obs_dim + 1 elif not args.show_D_dummy: # Do not let discriminator see dummy if args.state_rep == 21 or args.state_rep == 17: input_dim_D = obs_dim + 1 else: input_dim_D = obs_dim + 1 - args.n_experts if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) if args.algo == 'ppo': agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) elif args.algo == 'gail': from customer_behaviour.algorithms.irl.gail import GAIL as G from customer_behaviour.algorithms.irl.gail import Discriminator as D demonstrations = np.load(dst + '/expert_trajectories.npz') D = D(gpu=args.gpu, input_dim=input_dim_D, hidden_sizes=args.D_layers, loss_type=args.loss_type) agent = G(env=sample_env, demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, args=args) elif args.algo == 'airl': from customer_behaviour.algorithms.irl.airl import AIRL as G from customer_behaviour.algorithms.irl.airl import Discriminator as D # obs_normalizer = None demonstrations = np.load(dst + '/expert_trajectories.npz') D = D(gpu=args.gpu, input_dim=input_dim_D - 1, hidden_sizes=args.D_layers) # AIRL only inputs state to D agent = G(env=sample_env, demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, noise=args.noise, n_experts=args.n_experts, episode_length=args.episode_length, adam_days=args.adam_days, dummy_D=args.show_D_dummy) elif args.algo == 'mmct-gail': from customer_behaviour.algorithms.irl.gail.mmct_gail import MMCTGAIL as G from customer_behaviour.algorithms.irl.gail import Discriminator as D demonstrations = np.load(dst + '/expert_trajectories.npz') D = D(gpu=args.gpu, input_dim=input_dim_D, hidden_sizes=args.D_layers, loss_type=args.loss_type) agent = G(env=sample_env, demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, args=args) if args.load: # By default, not in here agent.load(args.load) if args.demo: # By default, not in here env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) outdir = args.load if args.load else args.outdir save_agent_demo(make_env(False), agent, outdir) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = max(value, 1e-8) clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) if train_env is None: experiments.train_agent_with_evaluation( agent=agent, env=make_env( False ), # Environment train the agent against (False -> scaled rewards) eval_env=make_env(True), # Environment used for evaluation outdir=args.outdir, steps=args. steps, # Total number of timesteps for training (args.n_training_episodes*args.episode_length) eval_n_steps= None, # Number of timesteps at each evaluation phase eval_n_episodes=args. eval_n_runs, # Number of episodes at each evaluation phase (default: 10) eval_interval=args. eval_interval, # Interval of evaluation (defualt: 10000 steps (?)) train_max_episode_len= timestep_limit, # Maximum episode length during training (is None) save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], checkpoint_freq=args.eval_interval) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=train_env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, max_episode_len=timestep_limit, eval_max_episode_len=None, eval_env=make_env(True), step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], save_best_so_far_agent=False, checkpoint_freq=args.eval_interval, log_interval=args.update_interval) save_agent_demo( make_env(True), agent, args.outdir, 10 * args.eval_episode_length ) # originally it was make_env(test=False) which seems strange # Move result files to correct folder and remove empty folder move_dir(args.outdir, dst) os.rmdir(args.outdir) if args.save_results: print('Saving result...') res2.save_data(dst, 10000, 50, N=1) print('Running evaluate policy...') ep.eval_policy(a_dir_path=dst) # else: # if args.n_experts <= 10: # print('Running evaluate policy...') # ep.eval_policy(a_dir_path=dst) # # print('Running evaluate training...') # # ets.eval_training(a_dir_path=dst) # print('Done') if args.save_report_material: print('Saving dataframe...') if args.state_rep == 21: if args.algo == 'gail': folder_name = 'gail' elif args.algo == 'airl': folder_name = 'airl' elif args.state_rep == 22: if args.algo == 'gail': folder_name = 'gail_dummies' elif args.algo == 'airl': folder_name = 'airl_dummies' elif args.state_rep == 81: if args.algo == 'gail': folder_name = 'gail_adams' elif args.algo == 'airl': folder_name = 'airl_adams' elif args.state_rep == 17: folder_name = 'ail' elif args.state_rep == 221: folder_name = 'ail_dummies' elif args.state_rep == 71: folder_name = 'ail_adams' report_material.save_df(dst, folder_name) if args.save_folder is not None: print('Saving result to ' + args.save_folder) os.makedirs(os.path.join(os.getcwd(), args.save_folder), exist_ok=True) from distutils.dir_util import copy_tree copy_tree( os.path.join(os.getcwd(), dst), os.path.join(os.getcwd(), args.save_folder, args.outdir.split('/')[-2]))