def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v1') parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--normalize-obs', action='store_true') parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.getLogger().setLevel(args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(test): env = gym.make(args.env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if args.reward_scale_factor and not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render: misc.env_modifiers.make_rendered(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean, normalize_obs=args.normalize_obs) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(): # Prevent numpy from using multiple threads os.environ['OMP_NUM_THREADS'] = '1' import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('rom', type=str) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=2.5e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') # In the original paper, agent runs in 8 environments parallely # and samples 128 steps per environment. # Sample 128 * 8 steps, instead. parser.add_argument('--update-interval', type=int, default=128 * 8) parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--epochs', type=int, default=3) parser.set_defaults(use_sdl=False) args = parser.parse_args() # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions model = A3CFF(n_actions) opt = chainer.optimizers.Adam(alpha=args.lr) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, gpu=args.gpu, phi=dqn_phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) def make_env(test): # Use different random seeds for train and test envs env_seed = 2**31 - 1 - args.seed if test else args.seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.1, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4', help='Gym Env ID.') parser.add_argument('--gpu', type=int, default=0, help='GPU device ID. Set to -1 to use CPUs only.') parser.add_argument('--num-envs', type=int, default=8, help='Number of env instances run in parallel.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**7, help='Total time steps for training.') parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--lr', type=float, default=2.5e-4, help='Learning rate.') parser.add_argument('--eval-interval', type=int, default=100000, help='Interval (in timesteps) between evaluation' ' phases.') parser.add_argument('--eval-n-runs', type=int, default=10, help='Number of episodes ran in an evaluation phase.') parser.add_argument('--demo', action='store_true', default=False, help='Run demo episodes, not training.') parser.add_argument('--load', type=str, default='', help='Directory path to load a saved agent data from' ' if it is a non-empty string.') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--update-interval', type=int, default=128 * 8, help='Interval (in timesteps) between PPO iterations.') parser.add_argument('--batchsize', type=int, default=32 * 8, help='Size of minibatch (in timesteps).') parser.add_argument('--epochs', type=int, default=4, help='Number of epochs used for each PPO iteration.') parser.add_argument('--log-interval', type=int, default=10000, help='Interval (in timesteps) of printing logs.') parser.add_argument('--recurrent', action='store_true', default=False, help='Use a recurrent model. See the code for the' ' model definition.') parser.add_argument('--flicker', action='store_true', default=False, help='Use so-called flickering Atari, where each' ' screen is blacked out with probability 0.5.') parser.add_argument('--no-frame-stack', action='store_true', default=False, help='Disable frame stacking so that the agent can' ' only see the current screen.') parser.add_argument('--checkpoint-frequency', type=int, default=None, help='Frequency at which agents are stored.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, flicker=args.flicker, frame_stack=not args.no_frame_stack, ) env.seed(env_seed) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ (lambda: make_env(idx, test)) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(0, test=False) print('Observation space', sample_env.observation_space) print('Action space', sample_env.action_space) n_actions = sample_env.action_space.n winit_last = chainer.initializers.LeCunNormal(1e-2) if args.recurrent: model = chainerrl.links.StatelessRecurrentSequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, L.NStepGRU(1, 512, 512, 0), chainerrl.links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1), )) else: model = chainer.Sequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, chainerrl.links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1), )) # Draw the computational graph and save it in the output directory. fake_obss = np.zeros(sample_env.observation_space.shape, dtype=np.float32)[None] if args.recurrent: fake_out, _ = model(fake_obss, None) else: fake_out = model(fake_obss) chainerrl.misc.draw_computational_graph([fake_out], os.path.join(args.outdir, 'model')) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(0.5)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=True, entropy_coef=1e-2, recurrent=args.recurrent, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: step_hooks = [] # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value step_hooks.append( experiments.LinearInterpolationHook(args.steps, args.lr, 0, lr_setter)) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, checkpoint_freq=args.checkpoint_frequency, eval_interval=args.eval_interval, log_interval=args.log_interval, save_best_so_far_agent=False, step_hooks=step_hooks, )
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--env', type=str, default='Hopper-v2', help='OpenAI Gym MuJoCo env to perform algorithm on.') parser.add_argument('--num-envs', type=int, default=1, help='Number of envs run in parallel.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=2 * 10 ** 6, help='Total number of timesteps to train the agent.') parser.add_argument('--eval-interval', type=int, default=100000, help='Interval in timesteps between evaluations.') parser.add_argument('--eval-n-runs', type=int, default=100, help='Number of episodes run for each evaluation.') parser.add_argument('--render', action='store_true', help='Render env states in a GUI window.') parser.add_argument('--demo', action='store_true', help='Just run evaluation, not training.') parser.add_argument('--load', type=str, default='', help='Directory to load agent from.') parser.add_argument('--logger-level', type=int, default=logging.INFO, help='Level of the root logger.') parser.add_argument('--monitor', action='store_true', help='Wrap env with gym.wrappers.Monitor.') parser.add_argument('--log-interval', type=int, default=1000, help='Interval in timesteps between outputting log' ' messages during training') parser.add_argument('--update-interval', type=int, default=2048, help='Interval in timesteps between model updates.') parser.add_argument('--epochs', type=int, default=10, help='Number of epochs to update model for per PPO' ' iteration.') parser.add_argument('--batch-size', type=int, default=64, help='Minibatch size') args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2 ** 32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv( [functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs))]) # Only for getting timesteps, and obs-action spaces sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space print('Observation space:', obs_space) print('Action space:', action_space) assert isinstance(action_space, gym.spaces.Box) # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_space.low.size, clip_threshold=5) # While the original paper initialized weights by normal distribution, # we use orthogonal initialization as the latest openai/baselines does. winit = chainerrl.initializers.Orthogonal(1.) winit_last = chainerrl.initializers.Orthogonal(1e-2) action_size = action_space.low.size policy = chainer.Sequential( L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, action_size, initialW=winit_last), chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 ), ) vf = chainer.Sequential( L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 64, initialW=winit), F.tanh, L.Linear(None, 1, initialW=winit), ) # Combine a policy and a value function into a single model model = chainerrl.links.Branched(policy, vf) opt = chainer.optimizers.Adam(3e-4, eps=1e-5) opt.setup(model) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batch_size, epochs=args.epochs, clip_eps_vf=None, entropy_coef=0, standardize_advantages=True, gamma=0.995, lambd=0.97, ) if args.load: agent.load(args.load) if args.demo: env = make_batch_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=timestep_limit, save_best_so_far_agent=False, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**7) parser.add_argument( '--max-episode-len', type=int, default=5 * 60 * 60 // 4, # 5 minutes with 60/4 fps help='Maximum number of steps for each episode.') parser.add_argument('--lr', type=float, default=2.5e-4) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') # In the original paper, agent runs in 8 environments parallely # and samples 128 steps per environment. # Sample 128 * 8 steps, instead. parser.add_argument('--update-interval', type=int, default=128 * 8) parser.add_argument('--batchsize', type=int, default=32) parser.add_argument('--epochs', type=int, default=3) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n model = A3CFF(n_actions) opt = chainer.optimizers.Adam(alpha=args.lr) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = max(value, 1e-8) clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.1, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=env, eval_env=eval_env, outdir=args.outdir, steps=args.steps, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=args.max_episode_len, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('algo', default='ppo', choices=['ppo', 'gail', 'airl'], type=str) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10 ** 6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--load_demo', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu,)) if not (args.demo and args.load): args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_space.low.size, clip_threshold=5) # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) if args.algo == 'ppo': agent = PPO(model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) elif args.algo == 'gail': import numpy as np from irl.gail import GAIL from irl.gail import Discriminator demonstrations = np.load(args.load_demo) D = Discriminator(gpu=args.gpu) agent = GAIL(demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages,) elif args.algo == 'airl': import numpy as np from irl.airl import AIRL as Agent from irl.airl import Discriminator # obs_normalizer = None demonstrations = np.load(args.load_demo) D = Discriminator(gpu=args.gpu) agent = Agent(demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages,) if args.load: agent.load(args.load) if args.demo: env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) outdir = args.load if args.load else args.outdir save_agent_demo(make_env(False), agent, outdir) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = max(value, 1e-8) clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) experiments.train_agent_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, train_max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], ) save_agent_demo(make_env(False), agent, args.outdir)
def __init__(self, args, sample_env): obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_space.low.size, clip_threshold=5) # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': #model = A3CFFSoftmax(obs_space.low.size, action_space.n) model = A3CFFSoftmax(obs_space.low.size, sample_env.env_prop.get_softmax_layer_size(), n_hidden_channels=600, beta=cfg.beta) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean, n_hidden_channels=cfg.n_hidden_channels) elif args.arch == 'FFParamSoftmax': model = A3CFFParamSoftmax( obs_space.low.size, sample_env.env_prop.get_pre_output_layer_size(), sample_env.env_prop.get_parametric_segments(), sample_env.env_prop.get_parametric_softmax_segments_sizes(), n_hidden_channels=600, beta=cfg.beta) else: raise NotImplementedError opt = chainer.optimizers.Adam(alpha=args.adam_lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) # a workaround for saving obs_normalizer # see https://github.com/chainer/chainerrl/issues/376 if 'obs_normalizer' not in PPO.saved_attributes: PPO.saved_attributes.append('obs_normalizer') agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, phi=lambda x: x.astype(np.float32, copy=False), gamma=args.ppo_gamma, lambd=args.ppo_lambda, update_interval=args.ppo_update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) self._agent = agent
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--arch', type=str, default='FFGaussian', choices=('FFSoftmax', 'FFMellowmax', 'FFGaussian')) parser.add_argument('--bound-mean', action='store_true') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--window-size', type=int, default=100) parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() #logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ (lambda: make_env(idx, test)) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.ppo_update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = value clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, return_window_size=args.window_size, max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], )
action_space = env.action_space model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=False, normalize_obs=False) opt = chainer.optimizers.Adam(alpha=3e-4, eps=1e-5) opt.setup(model) agent = PPO(model, opt, gpu=-1, phi=phi, update_interval=2048, minibatch_size=64, epochs=10, clip_eps_vf=None, entropy_coef=0.0, standardize_advantages=False) agent.load("parameters") ACTION_MEANINGS = { 0: 'Hip1(Torque/Velocity)', 1: 'Knee1(Torque/Velocity)', 2: 'Hip2(Torque/Velocity)', 3: 'Knee2(Torque/Velocity)', } launch_visualizer(agent, env, ACTION_MEANINGS)
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--window-size', type=int, default=100) parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) winit_last = chainer.initializers.LeCunNormal(1e-2) # Switch policy types accordingly to action space types if isinstance(action_space, gym.spaces.Discrete): n_actions = action_space.n policy = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ) elif isinstance(action_space, gym.spaces.Box): action_size = action_space.low.size policy = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, action_size, initialW=winit_last), chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 ), ) else: print("""\ This example only supports gym.spaces.Box or gym.spaces.Discrete action spaces.""" ) # NOQA return vf = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, 1), ) # Combine a policy and a value function into a single model model = chainerrl.links.Branched(policy, vf) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: env = make_batch_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, return_window_size=args.window_size, max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, ], )
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--steps', type=int, default=10**6) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--standardize-advantages', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--window-size', type=int, default=100) parser.add_argument('--update-interval', type=int, default=2048) parser.add_argument('--log-interval', type=int, default=1000) parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--entropy-coef', type=float, default=0.0) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) # def make_env(process_idx, test): # env = gym.make(args.env) # # Use different random seeds for train and test envs # process_seed = int(process_seeds[process_idx]) # env_seed = 2 ** 32 - 1 - process_seed if test else process_seed # env.seed(env_seed) # # Cast observations to float32 because our model uses float32 # env = chainerrl.wrappers.CastObservationToFloat32(env) # if args.monitor: # env = chainerrl.wrappers.Monitor(env, args.outdir) # if not test: # # Scale rewards (and thus returns) to a reasonable range so that # # training is easier # env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) # if args.render: # env = chainerrl.wrappers.Render(env) # return env def make_env(test): env = gym.make( "DaktyPushingSimulationEnv-v0", level=5, simulation_backend="mujoco", control_frequency_in_hertz=100, state_space_components_to_be_used=None, alternate_env_object=None, discretization_factor_torque_control_space=None, model_as_function_for_pixel_to_latent_space_parsing=(None, None)) # print('\n############\n', env, '\n############\n') env.unwrapped.finger.set_resolution_quality('low') # print('\n############\n', env, '\n############\n') env = gym.wrappers.TimeLimit(env) # print('\n############\n', env, '\n############\n') # Unwrap TimeLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs # env_seed = 2 ** 32 - 1 - args.seed if test else args.seed # env.seed(env_seed) process_seed = 420 env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) # Only for getting timesteps, and obs-action spaces sample_env = make_env(0) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space print('\n\n------------------- obs_space: ', obs_space.shape, '\n\n\n') # Normalize observations based on their empirical mean and variance obs_normalizer = chainerrl.links.EmpiricalNormalization(obs_space.low.size, clip_threshold=5) winit_last = chainer.initializers.LeCunNormal(1e-2) action_size = action_space.low.size policy = chainer.Sequential( L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, action_size, initialW=winit_last), chainerrl.policies.GaussianHeadWithStateIndependentCovariance( action_size=action_size, var_type='diagonal', var_func=lambda x: F.exp(2 * x), # Parameterize log std var_param_init=0, # log std = 0 => std = 1 )) vf = chainer.Sequential( concat_obs_and_action, L.Linear(None, 64), F.tanh, L.Linear(None, 64), F.tanh, L.Linear(None, 1), ) # Combine a policy and a value function into a single model model = chainerrl.links.Branched(policy, vf) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) if args.load: agent.load(args.load) if args.demo: env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: env = make_env(False) n_episodes = 10000 # pbar = tqdm(total=n_episodes) max_episode_len = 1000 for i in range(1, n_episodes + 1): # pbar.update(1) obs = env.reset() # print('obs inital..............', obs.shape) reward = 0 done = False R = 0 # return (sum of rewards) t = 0 # time step # pbar = tqdm(total=max_episode_len) while not done and t < max_episode_len: # pbar.update(1) # Uncomment to watch the behaviour # env.render() action = agent.act_and_train(obs, reward) # print('action..................', action) obs, reward, done, _ = env.step(action) # print('obs.....................', obs) # print('reward..................', reward) R += reward t += 1 if i % 10 == 0: print('episode:', i, 'R:', R, 'statistics:', agent.get_statistics()) agent.stop_episode_and_train(obs, reward, done) print('Finished.') # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_env(False), eval_env=make_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, return_window_size=args.window_size, max_episode_len=timestep_limit, save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, ], )
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if(type(args) is list): args=make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu,)) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2 ** 32 def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test, flicker=args.flicker, frame_stack=not args.no_frame_stack, ) env.seed(env_seed) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_env_check(): # Use different random seeds for train and test envs env_seed = args.seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=True, clip_rewards=True) env.seed(int(env_seed)) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv( [(lambda: make_env(idx, test)) for idx, env in enumerate(range(args.num_envs))]) sample_env = make_env(0, test=False) print('Observation space', sample_env.observation_space) print('Action space', sample_env.action_space) n_actions = sample_env.action_space.n winit_last = chainer.initializers.LeCunNormal(1e-2) if args.recurrent: model = chainerrl.links.StatelessRecurrentSequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, L.NStepGRU(1, 512, 512, 0), chainerrl.links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1), ) ) else: model = chainer.Sequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, chainerrl.links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), chainerrl.distribution.SoftmaxDistribution, ), L.Linear(None, 1), ) ) # Draw the computational graph and save it in the output directory. fake_obss = np.zeros( sample_env.observation_space.shape, dtype=np.float32)[None] if args.recurrent: fake_out, _ = model(fake_obss, None) else: fake_out = model(fake_obss) chainerrl.misc.draw_computational_graph( [fake_out], os.path.join(args.outdir, 'model')) opt = chainer.optimizers.Adam(alpha=args.lr, eps=1e-5) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(0.5)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = PPO( model, opt, gpu=args.gpu, phi=phi, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps=0.1, clip_eps_vf=None, standardize_advantages=True, entropy_coef=1e-2, recurrent=args.recurrent, ) if args.load_agent: agent.load(args.load_agent) if (args.mode=='train'): step_hooks = [] # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value step_hooks.append( experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter)) experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(False), eval_env=make_batch_env(True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_frequency, eval_interval=args.eval_interval, log_interval=args.log_interval, save_best_so_far_agent=False, step_hooks=step_hooks, log_type=args.log_type ) elif (args.mode=='check'): return tools.make_video.check(env=make_env_check(),agent=agent,save_mp4=args.save_mp4) elif (args.mode=='growth'): return tools.make_video.growth(env=make_env_check(),agent=agent,outdir=args.outdir,max_num=args.max_frames,save_mp4=args.save_mp4)
def main(args, train_env): logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) if not (args.demo and args.load): args.outdir = experiments.prepare_output_dir(args, args.outdir) temp = args.outdir.split('/')[-1] dst = args.outdir.strip(temp) def make_env(test): env = gym.make(args.env) if test: episode_length = args.eval_episode_length else: episode_length = args.episode_length env.initialize_environment( case=args.state_rep, n_historical_events=args.n_historical_events, episode_length=episode_length, n_experts=args.n_experts, n_demos_per_expert=1, n_expert_time_steps=args.length_expert_TS, seed_agent=args.seed_agent, seed_expert=args.seed_expert, adam_days=args.adam_days) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) sample_env.initialize_environment( case=args.state_rep, n_historical_events=args.n_historical_events, episode_length=args.episode_length, n_experts=args.n_experts, n_demos_per_expert=1, n_expert_time_steps=args.length_expert_TS, seed_agent=args.seed_agent, seed_expert=args.seed_expert, adam_days=args.adam_days) demonstrations = sample_env.generate_expert_trajectories(out_dir=dst, eval=False) timestep_limit = None #sample_env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') # This value is None # Generate expert data for evaluation temp_env = gym.make(args.env) temp_env.initialize_environment( case=args.state_rep, n_historical_events=args.n_historical_events, episode_length= 0, # This parameter does not really matter since we create this env only for generating samples n_experts=args.n_experts, n_demos_per_expert=1, # We do not perform any clustering right now # n_demos_per_expert=args.n_demos_per_expert, # How large should the expert cluster be? n_expert_time_steps=args. eval_episode_length, # How long should each expert trajectory be? seed_expert=args.seed_expert, adam_days=args.adam_days) temp_env.generate_expert_trajectories(out_dir=dst, eval=True) obs_space = sample_env.observation_space action_space = sample_env.action_space # Normalize observations based on their empirical mean and variance if args.state_rep == 1: obs_dim = obs_space.low.size elif args.state_rep == 2 or args.state_rep == 21 or args.state_rep == 22 or args.state_rep == 24 or args.state_rep == 4 or args.state_rep == 221 or args.state_rep == 222 \ or args.state_rep == 71 or args.state_rep == 17 or args.state_rep == 81: obs_dim = obs_space.n elif args.state_rep == 3 or args.state_rep == 11 or args.state_rep == 23 or args.state_rep == 31 or args.state_rep == 7: obs_dim = obs_space.nvec.size else: raise NotImplementedError if args.normalize_obs: obs_normalizer = chainerrl.links.EmpiricalNormalization( obs_dim, clip_threshold=5) # shape: Shape of input values except batch axis else: obs_normalizer = None # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_dim, action_space.n, hidden_sizes=args.G_layers) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) elif args.arch == 'FFGaussian': model = A3CFFGaussian(obs_space.low.size, action_space, bound_mean=args.bound_mean) opt = chainer.optimizers.Adam(alpha=args.lr, eps=10e-1) opt.setup(model) if args.show_D_dummy: # Let discriminator see dummy input_dim_D = obs_dim + 1 elif not args.show_D_dummy: # Do not let discriminator see dummy if args.state_rep == 21 or args.state_rep == 17: input_dim_D = obs_dim + 1 else: input_dim_D = obs_dim + 1 - args.n_experts if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) if args.algo == 'ppo': agent = PPO( model, opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, ) elif args.algo == 'gail': from customer_behaviour.algorithms.irl.gail import GAIL as G from customer_behaviour.algorithms.irl.gail import Discriminator as D demonstrations = np.load(dst + '/expert_trajectories.npz') D = D(gpu=args.gpu, input_dim=input_dim_D, hidden_sizes=args.D_layers, loss_type=args.loss_type) agent = G(env=sample_env, demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, args=args) elif args.algo == 'airl': from customer_behaviour.algorithms.irl.airl import AIRL as G from customer_behaviour.algorithms.irl.airl import Discriminator as D # obs_normalizer = None demonstrations = np.load(dst + '/expert_trajectories.npz') D = D(gpu=args.gpu, input_dim=input_dim_D - 1, hidden_sizes=args.D_layers) # AIRL only inputs state to D agent = G(env=sample_env, demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, noise=args.noise, n_experts=args.n_experts, episode_length=args.episode_length, adam_days=args.adam_days, dummy_D=args.show_D_dummy) elif args.algo == 'mmct-gail': from customer_behaviour.algorithms.irl.gail.mmct_gail import MMCTGAIL as G from customer_behaviour.algorithms.irl.gail import Discriminator as D demonstrations = np.load(dst + '/expert_trajectories.npz') D = D(gpu=args.gpu, input_dim=input_dim_D, hidden_sizes=args.D_layers, loss_type=args.loss_type) agent = G(env=sample_env, demonstrations=demonstrations, discriminator=D, model=model, optimizer=opt, obs_normalizer=obs_normalizer, gpu=args.gpu, update_interval=args.update_interval, minibatch_size=args.batchsize, epochs=args.epochs, clip_eps_vf=None, entropy_coef=args.entropy_coef, standardize_advantages=args.standardize_advantages, args=args) if args.load: # By default, not in here agent.load(args.load) if args.demo: # By default, not in here env = make_env(True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) outdir = args.load if args.load else args.outdir save_agent_demo(make_env(False), agent, outdir) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.alpha = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) # Linearly decay the clipping parameter to zero def clip_eps_setter(env, agent, value): agent.clip_eps = max(value, 1e-8) clip_eps_decay_hook = experiments.LinearInterpolationHook( args.steps, 0.2, 0, clip_eps_setter) if train_env is None: experiments.train_agent_with_evaluation( agent=agent, env=make_env( False ), # Environment train the agent against (False -> scaled rewards) eval_env=make_env(True), # Environment used for evaluation outdir=args.outdir, steps=args. steps, # Total number of timesteps for training (args.n_training_episodes*args.episode_length) eval_n_steps= None, # Number of timesteps at each evaluation phase eval_n_episodes=args. eval_n_runs, # Number of episodes at each evaluation phase (default: 10) eval_interval=args. eval_interval, # Interval of evaluation (defualt: 10000 steps (?)) train_max_episode_len= timestep_limit, # Maximum episode length during training (is None) save_best_so_far_agent=False, step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], checkpoint_freq=args.eval_interval) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=train_env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, max_episode_len=timestep_limit, eval_max_episode_len=None, eval_env=make_env(True), step_hooks=[ lr_decay_hook, clip_eps_decay_hook, ], save_best_so_far_agent=False, checkpoint_freq=args.eval_interval, log_interval=args.update_interval) save_agent_demo( make_env(True), agent, args.outdir, 10 * args.eval_episode_length ) # originally it was make_env(test=False) which seems strange # Move result files to correct folder and remove empty folder move_dir(args.outdir, dst) os.rmdir(args.outdir) if args.save_results: print('Saving result...') res2.save_data(dst, 10000, 50, N=1) print('Running evaluate policy...') ep.eval_policy(a_dir_path=dst) # else: # if args.n_experts <= 10: # print('Running evaluate policy...') # ep.eval_policy(a_dir_path=dst) # # print('Running evaluate training...') # # ets.eval_training(a_dir_path=dst) # print('Done') if args.save_report_material: print('Saving dataframe...') if args.state_rep == 21: if args.algo == 'gail': folder_name = 'gail' elif args.algo == 'airl': folder_name = 'airl' elif args.state_rep == 22: if args.algo == 'gail': folder_name = 'gail_dummies' elif args.algo == 'airl': folder_name = 'airl_dummies' elif args.state_rep == 81: if args.algo == 'gail': folder_name = 'gail_adams' elif args.algo == 'airl': folder_name = 'airl_adams' elif args.state_rep == 17: folder_name = 'ail' elif args.state_rep == 221: folder_name = 'ail_dummies' elif args.state_rep == 71: folder_name = 'ail_adams' report_material.save_df(dst, folder_name) if args.save_folder is not None: print('Saving result to ' + args.save_folder) os.makedirs(os.path.join(os.getcwd(), args.save_folder), exist_ok=True) from distutils.dir_util import copy_tree copy_tree( os.path.join(os.getcwd(), dst), os.path.join(os.getcwd(), args.save_folder, args.outdir.split('/')[-2]))