def init_agent(): # initialize an agent imsize = 3 ToyEnv(imsize) G = SpiralToyModel(imsize, False) D = SpiralToyDiscriminator(imsize, False) G_opt = rmsprop_async.RMSpropAsync() D_opt = rmsprop_async.RMSpropAsync() G_opt.setup(G) D_opt.setup(D) p = [(1, 4, 7)] dataset = ToyDataset(imsize, p, p) agent = spiral.SPIRAL(generator=G, discriminator=D, gen_optimizer=G_opt, dis_optimizer=D_opt, dataset=dataset, conditional=True, reward_mode='wgangp', imsize=imsize, max_episode_steps=3, rollout_n=1, gamma=0.99, beta=0.001, gp_lambda=10.0, lambda_R=1.0, staying_penalty=10.0, empty_drawing_penalty=1.0, n_save_final_obs_interval=10000, outdir='/tmp/chainer_spiral_test') return agent
def get_pretrained_agent(agent_path="./"): model = pretrained_NN(ndim_obs=9, n_actions=9) opt = rmsprop_async.RMSpropAsync() opt.setup(model) agent = a3c.A3C(model, opt, t_max=5, gamma=0.99, beta=1e-2) agent.load(agent_path) return agent
def make_a3c_agent(obs_space_dim, action_space_dim): model = A3CLSTMGaussian(obs_space_dim, action_space_dim) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) agent = a3c.A3C(model, opt, t_max=5, gamma=1, beta=1e-2, phi=phi) return agent
def make_agent(process_idx): n_hidden_channels = 50 if self.use_lstm: q_func = FCLSTMStateQFunction( ndim_obs, n_actions, n_hidden_channels=n_hidden_channels, n_hidden_layers=2) else: q_func = FCStateQFunctionWithDiscreteAction( ndim_obs, n_actions, n_hidden_channels=n_hidden_channels, n_hidden_layers=2) opt = rmsprop_async.RMSpropAsync(lr=1e-3, eps=1e-2, alpha=0.99) opt.setup(q_func) if self.explorer == 'epsilon_greedy': explorer = chainerrl.explorers.ConstantEpsilonGreedy( process_idx / 10, random_action_func) else: explorer = chainerrl.explorers.Boltzmann() return nsq.NSQ(q_func, opt, t_max=self.t_max, gamma=0.9, i_target=100, explorer=explorer)
def create_a3c_agent(): # env = gym.make('malware-v0') # obs_size = env.observation_space.shape[1] # action_space = env.action_space # n_actions = action_space.n obs_size = 8006 n_actions = 6 # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_size, n_actions) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_size, n_actions) if args.gpu: pass # model.to_gpu(0) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta) return agent
def __init__(self, gpu=False): self.model = A3CFFSoftmax(gpu) if gpu: self.model.to_gpu(0) self.optimizer = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) self.agent = a3c.A3C(self.model, self.optimizer, t_max=5, gamma=0.99, beta=1e-2, phi=phi) self.add_hooks = [chainer.optimizer.GradientClipping(40)]
def create_async_learner(cfg_name): """ Creates a learner that can be used with asynchronous algorithms from chainerrl. :param cfg_name: type str, the name of the config :return: chainerrl agent specified in config """ config = Config(cfg_name) network = getattr(models, config.get_str('BASIC', 'network'))(**config.get_section('NETWORK')) opt = rmsprop_async.RMSpropAsync(**config.get_section('OPTIMIZER')) opt.setup(network) opt.add_hook(optimizer.GradientClipping(threshold=config.get_float('BASIC', 'grad_clip'))) learner = getattr(agents, config.get_str('BASIC', 'learner'))(network, opt, **config.get_section( 'ALGORITHM')) return learner
def _test_load_a3c(self, gpu): model = A3CFF(4) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) agent = agents.A3C(model, opt, t_max=5, gamma=0.99, beta=1e-2, phi=lambda x: x) model, exists = download_model("A3C", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def create_acer_agent(env): #our observation space dimension of malware obs_dim = env.observation_space.shape[0] #the list of actions that we can perform on the malware n_actions = env.action_space.n #our acer network #consists of pi (our policy) and our q (our q function) model = acer.ACERSeparateModel( pi=links.Sequence(L.Linear(obs_dim, 1024, initialW=LeCunNormal(1e-3)), F.relu, L.Linear(1024, 512, initialW=LeCunNormal(1e-3)), F.relu, L.Linear(512, n_actions, initialW=LeCunNormal(1e-3)), SoftmaxDistribution), q=links.Sequence(L.Linear(obs_dim, 1024, initialW=LeCunNormal(1e-3)), F.relu, L.Linear(1024, 512, initialW=LeCunNormal(1e-3)), F.relu, L.Linear(512, n_actions, initialW=LeCunNormal(1e-3)), DiscreteActionValue), ) #optimizer for the acer opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-2, alpha=0.99) opt.setup(model) #hook to the chainer model opt.add_hook(chainer.optimizer.GradientClipping(40)) replay_buffer = EpisodicReplayBuffer(128) #the agent itself, params from original file agent = acer.ACER( model, opt, gamma=0.95, # reward discount factor t_max=32, # update the model after this many local steps replay_buffer=replay_buffer, n_times_replay= 4, # number of times experience replay is repeated for each update replay_start_size= 64, # don't start replay unless we have this many experiences in the buffer disable_online_update=True, # rely only on experience buffer use_trust_region=True, # enable trust region policy optimiztion trust_region_delta=0.1, # a parameter for TRPO truncation_threshold=5.0, # truncate large importance weights beta=1e-2, # entropy regularization parameter phi=lambda obs: obs.astype(np.float32, copy=False)) return agent
def make_acer_agent(obs_space_dim, action_space_dim): def phi(obs): return obs.astype(np.float32, copy=False) obs_low = np.array([-1] * obs_space_dim) obs_high = np.array([1] * obs_space_dim) ac_low = np.array([-1] * action_space_dim) ac_high = np.array([1] * action_space_dim) obs_space = gym.spaces.Box(obs_low, obs_high) action_space = gym.spaces.Box(ac_low, ac_high) model = chainerrl.agents.acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy(obs_space.low.size, action_space.low.size, n_hidden_channels=50, n_hidden_layers=2, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_functions.FCVFunction(obs_space.low.size, n_hidden_channels=50, n_hidden_layers=2), adv=q_functions.FCSAQFunction(obs_space.low.size, action_space.low.size, n_hidden_channels=50 // 4, n_hidden_layers=2), ) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) replay_buffer = EpisodicReplayBuffer(10) agent = chainerrl.agents.acer.ACER(model, opt, t_max=5, gamma=1, replay_buffer=replay_buffer, n_times_replay=1, replay_start_size=50, disable_online_update=False, use_trust_region=True, trust_region_delta=0.1, truncation_threshold=5, beta=0.5, phi=phi) return agent
def CREATE_AGENT(env, agent_name): gamma= 0.9 if agent_name == "DoubleDQN": q_func= QFunction(env.OBSDIM*(NUM_EYES), env.action_space_d.n) # q_func = chainerrl.q_functions.FCStateQFunctionWithDiscreteAction( # env.OBSDIM*(NUM_EYES), env.action_space_d.n, # n_hidden_layers=2, n_hidden_channels=50) # q_func.to_gpu(0) optimizer= chainer.optimizers.Adam(eps=1e-2) optimizer.setup(q_func) # explorer = chainerrl.explorers.ConstantEpsilonGreedy( # epsilon=0.3, random_action_func=env.action_space_d.sample) explorer= chainerrl.explorers.LinearDecayEpsilonGreedy( start_epsilon=0.5, end_epsilon=0.1, decay_steps=10000, random_action_func=env.action_space_d.sample) replay_buffer= chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6) agent= chainerrl.agents.DoubleDQN( q_func, optimizer, replay_buffer, gamma, explorer, replay_start_size=500, update_interval=1, target_update_interval=100) return agent if agent_name == "A3CFF": # n_actions = ale.ALE(str(env.action_space_d.n)).number_of_actions # model = A3CFF(n_actions) model= A3CFF(env.OBSDIM*(NUM_EYES), env.action_space_d.n) optimizer= rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(40)) agent= a3c.A3C(model, optimizer, t_max=4, gamma=0.9, beta=1e-2, phi=dqn_phi) return agent
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian')) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.getLogger().setLevel(args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render and process_idx == 0 and not test: misc.env_modifiers.make_rendered(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'LSTMGaussian': model = A3CLSTMGaussian(obs_space.low.size, action_space.low.size) elif args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('--processes', type=int, default=8) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--batchsize', type=int, default=10) parser.add_argument('--rollout-len', type=int, default=10) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--n-times-replay', type=int, default=1) parser.add_argument('--replay-start-size', type=int, default=10000) parser.add_argument('--t-max', type=int, default=None) parser.add_argument('--tau', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--train-async', action='store_true', default=False) parser.add_argument('--prioritized-replay', action='store_true', default=False) parser.add_argument('--disable-online-update', action='store_true', default=False) parser.add_argument('--backprop-future-values', action='store_true', default=True) parser.add_argument('--no-backprop-future-values', action='store_false', dest='backprop_future_values') args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use async training (--train-async), the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed, gpus=(args.gpu, )) if args.train_async: # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs if args.train_async: process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed else: env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render and process_idx == 0 and not test: misc.env_modifiers.make_rendered(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if isinstance(action_space, gym.spaces.Box): model = chainerrl.agents.pcl.PCLSeparateModel( pi=chainerrl.policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, var_wscale=1e-3, var_bias=1, var_type='diagonal', ), v=chainerrl.v_functions.FCVFunction( obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, )) else: model = chainerrl.agents.pcl.PCLSeparateModel( pi=chainerrl.policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers), v=chainerrl.v_functions.FCVFunction( obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, ), ) if not args.train_async and args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() model.to_gpu(args.gpu) if args.train_async: opt = rmsprop_async.RMSpropAsync(lr=args.lr, alpha=0.99) else: opt = chainer.optimizers.Adam(alpha=args.lr) opt.setup(model) if args.prioritized_replay: replay_buffer = \ chainerrl.replay_buffer.PrioritizedEpisodicReplayBuffer( capacity=5 * 10 ** 3, uniform_ratio=0.1, default_priority_func=exp_return_of_episode, wait_priority_after_sampling=False, return_sample_weights=False) else: replay_buffer = chainerrl.replay_buffer.EpisodicReplayBuffer( capacity=5 * 10**3) agent = chainerrl.agents.PCL( model, opt, replay_buffer=replay_buffer, t_max=args.t_max, gamma=0.99, tau=args.tau, phi=lambda x: x.astype(np.float32, copy=False), rollout_len=args.rollout_len, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, batchsize=args.batchsize, train_async=args.train_async, disable_online_update=args.disable_online_update, backprop_future_values=args.backprop_future_values, ) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: if args.train_async: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit) else: experiments.train_agent_with_evaluation( agent=agent, env=make_env(0, test=False), eval_env=make_env(0, test=True), outdir=args.outdir, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def __init__(self, ndim_obs, n_actions, hidden_sizes=(50, 50, 50)): self.pi = policies.SoftmaxPolicy(model=links.MLP( ndim_obs, n_actions, hidden_sizes, nonlinearity=F.tanh)) self.v = links.MLP(ndim_obs, 1, hidden_sizes=hidden_sizes, nonlinearity=F.tanh) super().__init__(self.pi, self.v) def pi_and_v(self, state): return self.pi(state), self.v(state) # load trained A3C agent model = A3CFFSoftmax(ndim_obs=9, n_actions=9) opt = rmsprop_async.RMSpropAsync() opt.setup(model) agent = a3c.A3C(model, opt, t_max=5, gamma=0.99) agent.load(agent_path) def check_game(field, action): global player_symbol, CPU_symbol, state # action is linearized to 0 to 8: break down into 3 x 3 array row, col = int(int(action) / 3), int(action) % 3 if field["text"] == " ": # user performed legal move field["text"] = player_symbol field["state"] = "disabled" state[row][col] = -1
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int, default=4) # increase for more asynchronous workers parser.add_argument('--outdir', type=str, default='a3c_training', help='Directory path to save output files. If it does not exist, it will be created.') # set directory to which output files will be written parser.add_argument('--env', type=str, default='1DIsing-A3C-v0') # specify environment to explore parser.add_argument('--steps', type=int, default=1 * 10 ** 7) # maximum number of steps before training ends parser.add_argument('--eval-interval', type=int, default=10**4) # frequency at which the agent will be evaluated parser.add_argument('--eval-n-runs', type=int, default=10) # number of evaluation runs per evaluation parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax')) # NN to use for policy and state value estimates parser.add_argument('--t-max', type=int, default=5) # increase for later truncation of the sum parser.add_argument('--beta', type=float, default=1e-2) # increase for more exploration parser.add_argument('--gamma', type=float, default=0.99) # increase for less discount of future rewards parser.add_argument('--lr', type=float, default=1 * 1e-4) # decrease for slower learning rate parser.add_argument('--weight-decay', type=float, default=0) # turn on to get weight decay parser.add_argument('--seed', type=int, default=17, help='Random seed [0, 2 ** 32)') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--profile', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e0) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--logger-level', type=int, default=logging.ERROR) # set to logging.DEBUG for (much more) information parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2 ** 32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space model = A3CFFSoftmax(obs_space.low.size, action_space.n) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=args.gamma, beta=args.beta) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(): parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument( '--max-episode-len', type=int, default=5 * 60 * 60 // 4, # 5 minutes with 60/4 fps help='Maximum number of steps for each episode.') parser.add_argument('--final-exploration-frames', type=int, default=4 * 10**6) parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--profile', action='store_true') parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, 0.05) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env sample_env = make_env(0, test=False) action_space = sample_env.action_space assert isinstance(action_space, spaces.Discrete) # Define a model and its optimizer q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n), DiscreteActionValue) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99) opt.setup(q_func) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 # Make process-specific agents to diversify exploration def make_agent(process_idx): # Random epsilon assignment described in the original paper rand = random.random() if rand < 0.4: epsilon_target = 0.1 elif rand < 0.7: epsilon_target = 0.01 else: epsilon_target = 0.5 explorer = explorers.LinearDecayEpsilonGreedy( 1, epsilon_target, args.final_exploration_frames, action_space.sample) # Suppress the explorer logger explorer.logger.setLevel(logging.INFO) return nsq.NSQ(q_func, opt, t_max=5, gamma=0.99, i_target=40000, explorer=explorer, phi=phi) if args.demo: env = make_env(0, True) agent = make_agent(0) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( outdir=args.outdir, processes=args.processes, make_env=make_env, make_agent=make_agent, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
def main(): # This prevents numpy from using multiple threads os.environ['OMP_NUM_THREADS'] = '1' import logging # logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--final-exploration-frames', type=int, default=4 * 10**6) parser.add_argument('--outdir', type=str, default='nsq_output') parser.add_argument('--profile', action='store_true') parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) args = parser.parse_args() if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env sample_env = make_env(0, test=False) action_space = sample_env.action_space assert isinstance(action_space, spaces.Discrete) # Define a model and its optimizer q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n), DiscreteActionValue) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99) opt.setup(q_func) # Make process-specific agents to diversify exploration def make_agent(process_idx): # Random epsilon assignment described in the original paper rand = random.random() if rand < 0.4: epsilon_target = 0.1 elif rand < 0.7: epsilon_target = 0.01 else: epsilon_target = 0.5 explorer = explorers.LinearDecayEpsilonGreedy( 1, epsilon_target, args.final_exploration_frames, action_space.sample) # Suppress the explorer logger explorer.logger.setLevel(logging.INFO) return nsq.NSQ(q_func, opt, t_max=5, gamma=0.99, i_target=40000, explorer=explorer, phi=dqn_phi) if args.demo: env = make_env(0, True) agent = make_agent(0) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: explorer = explorers.ConstantEpsilonGreedy(0.05, action_space.sample) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(outdir=args.outdir, processes=args.processes, make_env=make_env, make_agent=make_agent, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, eval_explorer=explorer, global_step_hooks=[lr_decay_hook])
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=50) parser.add_argument('--n-times-replay', type=int, default=4) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--replay-capacity', type=int, default=5000) parser.add_argument('--replay-start-size', type=int, default=10**3) parser.add_argument('--disable-online-update', action='store_true') parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--truncation-threshold', type=float, default=5) parser.add_argument('--trust-region-delta', type=float, default=0.1) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space if isinstance(action_space, spaces.Box): model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_functions.FCVFunction(obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers), adv=q_functions.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels // 4, n_hidden_layers=args.n_hidden_layers), ) else: model = acer.ACERSeparateModel( pi=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), SoftmaxDistribution), q=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) replay_buffer = EpisodicReplayBuffer(args.replay_capacity) agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, disable_online_update=args.disable_online_update, use_trust_region=True, trust_region_delta=args.trust_region_delta, truncation_threshold=args.truncation_threshold, beta=args.beta) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--replay-start-size', type=int, default=10000) parser.add_argument('--n-times-replay', type=int, default=4) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.set_defaults(use_sdl=False) parser.set_defaults(use_lstm=False) args = parser.parse_args() # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions if args.use_lstm: model = acer.ACERSharedModel( shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) else: model = acer.ACERSharedModel( shared=links.NIPSDQNHead(), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, beta=args.beta, phi=dqn_phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook])
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 n_hidden_layers = 2 nonlinearity = F.relu if use_lstm: if discrete: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), ) else: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, var_wscale=1e-2, var_bias=1, bound_mean=True, min_action=action_space.low, max_action=action_space.high, min_var=1e-1, ), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), ) else: if discrete: model = a3c.A3CSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), ) else: model = a3c.A3CSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, var_wscale=1e-2, var_bias=1, bound_mean=True, min_action=action_space.low, max_action=action_space.high, min_var=1e-1, ), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, last_wscale=1e-2, ), ) eps = 1e-8 if self.backprop_future_values else 1e-1 opt = rmsprop_async.RMSpropAsync(lr=5e-4, eps=eps, alpha=0.99) opt.setup(model) gamma = 0.5 tau = 1e-2 replay_buffer = chainerrl.replay_buffer.EpisodicReplayBuffer(10**5) agent = pcl.PCL(model, opt, replay_buffer=replay_buffer, t_max=t_max, gamma=gamma, tau=tau, phi=phi, n_times_replay=1, batchsize=self.batchsize, train_async=self.train_async, backprop_future_values=self.backprop_future_values, act_deterministically=True) if self.train_async: with warnings.catch_warnings(record=True) as warns: chainerrl.experiments.train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=2, eval_interval=200, eval_n_runs=5, successful_score=1) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other # processes after success. Thus here the successful model is loaded # explicitly. if require_success: agent.load(os.path.join(self.outdir, 'successful')) else: agent.process_idx = 0 chainerrl.experiments.train_agent_with_evaluation( agent=agent, env=make_env(0, False), eval_env=make_env(0, True), outdir=self.outdir, steps=steps, max_episode_len=2, eval_interval=200, eval_n_runs=5, successful_score=1) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward if require_success: self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results') parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--update-steps', type=int, default=5) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--gamma', type=float, default=0.99, help='discount factor') parser.add_argument('--rmsprop-epsilon', type=float, default=1e-5) parser.add_argument('--use-gae', action='store_true', default=False, help='use generalized advantage estimation') parser.add_argument('--tau', type=float, default=0.95, help='gae parameter') parser.add_argument('--alpha', type=float, default=0.99, help='RMSprop optimizer alpha') parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--max-grad-norm', type=float, default=40, help='value loss coefficient') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.set_defaults(use_lstm=False) args = parser.parse_args() logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(0, test=False) n_actions = sample_env.action_space.n model = A2CFF(n_actions) optimizer = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=args.alpha) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.max_grad_norm)) if args.weight_decay > 0: optimizer.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a2c.A2C( model, optimizer, gamma=args.gamma, gpu=args.gpu, num_processes=args.num_envs, update_steps=args.update_steps, phi=phi, use_gae=args.use_gae, tau=args.tau, ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, log_interval=1000, )
) logger.info('Gym is set up.') # Define agents to be used model = A3Cagent(obs_size, n_actions, args.nHidden) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros(obs_size, dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph( [model(fake_obs)], os.path.join(args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.eps, alpha=args.alpha) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(args.gclipping)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) phi = lambda x: x.astype(np.float32, copy=False) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=args.gamma, beta=args.beta, phi=phi) lr_decay_hook = experiments.LinearInterpolationHook(args.steps, args.lr, 0, lr_setter) training = experiments.train_agent_async( agent=agent, outdir=args.outdir,
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--use-sdl', action='store_true', default=False) parser.add_argument('--final-exploration-frames', type=int, default=4 * 10**6) parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--profile', action='store_true') parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) args = parser.parse_args() # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env sample_env = make_env(0, test=False) action_space = sample_env.action_space assert isinstance(action_space, spaces.Discrete) # Define a model and its optimizer q_func = links.Sequence(links.NIPSDQNHead(), L.Linear(256, action_space.n), DiscreteActionValue) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=1e-1, alpha=0.99) opt.setup(q_func) # Make process-specific agents to diversify exploration def make_agent(process_idx): # Random epsilon assignment described in the original paper rand = random.random() if rand < 0.4: epsilon_target = 0.1 elif rand < 0.7: epsilon_target = 0.01 else: epsilon_target = 0.5 explorer = explorers.LinearDecayEpsilonGreedy( 1, epsilon_target, args.final_exploration_frames, action_space.sample) # Suppress the explorer logger explorer.logger.setLevel(logging.INFO) return nsq.NSQ(q_func, opt, t_max=5, gamma=0.99, i_target=40000, explorer=explorer, phi=dqn_phi) if args.demo: env = make_env(0, True) agent = make_agent(0) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: explorer = explorers.ConstantEpsilonGreedy(0.05, action_space.sample) # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(outdir=args.outdir, processes=args.processes, make_env=make_env, make_agent=make_agent, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, eval_explorer=explorer, global_step_hooks=[lr_decay_hook])
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 n_hidden_layers = 1 nonlinearity = F.leaky_relu replay_buffer = EpisodicReplayBuffer(10**4) if use_lstm: if discrete: model = acer.ACERSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, min_prob=1e-1), q=q_function.FCStateQFunctionWithDiscreteAction( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: model = acer.ACERSDNSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, nonlinearity=nonlinearity, min_var=1e-1), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), adv=q_function.FCSAQFunction( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: if discrete: model = acer.ACERSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity, min_prob=1e-1), q=q_function.FCStateQFunctionWithDiscreteAction( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) else: model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high, nonlinearity=nonlinearity, min_var=1e-1), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), adv=q_function.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=n_hidden_layers, nonlinearity=nonlinearity), ) eps = 1e-8 opt = rmsprop_async.RMSpropAsync(lr=1e-3, eps=eps, alpha=0.99) opt.setup(model) gamma = 0.5 beta = 1e-5 if self.n_times_replay == 0 and self.disable_online_update: # At least one of them must be enabled return agent = acer.ACER(model, opt, replay_buffer=replay_buffer, t_max=t_max, gamma=gamma, beta=beta, phi=phi, n_times_replay=self.n_times_replay, act_deterministically=True, disable_online_update=self.disable_online_update, replay_start_size=100, use_trust_region=self.use_trust_region) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, 'successful')) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward if require_success: self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--n-times-replay', type=int, default=8) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-frequency', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.getLogger().setLevel(args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render and process_idx == 0 and not test: misc.env_modifiers.make_rendered(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space n_hidden_channels = 200 model = acer.ACERSeparateModel( pi=links.Sequence( L.Linear(obs_space.low.size, n_hidden_channels), F.relu, L.Linear(n_hidden_channels, action_space.n, wscale=1e-3), SoftmaxDistribution), q=links.Sequence( L.Linear(obs_space.low.size, n_hidden_channels), F.relu, L.Linear(n_hidden_channels, action_space.n, wscale=1e-3), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) replay_buffer = EpisodicReplayBuffer(10**5 // args.processes) agent = acer.DiscreteACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, beta=args.beta, phi=phi) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) mean, median, stdev = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev'.format( args.eval_n_runs, mean, median, stdev)) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_frequency=args.eval_frequency, max_episode_len=timestep_limit)
def main(): # Prevent numpy from using multiple threads os.environ['OMP_NUM_THREADS'] = '1' import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.set_defaults(use_sdl=False) parser.set_defaults(use_lstm=False) args = parser.parse_args() if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions if args.use_lstm: model = A3CLSTM(n_actions) else: model = A3CFF(n_actions) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=dqn_phi) if args.load: agent.load(args.load) def make_env(process_idx, test): env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--processes', type=int, default=16) parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=250000) parser.add_argument('--eval-n-steps', type=int, default=125000) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = gym.make(args.env).action_space.n model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph([model(fake_obs)], os.path.join( args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian')) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'LSTMGaussian': model = A3CLSTMGaussian(obs_space.low.size, action_space.low.size) elif args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(): parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--replay-start-size', type=int, default=10000) parser.add_argument('--n-times-replay', type=int, default=4) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=10**7) parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.set_defaults(use_lstm=False) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = gym.make(args.env).action_space.n if args.use_lstm: model = acer.ACERSharedModel( shared=links.Sequence(links.NIPSDQNHead(), L.LSTM(256, 256)), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) else: model = acer.ACERSharedModel( shared=links.NIPSDQNHead(), pi=links.Sequence(L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence(L.Linear(256, n_actions), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, beta=args.beta, phi=phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.set_defaults(use_sdl=False) parser.set_defaults(use_lstm=False) args = parser.parse_args() # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions if args.use_lstm: model = A3CLSTM(n_actions) else: model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph([model(fake_obs)], os.path.join( args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=dqn_phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook])