def create_agent(train=True): model = ACERSeparateModel( pi=Sequence( L.Linear(env.observation_space.shape[0], 20), F.relu, L.Linear(20, env.action_space.n, initialW=LeCunNormal(1e-3)), SoftmaxDistribution, ), q=Sequence( L.Linear(env.observation_space.shape[0], 20), F.relu, L.Linear(20, env.action_space.n, initialW=LeCunNormal(1e-3)), DiscreteActionValue, ) ) opt = RMSpropAsync(lr=7e-4, eps=1e-2, alpha=0.99) opt.setup(model) opt.add_hook(GradientClipping(40)) agent = ACER( model=model, # Model to train optimizer=opt, # The optimizer gamma=0.99, # Reward discount factor t_max=50, # The model is updated after this many local steps replay_buffer=EpisodicReplayBuffer(5000), # The replay buffer replay_start_size=100, # Replay buffer won't be used until it has at least this many episodes beta=1, # Entropy regularization parameter ) if not train: agent.act_deterministically = True return agent
def __init__(self, in_size, out_size, hidden_sizes, nonlinearity=F.relu, last_wscale=1): self.in_size = in_size self.out_size = out_size self.hidden_sizes = hidden_sizes self.nonlinearity = nonlinearity super().__init__() with self.init_scope(): if hidden_sizes: hidden_layers = [] hidden_layers.append(L.Linear(in_size, hidden_sizes[0])) for hin, hout in zip(hidden_sizes, hidden_sizes[1:]): hidden_layers.append(L.Linear(hin, hout)) self.hidden_layers = chainer.ChainList(*hidden_layers) self.output = L.Linear(hidden_sizes[-1], out_size, initialW=LeCunNormal(last_wscale)) else: self.output = L.Linear(in_size, out_size, initialW=LeCunNormal(last_wscale))
def __init__(self, in_size, out_size, hidden_sizes, normalize_input=True, normalize_output=False, nonlinearity=F.relu, last_wscale=1): self.in_size = in_size self.out_size = out_size self.hidden_sizes = hidden_sizes self.normalize_input = normalize_input self.normalize_output = normalize_output self.nonlinearity = nonlinearity super().__init__() with self.init_scope(): if normalize_input: self.input_bn = L.BatchNormalization(in_size) self.input_bn.avg_var[:] = 1 if hidden_sizes: hidden_layers = [] hidden_layers.append(LinearBN(in_size, hidden_sizes[0])) for hin, hout in zip(hidden_sizes, hidden_sizes[1:]): hidden_layers.append(LinearBN(hin, hout)) self.hidden_layers = chainer.ChainList(*hidden_layers) self.output = L.Linear(hidden_sizes[-1], out_size, initialW=LeCunNormal(last_wscale)) else: self.output = L.Linear(in_size, out_size, initialW=LeCunNormal(last_wscale)) if normalize_output: self.output_bn = L.BatchNormalization(out_size) self.output_bn.avg_var[:] = 1
def __init__(self, n_input_channels, action_size, var, n_hidden_layers=0, n_hidden_channels=None, min_action=None, max_action=None, bound_mean=False, nonlinearity=F.relu, mean_wscale=1): self.n_input_channels = n_input_channels self.action_size = action_size self.n_hidden_layers = n_hidden_layers self.n_hidden_channels = n_hidden_channels self.min_action = min_action self.max_action = max_action self.bound_mean = bound_mean self.nonlinearity = nonlinearity if np.isscalar(var): self.var = np.full(action_size, var, dtype=np.float32) else: self.var = var layers = [] if n_hidden_layers > 0: # Input to hidden layers.append(L.Linear(n_input_channels, n_hidden_channels)) layers.append(self.nonlinearity) for _ in range(n_hidden_layers - 1): # Hidden to hidden layers.append(L.Linear(n_hidden_channels, n_hidden_channels)) layers.append(self.nonlinearity) # The last layer is used to compute the mean layers.append( L.Linear(n_hidden_channels, action_size, initialW=LeCunNormal(mean_wscale))) else: # There's only one layer for computing the mean layers.append( L.Linear(n_input_channels, action_size, initialW=LeCunNormal(mean_wscale))) if self.bound_mean: layers.append( lambda x: bound_by_tanh(x, self.min_action, self.max_action)) def get_var_array(shape): self.var = self.xp.asarray(self.var) return self.xp.broadcast_to(self.var, shape) layers.append(lambda x: distribution.GaussianDistribution( x, get_var_array(x.shape))) super().__init__(*layers)
def __init__(self, n_input_channels, action_size, n_hidden_layers=0, n_hidden_channels=None, min_action=None, max_action=None, bound_mean=False, var_type='spherical', nonlinearity=F.relu, mean_wscale=1, var_wscale=1, var_bias=0, min_var=0): self.n_input_channels = n_input_channels self.action_size = action_size self.n_hidden_layers = n_hidden_layers self.n_hidden_channels = n_hidden_channels self.min_action = min_action self.max_action = max_action self.bound_mean = bound_mean self.nonlinearity = nonlinearity self.min_var = min_var var_size = {'spherical': 1, 'diagonal': action_size}[var_type] self.hidden_layers = [] if n_hidden_layers > 0: self.hidden_layers.append( L.Linear(n_input_channels, n_hidden_channels)) for _ in range(n_hidden_layers - 1): self.hidden_layers.append( L.Linear(n_hidden_channels, n_hidden_channels)) self.mean_layer = L.Linear(n_hidden_channels, action_size, initialW=LeCunNormal(mean_wscale)) self.var_layer = L.Linear(n_hidden_channels, var_size, initialW=LeCunNormal(var_wscale), initial_bias=var_bias) else: self.mean_layer = L.Linear(n_input_channels, action_size, initialW=LeCunNormal(mean_wscale)) self.var_layer = L.Linear(n_input_channels, var_size, initialW=LeCunNormal(var_wscale), initial_bias=var_bias) super().__init__(self.mean_layer, self.var_layer, *self.hidden_layers)
def __init__( self, n_input_channels, action_size, n_hidden_layers=0, n_hidden_channels=None, min_action=None, max_action=None, bound_mean=False, var_type='spherical', nonlinearity=F.relu, mean_wscale=1, var_func=F.softplus, var_param_init=0, ): self.n_input_channels = n_input_channels self.action_size = action_size self.n_hidden_layers = n_hidden_layers self.n_hidden_channels = n_hidden_channels self.min_action = min_action self.max_action = max_action self.bound_mean = bound_mean self.nonlinearity = nonlinearity self.var_func = var_func var_size = {'spherical': 1, 'diagonal': action_size}[var_type] layers = [] layers.append(L.Linear(n_input_channels, n_hidden_channels)) for _ in range(n_hidden_layers - 1): layers.append(self.nonlinearity) layers.append(L.Linear(n_hidden_channels, n_hidden_channels)) layers.append(self.nonlinearity) # The last layer is used to compute the mean layers.append( L.Linear(n_hidden_channels, action_size, initialW=LeCunNormal(mean_wscale))) if self.bound_mean: layers.append( lambda x: bound_by_tanh(x, self.min_action, self.max_action)) super().__init__() with self.init_scope(): self.hidden_layers = links.Sequence(*layers) self.var_param = chainer.Parameter(initializer=var_param_init, shape=(var_size, ))
def __init__(self, n_input_channels, n_hidden_layers, n_hidden_channels, action_size, min_action=None, max_action=None, bound_action=True, nonlinearity=F.relu, last_wscale=1.): self.n_input_channels = n_input_channels self.n_hidden_layers = n_hidden_layers self.n_hidden_channels = n_hidden_channels self.action_size = action_size self.min_action = min_action self.max_action = max_action self.bound_action = bound_action if self.bound_action: def action_filter(x): return bound_by_tanh(x, self.min_action, self.max_action) else: action_filter = None model = chainer.Chain( fc=MLP( self.n_input_channels, n_hidden_channels, (self.n_hidden_channels, ) * self.n_hidden_layers, nonlinearity=nonlinearity, ), lstm=L.LSTM(n_hidden_channels, n_hidden_channels), out=L.Linear(n_hidden_channels, action_size, initialW=LeCunNormal(last_wscale)), ) def model_call(model, x): h = nonlinearity(model.fc(x)) h = model.lstm(h) h = model.out(h) return h super().__init__(model=model, model_call=model_call, action_filter=action_filter)
def get_weight_initializer(name: Optional[str]): if name is None: return None elif name == "GlorotNormal": from chainer.initializers.normal import GlorotNormal initializer = GlorotNormal() elif name == "HeNormal": from chainer.initializers.normal import HeNormal initializer = HeNormal() elif name == "LeCunNormal": from chainer.initializers.normal import LeCunNormal initializer = LeCunNormal() elif name == "Normal": from chainer.initializers.normal import Normal initializer = Normal() elif name == "Orthogonal": from chainer.initializers.orthogonal import Orthogonal initializer = Orthogonal() elif name == "GlorotUniform": from chainer.initializers.uniform import GlorotUniform initializer = GlorotUniform() elif name == "HeUniform": from chainer.initializers.uniform import HeUniform initializer = HeUniform() elif name == "LeCunUniform": from chainer.initializers.uniform import LeCunUniform initializer = LeCunUniform() elif name == "Uniform": from chainer.initializers.uniform import Uniform initializer = Uniform() elif name == "PossibleOrthogonal": initializer = PossibleOrthogonal() else: raise ValueError(name) return initializer
def __init__(self, n_dim_obs, n_dim_action, n_hidden_channels, n_hidden_layers, nonlinearity=F.relu, last_wscale=1.): self.n_input_channels = n_dim_obs + n_dim_action self.n_hidden_layers = n_hidden_layers self.n_hidden_channels = n_hidden_channels self.nonlinearity = nonlinearity super().__init__() with self.init_scope(): self.fc = MLP( self.n_input_channels, n_hidden_channels, [self.n_hidden_channels] * self.n_hidden_layers, nonlinearity=nonlinearity, ) self.lstm = L.LSTM(n_hidden_channels, n_hidden_channels) self.out = L.Linear(n_hidden_channels, 1, initialW=LeCunNormal(last_wscale))
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=50) parser.add_argument('--n-times-replay', type=int, default=4) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--replay-capacity', type=int, default=5000) parser.add_argument('--replay-start-size', type=int, default=10**3) parser.add_argument('--disable-online-update', action='store_true') parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--truncation-threshold', type=float, default=5) parser.add_argument('--trust-region-delta', type=float, default=0.1) args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space if isinstance(action_space, spaces.Box): model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_functions.FCVFunction(obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers), adv=q_functions.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels // 4, n_hidden_layers=args.n_hidden_layers), ) else: model = acer.ACERSeparateModel( pi=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), SoftmaxDistribution), q=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) replay_buffer = EpisodicReplayBuffer(args.replay_capacity) agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, disable_online_update=args.disable_online_update, use_trust_region=True, trust_region_delta=args.trust_region_delta, truncation_threshold=args.truncation_threshold, beta=args.beta) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def __init__(self): super().__init__() self.orthogonal = Orthogonal() self.lecun = LeCunNormal()
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**32 def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space if isinstance(action_space, spaces.Box): model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_functions.FCVFunction(obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers), adv=q_functions.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels // 4, n_hidden_layers=args.n_hidden_layers), ) else: model = acer.ACERSeparateModel( pi=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), SoftmaxDistribution), q=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, initialW=LeCunNormal(1e-3)), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) replay_buffer = EpisodicReplayBuffer(args.replay_capacity) agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, disable_online_update=args.disable_online_update, use_trust_region=True, trust_region_delta=args.trust_region_delta, truncation_threshold=args.truncation_threshold, beta=args.beta) if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_freq, log_type=args.log_type, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit) elif (args.mode == 'check'): from matplotlib import animation import matplotlib.pyplot as plt env = make_env(0, True) frames = [] for i in range(3): obs = env.reset() done = False R = 0 t = 0 while not done and t < 200: frames.append(env.render(mode='rgb_array')) action = agent.act(obs) obs, r, done, _ = env.step(action) R += r t += 1 print('test episode:', i, 'R:', R) agent.stop_episode() env.close() from IPython.display import HTML plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) anim.save(args.save_mp4) return anim