def make_a3c_agent(obs_space_dim, action_space_dim): model = A3CLSTMGaussian(obs_space_dim, action_space_dim) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) agent = a3c.A3C(model, opt, t_max=5, gamma=1, beta=1e-2, phi=phi) return agent
def get_pretrained_agent(agent_path="./"): model = pretrained_NN(ndim_obs=9, n_actions=9) opt = rmsprop_async.RMSpropAsync() opt.setup(model) agent = a3c.A3C(model, opt, t_max=5, gamma=0.99, beta=1e-2) agent.load(agent_path) return agent
def create_a3c_agent(): # env = gym.make('malware-v0') # obs_size = env.observation_space.shape[1] # action_space = env.action_space # n_actions = action_space.n obs_size = 8006 n_actions = 6 # Switch policy types accordingly to action space types if args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_size, n_actions) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_size, n_actions) if args.gpu: pass # model.to_gpu(0) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta) return agent
def __init__(self, gpu=False): self.model = A3CFFSoftmax(gpu) if gpu: self.model.to_gpu(0) self.optimizer = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) self.agent = a3c.A3C(self.model, self.optimizer, t_max=5, gamma=0.99, beta=1e-2, phi=phi) self.add_hooks = [chainer.optimizer.GradientClipping(40)]
def CREATE_AGENT(env, agent_name): gamma= 0.9 if agent_name == "DoubleDQN": q_func= QFunction(env.OBSDIM*(NUM_EYES), env.action_space_d.n) # q_func = chainerrl.q_functions.FCStateQFunctionWithDiscreteAction( # env.OBSDIM*(NUM_EYES), env.action_space_d.n, # n_hidden_layers=2, n_hidden_channels=50) # q_func.to_gpu(0) optimizer= chainer.optimizers.Adam(eps=1e-2) optimizer.setup(q_func) # explorer = chainerrl.explorers.ConstantEpsilonGreedy( # epsilon=0.3, random_action_func=env.action_space_d.sample) explorer= chainerrl.explorers.LinearDecayEpsilonGreedy( start_epsilon=0.5, end_epsilon=0.1, decay_steps=10000, random_action_func=env.action_space_d.sample) replay_buffer= chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6) agent= chainerrl.agents.DoubleDQN( q_func, optimizer, replay_buffer, gamma, explorer, replay_start_size=500, update_interval=1, target_update_interval=100) return agent if agent_name == "A3CFF": # n_actions = ale.ALE(str(env.action_space_d.n)).number_of_actions # model = A3CFF(n_actions) model= A3CFF(env.OBSDIM*(NUM_EYES), env.action_space_d.n) optimizer= rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(40)) agent= a3c.A3C(model, optimizer, t_max=4, gamma=0.9, beta=1e-2, phi=dqn_phi) return agent
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=1000000): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 if use_lstm: if discrete: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), ) else: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), ) else: if discrete: model = a3c.A3CSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), ) else: model = a3c.A3CSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2), ) eps = 1e-1 if discrete else 1e-2 opt = rmsprop_async.RMSpropAsync(lr=5e-4, eps=eps, alpha=0.99) opt.setup(model) gamma = 0.9 beta = 1e-2 agent = a3c.A3C(model, opt, t_max=t_max, gamma=gamma, beta=beta, phi=phi, act_deterministically=True) max_episode_len = None if episodic else 2 train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_runs=5, successful_score=1) # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. agent.load(os.path.join(self.outdir, 'successful')) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int, default=4) # increase for more asynchronous workers parser.add_argument('--outdir', type=str, default='a3c_training', help='Directory path to save output files. If it does not exist, it will be created.') # set directory to which output files will be written parser.add_argument('--env', type=str, default='1DIsing-A3C-v0') # specify environment to explore parser.add_argument('--steps', type=int, default=1 * 10 ** 7) # maximum number of steps before training ends parser.add_argument('--eval-interval', type=int, default=10**4) # frequency at which the agent will be evaluated parser.add_argument('--eval-n-runs', type=int, default=10) # number of evaluation runs per evaluation parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax')) # NN to use for policy and state value estimates parser.add_argument('--t-max', type=int, default=5) # increase for later truncation of the sum parser.add_argument('--beta', type=float, default=1e-2) # increase for more exploration parser.add_argument('--gamma', type=float, default=0.99) # increase for less discount of future rewards parser.add_argument('--lr', type=float, default=1 * 1e-4) # decrease for slower learning rate parser.add_argument('--weight-decay', type=float, default=0) # turn on to get weight decay parser.add_argument('--seed', type=int, default=17, help='Random seed [0, 2 ** 32)') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--profile', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e0) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--logger-level', type=int, default=logging.ERROR) # set to logging.DEBUG for (much more) information parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2 ** 32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = chainerrl.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space model = A3CFFSoftmax(obs_space.low.size, action_space.n) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=args.gamma, beta=args.beta) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian')) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'LSTMGaussian': model = A3CLSTMGaussian(obs_space.low.size, action_space.low.size) elif args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.set_defaults(use_sdl=False) parser.set_defaults(use_lstm=False) args = parser.parse_args() # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions if args.use_lstm: model = A3CLSTM(n_actions) else: model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph([model(fake_obs)], os.path.join( args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=dqn_phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test, seed=env_seed) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook])
def make_chainer_a3c(obs_size, action_size): model = A3CLSTMGaussian(obs_size, action_size) opt = optimizers.Adam(eps=1e-2) opt.setup(model) agent = a3c.A3C(model, opt, 10 ** 5, 0.9) return agent
fake_obs = chainer.Variable(np.zeros(obs_size, dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph( [model(fake_obs)], os.path.join(args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.eps, alpha=args.alpha) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(args.gclipping)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) phi = lambda x: x.astype(np.float32, copy=False) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=args.gamma, beta=args.beta, phi=phi) lr_decay_hook = experiments.LinearInterpolationHook(args.steps, args.lr, 0, lr_setter) training = experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.threads, make_env=make_env, profile=False, steps=args.steps, eval_interval=args.eval_interval, eval_n_episodes=args.eval_n_runs, max_episode_len=args.max_episode_len, successful_score=args.stop, global_step_hooks=[lr_decay_hook],
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian')) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.getLogger().setLevel(args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render and process_idx == 0 and not test: misc.env_modifiers.make_rendered(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Switch policy types accordingly to action space types if args.arch == 'LSTMGaussian': model = A3CLSTMGaussian(obs_space.low.size, action_space.low.size) elif args.arch == 'FFSoftmax': model = A3CFFSoftmax(obs_space.low.size, action_space.n) elif args.arch == 'FFMellowmax': model = A3CFFMellowmax(obs_space.low.size, action_space.n) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(): import argparse parser = argparse.ArgumentParser() # Simulation params parser.add_argument('--name', type=str, default='EIIE_A3C') parser.add_argument('--processes', type=int, default=4, help="number of environment instances to use") parser.add_argument('--seed', type=int, default=42, help="random seed") parser.add_argument( '--obs_steps', type=int, default=64, help= "Observation steps, number of candles required by the agent for calculations" ) parser.add_argument( '--period', type=int, default=30, help="Observation period in minutes, also trading frequency") parser.add_argument('--max_episode_len', type=int, default=5, help="Max timesteps per episode") parser.add_argument('--steps', type=int, default=1e6, help="Training steps") parser.add_argument('--eval-interval', type=int, default=None) parser.add_argument('--eval-n-runs', type=int, default=10) # Learning params parser.add_argument('--n_filters_in', type=int, default=8, help="number of input filters heads to train") parser.add_argument('--n_filters_out', type=int, default=256, help="number of pattern recognition neurons to train") parser.add_argument('--t_max', type=int, default=5, help="Timesteps before update main model") parser.add_argument('--grad_noise', type=float, default=0.0, help="gradient noise to apply") parser.add_argument('--lr', type=float, default=1e-4, help="learning rate") parser.add_argument('--lr_decay', type=float, default=1000, help="learning rate linear decay rate") parser.add_argument( '--alpha', type=float, default=0.99, help= "Exponential decay rate of the second order moment for rmsprop optimizer" ) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1, help="fuzz factor") parser.add_argument('--clip_grad', type=float, default=100, help="Clip gradient norm") parser.add_argument('--reward-scale-factor', type=float, default=1., help="scale environment reward") # Regularization parser.add_argument('--beta', type=float, default=1e-3, help="entropy regularization weight for policy") parser.add_argument('--beta_decay', type=float, default=1000, help="entropy regularization decay rate") parser.add_argument('--l2_reg', type=float, default=0, help="l2 regularization coefficient") parser.add_argument('--l1_reg', type=float, default=0, help="l1 regularization coefficient") parser.add_argument('--gamma', type=float, default=0.999, help="discount factor") # Misc parser.add_argument('--profile', action='store_true') parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--monitor', action='store_true', default=False) parser.add_argument('--download', action='store_true', default=False) parser.add_argument('--datafeed', action='store_true', default=False) # Dir options parser.add_argument('--data_dir', type=str, default='./data') parser.add_argument('--out_dir', type=str, default='./save') parser.add_argument('--load_dir', type=str, default='./save') parser.add_argument('--log_dir', type=str, default='./logs') parser.add_argument('--logger-level', type=int, default=logging.ERROR) args = parser.parse_args() logging.getLogger().setLevel(args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.log_dir) # Simulation Params # Universe pairs = [ 'BTC_ETH', 'BTC_BCH', 'BTC_XRP', 'BTC_STR', 'BTC_LTC', 'BTC_DASH', 'BTC_XMR', 'BTC_ETC', 'BTC_ZEC', 'BTC_BTS', 'BTC_LSK', 'BTC_XEM', 'BTC_VTC', 'BTC_STRAT', 'BTC_EMC2', 'BTC_NXT', 'BTC_OMG' ] # Universe, some survivor bias here... fiat_symbol = 'BTC' # Quote symbol init_funds = make_balance( crypto=0.0, fiat=100.0, pairs=pairs) # Initial equally distributed portfolio # NN params timesteps = args.obs_steps - 1 n_filters_in = args.n_filters_in n_filters_out = args.n_filters_out if args.load_dir: try: last_save = np.argmax([ int(d.split('_')[0]) for d in listdir(args.load_dir) if '.' not in d ]) load_dir = args.load_dir + "/" + listdir(args.load_dir)[last_save] global_t = int( listdir(args.load_dir)[int(last_save)].split('_')[0]) except ValueError as e: load_dir = False global_t = 0 else: load_dir = None global_t = 0 # Make environment function if args.datafeed: papi = DataFeed(args.period, pairs, 'polo_test', 'ipc://feed.ipc') else: papi = BacktestDataFeed(Poloniex(), args.period, pairs) if args.download: print("Downloading data...") papi.download_data( end=datetime.timestamp(datetime.utcnow() - timedelta(days=50)), start=datetime.timestamp(datetime.utcnow() - timedelta(days=300))) papi.save_data(args.data_dir + '/train') def make_env(process_idx, test): tapi = BacktestDataFeed(papi, args.period, pairs=pairs, balance=init_funds, load_dir=args.data_dir) tapi.load_data('/train') # Environment setup env = BacktestEnvironment(args.period, args.obs_steps, tapi, fiat_symbol, args.name) env.setup() if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) return env # Model declaration print("Instantiating model") model = cn_agents.A3CEIIE(timesteps, len(pairs) + 1, n_filters_in, n_filters_out) #.to_gpu(0) # Optimizer opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=args.alpha) opt.setup(model) if args.clip_grad: opt.add_hook(cn.optimizer.GradientClipping(args.clip_grad)) if args.grad_noise: opt.add_hook(cn.optimizer.GradientNoise(args.grad_noise)) if args.l2_reg: opt.add_hook(cn.optimizer.WeightDecay(args.l2_reg)) if args.l1_reg: opt.add_hook(cn.optimizer.Lasso(args.l1_reg)) # Agent print("Building agent") agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=args.gamma, beta=args.beta, phi=phi, normalize_grad_by_t_max=True, act_deterministically=False, v_loss_coef=1.0) # Load information if load_dir: agent.load(load_dir) print("Model loaded from %s" % (load_dir)) # Training hooks pp = PrintProgress(time()) def lr_setter(env, agent, value): agent.optimizer.lr = value def beta_setter(env, agent, value): agent.beta = value lr_decay = LinearInterpolationHook(int(args.steps), args.lr, args.lr / args.lr_decay, lr_setter) beta_decay = LinearInterpolationHook(int(3 * args.steps / 4), args.beta, args.beta / args.beta_decay, beta_setter) # Training session try: print("Training starting...\n") with np.errstate(divide='ignore'): experiments.train_agent_async( agent=agent, outdir=args.out_dir, processes=args.processes, make_env=make_env, profile=args.profile, steps=int(args.steps), eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[pp, lr_decay, beta_decay], resume_step=global_t) except KeyboardInterrupt: print("\nThx for the visit. Good bye.")
def _test_abc(self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True): nproc = 8 def make_env(process_idx, test): size = 2 return ABC(size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space def phi(x): return x n_hidden_channels = 20 if use_lstm: if discrete: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCSoftmaxPolicy( n_hidden_channels, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), ) else: model = a3c.A3CSharedModel( shared=L.LSTM(obs_space.low.size, n_hidden_channels), pi=policies.FCGaussianPolicy( n_hidden_channels, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, mean_wscale=1e-1, ), v=v_function.FCVFunction( n_hidden_channels, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), ) else: if discrete: model = a3c.A3CSeparateModel( pi=policies.FCSoftmaxPolicy( obs_space.low.size, action_space.n, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), ) else: model = a3c.A3CSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, mean_wscale=1e-1, ), v=v_function.FCVFunction( obs_space.low.size, n_hidden_channels=n_hidden_channels, n_hidden_layers=2, nonlinearity=F.tanh, last_wscale=1e-1, ), ) opt = chainer.optimizers.Adam() opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(1)) gamma = 0.8 beta = 1e-2 agent = a3c.A3C(model, opt, t_max=t_max, gamma=gamma, beta=beta, phi=phi, act_deterministically=True) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async(outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, 'successful')) agent.stop_episode() # Test env = make_env(0, True) n_test_runs = 5 for _ in range(n_test_runs): total_r = 0 obs = env.reset() done = False reward = 0.0 while not done: action = agent.act(obs) print('state:', obs, 'action:', action) obs, reward, done, _ = env.step(action) total_r += reward if require_success: self.assertAlmostEqual(total_r, 1) agent.stop_episode()
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian')) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=True) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.getLogger().setLevel(args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): print('in make env', process_idx, test) # env = gym.make(args.env) env = env_vrep.Simu_env(10000 + process_idx) if processor_status[process_idx] == 0: env.connect_vrep() processor_status[process_idx] = 1 return env # sample_env = gym.make(args.env) # timestep_limit = sample_env.spec.tags.get( # 'wrapper_config.TimeLimit.max_episode_steps') # obs_space = sample_env.observation_space # action_space = sample_env.action_space obs_space = env_vrep.state_size action_space = env_vrep.action_size timestep_limit = 200 model = A3CFFSoftmax(obs_space, action_space) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) # opt.add_hook(chainer.optimizer.GradientClipping(40)) # if args.weight_decay > 0: # opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) if args.load: agent.load(args.load) experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
def main(args): import logging logging.basicConfig(level=logging.INFO, filename='log') if (type(args) is list): args = make_args(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Set a random seed used in ChainerRL. # If you use more than one process (i.e. processes > 1), # the results will be no longer be deterministic # even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 n_actions = gym.make(args.env).action_space.n if args.use_lstm: model = A3CLSTM(n_actions) else: model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph([model(fake_obs)], os.path.join( args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env def make_env_check(): # Use different random seeds for train and test envs env_seed = args.seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=True, clip_rewards=True) env.seed(int(env_seed)) return env if args.load_agent: agent.load(args.load_agent) if (args.mode == 'train'): # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, step_offset=args.step_offset, checkpoint_freq=args.checkpoint_frequency, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, log_type=args.log_type) elif (args.mode == 'check'): return tools.make_video.check(env=make_env_check(), agent=agent, save_mp4=args.save_mp4) elif (args.mode == 'growth'): return tools.make_video.growth(env=make_env_check(), agent=agent, outdir=args.outdir, max_num=args.max_frames, save_mp4=args.save_mp4)
def __init__(self, alg, env, model_path): self.alg = alg seed = 0 n_actions = gym.make(env).action_space.n gpus = [-1] gpu = None misc.set_random_seed(seed, gpus=gpus) if alg == "DQN-C": model = links.Sequence( links.NatureDQNHead(), L.Linear(512, n_actions), DiscreteActionValue) if alg == "PPO": winit_last = chainer.initializers.LeCunNormal(1e-2) model = chainer.Sequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), SoftmaxDistribution, ), L.Linear(None, 1), ) ) if alg == "C51": n_atoms = 51 v_max = 10 v_min = -10 model = links.Sequence( links.NatureDQNHead(), DistributionalFCStateQFunctionWithDiscreteAction( None, n_actions, n_atoms, v_min, v_max, n_hidden_channels=0, n_hidden_layers=0), ) if alg == "ACER": model = agents.acer.ACERSharedModel( shared=links.Sequence( links.NIPSDQNHead(), L.LSTM(256, 256)), pi=links.Sequence( L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence( L.Linear(256, n_actions), DiscreteActionValue), ) if alg == "A3C": model = A3CFF(n_actions) if alg == "Rainbow": n_atoms = 51 v_max = 10 v_min = -10 model = DistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max) links.to_factorized_noisy(model, sigma_scale=0.5) if alg == "IQN": model = agents.iqn.ImplicitQuantileQFunction( psi=chainerrl.links.Sequence( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, functools.partial(F.reshape, shape=(-1, 3136)), ), phi=chainerrl.links.Sequence( chainerrl.agents.iqn.CosineBasisLinear(64, 3136), F.relu, ), f=chainerrl.links.Sequence( L.Linear(None, 512), F.relu, L.Linear(None, n_actions), ), ) if alg in ["A3C"]: fake_obs = chainer.Variable( np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph variables = misc.collect_variables([model(fake_obs)]) chainer.computational_graph.build_computational_graph(variables) elif alg in ["Rainbow", "DQN-C", "C51", "ACER", "PPO"]: variables = misc.collect_variables([model(np.zeros((4, 84, 84), dtype=np.float32)[None])]) chainer.computational_graph.build_computational_graph(variables) else: fake_obs = np.zeros((4, 84, 84), dtype=np.float32)[None] fake_taus = np.zeros(32, dtype=np.float32)[None] variables = misc.collect_variables([model(fake_obs)(fake_taus)]) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 opt = optimizers.RMSpropGraves() opt.setup(model) rbuf = replay_buffer.ReplayBuffer(1) if alg == "IQN": self.agent = agents.IQN(model, opt, rbuf, gpu=gpu, gamma=0.99, act_deterministically=True, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi) if alg == "A3C": self.agent = a3c.A3C(model, opt, t_max=5, gamma=0.99, phi=phi, act_deterministically=True) if alg == "Rainbow": self.agent = agents.CategoricalDoubleDQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi) if alg == "DQN-C": self.agent = agents.DQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi) if alg == "C51": self.agent = agents.CategoricalDQN( model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi, ) if alg == "ACER": self.agent = agents.acer.ACER(model, opt, t_max=5, gamma=0.99, replay_buffer=rbuf, n_times_replay=4, replay_start_size=1, act_deterministically=True, phi=phi ) if alg == "PPO": self.agent = agents.PPO(model, opt, gpu=gpu, phi=phi, update_interval=4, minibatch_size=1, clip_eps=0.1, recurrent=False, act_deterministically=True) self.agent.load(os.path.join(model_path, 'chainer', alg, env.replace("NoFrameskip-v4", ""), 'final'))
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--outdir', type=str, default='a3c_training', help='Directory path to save output files. If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='TTT-A3C-v0') parser.add_argument('--seed', type=int, default=17, help='Random seed [0, 2 ** 32)') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--steps', type=int, default=5*10 ** 5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--eval-interval', type=int, default=10 ** 5) parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax')) parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e0) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=1*1e-4) parser.add_argument('--weight-decay', type=float, default=0) parser.add_argument('--logger-level', type=int, default=logging.ERROR) parser.add_argument('--monitor', action='store_true') args = parser.parse_args() logging.basicConfig(level=args.logger_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer deterministic even with the same random seed. misc.set_random_seed(args.seed) process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2 ** 32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2 ** 32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if args.render and process_idx == 0 and not test: env = chainerrl.wrappers.Render(env) # NOTE: uncomment the next line to start from a pretrained agent # env.set_agent(gym_ttt.pretrained_agent.get_pretrained_agent("./")) return env sample_env = gym.make(args.env) # number of steps after which an episode is ended (whether the game is over or not) timestep_limit = sample_env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space # Initialize the NN and the optimizer model = A3CFFSoftmax(obs_space.low.size, action_space.n) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.8, beta=args.beta) if args.load: agent.load(args.load) # draw the policy and state value network chainerrl.misc.draw_computational_graph( [agent.model.pi_and_v(np.array([np.array([[0. for _ in range(3)] for _ in range(3)], dtype=np.float32)]))[0]], os.path.join(args.outdir, 'model_pi')) chainerrl.misc.draw_computational_graph( [agent.model.pi_and_v(np.array([np.array([[0. for _ in range(3)] for _ in range(3)], dtype=np.float32)]))[1]], os.path.join(args.outdir, 'model_v')) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env = env, agent = agent, n_steps = None, n_episodes = args.eval_n_runs, max_episode_len = timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format(args.eval_n_runs, eval_stats['mean'], eval_stats['median'],eval_stats['stdev'])) else: experiments.train_agent_async( agent = agent, outdir = args.outdir, processes = args.processes, make_env = make_env, profile = args.profile, steps = args.steps, eval_n_runs = args.eval_n_runs, eval_interval = args.eval_interval, max_episode_len = timestep_limit)
ndim_obs, n_actions, hidden_sizes, nonlinearity=F.tanh)) self.v = links.MLP(ndim_obs, 1, hidden_sizes=hidden_sizes, nonlinearity=F.tanh) super().__init__(self.pi, self.v) def pi_and_v(self, state): return self.pi(state), self.v(state) # load trained A3C agent model = A3CFFSoftmax(ndim_obs=9, n_actions=9) opt = rmsprop_async.RMSpropAsync() opt.setup(model) agent = a3c.A3C(model, opt, t_max=5, gamma=0.99) agent.load(agent_path) def check_game(field, action): global player_symbol, CPU_symbol, state # action is linearized to 0 to 8: break down into 3 x 3 array row, col = int(int(action) / 3), int(action) % 3 if field["text"] == " ": # user performed legal move field["text"] = player_symbol field["state"] = "disabled" state[row][col] = -1 # player won
def main(): # Prevent numpy from using multiple threads os.environ['OMP_NUM_THREADS'] = '1' import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('rom', type=str) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--use-sdl', action='store_true') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=10**6) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--use-lstm', action='store_true') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.set_defaults(use_sdl=False) parser.set_defaults(use_lstm=False) args = parser.parse_args() if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = ale.ALE(args.rom).number_of_actions if args.use_lstm: model = A3CLSTM(n_actions) else: model = A3CFF(n_actions) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=dqn_phi) if args.load: agent.load(args.load) def make_env(process_idx, test): env = ale.ALE(args.rom, use_sdl=args.use_sdl, treat_life_lost_as_terminal=not test) if not test: misc.env_modifiers.make_reward_clipped(env, -1, 1) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=args.max_episode_len, global_step_hooks=[lr_decay_hook])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--processes', type=int, default=16) parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--t-max', type=int, default=5) parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--eval-interval', type=int, default=250000) parser.add_argument('--eval-n-steps', type=int, default=125000) parser.add_argument('--weight-decay', type=float, default=0.0) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. # If you use more than one processes, the results will be no longer # deterministic even with the same random seed. misc.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) n_actions = gym.make(args.env).action_space.n model = A3CFF(n_actions) # Draw the computational graph and save it in the output directory. fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph chainerrl.misc.draw_computational_graph([model(fake_obs)], os.path.join( args.outdir, 'model')) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) if args.weight_decay > 0: opt.add_hook(NonbiasWeightDecay(args.weight_decay)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta, phi=phi) if args.load: agent.load(args.load) def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance(env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: # Linearly decay the learning rate to zero def lr_setter(env, agent, value): agent.optimizer.lr = value lr_decay_hook = experiments.LinearInterpolationHook( args.steps, args.lr, 0, lr_setter) experiments.train_agent_async( agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, global_step_hooks=[lr_decay_hook], save_best_so_far_agent=False, )
clip_rewards=False) env.seed(seed) return env seed = 0 env_name = 'BreakoutNoFrameskip-v4' misc.set_random_seed(seed) env = make_env() n_actions = env.action_space.n model = A3CFF(n_actions) opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) agent = a3c.A3C(model, opt, t_max=5, gamma=0.99, beta=1e-2, phi=phi) agent.load('parameters') ACTION_MEANINGS = { 0: 'NOOP', 1: 'FIRE', 2: 'RIGHT', 3: 'LEFT', } launch_visualizer(agent, env, ACTION_MEANINGS, raw_image_input=True)