def create_agent(self, env): model = create_state_q_function_for_env(env) rbuf = replay_buffer.ReplayBuffer(10**5) opt = optimizers.Adam() opt.setup(model) explorer = explorers.ConstantEpsilonGreedy( 0.2, random_action_func=lambda: env.action_space.sample()) return agents.DQN(model, opt, rbuf, gamma=0.99, explorer=explorer)
def _test_load_dqn(self, gpu): q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, 4), DiscreteActionValue) opt = optimizers.RMSpropGraves(lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(100) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, decay_steps=10**6, random_action_func=lambda: np.random.randint(4)) agent = agents.DQN(q_func, opt, rbuf, gpu=gpu, gamma=0.99, explorer=explorer, replay_start_size=50, target_update_interval=10**4, clip_delta=True, update_interval=4, batch_accumulator='sum', phi=lambda x: x) model, exists = download_model("DQN", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def main(): parser = argparse.ArgumentParser() parser.add_argument('--out_dir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--final-exploration-frames', type=int, default=10 ** 5, help='Timesteps after which we stop ' + 'annealing exploration rate') parser.add_argument('--final-epsilon', type=float, default=0.1, help='Final value of epsilon during training.') parser.add_argument('--eval-epsilon', type=float, default=0.05, help='Exploration epsilon used during eval episodes.') parser.add_argument('--steps', type=int, default=10 ** 6, help='Total number of timesteps to train the agent.') parser.add_argument('--max-episode-len', type=int, default=30 * 60 * 60 // 4, # 30 minutes with 60/4 fps help='Maximum number of timesteps for each episode.') parser.add_argument('--replay-start-size', type=int, default=1000, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--target-update-interval', type=int, default=1 * 10 ** 4, help='Frequency (in timesteps) at which ' + 'the target network is updated.') parser.add_argument('--eval-interval', type=int, default=10 ** 5, help='Frequency (in timesteps) of evaluation phase.') parser.add_argument('--update-interval', type=int, default=4, help='Frequency (in timesteps) of network updates.') parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--lr', type=float, default=2.5e-4, help='Learning rate.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu,)) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) experiments.set_log_base_dir(args.out_dir) print('Output files are saved in {}'.format(args.out_dir)) env = make_env(env_seed=args.seed) n_actions = env.action_space.n q_func = links.Sequence( links.NatureDQNHead(n_input_channels=3), L.Linear(512, n_actions), DiscreteActionValue ) # Use the same hyper parameters as the Nature paper's opt = optimizers.RMSpropGraves( lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10 ** 6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor x = x.transpose(2, 0, 1) return np.asarray(x, dtype=np.float32) / 255 agent = agents.DQN( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator='sum', phi=phi ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.out_dir, save_best_so_far_agent=False, max_episode_len=args.max_episode_len, eval_env=env, )
def __init__(self, alg, env, model_path): self.alg = alg seed = 0 n_actions = gym.make(env).action_space.n gpus = [-1] gpu = None misc.set_random_seed(seed, gpus=gpus) if alg == "DQN-C": model = links.Sequence( links.NatureDQNHead(), L.Linear(512, n_actions), DiscreteActionValue) if alg == "PPO": winit_last = chainer.initializers.LeCunNormal(1e-2) model = chainer.Sequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), SoftmaxDistribution, ), L.Linear(None, 1), ) ) if alg == "C51": n_atoms = 51 v_max = 10 v_min = -10 model = links.Sequence( links.NatureDQNHead(), DistributionalFCStateQFunctionWithDiscreteAction( None, n_actions, n_atoms, v_min, v_max, n_hidden_channels=0, n_hidden_layers=0), ) if alg == "ACER": model = agents.acer.ACERSharedModel( shared=links.Sequence( links.NIPSDQNHead(), L.LSTM(256, 256)), pi=links.Sequence( L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence( L.Linear(256, n_actions), DiscreteActionValue), ) if alg == "A3C": model = A3CFF(n_actions) if alg == "Rainbow": n_atoms = 51 v_max = 10 v_min = -10 model = DistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max) links.to_factorized_noisy(model, sigma_scale=0.5) if alg == "IQN": model = agents.iqn.ImplicitQuantileQFunction( psi=chainerrl.links.Sequence( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, functools.partial(F.reshape, shape=(-1, 3136)), ), phi=chainerrl.links.Sequence( chainerrl.agents.iqn.CosineBasisLinear(64, 3136), F.relu, ), f=chainerrl.links.Sequence( L.Linear(None, 512), F.relu, L.Linear(None, n_actions), ), ) if alg in ["A3C"]: fake_obs = chainer.Variable( np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph variables = misc.collect_variables([model(fake_obs)]) chainer.computational_graph.build_computational_graph(variables) elif alg in ["Rainbow", "DQN-C", "C51", "ACER", "PPO"]: variables = misc.collect_variables([model(np.zeros((4, 84, 84), dtype=np.float32)[None])]) chainer.computational_graph.build_computational_graph(variables) else: fake_obs = np.zeros((4, 84, 84), dtype=np.float32)[None] fake_taus = np.zeros(32, dtype=np.float32)[None] variables = misc.collect_variables([model(fake_obs)(fake_taus)]) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 opt = optimizers.RMSpropGraves() opt.setup(model) rbuf = replay_buffer.ReplayBuffer(1) if alg == "IQN": self.agent = agents.IQN(model, opt, rbuf, gpu=gpu, gamma=0.99, act_deterministically=True, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi) if alg == "A3C": self.agent = a3c.A3C(model, opt, t_max=5, gamma=0.99, phi=phi, act_deterministically=True) if alg == "Rainbow": self.agent = agents.CategoricalDoubleDQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi) if alg == "DQN-C": self.agent = agents.DQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi) if alg == "C51": self.agent = agents.CategoricalDQN( model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi, ) if alg == "ACER": self.agent = agents.acer.ACER(model, opt, t_max=5, gamma=0.99, replay_buffer=rbuf, n_times_replay=4, replay_start_size=1, act_deterministically=True, phi=phi ) if alg == "PPO": self.agent = agents.PPO(model, opt, gpu=gpu, phi=phi, update_interval=4, minibatch_size=1, clip_eps=0.1, recurrent=False, act_deterministically=True) self.agent.load(os.path.join(model_path, 'chainer', alg, env.replace("NoFrameskip-v4", ""), 'final'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4', help='OpenAI Atari domain to perform algorithm on.') parser.add_argument('--out_dir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--final-exploration-frames', type=int, default=10**5, help='Timesteps after which we stop ' + 'annealing exploration rate') parser.add_argument('--final-epsilon', type=float, default=0.1, help='Final value of epsilon during training.') parser.add_argument('--eval-epsilon', type=float, default=0.05, help='Exploration epsilon used during eval episodes.') parser.add_argument('--arch', type=str, default='doubledqn', choices=['nature', 'nips', 'dueling', 'doubledqn'], help='Network architecture to use.') parser.add_argument('--steps', type=int, default=10**6, help='Total number of timesteps to train the agent.') parser.add_argument( '--max-episode-len', type=int, default=30 * 60 * 60 // 4, # 30 minutes with 60/4 fps help='Maximum number of timesteps for each episode.') parser.add_argument('--replay-start-size', type=int, default=1000, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--target-update-interval', type=int, default=1 * 10**4, help='Frequency (in timesteps) at which ' + 'the target network is updated.') parser.add_argument('--eval-interval', type=int, default=10**5, help='Frequency (in timesteps) of evaluation phase.') parser.add_argument('--update-interval', type=int, default=4, help='Frequency (in timesteps) of network updates.') parser.add_argument('--eval-n-runs', type=int, default=100) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--lr', type=float, default=2.5e-4, help='Learning rate.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) experiments.set_log_base_dir(args.out_dir) print('Output files are saved in {}'.format(args.out_dir)) def make_env(render=False, env_seed=0): join_tokens = marlo.make("MarLo-FindTheGoal-v0", params=dict( allowContinuousMovement=["move", "turn"], videoResolution=[84, 84], kill_clients_after_num_rounds=500)) env = marlo.init(join_tokens[0]) obs = env.reset() if render: env.render(mode="rgb_array") action = env.action_space.sample() obs, r, done, info = env.step(action) env.seed(int(env_seed)) return env env = make_env(render=args.render, env_seed=args.seed) n_actions = env.action_space.n q_func = links.Sequence(links.NatureDQNHead(n_input_channels=3), L.Linear(512, n_actions), DiscreteActionValue) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((3, 84, 84), dtype=np.float32)[None])], os.path.join(args.out_dir, 'model')) # Use the same hyper parameters as the Nature paper's opt = optimizers.RMSpropGraves(lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor x = x.transpose(2, 0, 1) return np.asarray(x, dtype=np.float32) / 255 agent = agents.DQN(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator='sum', phi=phi) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.out_dir, save_best_so_far_agent=False, max_episode_len=args.max_episode_len, eval_env=env, )