def parse_arch(arch, n_actions): if arch == 'nature': return links.Sequence(links.NatureDQNHead(n_input_channels=3), L.Linear(512, n_actions), DiscreteActionValue) elif arch == 'doubledqn': class SingleSharedBias(chainer.Chain): """Single shared bias used in the Double DQN paper. You can add this link after a Linear layer with nobias=True to implement a Linear layer with a single shared bias parameter. See http://arxiv.org/abs/1509.06461. """ def __init__(self): super().__init__() with self.init_scope(): self.bias = chainer.Parameter(0, shape=1) def __call__(self, x): return x + F.broadcast_to(self.bias, x.shape) return links.Sequence(links.NatureDQNHead(n_input_channels=3), L.Linear(512, n_actions, nobias=True), SingleSharedBias(), DiscreteActionValue) elif arch == 'nips': return links.Sequence(links.NIPSDQNHead(n_input_channels=3), L.Linear(256, n_actions), DiscreteActionValue) elif arch == 'dueling': return DuelingDQN(n_actions, n_input_channels=3) else: raise RuntimeError('Not supported architecture: {}'.format(arch))
def parse_arch(arch, n_actions): if arch == 'nature': return links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions), DiscreteActionValue) elif arch == 'doubledqn': return links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions, nobias=True), SingleSharedBias(), DiscreteActionValue) elif arch == 'nips': return links.Sequence(links.NIPSDQNHead(), L.Linear(256, n_actions), DiscreteActionValue) elif arch == 'dueling': return DuelingDQN(n_actions) else: raise RuntimeError('Not supported architecture: {}'.format(arch))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True, help='Model directory path.') parser.add_argument('--out', type=str, required=True, help='ONNX file output path.') parser.add_argument('--gpu', type=int, default=0, help='GPU id.') args = parser.parse_args() # Predefined parameters. n_actions = 4 # env.action_space.n replay_start_size = 5 * 10**4 # Load the model. q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions), DiscreteActionValue) opt = chainer.optimizers.RMSpropGraves(lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, decay_steps=10**6, random_action_func=lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.DQN agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=replay_start_size, target_update_interval=10**4, clip_delta=True, update_interval=4, batch_accumulator='sum', phi=phi) agent.load(args.model) # Extract core links from the model and export these links as an ONNX format. onnx_compat_model = convert_to_compatible_model(agent) x = cp.array(np.zeros((1, 4, 84, 84), dtype=np.float32)) onnx_chainer.export(onnx_compat_model, x, input_names='input', output_names='action', return_named_inout=True, filename=args.out)
def parse_arch(arch, n_actions, activation): if arch == 'nature': return links.Sequence(links.NatureDQNHead(activation=activation), L.Linear(512, n_actions), DiscreteActionValue) elif arch == 'nips': return links.Sequence(links.NIPSDQNHead(activation=activation), L.Linear(256, n_actions), DiscreteActionValue) elif arch == 'dueling': return DuelingDQN(n_actions) else: raise RuntimeError('Not supported architecture: {}'.format(arch))
def __init__(self, modelpath, n_actions=4, n_stack_frames=4): # Predefined parameters. replay_start_size = 5 * 10**4 # Load the model. q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions), DiscreteActionValue) opt = chainer.optimizers.RMSpropGraves(lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, decay_steps=10**6, random_action_func=lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.DQN self._agent = Agent(q_func, opt, rbuf, gpu=-1, gamma=0.99, explorer=explorer, replay_start_size=replay_start_size, target_update_interval=10**4, clip_delta=True, update_interval=4, batch_accumulator='sum', phi=phi) self._agent.load(modelpath) self._state = deque([], maxlen=n_stack_frames) self._action = 0
def _test_load_dqn(self, gpu): q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, 4), DiscreteActionValue) opt = optimizers.RMSpropGraves(lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(100) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, decay_steps=10**6, random_action_func=lambda: np.random.randint(4)) agent = agents.DQN(q_func, opt, rbuf, gpu=gpu, gamma=0.99, explorer=explorer, replay_start_size=50, target_update_interval=10**4, clip_delta=True, update_interval=4, batch_accumulator='sum', phi=lambda x: x) model, exists = download_model("DQN", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type) agent.load(model) if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'): assert exists
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4', help='OpenAI Atari domain to perform algorithm on.') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--steps', type=int, default=5 * 10**7, help='Total number of timesteps to train the agent.') parser.add_argument('--replay-start-size', type=int, default=5 * 10**4, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--eval-n-steps', type=int, default=125000) parser.add_argument('--eval-interval', type=int, default=250000) parser.add_argument('--n-best-episodes', type=int, default=30) args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=None), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, 0.05) if args.monitor: env = chainerrl.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions), DiscreteActionValue) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the same hyperparameters as the Nature paper opt = optimizers.RMSpropGraves(lr=2.5e-4, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, decay_steps=10**6, random_action_func=lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.DQN agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=10**4, clip_delta=True, update_interval=4, batch_accumulator='sum', phi=phi) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=args.eval_n_steps, n_episodes=None) print('n_episodes: {} mean: {} median: {} stdev {}'.format( eval_stats['episodes'], eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=args.eval_n_steps, eval_n_episodes=None, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=True, eval_env=eval_env, ) dir_of_best_network = os.path.join(args.outdir, "best") agent.load(dir_of_best_network) # run 30 evaluation episodes, each capped at 5 mins of play stats = experiments.evaluator.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.n_best_episodes, max_episode_len=4500, logger=None) with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f: json.dump(stats, f) print("The results of the best scoring network:") for stat in stats: print(str(stat) + ":" + str(stats[stat]))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4', help='OpenAI Atari domain to perform algorithm on.') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--final-exploration-frames', type=int, default=10**6, help='Timesteps after which we stop ' + 'annealing exploration rate') parser.add_argument('--final-epsilon', type=float, default=0.1, help='Final value of epsilon during training.') parser.add_argument('--eval-epsilon', type=float, default=0.05, help='Exploration epsilon used during eval episodes.') parser.add_argument('--noisy-net-sigma', type=float, default=None) parser.add_argument('--arch', type=str, default='doubledqn', choices=['nature', 'nips', 'dueling', 'doubledqn'], help='Network architecture to use.') parser.add_argument('--steps', type=int, default=5 * 10**7, help='Total number of timesteps to train the agent.') parser.add_argument( '--max-frames', type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help='Maximum number of frames for each episode.') parser.add_argument('--replay-start-size', type=int, default=5 * 10**4, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--target-update-interval', type=int, default=1 * 10**4, help='Frequency (in timesteps) at which ' + 'the target network is updated.') parser.add_argument('--eval-interval', type=int, default=10**5, help='Frequency (in timesteps) of evaluation phase.') parser.add_argument('--update-interval', type=int, default=4, help='Frequency (in timesteps) of network updates.') parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--no-clip-delta', dest='clip_delta', action='store_false') parser.set_defaults(clip_delta=True) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--lr', type=float, default=2.5e-4, help='Learning rate.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for train and test envs. train_seed = args.seed test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=args.max_frames), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) eval_env = make_env(test=True) n_actions = env.action_space.n q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions), DiscreteActionValue) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use the same hyper parameters as the Nature paper's opt = optimizers.RMSpropGraves(lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.DQN agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator='sum', phi=phi) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, save_best_so_far_agent=False, eval_env=eval_env, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--out_dir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--final-exploration-frames', type=int, default=10 ** 5, help='Timesteps after which we stop ' + 'annealing exploration rate') parser.add_argument('--final-epsilon', type=float, default=0.1, help='Final value of epsilon during training.') parser.add_argument('--eval-epsilon', type=float, default=0.05, help='Exploration epsilon used during eval episodes.') parser.add_argument('--steps', type=int, default=10 ** 6, help='Total number of timesteps to train the agent.') parser.add_argument('--max-episode-len', type=int, default=30 * 60 * 60 // 4, # 30 minutes with 60/4 fps help='Maximum number of timesteps for each episode.') parser.add_argument('--replay-start-size', type=int, default=1000, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--target-update-interval', type=int, default=1 * 10 ** 4, help='Frequency (in timesteps) at which ' + 'the target network is updated.') parser.add_argument('--eval-interval', type=int, default=10 ** 5, help='Frequency (in timesteps) of evaluation phase.') parser.add_argument('--update-interval', type=int, default=4, help='Frequency (in timesteps) of network updates.') parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--lr', type=float, default=2.5e-4, help='Learning rate.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu,)) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) experiments.set_log_base_dir(args.out_dir) print('Output files are saved in {}'.format(args.out_dir)) env = make_env(env_seed=args.seed) n_actions = env.action_space.n q_func = links.Sequence( links.NatureDQNHead(n_input_channels=3), L.Linear(512, n_actions), DiscreteActionValue ) # Use the same hyper parameters as the Nature paper's opt = optimizers.RMSpropGraves( lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10 ** 6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor x = x.transpose(2, 0, 1) return np.asarray(x, dtype=np.float32) / 255 agent = agents.DQN( q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator='sum', phi=phi ) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.out_dir, save_best_so_far_agent=False, max_episode_len=args.max_episode_len, eval_env=env, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4', help='OpenAI Atari domain to perform algorithm on.') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--load', type=str, default=None, required=True) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--monitor', action='store_true', default=False, help='Monitor env. Videos and additional information' ' are saved as output files.') parser.add_argument('--steps', type=int, default=5 * 10**7, help='Total number of demo timesteps to collect') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) args.outdir = experiments.prepare_output_dir(args, args.outdir) print('Output files are saved in {}'.format(args.outdir)) def make_env(): env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=None), episode_life=False, clip_rewards=False) env.seed(int(args.seed)) # Randomize actions like epsilon-greedy env = chainerrl.wrappers.RandomizeAction(env, 0.01) if args.monitor: env = chainerrl.wrappers.Monitor(env, args.outdir, mode='evaluation') if args.render: env = chainerrl.wrappers.Render(env) return env env = make_env() n_actions = env.action_space.n q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions), DiscreteActionValue) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # The optimizer and replay buffer are dummy variables required by agent opt = optimizers.RMSpropGraves() opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(1) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 Agent = agents.DQN agent = Agent(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi) agent.load(args.load) # saves demos to outdir/demos.pickle experiments.collect_demonstrations(agent=agent, env=env, steps=args.steps, episodes=None, outdir=args.outdir, max_episode_len=None)
def __init__(self, alg, env, model_path): self.alg = alg seed = 0 n_actions = gym.make(env).action_space.n gpus = [-1] gpu = None misc.set_random_seed(seed, gpus=gpus) if alg == "DQN-C": model = links.Sequence( links.NatureDQNHead(), L.Linear(512, n_actions), DiscreteActionValue) if alg == "PPO": winit_last = chainer.initializers.LeCunNormal(1e-2) model = chainer.Sequential( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, L.Linear(None, 512), F.relu, links.Branched( chainer.Sequential( L.Linear(None, n_actions, initialW=winit_last), SoftmaxDistribution, ), L.Linear(None, 1), ) ) if alg == "C51": n_atoms = 51 v_max = 10 v_min = -10 model = links.Sequence( links.NatureDQNHead(), DistributionalFCStateQFunctionWithDiscreteAction( None, n_actions, n_atoms, v_min, v_max, n_hidden_channels=0, n_hidden_layers=0), ) if alg == "ACER": model = agents.acer.ACERSharedModel( shared=links.Sequence( links.NIPSDQNHead(), L.LSTM(256, 256)), pi=links.Sequence( L.Linear(256, n_actions), SoftmaxDistribution), q=links.Sequence( L.Linear(256, n_actions), DiscreteActionValue), ) if alg == "A3C": model = A3CFF(n_actions) if alg == "Rainbow": n_atoms = 51 v_max = 10 v_min = -10 model = DistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max) links.to_factorized_noisy(model, sigma_scale=0.5) if alg == "IQN": model = agents.iqn.ImplicitQuantileQFunction( psi=chainerrl.links.Sequence( L.Convolution2D(None, 32, 8, stride=4), F.relu, L.Convolution2D(None, 64, 4, stride=2), F.relu, L.Convolution2D(None, 64, 3, stride=1), F.relu, functools.partial(F.reshape, shape=(-1, 3136)), ), phi=chainerrl.links.Sequence( chainerrl.agents.iqn.CosineBasisLinear(64, 3136), F.relu, ), f=chainerrl.links.Sequence( L.Linear(None, 512), F.relu, L.Linear(None, n_actions), ), ) if alg in ["A3C"]: fake_obs = chainer.Variable( np.zeros((4, 84, 84), dtype=np.float32)[None], name='observation') with chainerrl.recurrent.state_reset(model): # The state of the model is reset again after drawing the graph variables = misc.collect_variables([model(fake_obs)]) chainer.computational_graph.build_computational_graph(variables) elif alg in ["Rainbow", "DQN-C", "C51", "ACER", "PPO"]: variables = misc.collect_variables([model(np.zeros((4, 84, 84), dtype=np.float32)[None])]) chainer.computational_graph.build_computational_graph(variables) else: fake_obs = np.zeros((4, 84, 84), dtype=np.float32)[None] fake_taus = np.zeros(32, dtype=np.float32)[None] variables = misc.collect_variables([model(fake_obs)(fake_taus)]) def phi(x): # Feature extractor return np.asarray(x, dtype=np.float32) / 255 opt = optimizers.RMSpropGraves() opt.setup(model) rbuf = replay_buffer.ReplayBuffer(1) if alg == "IQN": self.agent = agents.IQN(model, opt, rbuf, gpu=gpu, gamma=0.99, act_deterministically=True, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi) if alg == "A3C": self.agent = a3c.A3C(model, opt, t_max=5, gamma=0.99, phi=phi, act_deterministically=True) if alg == "Rainbow": self.agent = agents.CategoricalDoubleDQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi) if alg == "DQN-C": self.agent = agents.DQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi) if alg == "C51": self.agent = agents.CategoricalDQN( model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4, phi=phi, ) if alg == "ACER": self.agent = agents.acer.ACER(model, opt, t_max=5, gamma=0.99, replay_buffer=rbuf, n_times_replay=4, replay_start_size=1, act_deterministically=True, phi=phi ) if alg == "PPO": self.agent = agents.PPO(model, opt, gpu=gpu, phi=phi, update_interval=4, minibatch_size=1, clip_eps=0.1, recurrent=False, act_deterministically=True) self.agent.load(os.path.join(model_path, 'chainer', alg, env.replace("NoFrameskip-v4", ""), 'final'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4', help='OpenAI Atari domain to perform algorithm on.') parser.add_argument('--out_dir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--final-exploration-frames', type=int, default=10**5, help='Timesteps after which we stop ' + 'annealing exploration rate') parser.add_argument('--final-epsilon', type=float, default=0.1, help='Final value of epsilon during training.') parser.add_argument('--eval-epsilon', type=float, default=0.05, help='Exploration epsilon used during eval episodes.') parser.add_argument('--arch', type=str, default='doubledqn', choices=['nature', 'nips', 'dueling', 'doubledqn'], help='Network architecture to use.') parser.add_argument('--steps', type=int, default=10**6, help='Total number of timesteps to train the agent.') parser.add_argument( '--max-episode-len', type=int, default=30 * 60 * 60 // 4, # 30 minutes with 60/4 fps help='Maximum number of timesteps for each episode.') parser.add_argument('--replay-start-size', type=int, default=1000, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--target-update-interval', type=int, default=1 * 10**4, help='Frequency (in timesteps) at which ' + 'the target network is updated.') parser.add_argument('--eval-interval', type=int, default=10**5, help='Frequency (in timesteps) of evaluation phase.') parser.add_argument('--update-interval', type=int, default=4, help='Frequency (in timesteps) of network updates.') parser.add_argument('--eval-n-runs', type=int, default=100) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--render', action='store_true', default=False, help='Render env states in a GUI window.') parser.add_argument('--lr', type=float, default=2.5e-4, help='Learning rate.') args = parser.parse_args() import logging logging.basicConfig(level=args.logging_level) # Set a random seed used in ChainerRL. misc.set_random_seed(args.seed, gpus=(args.gpu, )) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) experiments.set_log_base_dir(args.out_dir) print('Output files are saved in {}'.format(args.out_dir)) def make_env(render=False, env_seed=0): join_tokens = marlo.make("MarLo-FindTheGoal-v0", params=dict( allowContinuousMovement=["move", "turn"], videoResolution=[84, 84], kill_clients_after_num_rounds=500)) env = marlo.init(join_tokens[0]) obs = env.reset() if render: env.render(mode="rgb_array") action = env.action_space.sample() obs, r, done, info = env.step(action) env.seed(int(env_seed)) return env env = make_env(render=args.render, env_seed=args.seed) n_actions = env.action_space.n q_func = links.Sequence(links.NatureDQNHead(n_input_channels=3), L.Linear(512, n_actions), DiscreteActionValue) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros((3, 84, 84), dtype=np.float32)[None])], os.path.join(args.out_dir, 'model')) # Use the same hyper parameters as the Nature paper's opt = optimizers.RMSpropGraves(lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2) opt.setup(q_func) rbuf = replay_buffer.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.random.randint(n_actions)) def phi(x): # Feature extractor x = x.transpose(2, 0, 1) return np.asarray(x, dtype=np.float32) / 255 agent = agents.DQN(q_func, opt, rbuf, gpu=args.gpu, gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, batch_accumulator='sum', phi=phi) if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=env, agent=agent, n_runs=args.eval_n_runs) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.out_dir, save_best_so_far_agent=False, max_episode_len=args.max_episode_len, eval_env=env, )