def test_append_and_sample(self): rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer( capacity=self.capacity, normalize_by_max=self.normalize_by_max, default_priority_func=self.default_priority_func, uniform_ratio=self.uniform_ratio, wait_priority_after_sampling=self.wait_priority_after_sampling, return_sample_weights=self.return_sample_weights) for n in [10, 15, 5] * 3: transs = [ dict(state=i, action=100 + i, reward=200 + i, next_state=i + 1, next_action=101 + i, is_state_terminal=(i == n - 1)) for i in range(n) ] for trans in transs: rbuf.append(**trans) self.assertEqual(len(rbuf), 90) self.assertEqual(rbuf.n_episodes, 9) for k in [10, 30, 90]: s = rbuf.sample(k) self.assertEqual(len(s), k) for k in [1, 3, 9]: ret = rbuf.sample_episodes(k) if self.return_sample_weights: s, wt = ret self.assertEqual(len(s), k) self.assertEqual(len(wt), k) else: s = ret self.assertEqual(len(s), k) if self.wait_priority_after_sampling: rbuf.update_errors([1.0] * k) ret = rbuf.sample_episodes(k, max_len=10) if self.return_sample_weights: s, wt = ret self.assertEqual(len(s), k) self.assertEqual(len(wt), k) else: s = ret if self.wait_priority_after_sampling: rbuf.update_errors([1.0] * k) for ep in s: self.assertLessEqual(len(ep), 10) for t0, t1 in zip(ep, ep[1:]): self.assertEqual(t0['next_state'], t1['state']) self.assertEqual(t0['next_action'], t1['action'])
def create_value_based_learner(cfg_name): """ Creates a learner that can be used with value based algorithms from chainerrl. :param cfg_name: type str, the name of the config :return: chainerrl agent specified in config """ vb_config = Config(cfg_name) network = getattr(models, vb_config.get_str('BASIC', 'network'))( **vb_config.get_section('NETWORK')) q_func = q_functions.SingleModelStateQFunctionWithDiscreteAction(model=network) opt = getattr(optimizers, vb_config.get_str('BASIC', 'optimizer'))( **vb_config.get_section('OPTIMIZER')) opt.setup(q_func) opt.add_hook( optimizer.GradientClipping(threshold=vb_config.get_float('BASIC', 'grad_clip'))) rep_buf = replay_buffer.PrioritizedEpisodicReplayBuffer( capacity=vb_config.get_int('MEMORY_BUFFER', 'episodic_buffer_size'), wait_priority_after_sampling=vb_config.get_bool('MEMORY_BUFFER', 'wait_priority_after_sampling')) explorer = explorers.LinearDecayEpsilonGreedy( random_action_func=lambda: np.random.random_integers(0, vb_config.get_int('NETWORK', 'output_dim') - 1), **vb_config.get_section('EXPLORER')) try: learner = getattr(agents, vb_config.get_str('BASIC', 'learner'))(q_function=q_func, optimizer=opt, replay_buffer=rep_buf, phi=lambda x: x, explorer=explorer, **vb_config.get_section( 'ALGORITHM')) if vb_config.get_str('BASIC', 'load_path'): learner.load(os.path.join(get_results_path(), vb_config.get_str('BASIC', 'load_path'))) except AttributeError as e: logger.log(msg='Cannot find model {} in chainerrl.agents'.format( vb_config.get_str('BASIC', 'learner')), level=logging.ERROR) raise e logger.log(msg='Created learner {}'.format(learner.__class__.__name__), level=logging.INFO) logger.log(msg='Model parameters {}'.format( ' '.join([name + ':' + str(value) for name, value in vb_config.get_section('EXPERIMENT').items()])), level=logging.INFO) logger.log(msg='Explorer parameters {}'.format( ' '.join([name + ':' + str(value) for name, value in vb_config.get_section('EXPLORER').items()])), level=logging.INFO) return learner
def test(self): if self.replay_buffer_type == 'EpisodicReplayBuffer': rbuf = replay_buffer.EpisodicReplayBuffer(capacity=None) elif self.replay_buffer_type == 'PrioritizedEpisodicReplayBuffer': rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer(capacity=None) else: assert False # 2 transitions for env_id=0 for _ in range(2): trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False) rbuf.append(env_id=0, **trans1) # 4 transitions for env_id=1 with a terminal state for i in range(4): trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=(i == 3)) rbuf.append(env_id=1, **trans1) # 9 transitions for env_id=2 for _ in range(9): trans1 = dict(state=0, action=1, reward=2, next_state=3, next_action=4, is_state_terminal=False) rbuf.append(env_id=2, **trans1) # It should have 4 transitions from env_id=1 self.assertEqual(len(rbuf), 4) # env_id=0 episode ends rbuf.stop_current_episode(env_id=0) # Now it should have 4 + 2 = 6 transitions self.assertEqual(len(rbuf), 6) # env_id=2 episode ends rbuf.stop_current_episode(env_id=2) # Finally it should have 4 + 2 + 9 = 15 transitions self.assertEqual(len(rbuf), 15)
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='CartPole-v1') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--final-exploration-steps', type=int, default=1000) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--steps', type=int, default=10**8) parser.add_argument('--prioritized-replay', action='store_true') parser.add_argument('--episodic-replay', action='store_true') parser.add_argument('--replay-start-size', type=int, default=50) parser.add_argument('--target-update-interval', type=int, default=100) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=100) parser.add_argument('--eval-interval', type=int, default=1000) parser.add_argument('--n-hidden-channels', type=int, default=12) parser.add_argument('--n-hidden-layers', type=int, default=3) parser.add_argument('--gamma', type=float, default=0.95) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--render-train', action='store_true') parser.add_argument('--render-eval', action='store_true') parser.add_argument('--monitor', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1.0) args = parser.parse_args() # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): env = gym.make(args.env) env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_size = env.observation_space.low.size action_space = env.action_space n_atoms = 51 v_max = 500 v_min = 0 n_actions = action_space.n q_func = q_functions.DistributionalFCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_atoms, v_min, v_max, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy(args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) opt = optimizers.Adam(1e-3) opt.setup(q_func) rbuf_capacity = 50000 # 5 * 10 ** 5 if args.episodic_replay: if args.minibatch_size is None: args.minibatch_size = 4 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity) else: if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = chainerrl.agents.CategoricalDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, episodic_update=args.episodic_replay, episodic_update_len=16) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, max_episode_len=timestep_limit)
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='dqn_out') parser.add_argument('--env', type=str, default='Pendulum-v0') parser.add_argument('--seed', type=int, default=None) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--final-exploration-steps', type=int, default=10**4) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--steps', type=int, default=10**5) parser.add_argument('--prioritized-replay', action='store_true') parser.add_argument('--episodic-replay', action='store_true') parser.add_argument('--replay-start-size', type=int, default=1000) parser.add_argument('--target-update-interval', type=int, default=10**2) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=100) parser.add_argument('--eval-interval', type=int, default=10**4) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--render-train', action='store_true') parser.add_argument('--render-eval', action='store_true') parser.add_argument('--monitor', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e-3) args = parser.parse_args() args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) if args.seed is not None: misc.set_random_seed(args.seed) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(for_eval): env = gym.make(args.env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not for_eval: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if ((args.render_eval and for_eval) or (args.render_train and not for_eval)): misc.env_modifiers.make_rendered(env) return env env = make_env(for_eval=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10**5 if args.episodic_replay: if args.minibatch_size is None: args.minibatch_size = 4 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity) else: if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) def phi(obs): return obs.astype(np.float32) agent = DQN(q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, phi=phi, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, episodic_update=args.episodic_replay, episodic_update_len=16) if args.load: agent.load(args.load) eval_env = make_env(for_eval=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, max_episode_len=timestep_limit)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=123, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--final-exploration-steps', type=int, default=10 ** 4) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--noisy-net-sigma', type=float, default=None) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--steps', type=int, default=50000) parser.add_argument('--prioritized-replay', action='store_true', default=False) parser.add_argument('--episodic-replay', action='store_true', default=False) parser.add_argument('--replay-start-size', type=int, default=1000) parser.add_argument('--target-update-interval', type=int, default=10 ** 2) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=50) parser.add_argument('--eval-interval', type=int, default=10 ** 3) parser.add_argument('--n-hidden-channels', type=int, default=512) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--render-train', action='store_true') parser.add_argument('--render-eval', action='store_true') parser.add_argument('--monitor', action='store_true', default=True) parser.add_argument('--reward-scale-factor', type=float, default=1e-3) args = parser.parse_args() # Set a random seed used in ChainerRL misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir( args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) def make_env(test): ENV_NAME = 'malware-test-v0' if test else 'malware-v0' env = gym.make(ENV_NAME) # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) # if not test: # misc.env_modifiers.make_reward_filtered( # env, lambda x: x * args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): misc.env_modifiers.make_rendered(env) return env env = make_env(test=False) timestep_limit = 80 obs_space = env.observation_space obs_size = obs_space.shape[0] action_space = env.action_space n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) if args.gpu >= 0: q_func.to_gpu(args.gpu) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. if args.gpu < 0: chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10 ** 5 if args.episodic_replay: if args.minibatch_size is None: args.minibatch_size = 4 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity) else: if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) def phi(obs): return obs.astype(np.float32) agent = DoubleDQN(q_func, opt, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, phi=phi, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, episodic_update=args.episodic_replay, episodic_update_len=16) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: q_hook = PlotHook('Average Q Value') loss_hook = PlotHook('Average Loss', plot_index=1) experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, max_episode_len=timestep_limit, step_hooks=[q_hook, loss_hook], successful_score=7 )
def create_ddqn_agent(env, args): obs_size = env.observation_space.shape[0] action_space = env.action_space n_actions = action_space.n # q_func = q_functions.FCStateQFunctionWithDiscreteAction( # obs_size, n_actions, # n_hidden_channels=args.n_hidden_channels, # n_hidden_layers=args.n_hidden_layers) q_func = QFunction(obs_size, n_actions) if args.gpu: q_func.to_gpu(args.gpu) # Draw the computational graph and save it in the output directory. if not args.test and not args.gpu: chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(env.observation_space, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) # explorer = explorers.Boltzmann() # explorer = explorers.ConstantEpsilonGreedy( # epsilon=0.3, random_action_func=env.action_space.sample) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10 ** 3 if args.episodic_replay: if args.minibatch_size is None: args.minibatch_size = 4 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) // args.update_interval rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity) else: if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) # Chainer only accepts numpy.float32 by default, make sure # a converter as a feature extractor function phi. phi = lambda x: x.astype(np.float32, copy=False) agent = chainerrl.agents.DoubleDQN(q_func, opt, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, phi=phi, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, episodic_update=args.episodic_replay, episodic_update_len=16) return agent
def main(): import logging logging.basicConfig(level=logging.WARNING) args = parser.parse_args() args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) if args.seed is not None: misc.set_random_seed(args.seed) option2id, all_guesses = load_quizbowl() train_iter = QuestionIterator(all_guesses[c.BUZZER_DEV_FOLD], option2id, batch_size=1, make_vector=dense_vector) env = BuzzingGame(train_iter) timestep_limit = 300 obs_size = env.observation_size action_space = env.action_space n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy(args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) opt = optimizers.Adam() opt.setup(q_func) rbuf_capacity = 5 * 10**5 if args.episodic_replay: if args.minibatch_size is None: args.minibatch_size = 4 if args.replay_start_size is None: args.replay_start_size = 10 if args.prioritized_replay: betasteps = \ (args.steps - timestep_limit * args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity) else: if args.minibatch_size is None: args.minibatch_size = 32 if args.replay_start_size is None: args.replay_start_size = 1000 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) def phi(obs): return obs.astype(np.float32) agent = DQN(q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, phi=phi, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, episodic_update=args.episodic_replay, episodic_update_len=16) if args.load: agent.load(args.load) eval_env = BuzzingGame(train_iter) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, max_episode_len=timestep_limit) serializers.save_npz('dqn.npz', q_func) dev_iter = QuestionIterator(all_guesses[c.BUZZER_DEV_FOLD], option2id, batch_size=128, make_vector=dense_vector) dev_buzzes = get_buzzes(q_func, dev_iter) dev_buzzes_dir = 'output/buzzer/rl/dev_buzzes.pkl' with open(dev_buzzes_dir, 'wb') as f: pickle.dump(dev_buzzes, f) print('Dev buzz {} saved to {}'.format(len(dev_buzzes), dev_buzzes_dir)) report(dev_buzzes_dir)
def main(): import logging logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--env', type=str, default='MsPacman-ramNoFrameskip-v4') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=-1) parser.add_argument('--final-exploration-steps', type=int, default=10**4) parser.add_argument('--start-epsilon', type=float, default=1.0) parser.add_argument('--end-epsilon', type=float, default=0.1) parser.add_argument('--noisy-net-sigma', type=float, default=None) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument('--steps', type=int, default=5 * 10**6) parser.add_argument('--prioritized-replay', action='store_true') parser.add_argument('--episodic-replay', action='store_true') parser.add_argument('--replay-start-size', type=int, default=1000) parser.add_argument('--target-update-interval', type=int, default=10**2) parser.add_argument('--target-update-method', type=str, default='hard') parser.add_argument('--soft-update-tau', type=float, default=1e-2) parser.add_argument('--update-interval', type=int, default=1) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--eval-interval', type=int, default=5 * 10**4) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--minibatch-size', type=int, default=None) parser.add_argument('--render-train', action='store_true') parser.add_argument('--render-eval', action='store_true') parser.add_argument('--monitor', action='store_true') parser.add_argument('--reward-scale-factor', type=float, default=1e-2) args = parser.parse_args() args.prioritized_replay = True # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # args.outdir = experiments.prepare_output_dir( # args, args.outdir, argv=sys.argv) print('Output files are saved in {}'.format(args.outdir)) def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if isinstance(env.action_space, spaces.Box): misc.env_modifiers.make_action_filtered(env, clip_action_filter) if not test: # Scale rewards (and thus returns) to a reasonable range so that # training is easier env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor) if ((args.render_eval and test) or (args.render_train and not test)): env = chainerrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = env.observation_space obs_size = obs_space.low.size action_space = env.action_space if isinstance(action_space, spaces.Box): action_size = action_space.low.size # Use NAF to apply DQN to continuous action spaces q_func = q_functions.FCQuadraticStateQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, action_space=action_space) # Use the Ornstein-Uhlenbeck process for exploration ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) else: n_actions = action_space.n q_func = q_functions.FCStateQFunctionWithDiscreteAction( obs_size, n_actions, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) # Use epsilon-greedy for exploration explorer = explorers.LinearDecayEpsilonGreedy( args.start_epsilon, args.end_epsilon, args.final_exploration_steps, action_space.sample) if args.noisy_net_sigma is not None: links.to_factorized_noisy(q_func) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. chainerrl.misc.draw_computational_graph( [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])], os.path.join(args.outdir, 'model')) opt = optimizers.Adam(eps=1e-3) opt.setup(q_func) rbuf_capacity = 5 * 10**5 if args.episodic_replay: if args.minibatch_size is None: args.minibatch_size = 4 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer( rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity) else: if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: betasteps = (args.steps - args.replay_start_size) \ // args.update_interval rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps) else: rbuf = replay_buffer.ReplayBuffer(rbuf_capacity) agent = chainerrl.agents.DoubleDQN( q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, update_interval=args.update_interval, minibatch_size=args.minibatch_size, target_update_method=args.target_update_method, soft_update_tau=args.soft_update_tau, episodic_update=args.episodic_replay, episodic_update_len=16) if args.load: agent.load(args.load) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: env.next_agent_num = 1 def hook(env, agent, step): if step > env.next_agent_num * 5 * 10**5: agent.save('results/agent_' + str(env.next_agent_num)) env.next_agent_num += 1 experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, eval_env=eval_env, train_max_episode_len=timestep_limit, step_hooks=(hook, ))