def train_agent_with_evaluation(agent, env, steps, eval_n_runs, eval_frequency, outdir, max_episode_len=None, step_offset=0, eval_explorer=None, eval_max_episode_len=None, eval_env=None, successful_score=None, render=False, logger=None): """Run a DQN-like agent. Args: agent: Agent. env: Environment. steps (int): Number of total time steps for training. eval_n_runs (int): Number of runs for each time of evaluation. eval_frequency (int): Interval of evaluation. outdir (str): Path to the directory to output things. max_episode_len (int): Maximum episode length. step_offset (int): Time step from which training starts. eval_explorer: Explorer used for evaluation. eval_env: Environment used for evaluation. successful_score (float): Finish training if the mean score is greater or equal to this value if not None """ logger = logger or logging.getLogger(__name__) makedirs(outdir, exist_ok=True) if eval_env is None: eval_env = env if eval_max_episode_len is None: eval_max_episode_len = max_episode_len evaluator = Evaluator(agent=agent, n_runs=eval_n_runs, eval_frequency=eval_frequency, outdir=outdir, max_episode_len=eval_max_episode_len, explorer=eval_explorer, env=eval_env, step_offset=step_offset, logger=logger) train_agent(agent, env, steps, outdir, max_episode_len=max_episode_len, step_offset=step_offset, evaluator=evaluator, successful_score=successful_score, logger=logger)
def train_agent_with_evaluation( agent, env, steps, eval_n_runs, eval_interval, outdir, max_episode_len=None, step_offset=0, eval_explorer=None, eval_max_episode_len=None, eval_env=None, successful_score=None, step_hooks=[], logger=None): """Train an agent while regularly evaluating it. Args: agent: Agent to train. env: Environment train the againt against. steps (int): Number of total time steps for training. eval_n_runs (int): Number of runs for each time of evaluation. eval_interval (int): Interval of evaluation. outdir (str): Path to the directory to output things. max_episode_len (int): Maximum episode length. step_offset (int): Time step from which training starts. eval_explorer: Explorer used for evaluation. eval_max_episode_len (int or None): Maximum episode length of evaluation runs. If set to None, max_episode_len is used instead. eval_env: Environment used for evaluation. successful_score (float): Finish training if the mean score is greater or equal to this value if not None step_hooks (list): List of callable objects that accepts (env, agent, step) as arguments. They are called every step. See chainerrl.experiments.hooks. logger (logging.Logger): Logger used in this function. """ logger = logger or logging.getLogger(__name__) makedirs(outdir, exist_ok=True) if eval_env is None: eval_env = env if eval_max_episode_len is None: eval_max_episode_len = max_episode_len evaluator = Evaluator(agent=agent, n_runs=eval_n_runs, eval_interval=eval_interval, outdir=outdir, max_episode_len=eval_max_episode_len, explorer=eval_explorer, env=eval_env, step_offset=step_offset, logger=logger) train_agent( agent, env, steps, outdir, max_episode_len=max_episode_len, step_offset=step_offset, evaluator=evaluator, successful_score=successful_score, step_hooks=step_hooks, logger=logger)
def train_agent_batch_with_evaluation( agent, env, steps, eval_n_steps, eval_n_episodes, eval_interval, outdir, max_episode_len=None, step_offset=0, eval_max_episode_len=None, return_window_size=100, eval_env=None, log_interval=None, successful_score=None, step_hooks=[], save_best_so_far_agent=True, logger=None, ): """Train an agent while regularly evaluating it. Args: agent: Agent to train. env: Environment train the againt against. steps (int): Number of total time steps for training. eval_n_steps (int): Number of timesteps at each evaluation phase. eval_n_runs (int): Number of runs for each time of evaluation. eval_interval (int): Interval of evaluation. outdir (str): Path to the directory to output things. log_interval (int): Interval of logging. max_episode_len (int): Maximum episode length. step_offset (int): Time step from which training starts. return_window_size (int): Number of training episodes used to estimate the average returns of the current agent. eval_max_episode_len (int or None): Maximum episode length of evaluation runs. If set to None, max_episode_len is used instead. eval_env: Environment used for evaluation. successful_score (float): Finish training if the mean score is greater or equal to thisvalue if not None step_hooks (list): List of callable objects that accepts (env, agent, step) as arguments. They are called every step. See chainerrl.experiments.hooks. save_best_so_far_agent (bool): If set to True, after each evaluation, if the score (= mean return of evaluation episodes) exceeds the best-so-far score, the current agent is saved. logger (logging.Logger): Logger used in this function. """ logger = logger or logging.getLogger(__name__) makedirs(outdir, exist_ok=True) if eval_env is None: eval_env = env if eval_max_episode_len is None: eval_max_episode_len = max_episode_len evaluator = Evaluator( agent=agent, n_steps=eval_n_steps, n_episodes=eval_n_episodes, eval_interval=eval_interval, outdir=outdir, max_episode_len=eval_max_episode_len, env=eval_env, step_offset=step_offset, save_best_so_far_agent=save_best_so_far_agent, logger=logger, ) train_agent_batch(agent, env, steps, outdir, max_episode_len=max_episode_len, step_offset=step_offset, eval_interval=eval_interval, evaluator=evaluator, successful_score=successful_score, return_window_size=return_window_size, log_interval=log_interval, step_hooks=step_hooks, logger=logger)
def train_agent_with_evaluation(agent, env, steps, eval_n_steps, eval_n_episodes, eval_interval, outdir, train_max_episode_len=None, step_offset=0, eval_max_episode_len=None, eval_env=None, successful_score=None, step_hooks=[], save_best_so_far_agent=True, logger=None, ): """Train an agent while periodically evaluating it. Args: agent: A chainerrl.agent.Agent env: Environment train the agent against. steps (int): Total number of timesteps for training. eval_n_steps (int): Number of timesteps at each evaluation phase. eval_n_episodes (int): Number of episodes at each evaluation phase. eval_interval (int): Interval of evaluation. outdir (str): Path to the directory to output data. train_max_episode_len (int): Maximum episode length during training. step_offset (int): Time step from which training starts. eval_max_episode_len (int or None): Maximum episode length of evaluation runs. If None, train_max_episode_len is used instead. eval_env: Environment used for evaluation. successful_score (float): Finish training if the mean score is greater than or equal to this value if not None step_hooks (list): List of callable objects that accepts (env, agent, step) as arguments. They are called every step. See chainerrl.experiments.hooks. save_best_so_far_agent (bool): If set to True, after each evaluation phase, if the score (= mean return of evaluation episodes) exceeds the best-so-far score, the current agent is saved. logger (logging.Logger): Logger used in this function. """ logger = logger or logging.getLogger(__name__) makedirs(outdir, exist_ok=True) if eval_env is None: eval_env = env if eval_max_episode_len is None: eval_max_episode_len = train_max_episode_len env.unwrapped.set_agent(agent) # added by FR eval_env.unwrapped.set_agent(agent) # added by FR evaluator = Evaluator(agent=agent, n_steps=eval_n_steps, n_episodes=eval_n_episodes, eval_interval=eval_interval, outdir=outdir, max_episode_len=eval_max_episode_len, env=eval_env, step_offset=step_offset, save_best_so_far_agent=save_best_so_far_agent, logger=logger, ) train_agent( agent, env, steps, outdir, max_episode_len=train_max_episode_len, step_offset=step_offset, evaluator=evaluator, successful_score=successful_score, step_hooks=step_hooks, logger=logger)
def main(): """Parses arguments and runs the example """ parser = argparse.ArgumentParser() parser.add_argument( '--env', type=str, default='MineRLTreechop-v0', choices=[ 'MineRLTreechop-v0', 'MineRLNavigate-v0', 'MineRLNavigateDense-v0', 'MineRLNavigateExtreme-v0', 'MineRLNavigateExtremeDense-v0', 'MineRLObtainIronPickaxe-v0', 'MineRLObtainIronPickaxeDense-v0', 'MineRLObtainDiamond-v0', 'MineRLObtainDiamondDense-v0', 'MineRLNavigateDenseFixed-v0' # for debug use ], help='MineRL environment identifier') parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 31)') parser.add_argument('--gpu', type=int, default=-1, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--final-exploration-frames', type=int, default=10**6, help='Timesteps after which we stop ' + 'annealing exploration rate') parser.add_argument('--final-epsilon', type=float, default=0.01, help='Final value of epsilon during training.') parser.add_argument('--eval-epsilon', type=float, default=0.001, help='Exploration epsilon used during eval episodes.') parser.add_argument('--replay-start-size', type=int, default=1000, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--target-update-interval', type=int, default=10**4, help='Frequency (in timesteps) at which ' + 'the target network is updated.') parser.add_argument('--update-interval', type=int, default=4, help='Frequency (in timesteps) of network updates.') parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--no-clip-delta', dest='clip_delta', action='store_false') parser.add_argument('--error-max', type=float, default=1.0) parser.add_argument('--num-step-return', type=int, default=10) parser.set_defaults(clip_delta=True) parser.add_argument('--logging-level', type=int, default=20, help='Logging level. 10:DEBUG, 20:INFO etc.') parser.add_argument('--logging-filename', type=str, default=None) parser.add_argument( '--monitor', action='store_true', default=False, help= 'Monitor env. Videos and additional information are saved as output files when evaluation' ) # parser.add_argument('--render', action='store_true', default=False, # help='Render env states in a GUI window.') parser.add_argument('--optimizer', type=str, default='rmsprop', choices=['rmsprop', 'adam']) parser.add_argument('--lr', type=float, default=2.5e-4, help='Learning rate') parser.add_argument( "--replay-buffer-size", type=int, default=10**6, help="Size of replay buffer (Excluding demonstrations)") parser.add_argument("--minibatch-size", type=int, default=32) parser.add_argument('--batch-accumulator', type=str, default="sum") parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default=None) parser.add_argument("--save-demo-trajectories", action="store_true", default=False) # DQfD specific parameters for loading and pretraining. parser.add_argument('--n-experts', type=int, default=10) parser.add_argument('--expert-demo-path', type=str, default=None) parser.add_argument('--n-pretrain-steps', type=int, default=750000) parser.add_argument('--demo-supervised-margin', type=float, default=0.8) parser.add_argument('--loss-coeff-l2', type=float, default=1e-5) parser.add_argument('--loss-coeff-nstep', type=float, default=1.0) parser.add_argument('--loss-coeff-supervised', type=float, default=1.0) parser.add_argument('--bonus-priority-agent', type=float, default=0.001) parser.add_argument('--bonus-priority-demo', type=float, default=1.0) # Action branching architecture parser.add_argument('--gradient-clipping', action='store_true', default=False) parser.add_argument('--gradient-rescaling', action='store_true', default=False) # NoisyNet parameters parser.add_argument('--use-noisy-net', type=str, default=None, choices=['before-pretraining', 'after-pretraining']) parser.add_argument('--noisy-net-sigma', type=float, default=0.5) # Parameters for state/action handling parser.add_argument('--frame-stack', type=int, default=None, help='Number of frames stacked (None for disable).') parser.add_argument('--frame-skip', type=int, default=None, help='Number of frames skipped (None for disable).') parser.add_argument('--camera-atomic-actions', type=int, default=10) parser.add_argument('--max-range-of-camera', type=float, default=10.) parser.add_argument('--use-full-observation', action='store_true', default=False) args = parser.parse_args() assert args.expert_demo_path is not None, "DQfD needs collected \ expert demonstrations" import logging if args.logging_filename is not None: logging.basicConfig(filename=args.logging_filename, filemode='w', level=args.logging_level) else: logging.basicConfig(level=args.logging_level) logger = logging.getLogger(__name__) train_seed = args.seed test_seed = 2**31 - 1 - args.seed chainerrl.misc.set_random_seed(args.seed, gpus=(args.gpu, )) args.outdir = experiments.prepare_output_dir(args, args.outdir) logger.info('Output files are saved in {}'.format(args.outdir)) if args.env == 'MineRLTreechop-v0': branch_sizes = [ 9, 16, args.camera_atomic_actions, args.camera_atomic_actions ] elif args.env in [ 'MineRLNavigate-v0', 'MineRLNavigateDense-v0', 'MineRLNavigateExtreme-v0', 'MineRLNavigateExtremeDense-v0' ]: branch_sizes = [ 9, 16, args.camera_atomic_actions, args.camera_atomic_actions, 2 ] elif args.env in [ 'MineRLObtainIronPickaxe-v0', 'MineRLObtainIronPickaxeDense-v0', 'MineRLObtainDiamond-v0', 'MineRLObtainDiamondDense-v0' ]: branch_sizes = [ 9, 16, args.camera_atomic_actions, args.camera_atomic_actions, 32 ] else: raise Exception("Unknown environment") def make_env(env, test): # wrap env: observation... # NOTE: wrapping order matters! if args.use_full_observation: env = FullObservationSpaceWrapper(env) elif args.env.startswith('MineRLNavigate'): env = PoVWithCompassAngleWrapper(env) else: env = ObtainPoVWrapper(env) if test and args.monitor: env = gym.wrappers.Monitor( env, os.path.join(args.outdir, 'monitor'), mode='evaluation' if test else 'training', video_callable=lambda episode_id: True) if args.frame_skip is not None: env = FrameSkip(env, skip=args.frame_skip) # convert hwc -> chw as Chainer requires env = MoveAxisWrapper(env, source=-1, destination=0, use_tuple=args.use_full_observation) #env = ScaledFloatFrame(env) if args.frame_stack is not None: env = FrameStack(env, args.frame_stack, channel_order='chw', use_tuple=args.use_full_observation) # wrap env: action... env = BranchedActionWrapper(env, branch_sizes, args.camera_atomic_actions, args.max_range_of_camera) if test: env = BranchedRandomizedAction(env, branch_sizes, args.eval_epsilon) env_seed = test_seed if test else train_seed env.seed(int(env_seed)) return env core_env = gym.make(args.env) env = make_env(core_env, test=False) eval_env = make_env(core_env, test=True) # Q function if args.env.startswith('MineRLNavigate'): if args.use_full_observation: base_channels = 3 # RGB else: base_channels = 4 # RGB + compass elif args.env.startswith('MineRLObtain'): base_channels = 3 # RGB else: base_channels = 3 # RGB if args.frame_stack is None: n_input_channels = base_channels else: n_input_channels = base_channels * args.frame_stack q_func = CNNBranchingQFunction(branch_sizes, n_input_channels=n_input_channels, gradient_rescaling=args.gradient_rescaling, use_tuple=args.use_full_observation) def phi(x): # observation -> NN input if args.use_full_observation: pov = np.asarray(x[0], dtype=np.float32) others = np.asarray(x[1], dtype=np.float32) return (pov / 255, others) else: return np.asarray(x, dtype=np.float32) / 255 explorer = explorers.LinearDecayEpsilonGreedy( 1.0, args.final_epsilon, args.final_exploration_frames, lambda: np.array([np.random.randint(n) for n in branch_sizes])) # Draw the computational graph and save it in the output directory. if args.use_full_observation: sample_obs = tuple([x[None] for x in env.observation_space.sample()]) else: sample_obs = env.observation_space.sample()[None] chainerrl.misc.draw_computational_graph([q_func(phi(sample_obs))], os.path.join(args.outdir, 'model')) if args.optimizer == 'rmsprop': opt = chainer.optimizers.RMSpropGraves(args.lr, alpha=0.95, momentum=0.0, eps=1e-2) elif args.optimizer == 'adam': opt = chainer.optimizers.Adam(args.lr) if args.use_noisy_net is None: opt.setup(q_func) if args.gradient_rescaling: opt.add_hook(ScaleGradHook(1 / (1 + len(q_func.branch_sizes)))) if args.gradient_clipping: opt.add_hook(chainer.optimizer_hooks.GradientClipping(10.0)) # calculate corresponding `steps` and `eval_interval` according to frameskip maximum_frames = 8640000 # = 1440 episodes if we count an episode as 6000 frames. if args.frame_skip is None: steps = maximum_frames eval_interval = 6000 * 100 # (approx.) every 100 episode (counts "1 episode = 6000 steps") else: steps = maximum_frames // args.frame_skip eval_interval = 6000 * 100 // args.frame_skip # (approx.) every 100 episode (counts "1 episode = 6000 steps") # Anneal beta from beta0 to 1 throughout training betasteps = steps / args.update_interval replay_buffer = PrioritizedDemoReplayBuffer(args.replay_buffer_size, alpha=0.4, beta0=0.6, betasteps=betasteps, error_max=args.error_max, num_steps=args.num_step_return) # Fill the demo buffer with expert transitions if not args.demo: chosen_dirs = choose_top_experts(args.expert_demo_path, args.n_experts, logger=logger) fill_buffer(args.env, chosen_dirs, replay_buffer, args.frame_skip, args.frame_stack, args.camera_atomic_actions, args.max_range_of_camera, args.use_full_observation, logger=logger) logger.info("Demo buffer loaded with {} transitions".format( len(replay_buffer))) def reward_transform(x): return np.sign(x) * np.log(1 + np.abs(x)) if args.use_noisy_net is not None and args.use_noisy_net == 'before-pretraining': chainerrl.links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) explorer = explorers.Greedy() opt.setup(q_func) agent = DQfD(q_func, opt, replay_buffer, gamma=0.99, explorer=explorer, n_pretrain_steps=args.n_pretrain_steps, demo_supervised_margin=args.demo_supervised_margin, bonus_priority_agent=args.bonus_priority_agent, bonus_priority_demo=args.bonus_priority_demo, loss_coeff_nstep=args.loss_coeff_nstep, loss_coeff_supervised=args.loss_coeff_supervised, loss_coeff_l2=args.loss_coeff_l2, gpu=args.gpu, replay_start_size=args.replay_start_size, target_update_interval=args.target_update_interval, clip_delta=args.clip_delta, update_interval=args.update_interval, batch_accumulator=args.batch_accumulator, phi=phi, reward_transform=reward_transform, minibatch_size=args.minibatch_size) if args.use_noisy_net is not None and args.use_noisy_net == 'after-pretraining': chainerrl.links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma) explorer = explorers.Greedy() if args.optimizer == 'rmsprop': opt = chainer.optimizers.RMSpropGraves(args.lr, alpha=0.95, momentum=0.0, eps=1e-2) elif args.optimizer == 'adam': opt = chainer.optimizers.Adam(args.lr) opt.setup(q_func) opt.add_hook(chainer.optimizer_hooks.WeightDecay(args.loss_coeff_l2)) agent.optimizer = opt agent.target_model = None agent.sync_target_network() if args.load: agent.load(args.load) if args.demo: eval_stats = experiments.eval_performance(env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs) logger.info('n_runs: {} mean: {} median: {} stdev: {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: agent.pretrain() evaluator = Evaluator(agent=agent, n_steps=None, n_episodes=args.eval_n_runs, eval_interval=eval_interval, outdir=args.outdir, max_episode_len=None, env=eval_env, step_offset=0, save_best_so_far_agent=True, logger=logger) # Evaluate the agent BEFORE training begins evaluator.evaluate_and_update_max_score(t=0, episodes=0) experiments.train_agent(agent=agent, env=env, steps=steps, outdir=args.outdir, max_episode_len=None, step_offset=0, evaluator=evaluator, successful_score=None, step_hooks=[]) env.close()
step_offset = 0 eval_max_episode_len = None successful_score = None step_hooks = () save_best_so_far_agent = True os.makedirs(outdir, exist_ok=True) eval_max_episode_len = train_max_episode_len """ evaluator to save best so far agent """ evaluator1 = Evaluator(agent=tom, n_steps=eval_n_steps, n_episodes=eval_n_episodes, eval_interval=eval_interval, outdir=outdir, max_episode_len=eval_max_episode_len, env=env, step_offset=step_offset, save_best_so_far_agent=save_best_so_far_agent, logger=logger, ) evaluator2 = Evaluator(agent=jerry, n_steps=eval_n_steps, n_episodes=eval_n_episodes, eval_interval=eval_interval, outdir=outdir, max_episode_len=eval_max_episode_len, env=env, step_offset=step_offset, save_best_so_far_agent=save_best_so_far_agent, logger=logger, )