def train_agent_with_evaluation(agent, env, steps, eval_n_runs, eval_frequency, outdir, max_episode_len=None, step_offset=0, eval_explorer=None, eval_max_episode_len=None, eval_env=None, successful_score=None, render=False, logger=None): """Run a DQN-like agent. Args: agent: Agent. env: Environment. steps (int): Number of total time steps for training. eval_n_runs (int): Number of runs for each time of evaluation. eval_frequency (int): Interval of evaluation. outdir (str): Path to the directory to output things. max_episode_len (int): Maximum episode length. step_offset (int): Time step from which training starts. eval_explorer: Explorer used for evaluation. eval_env: Environment used for evaluation. successful_score (float): Finish training if the mean score is greater or equal to this value if not None """ logger = logger or logging.getLogger(__name__) makedirs(outdir, exist_ok=True) if eval_env is None: eval_env = env if eval_max_episode_len is None: eval_max_episode_len = max_episode_len evaluator = Evaluator(agent=agent, n_runs=eval_n_runs, eval_frequency=eval_frequency, outdir=outdir, max_episode_len=eval_max_episode_len, explorer=eval_explorer, env=eval_env, step_offset=step_offset, logger=logger) train_agent(agent, env, steps, outdir, max_episode_len=max_episode_len, step_offset=step_offset, evaluator=evaluator, successful_score=successful_score, logger=logger)
def train_agent_with_evaluation( agent, env, steps, eval_n_runs, eval_interval, outdir, max_episode_len=None, step_offset=0, eval_explorer=None, eval_max_episode_len=None, eval_env=None, successful_score=None, step_hooks=[], logger=None): """Train an agent while regularly evaluating it. Args: agent: Agent to train. env: Environment train the againt against. steps (int): Number of total time steps for training. eval_n_runs (int): Number of runs for each time of evaluation. eval_interval (int): Interval of evaluation. outdir (str): Path to the directory to output things. max_episode_len (int): Maximum episode length. step_offset (int): Time step from which training starts. eval_explorer: Explorer used for evaluation. eval_max_episode_len (int or None): Maximum episode length of evaluation runs. If set to None, max_episode_len is used instead. eval_env: Environment used for evaluation. successful_score (float): Finish training if the mean score is greater or equal to this value if not None step_hooks (list): List of callable objects that accepts (env, agent, step) as arguments. They are called every step. See chainerrl.experiments.hooks. logger (logging.Logger): Logger used in this function. """ logger = logger or logging.getLogger(__name__) makedirs(outdir, exist_ok=True) if eval_env is None: eval_env = env if eval_max_episode_len is None: eval_max_episode_len = max_episode_len evaluator = Evaluator(agent=agent, n_runs=eval_n_runs, eval_interval=eval_interval, outdir=outdir, max_episode_len=eval_max_episode_len, explorer=eval_explorer, env=eval_env, step_offset=step_offset, logger=logger) train_agent( agent, env, steps, outdir, max_episode_len=max_episode_len, step_offset=step_offset, evaluator=evaluator, successful_score=successful_score, step_hooks=step_hooks, logger=logger)
def save(self, dirname): """Save internal states.""" makedirs(dirname, exist_ok=True) for attr in self.saved_attributes: assert hasattr(self, attr) attr_value = getattr(self, attr) if isinstance(attr_value, AttributeSavingMixin): assert attr_value is not self, "Avoid an infinite loop" attr_value.save(os.path.join(dirname, attr)) else: serializers.save_npz( os.path.join(dirname, '{}.npz'.format(attr)), getattr(self, attr))
def __save(self, dirname, ancestors): makedirs(dirname, exist_ok=True) ancestors.append(self) for attr in self.saved_attributes: assert hasattr(self, attr) attr_value = getattr(self, attr) if isinstance(attr_value, AttributeSavingMixin): assert not any( attr_value is ancestor for ancestor in ancestors), "Avoid an infinite loop" attr_value.__save(os.path.join(dirname, attr), ancestors) else: serializers.save_npz( os.path.join(dirname, '{}.npz'.format(attr)), getattr(self, attr)) ancestors.pop()
def train_agent_batch_with_evaluation( agent, env, steps, eval_n_steps, eval_n_episodes, eval_interval, outdir, max_episode_len=None, step_offset=0, eval_max_episode_len=None, return_window_size=100, eval_env=None, log_interval=None, successful_score=None, step_hooks=[], save_best_so_far_agent=True, logger=None, ): """Train an agent while regularly evaluating it. Args: agent: Agent to train. env: Environment train the againt against. steps (int): Number of total time steps for training. eval_n_steps (int): Number of timesteps at each evaluation phase. eval_n_runs (int): Number of runs for each time of evaluation. eval_interval (int): Interval of evaluation. outdir (str): Path to the directory to output things. log_interval (int): Interval of logging. max_episode_len (int): Maximum episode length. step_offset (int): Time step from which training starts. return_window_size (int): Number of training episodes used to estimate the average returns of the current agent. eval_max_episode_len (int or None): Maximum episode length of evaluation runs. If set to None, max_episode_len is used instead. eval_env: Environment used for evaluation. successful_score (float): Finish training if the mean score is greater or equal to thisvalue if not None step_hooks (list): List of callable objects that accepts (env, agent, step) as arguments. They are called every step. See chainerrl.experiments.hooks. save_best_so_far_agent (bool): If set to True, after each evaluation, if the score (= mean return of evaluation episodes) exceeds the best-so-far score, the current agent is saved. logger (logging.Logger): Logger used in this function. """ logger = logger or logging.getLogger(__name__) makedirs(outdir, exist_ok=True) if eval_env is None: eval_env = env if eval_max_episode_len is None: eval_max_episode_len = max_episode_len evaluator = Evaluator( agent=agent, n_steps=eval_n_steps, n_episodes=eval_n_episodes, eval_interval=eval_interval, outdir=outdir, max_episode_len=eval_max_episode_len, env=eval_env, step_offset=step_offset, save_best_so_far_agent=save_best_so_far_agent, logger=logger, ) train_agent_batch(agent, env, steps, outdir, max_episode_len=max_episode_len, step_offset=step_offset, eval_interval=eval_interval, evaluator=evaluator, successful_score=successful_score, return_window_size=return_window_size, log_interval=log_interval, step_hooks=step_hooks, logger=logger)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--outdir', type=str, default='results', help='Directory path to save output files.' ' If it does not exist, it will be created.') parser.add_argument( '--env', type=str, choices=[ 'Pendulum-v0', 'AntBulletEnv-v0', 'HalfCheetahBulletEnv-v0', 'HumanoidBulletEnv-v0', 'HopperBulletEnv-v0', 'Walker2DBulletEnv-v0' ], help= 'OpenAI Gym env and Pybullet (roboschool) env to perform algorithm on.' ) parser.add_argument('--num-envs', type=int, default=1, help='Number of envs run in parallel.') parser.add_argument('--seed', type=int, default=0, help='Random seed [0, 2 ** 32)') parser.add_argument('--gpu', type=int, default=0, help='GPU to use, set to -1 if no GPU.') parser.add_argument('--load', type=str, default='', help='Directory to load agent from.') parser.add_argument( '--expert-num-episode', type=int, default=0, help='the number of expert trajectory, if 0, no create demo mode.') parser.add_argument('--steps', type=int, default=10**6, help='Total number of timesteps to train the agent.') parser.add_argument('--eval-n-runs', type=int, default=10, help='Number of episodes run for each evaluation.') parser.add_argument('--eval-interval', type=int, default=5000, help='Interval in timesteps between evaluations.') parser.add_argument('--replay-start-size', type=int, default=10000, help='Minimum replay buffer size before ' + 'performing gradient updates.') parser.add_argument('--batch-size', type=int, default=256, help='Minibatch size') parser.add_argument('--render', action='store_true', help='Render env states in a GUI window.') parser.add_argument('--demo', action='store_true', help='Just run evaluation, not training.') parser.add_argument('--monitor', action='store_true', help='Wrap env with gym.wrappers.Monitor.') parser.add_argument('--log-interval', type=int, default=1000, help='Interval in timesteps between outputting log' ' messages during training') parser.add_argument('--logger-level', type=int, default=logging.INFO, help='Level of the root logger.') parser.add_argument('--policy-output-scale', type=float, default=1., help='Weight initialization scale of polity output.') parser.add_argument('--debug', action='store_true', help='Debug mode.') args = parser.parse_args() logging.basicConfig(level=args.logger_level) if args.debug: chainer.set_debug(True) if args.expert_num_episode == 0: args.outdir = experiments.prepare_output_dir( args, args.outdir, argv=sys.argv, time_format=f'{args.env}_{args.seed}') else: args.outdir = experiments.prepare_output_dir( args, args.outdir, argv=sys.argv, time_format=f'{args.env}_{args.expert_num_episode}expert') args.replay_start_size = 1e8 print('Output files are saved in {}'.format(args.outdir)) # Set a random seed used in ChainerRL misc.set_random_seed(args.seed, gpus=(args.gpu, )) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def make_env(process_idx, test): env = gym.make(args.env) # Unwrap TimiLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) if isinstance(env.observation_space, Box): # Cast observations to float32 because our model uses float32 env = chainerrl.wrappers.CastObservationToFloat32(env) else: env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari( args.env, max_frames=None), episode_life=not test, clip_rewards=not test) if isinstance(env.action_space, Box): # Normalize action space to [-1, 1]^n env = wrappers.NormalizeActionSpace(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if args.render: env = chainerrl.wrappers.Render(env) return env def make_batch_env(test): return chainerrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(process_idx=0, test=False) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space print('Observation space:', obs_space) print('Action space:', action_space) if isinstance(obs_space, Box): head = network.FCHead() phi = lambda x: x else: head = network.CNNHead(n_input_channels=4) phi = lambda x: np.asarray(x, dtype=np.float32) / 255 if isinstance(action_space, Box): action_size = action_space.low.size policy = network.GaussianPolicy(copy.deepcopy(head), action_size) q_func1 = network.QSAFunction(copy.deepcopy(head), action_size) q_func2 = network.QSAFunction(copy.deepcopy(head), action_size) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) else: action_size = action_space.n policy = network.SoftmaxPolicy(copy.deepcopy(head), action_size) q_func1 = network.QSFunction(copy.deepcopy(head), action_size) q_func2 = network.QSFunction(copy.deepcopy(head), action_size) def burnin_action_func(): return np.random.randint(0, action_size) policy_optimizer = optimizers.Adam(3e-4).setup(policy) q_func1_optimizer = optimizers.Adam(3e-4).setup(q_func1) q_func2_optimizer = optimizers.Adam(3e-4).setup(q_func2) # Draw the computational graph and save it in the output directory. # fake_obs = chainer.Variable( # policy.xp.zeros_like(obs_space.low, dtype=np.float32)[None], # name='observation') # fake_action = chainer.Variable( # policy.xp.zeros_like(action_space.low, dtype=np.float32)[None], # name='action') # chainerrl.misc.draw_computational_graph( # [policy(fake_obs)], os.path.join(args.outdir, 'policy')) # chainerrl.misc.draw_computational_graph( # [q_func1(fake_obs, fake_action)], os.path.join(args.outdir, 'q_func1')) # chainerrl.misc.draw_computational_graph( # [q_func2(fake_obs, fake_action)], os.path.join(args.outdir, 'q_func2')) rbuf = replay_buffer.ReplayBuffer(10**6) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = sac.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, is_discrete=isinstance(action_space, Discrete), replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, phi=phi, burnin_action_func=burnin_action_func, entropy_target=-action_size if isinstance(action_space, Box) else -np.log((1.0 / action_size)) * 0.98, temperature_optimizer=chainer.optimizers.Adam(3e-4), ) if len(args.load) > 0: agent.load(args.load, args.expert_num_episode == 0) if args.demo: eval_stats = experiments.eval_performance( env=make_env(process_idx=0, test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) elif args.expert_num_episode > 0: episode_r = 0 env = sample_env episode_len = 0 t = 0 logger = logging.getLogger(__name__) episode_results = [] try: for ep in range(args.expert_num_episode): obs = env.reset() r = 0 while True: # a_t action = agent.act_and_train(obs, r) # o_{t+1}, r_{t+1} obs, r, done, info = env.step(action) t += 1 episode_r += r episode_len += 1 reset = (episode_len == timestep_limit or info.get('needs_reset', False)) if done or reset: agent.stop_episode_and_train(obs, r, done=done) logger.info('outdir:%s step:%s episode:%s R:%s', args.outdir, t, ep, episode_r) episode_results.append(episode_r) episode_r = 0 episode_len = 0 break logger.info('mean: %s', sum(episode_results) / len(episode_results)) except (Exception, KeyboardInterrupt): raise # Save save_name = os.path.join( os.path.join('demos', f'{args.expert_num_episode}_episode'), args.env) makedirs(save_name, exist_ok=True) agent.replay_buffer.save(os.path.join(save_name, 'replay')) else: experiments.train_agent_with_evaluation( agent=agent, env=make_env(process_idx=0, test=False), eval_env=make_env(process_idx=0, test=True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, # log_interval=args.log_interval, train_max_episode_len=timestep_limit, eval_max_episode_len=timestep_limit, )
def train_agent_with_evaluation(agent, env, steps, eval_n_steps, eval_n_episodes, eval_interval, outdir, train_max_episode_len=None, step_offset=0, eval_max_episode_len=None, eval_env=None, successful_score=None, step_hooks=[], save_best_so_far_agent=True, logger=None, ): """Train an agent while periodically evaluating it. Args: agent: A chainerrl.agent.Agent env: Environment train the agent against. steps (int): Total number of timesteps for training. eval_n_steps (int): Number of timesteps at each evaluation phase. eval_n_episodes (int): Number of episodes at each evaluation phase. eval_interval (int): Interval of evaluation. outdir (str): Path to the directory to output data. train_max_episode_len (int): Maximum episode length during training. step_offset (int): Time step from which training starts. eval_max_episode_len (int or None): Maximum episode length of evaluation runs. If None, train_max_episode_len is used instead. eval_env: Environment used for evaluation. successful_score (float): Finish training if the mean score is greater than or equal to this value if not None step_hooks (list): List of callable objects that accepts (env, agent, step) as arguments. They are called every step. See chainerrl.experiments.hooks. save_best_so_far_agent (bool): If set to True, after each evaluation phase, if the score (= mean return of evaluation episodes) exceeds the best-so-far score, the current agent is saved. logger (logging.Logger): Logger used in this function. """ logger = logger or logging.getLogger(__name__) makedirs(outdir, exist_ok=True) if eval_env is None: eval_env = env if eval_max_episode_len is None: eval_max_episode_len = train_max_episode_len env.unwrapped.set_agent(agent) # added by FR eval_env.unwrapped.set_agent(agent) # added by FR evaluator = Evaluator(agent=agent, n_steps=eval_n_steps, n_episodes=eval_n_episodes, eval_interval=eval_interval, outdir=outdir, max_episode_len=eval_max_episode_len, env=eval_env, step_offset=step_offset, save_best_so_far_agent=save_best_so_far_agent, logger=logger, ) train_agent( agent, env, steps, outdir, max_episode_len=train_max_episode_len, step_offset=step_offset, evaluator=evaluator, successful_score=successful_score, step_hooks=step_hooks, logger=logger)
def save(self, dirname): """Save internal states.""" makedirs(dirname, exist_ok=True) for attr in self.saved_attributes: serializers.save_npz(os.path.join(dirname, '{}.npz'.format(attr)), getattr(self, attr))
def parallel_train_agent_batch_with_evaluation( start_weighted_size, all_agents, env, steps, eval_n_steps, eval_n_episodes, eval_interval, outdir, max_episode_len=None, step_offset=0, eval_max_episode_len=None, return_window_size=100, eval_env=None, log_interval=None, successful_score=None, step_hooks=[], save_best_so_far_agent=True, logger=None, step_callback=None, schedule_args={}, eval_before_distill=False, ): logger = logger or logging.getLogger(__name__) makedirs(outdir, exist_ok=True) if eval_env is None: eval_env = env if eval_max_episode_len is None: eval_max_episode_len = max_episode_len evaluator = MultipleAgentEvaluator( all_agents=all_agents, n_steps=eval_n_steps, n_episodes=eval_n_episodes, eval_interval=eval_interval, outdir=outdir, max_episode_len=eval_max_episode_len, env=eval_env, step_offset=step_offset, save_best_so_far_agent=save_best_so_far_agent, logger=logger, ) if eval_before_distill: before_evaluator = MultipleAgentEvaluator( all_agents=all_agents, n_steps=eval_n_steps, n_episodes=eval_n_episodes, eval_interval=eval_interval, outdir=outdir, max_episode_len=eval_max_episode_len, env=eval_env, step_offset=step_offset, save_best_so_far_agent=save_best_so_far_agent, logger=logger, suffix='-before-distillation') else: before_evaluator = None parallel_train_agent_batch(start_weighted_size, all_agents, env, steps, outdir, max_episode_len=max_episode_len, step_offset=step_offset, eval_interval=eval_interval, evaluator=evaluator, before_evaluator=before_evaluator, successful_score=successful_score, return_window_size=return_window_size, log_interval=log_interval, step_hooks=step_hooks, logger=logger, step_callback=step_callback, schedule_args=schedule_args)