Beispiel #1
0
def train_agent_with_evaluation(agent,
                                env,
                                steps,
                                eval_n_runs,
                                eval_frequency,
                                outdir,
                                max_episode_len=None,
                                step_offset=0,
                                eval_explorer=None,
                                eval_max_episode_len=None,
                                eval_env=None,
                                successful_score=None,
                                render=False,
                                logger=None):
    """Run a DQN-like agent.

    Args:
      agent: Agent.
      env: Environment.
      steps (int): Number of total time steps for training.
      eval_n_runs (int): Number of runs for each time of evaluation.
      eval_frequency (int): Interval of evaluation.
      outdir (str): Path to the directory to output things.
      max_episode_len (int): Maximum episode length.
      step_offset (int): Time step from which training starts.
      eval_explorer: Explorer used for evaluation.
      eval_env: Environment used for evaluation.
      successful_score (float): Finish training if the mean score is greater
          or equal to this value if not None
    """

    logger = logger or logging.getLogger(__name__)

    makedirs(outdir, exist_ok=True)

    if eval_env is None:
        eval_env = env

    if eval_max_episode_len is None:
        eval_max_episode_len = max_episode_len

    evaluator = Evaluator(agent=agent,
                          n_runs=eval_n_runs,
                          eval_frequency=eval_frequency,
                          outdir=outdir,
                          max_episode_len=eval_max_episode_len,
                          explorer=eval_explorer,
                          env=eval_env,
                          step_offset=step_offset,
                          logger=logger)

    train_agent(agent,
                env,
                steps,
                outdir,
                max_episode_len=max_episode_len,
                step_offset=step_offset,
                evaluator=evaluator,
                successful_score=successful_score,
                logger=logger)
Beispiel #2
0
def train_agent_with_evaluation(
        agent, env, steps, eval_n_runs, eval_interval,
        outdir, max_episode_len=None, step_offset=0, eval_explorer=None,
        eval_max_episode_len=None, eval_env=None, successful_score=None,
        step_hooks=[], logger=None):
    """Train an agent while regularly evaluating it.

    Args:
        agent: Agent to train.
        env: Environment train the againt against.
        steps (int): Number of total time steps for training.
        eval_n_runs (int): Number of runs for each time of evaluation.
        eval_interval (int): Interval of evaluation.
        outdir (str): Path to the directory to output things.
        max_episode_len (int): Maximum episode length.
        step_offset (int): Time step from which training starts.
        eval_explorer: Explorer used for evaluation.
        eval_max_episode_len (int or None): Maximum episode length of
            evaluation runs. If set to None, max_episode_len is used instead.
        eval_env: Environment used for evaluation.
        successful_score (float): Finish training if the mean score is greater
            or equal to this value if not None
        step_hooks (list): List of callable objects that accepts
            (env, agent, step) as arguments. They are called every step.
            See chainerrl.experiments.hooks.
        logger (logging.Logger): Logger used in this function.
    """

    logger = logger or logging.getLogger(__name__)

    makedirs(outdir, exist_ok=True)

    if eval_env is None:
        eval_env = env

    if eval_max_episode_len is None:
        eval_max_episode_len = max_episode_len

    evaluator = Evaluator(agent=agent,
                          n_runs=eval_n_runs,
                          eval_interval=eval_interval, outdir=outdir,
                          max_episode_len=eval_max_episode_len,
                          explorer=eval_explorer,
                          env=eval_env,
                          step_offset=step_offset,
                          logger=logger)

    train_agent(
        agent, env, steps, outdir,
        max_episode_len=max_episode_len,
        step_offset=step_offset,
        evaluator=evaluator,
        successful_score=successful_score,
        step_hooks=step_hooks,
        logger=logger)
Beispiel #3
0
 def save(self, dirname):
     """Save internal states."""
     makedirs(dirname, exist_ok=True)
     for attr in self.saved_attributes:
         assert hasattr(self, attr)
         attr_value = getattr(self, attr)
         if isinstance(attr_value, AttributeSavingMixin):
             assert attr_value is not self, "Avoid an infinite loop"
             attr_value.save(os.path.join(dirname, attr))
         else:
             serializers.save_npz(
                 os.path.join(dirname, '{}.npz'.format(attr)),
                 getattr(self, attr))
Beispiel #4
0
 def __save(self, dirname, ancestors):
     makedirs(dirname, exist_ok=True)
     ancestors.append(self)
     for attr in self.saved_attributes:
         assert hasattr(self, attr)
         attr_value = getattr(self, attr)
         if isinstance(attr_value, AttributeSavingMixin):
             assert not any(
                 attr_value is ancestor
                 for ancestor in ancestors), "Avoid an infinite loop"
             attr_value.__save(os.path.join(dirname, attr), ancestors)
         else:
             serializers.save_npz(
                 os.path.join(dirname, '{}.npz'.format(attr)),
                 getattr(self, attr))
     ancestors.pop()
def train_agent_batch_with_evaluation(
    agent,
    env,
    steps,
    eval_n_steps,
    eval_n_episodes,
    eval_interval,
    outdir,
    max_episode_len=None,
    step_offset=0,
    eval_max_episode_len=None,
    return_window_size=100,
    eval_env=None,
    log_interval=None,
    successful_score=None,
    step_hooks=[],
    save_best_so_far_agent=True,
    logger=None,
):
    """Train an agent while regularly evaluating it.

    Args:
        agent: Agent to train.
        env: Environment train the againt against.
        steps (int): Number of total time steps for training.
        eval_n_steps (int): Number of timesteps at each evaluation phase.
        eval_n_runs (int): Number of runs for each time of evaluation.
        eval_interval (int): Interval of evaluation.
        outdir (str): Path to the directory to output things.
        log_interval (int): Interval of logging.
        max_episode_len (int): Maximum episode length.
        step_offset (int): Time step from which training starts.
        return_window_size (int): Number of training episodes used to estimate
            the average returns of the current agent.
        eval_max_episode_len (int or None): Maximum episode length of
            evaluation runs. If set to None, max_episode_len is used instead.
        eval_env: Environment used for evaluation.
        successful_score (float): Finish training if the mean score is greater
            or equal to thisvalue if not None
        step_hooks (list): List of callable objects that accepts
            (env, agent, step) as arguments. They are called every step.
            See chainerrl.experiments.hooks.
        save_best_so_far_agent (bool): If set to True, after each evaluation,
            if the score (= mean return of evaluation episodes) exceeds
            the best-so-far score, the current agent is saved.
        logger (logging.Logger): Logger used in this function.
    """

    logger = logger or logging.getLogger(__name__)

    makedirs(outdir, exist_ok=True)

    if eval_env is None:
        eval_env = env

    if eval_max_episode_len is None:
        eval_max_episode_len = max_episode_len

    evaluator = Evaluator(
        agent=agent,
        n_steps=eval_n_steps,
        n_episodes=eval_n_episodes,
        eval_interval=eval_interval,
        outdir=outdir,
        max_episode_len=eval_max_episode_len,
        env=eval_env,
        step_offset=step_offset,
        save_best_so_far_agent=save_best_so_far_agent,
        logger=logger,
    )

    train_agent_batch(agent,
                      env,
                      steps,
                      outdir,
                      max_episode_len=max_episode_len,
                      step_offset=step_offset,
                      eval_interval=eval_interval,
                      evaluator=evaluator,
                      successful_score=successful_score,
                      return_window_size=return_window_size,
                      log_interval=log_interval,
                      step_hooks=step_hooks,
                      logger=logger)
Beispiel #6
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument(
        '--env',
        type=str,
        choices=[
            'Pendulum-v0', 'AntBulletEnv-v0', 'HalfCheetahBulletEnv-v0',
            'HumanoidBulletEnv-v0', 'HopperBulletEnv-v0',
            'Walker2DBulletEnv-v0'
        ],
        help=
        'OpenAI Gym env and Pybullet (roboschool) env to perform algorithm on.'
    )
    parser.add_argument('--num-envs',
                        type=int,
                        default=1,
                        help='Number of envs run in parallel.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--load',
                        type=str,
                        default='',
                        help='Directory to load agent from.')
    parser.add_argument(
        '--expert-num-episode',
        type=int,
        default=0,
        help='the number of expert trajectory, if 0, no create demo mode.')
    parser.add_argument('--steps',
                        type=int,
                        default=10**6,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--eval-n-runs',
                        type=int,
                        default=10,
                        help='Number of episodes run for each evaluation.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=5000,
                        help='Interval in timesteps between evaluations.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=10000,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--batch-size',
                        type=int,
                        default=256,
                        help='Minibatch size')
    parser.add_argument('--render',
                        action='store_true',
                        help='Render env states in a GUI window.')
    parser.add_argument('--demo',
                        action='store_true',
                        help='Just run evaluation, not training.')
    parser.add_argument('--monitor',
                        action='store_true',
                        help='Wrap env with gym.wrappers.Monitor.')
    parser.add_argument('--log-interval',
                        type=int,
                        default=1000,
                        help='Interval in timesteps between outputting log'
                        ' messages during training')
    parser.add_argument('--logger-level',
                        type=int,
                        default=logging.INFO,
                        help='Level of the root logger.')
    parser.add_argument('--policy-output-scale',
                        type=float,
                        default=1.,
                        help='Weight initialization scale of polity output.')
    parser.add_argument('--debug', action='store_true', help='Debug mode.')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    if args.debug:
        chainer.set_debug(True)
    if args.expert_num_episode == 0:
        args.outdir = experiments.prepare_output_dir(
            args,
            args.outdir,
            argv=sys.argv,
            time_format=f'{args.env}_{args.seed}')
    else:
        args.outdir = experiments.prepare_output_dir(
            args,
            args.outdir,
            argv=sys.argv,
            time_format=f'{args.env}_{args.expert_num_episode}expert')
        args.replay_start_size = 1e8
    print('Output files are saved in {}'.format(args.outdir))

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Unwrap TimiLimit wrapper
        assert isinstance(env, gym.wrappers.TimeLimit)
        env = env.env
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)

        if isinstance(env.observation_space, Box):
            # Cast observations to float32 because our model uses float32
            env = chainerrl.wrappers.CastObservationToFloat32(env)
        else:
            env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
                args.env, max_frames=None),
                                               episode_life=not test,
                                               clip_rewards=not test)

        if isinstance(env.action_space, Box):
            # Normalize action space to [-1, 1]^n
            env = wrappers.NormalizeActionSpace(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])

    sample_env = make_env(process_idx=0, test=False)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space
    print('Observation space:', obs_space)
    print('Action space:', action_space)

    if isinstance(obs_space, Box):
        head = network.FCHead()
        phi = lambda x: x

    else:
        head = network.CNNHead(n_input_channels=4)
        phi = lambda x: np.asarray(x, dtype=np.float32) / 255

    if isinstance(action_space, Box):
        action_size = action_space.low.size
        policy = network.GaussianPolicy(copy.deepcopy(head), action_size)
        q_func1 = network.QSAFunction(copy.deepcopy(head), action_size)
        q_func2 = network.QSAFunction(copy.deepcopy(head), action_size)

        def burnin_action_func():
            """Select random actions until model is updated one or more times."""
            return np.random.uniform(action_space.low,
                                     action_space.high).astype(np.float32)

    else:
        action_size = action_space.n

        policy = network.SoftmaxPolicy(copy.deepcopy(head), action_size)
        q_func1 = network.QSFunction(copy.deepcopy(head), action_size)
        q_func2 = network.QSFunction(copy.deepcopy(head), action_size)

        def burnin_action_func():
            return np.random.randint(0, action_size)

    policy_optimizer = optimizers.Adam(3e-4).setup(policy)
    q_func1_optimizer = optimizers.Adam(3e-4).setup(q_func1)
    q_func2_optimizer = optimizers.Adam(3e-4).setup(q_func2)

    # Draw the computational graph and save it in the output directory.
    # fake_obs = chainer.Variable(
    #     policy.xp.zeros_like(obs_space.low, dtype=np.float32)[None],
    #     name='observation')
    # fake_action = chainer.Variable(
    #     policy.xp.zeros_like(action_space.low, dtype=np.float32)[None],
    #     name='action')
    # chainerrl.misc.draw_computational_graph(
    #     [policy(fake_obs)], os.path.join(args.outdir, 'policy'))
    # chainerrl.misc.draw_computational_graph(
    #     [q_func1(fake_obs, fake_action)], os.path.join(args.outdir, 'q_func1'))
    # chainerrl.misc.draw_computational_graph(
    #     [q_func2(fake_obs, fake_action)], os.path.join(args.outdir, 'q_func2'))

    rbuf = replay_buffer.ReplayBuffer(10**6)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    agent = sac.SoftActorCritic(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=0.99,
        is_discrete=isinstance(action_space, Discrete),
        replay_start_size=args.replay_start_size,
        gpu=args.gpu,
        minibatch_size=args.batch_size,
        phi=phi,
        burnin_action_func=burnin_action_func,
        entropy_target=-action_size if isinstance(action_space, Box) else
        -np.log((1.0 / action_size)) * 0.98,
        temperature_optimizer=chainer.optimizers.Adam(3e-4),
    )

    if len(args.load) > 0:
        agent.load(args.load, args.expert_num_episode == 0)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=make_env(process_idx=0, test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit,
        )
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    elif args.expert_num_episode > 0:
        episode_r = 0
        env = sample_env
        episode_len = 0
        t = 0
        logger = logging.getLogger(__name__)
        episode_results = []
        try:
            for ep in range(args.expert_num_episode):
                obs = env.reset()
                r = 0
                while True:
                    # a_t
                    action = agent.act_and_train(obs, r)
                    # o_{t+1}, r_{t+1}
                    obs, r, done, info = env.step(action)
                    t += 1
                    episode_r += r
                    episode_len += 1
                    reset = (episode_len == timestep_limit
                             or info.get('needs_reset', False))
                    if done or reset:
                        agent.stop_episode_and_train(obs, r, done=done)
                        logger.info('outdir:%s step:%s episode:%s R:%s',
                                    args.outdir, t, ep, episode_r)
                        episode_results.append(episode_r)
                        episode_r = 0
                        episode_len = 0
                        break
            logger.info('mean: %s',
                        sum(episode_results) / len(episode_results))
        except (Exception, KeyboardInterrupt):
            raise

        # Save
        save_name = os.path.join(
            os.path.join('demos', f'{args.expert_num_episode}_episode'),
            args.env)
        makedirs(save_name, exist_ok=True)
        agent.replay_buffer.save(os.path.join(save_name, 'replay'))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(process_idx=0, test=False),
            eval_env=make_env(process_idx=0, test=True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            # log_interval=args.log_interval,
            train_max_episode_len=timestep_limit,
            eval_max_episode_len=timestep_limit,
        )
Beispiel #7
0
def train_agent_with_evaluation(agent,
                                env,
                                steps,
                                eval_n_steps,
                                eval_n_episodes,
                                eval_interval,
                                outdir,
                                train_max_episode_len=None,
                                step_offset=0,
                                eval_max_episode_len=None,
                                eval_env=None,
                                successful_score=None,
                                step_hooks=[],
                                save_best_so_far_agent=True,
                                logger=None,
                                ):
    """Train an agent while periodically evaluating it.

    Args:
        agent: A chainerrl.agent.Agent
        env: Environment train the agent against.
        steps (int): Total number of timesteps for training.
        eval_n_steps (int): Number of timesteps at each evaluation phase.
        eval_n_episodes (int): Number of episodes at each evaluation phase.
        eval_interval (int): Interval of evaluation.
        outdir (str): Path to the directory to output data.
        train_max_episode_len (int): Maximum episode length during training.
        step_offset (int): Time step from which training starts.
        eval_max_episode_len (int or None): Maximum episode length of
            evaluation runs. If None, train_max_episode_len is used instead.
        eval_env: Environment used for evaluation.
        successful_score (float): Finish training if the mean score is greater
            than or equal to this value if not None
        step_hooks (list): List of callable objects that accepts
            (env, agent, step) as arguments. They are called every step.
            See chainerrl.experiments.hooks.
        save_best_so_far_agent (bool): If set to True, after each evaluation
            phase, if the score (= mean return of evaluation episodes) exceeds
            the best-so-far score, the current agent is saved.
        logger (logging.Logger): Logger used in this function.
    """

    logger = logger or logging.getLogger(__name__)

    makedirs(outdir, exist_ok=True)

    if eval_env is None:
        eval_env = env

    if eval_max_episode_len is None:
        eval_max_episode_len = train_max_episode_len

    env.unwrapped.set_agent(agent)  # added by FR
    eval_env.unwrapped.set_agent(agent)  # added by FR

    evaluator = Evaluator(agent=agent,
                          n_steps=eval_n_steps,
                          n_episodes=eval_n_episodes,
                          eval_interval=eval_interval, outdir=outdir,
                          max_episode_len=eval_max_episode_len,
                          env=eval_env,
                          step_offset=step_offset,
                          save_best_so_far_agent=save_best_so_far_agent,
                          logger=logger,
                          )

    train_agent(
        agent, env, steps, outdir,
        max_episode_len=train_max_episode_len,
        step_offset=step_offset,
        evaluator=evaluator,
        successful_score=successful_score,
        step_hooks=step_hooks,
        logger=logger)
Beispiel #8
0
 def save(self, dirname):
     """Save internal states."""
     makedirs(dirname, exist_ok=True)
     for attr in self.saved_attributes:
         serializers.save_npz(os.path.join(dirname, '{}.npz'.format(attr)),
                              getattr(self, attr))
Beispiel #9
0
def parallel_train_agent_batch_with_evaluation(
    start_weighted_size,
    all_agents,
    env,
    steps,
    eval_n_steps,
    eval_n_episodes,
    eval_interval,
    outdir,
    max_episode_len=None,
    step_offset=0,
    eval_max_episode_len=None,
    return_window_size=100,
    eval_env=None,
    log_interval=None,
    successful_score=None,
    step_hooks=[],
    save_best_so_far_agent=True,
    logger=None,
    step_callback=None,
    schedule_args={},
    eval_before_distill=False,
):
    logger = logger or logging.getLogger(__name__)

    makedirs(outdir, exist_ok=True)

    if eval_env is None:
        eval_env = env

    if eval_max_episode_len is None:
        eval_max_episode_len = max_episode_len

    evaluator = MultipleAgentEvaluator(
        all_agents=all_agents,
        n_steps=eval_n_steps,
        n_episodes=eval_n_episodes,
        eval_interval=eval_interval,
        outdir=outdir,
        max_episode_len=eval_max_episode_len,
        env=eval_env,
        step_offset=step_offset,
        save_best_so_far_agent=save_best_so_far_agent,
        logger=logger,
    )

    if eval_before_distill:
        before_evaluator = MultipleAgentEvaluator(
            all_agents=all_agents,
            n_steps=eval_n_steps,
            n_episodes=eval_n_episodes,
            eval_interval=eval_interval,
            outdir=outdir,
            max_episode_len=eval_max_episode_len,
            env=eval_env,
            step_offset=step_offset,
            save_best_so_far_agent=save_best_so_far_agent,
            logger=logger,
            suffix='-before-distillation')
    else:
        before_evaluator = None

    parallel_train_agent_batch(start_weighted_size,
                               all_agents,
                               env,
                               steps,
                               outdir,
                               max_episode_len=max_episode_len,
                               step_offset=step_offset,
                               eval_interval=eval_interval,
                               evaluator=evaluator,
                               before_evaluator=before_evaluator,
                               successful_score=successful_score,
                               return_window_size=return_window_size,
                               log_interval=log_interval,
                               step_hooks=step_hooks,
                               logger=logger,
                               step_callback=step_callback,
                               schedule_args=schedule_args)