Exemple #1
0
def main(env_config,
         model_config,
         agent_config,
         replay_config,
         n,
         record=False,
         size=(128, 128),
         video_len=1000,
         fps=30,
         save=False):
    silence_tf_logs()
    configure_gpu()
    configure_precision(agent_config.get('precision', 32))

    use_ray = env_config.get('n_workers', 0) > 1
    if use_ray:
        import ray
        ray.init()
        sigint_shutdown_ray()

    algo_name = agent_config['algorithm']
    env_name = env_config['name']

    try:
        make_env = pkg.import_module('env', algo_name, place=-1).make_env
    except:
        make_env = None
    env_config.pop('reward_clip', False)
    env = create_env(env_config, env_fn=make_env)
    create_model, Agent = pkg.import_agent(config=agent_config)
    models = create_model(model_config, env)

    agent = Agent(config=agent_config, models=models, dataset=None, env=env)

    if n < env.n_envs:
        n = env.n_envs
    scores, epslens, video = evaluate(env,
                                      agent,
                                      n,
                                      record=record,
                                      size=size,
                                      video_len=video_len)
    pwc(f'After running {n} episodes',
        f'Score: {np.mean(scores):.3g}\tEpslen: {np.mean(epslens):.3g}',
        color='cyan')

    if record:
        save_video(f'{algo_name}-{env_name}', video, fps=fps)
    if use_ray:
        ray.shutdown()
Exemple #2
0
def train(agent, env, eval_env, replay):
    collect_fn = pkg.import_module('agent', algo=agent.name).collect
    collect = functools.partial(collect_fn, replay)

    _, step = replay.count_episodes()
    step = max(agent.env_step, step)

    runner = Runner(env, agent, step=step)

    def random_actor(*args, **kwargs):
        prev_action = random_actor.prev_action
        random_actor.prev_action = action = env.random_action()
        return action, {'prev_action': prev_action}
    random_actor.prev_action = np.zeros_like(env.random_action()) \
        if isinstance(env.random_action(), np.ndarray) else 0
    while not replay.good_to_learn():
        step = runner.run(action_selector=random_actor, step_fn=collect)

    to_log = Every(agent.LOG_PERIOD)
    to_eval = Every(agent.EVAL_PERIOD)
    print('Training starts...')
    while step < int(agent.MAX_STEPS):
        start_step = step
        start_t = time.time()
        agent.learn_log(step)
        step = runner.run(step_fn=collect, nsteps=agent.TRAIN_PERIOD)
        duration = time.time() - start_t
        agent.store(fps=(step - start_step) / duration,
                    tps=agent.N_UPDATES / duration)

        if to_eval(step):
            with TempStore(agent.get_states, agent.reset_states):
                score, epslen, video = evaluate(eval_env,
                                                agent,
                                                record=agent.RECORD,
                                                size=(64, 64))
                if agent.RECORD:
                    video_summary(f'{agent.name}/sim', video, step=step)
                agent.store(eval_score=score, eval_epslen=epslen)

        if to_log(step):
            agent.log(step)
            agent.save()
Exemple #3
0
def train(agent, env, eval_env, buffer):
    collect_fn = pkg.import_module('agent', algo=agent.name).collect
    collect = functools.partial(collect_fn, buffer)

    step = agent.env_step
    runner = Runner(env, agent, step=step, nsteps=agent.N_STEPS)
    exp_buffer = get_expert_data(f'{buffer.DATA_PATH}-{env.name}')

    if step == 0 and agent.is_obs_normalized:
        print('Start to initialize running stats...')
        for _ in range(10):
            runner.run(action_selector=env.random_action, step_fn=collect)
            agent.update_obs_rms(np.concatenate(buffer['obs']))
            agent.update_reward_rms(buffer['reward'], buffer['discount'])
            buffer.reset()
        buffer.clear()
        agent.save(print_terminal_info=True)

    runner.step = step
    # print("Initial running stats:", *[f'{k:.4g}' for k in agent.get_running_stats() if k])
    to_log = Every(agent.LOG_PERIOD, agent.LOG_PERIOD)
    to_eval = Every(agent.EVAL_PERIOD)
    rt = Timer('run')
    tt = Timer('train')
    et = Timer('eval')
    lt = Timer('log')
    print('Training starts...')
    while step < agent.MAX_STEPS:
        start_env_step = agent.env_step
        agent.before_run(env)
        with rt:
            step = runner.run(step_fn=collect)
        agent.store(fps=(step - start_env_step) / rt.last())
        buffer.reshape_to_sample()
        agent.disc_learn_log(exp_buffer)
        buffer.compute_reward_with_func(agent.compute_reward)
        buffer.reshape_to_store()

        # NOTE: normalizing rewards here may introduce some inconsistency
        # if normalized rewards is fed as an input to the network.
        # One can reconcile this by moving normalization to collect
        # or feeding the network with unnormalized rewards.
        # The latter is adopted in our implementation.
        # However, the following line currently doesn't store
        # a copy of unnormalized rewards
        agent.update_reward_rms(buffer['reward'], buffer['discount'])
        buffer.update('reward',
                      agent.normalize_reward(buffer['reward']),
                      field='all')
        agent.record_last_env_output(runner.env_output)
        value = agent.compute_value()
        buffer.finish(value)

        start_train_step = agent.train_step
        with tt:
            agent.learn_log(step)
        agent.store(tps=(agent.train_step - start_train_step) / tt.last())
        buffer.reset()

        if to_eval(agent.train_step) or step > agent.MAX_STEPS:
            with TempStore(agent.get_states, agent.reset_states):
                with et:
                    eval_score, eval_epslen, video = evaluate(
                        eval_env,
                        agent,
                        n=agent.N_EVAL_EPISODES,
                        record=agent.RECORD,
                        size=(64, 64))
                if agent.RECORD:
                    video_summary(f'{agent.name}/sim', video, step=step)
                agent.store(eval_score=eval_score, eval_epslen=eval_epslen)

        if to_log(agent.train_step) and agent.contains_stats('score'):
            with lt:
                agent.store(
                    **{
                        'train_step': agent.train_step,
                        'time/run': rt.total(),
                        'time/train': tt.total(),
                        'time/eval': et.total(),
                        'time/log': lt.total(),
                        'time/run_mean': rt.average(),
                        'time/train_mean': tt.average(),
                        'time/eval_mean': et.average(),
                        'time/log_mean': lt.average(),
                    })
                agent.log(step)
                agent.save()
Exemple #4
0
def train(agent, env, eval_env, buffer):
    def collect(env, step, reset, next_obs, **kwargs):
        buffer.add(**kwargs)

    step = agent.env_step
    runner = Runner(env, agent, step=step, nsteps=agent.N_STEPS)
    actsel = lambda *args, **kwargs: np.random.randint(
        0, env.action_dim, size=env.n_envs)
    if not agent.rnd_rms_restored():
        print('Start to initialize observation running stats...')
        for _ in range(50):
            runner.run(action_selector=actsel, step_fn=collect)
            agent.update_obs_rms(buffer['obs'])
            buffer.reset()
        buffer.clear()
        agent.save()
        runner.step = step

    to_log = Every(agent.LOG_PERIOD, agent.LOG_PERIOD)
    to_eval = Every(agent.EVAL_PERIOD)
    print('Training starts...')
    while step < agent.MAX_STEPS:
        start_env_step = agent.env_step
        with Timer('env') as rt:
            step = runner.run(step_fn=collect)
        agent.store(fps=(step - start_env_step) / rt.last())

        agent.record_last_env_output(runner.env_output)
        value_int, value_ext = agent.compute_value()
        obs = buffer.get_obs(runner.env_output.obs)
        assert obs.shape[:2] == (env.n_envs, agent.N_STEPS + 1)
        assert obs.dtype == np.uint8
        agent.update_obs_rms(obs[:, :-1])
        norm_obs = agent.normalize_obs(obs)
        # compute intrinsic reward from the next normalized obs
        reward_int = agent.compute_int_reward(norm_obs[:, 1:])
        agent.update_int_reward_rms(reward_int)
        reward_int = agent.normalize_int_reward(reward_int)
        buffer.finish(reward_int, norm_obs[:, :-1], value_int, value_ext)
        agent.store(
            reward_int_max=np.max(reward_int),
            reward_int_min=np.min(reward_int),
            reward_int=np.mean(reward_int),
            reward_int_std=np.std(reward_int),
        )

        start_train_step = agent.train_step
        with Timer('train') as tt:
            agent.learn_log(step)
        agent.store(tps=(agent.train_step - start_train_step) / tt.last())
        buffer.reset()

        if to_eval(agent.train_step):
            with TempStore(agent.get_states, agent.reset_states):
                scores, epslens, video = evaluate(eval_env,
                                                  agent,
                                                  record=True,
                                                  video_len=4500)
                video_summary(f'{agent.name}/sim', video, step=step)
                if eval_env.n_envs == 1:
                    rews_int, rews_ext = agent.retrieve_eval_rewards()
                    assert len(rews_ext) == len(rews_int) == video.shape[1], (
                        len(rews_ext), len(rews_int), video.shape[1])
                    n = 10
                    idxes_int = rews_int.argsort()[::-1][:n]
                    idxes_ext = rews_ext.argsort()[::-1][:n]
                    assert idxes_int.shape == idxes_ext.shape, (
                        idxes_int.shape, idxes_ext.shape)

                    imgs_int = video[0, idxes_int]
                    imgs_ext = video[0, idxes_ext]
                    rews_int = rews_int[idxes_int]
                    rews_ext = rews_ext[idxes_ext]
                    terms = {
                        **{
                            f'eval/reward_int_{i}': rews_int[i]
                            for i in range(0, n)
                        },
                        **{
                            f'eval/reward_ext_{i}': rews_ext[i]
                            for i in range(0, n)
                        },
                    }
                    agent.store(**terms)
                    imgs = np.concatenate([imgs_int[:n], imgs_ext[:n]], 0)
                    image_summary(f'{agent.name}/img', imgs, step=step)

                    # info = eval_env.info()[0]
                    # episode = info.get('episode', {'visited_rooms': 1})
                    # agent.store(visited_rooms_max=len(episode['visited_rooms']))
                    agent.histogram_summary(
                        {'eval/action': agent.retrieve_eval_actions()},
                        step=step)
                agent.store(eval_score=scores, eval_epslen=epslens)

        if to_log(agent.train_step) and agent.contains_stats('score'):
            agent.store(
                **{
                    'episodes': runner.episodes,
                    'train_step': agent.train_step,
                    'time/run': rt.total(),
                    'time/train': tt.total()
                })
            agent.log(step)
            agent.save()
Exemple #5
0
 def _run(self, record):
     score, epslen, video = evaluate(self.env,
                                     self,
                                     record=record,
                                     n=self.N_EVALUATION)
     self.store(score, epslen, video)
Exemple #6
0
def train(agent, env, eval_env, replay):
    collect_fn = pkg.import_module('agent', algo=agent.name).collect
    collect = functools.partial(collect_fn, replay)

    env_step = agent.env_step
    runner = Runner(env, agent, step=env_step, nsteps=agent.TRAIN_PERIOD)
    while not replay.good_to_learn():
        env_step = runner.run(
            # NOTE: random action below makes a huge difference for Mujoco tasks
            # by default, we don't use it as it's not a conventional practice.
            # action_selector=env.random_action,
            step_fn=collect)

    to_eval = Every(agent.EVAL_PERIOD)
    to_log = Every(agent.LOG_PERIOD, agent.LOG_PERIOD)
    to_eval = Every(agent.EVAL_PERIOD)
    to_record = Every(agent.EVAL_PERIOD * 10)
    rt = Timer('run')
    tt = Timer('train')
    et = Timer('eval')
    lt = Timer('log')
    print('Training starts...')
    while env_step <= int(agent.MAX_STEPS):
        with rt:
            env_step = runner.run(step_fn=collect)
        with tt:
            agent.learn_log(env_step)

        if to_eval(env_step):
            with TempStore(agent.get_states, agent.reset_states):
                with et:
                    record = agent.RECORD and to_record(env_step)
                    eval_score, eval_epslen, video = evaluate(
                        eval_env,
                        agent,
                        n=agent.N_EVAL_EPISODES,
                        record=agent.RECORD,
                        size=(64, 64))
                    if record:
                        video_summary(f'{agent.name}/sim',
                                      video,
                                      step=env_step)
                    agent.store(eval_score=eval_score, eval_epslen=eval_epslen)

        if to_log(env_step):
            with lt:
                fps = rt.average() * agent.TRAIN_PERIOD
                tps = tt.average() * agent.N_UPDATES

                agent.store(
                    env_step=agent.env_step,
                    train_step=agent.train_step,
                    fps=fps,
                    tps=tps,
                )
                agent.store(
                    **{
                        'train_step': agent.train_step,
                        'time/run': rt.total(),
                        'time/train': tt.total(),
                        'time/eval': et.total(),
                        'time/log': lt.total(),
                        'time/run_mean': rt.average(),
                        'time/train_mean': tt.average(),
                        'time/eval_mean': et.average(),
                        'time/log_mean': lt.average(),
                    })
                agent.log(env_step)
                agent.save()
Exemple #7
0
def main(env_config,
         model_config,
         agent_config,
         replay_config,
         n,
         record=False,
         size=(128, 128),
         video_len=1000,
         fps=30,
         save=False):
    logging.basicConfig(level=logging.DEBUG)
    silence_tf_logs()
    configure_gpu()
    configure_precision(agent_config.get('precision', 32))

    use_ray = env_config.get('n_workers', 0) > 1
    if use_ray:
        import ray
        ray.init()
        sigint_shutdown_ray()

    algo_name = agent_config['algorithm']
    env_name = env_config['name']

    if record:
        env_config['log_episode'] = True
        env_config['n_workers'] = env_config['n_envs'] = 1

    env = create_env(env_config)

    create_model, Agent = pkg.import_agent(config=agent_config)

    models = create_model(model_config, env)

    agent = Agent(config=agent_config, models=models, dataset=None, env=env)

    if save:
        n_workers = env_config.get('n_workers', 1)
        n_envs = env_config.get('n_envs', 1)
        replay_config['n_envs'] = n_workers * n_envs
        replay_config['replay_type'] = 'uniform'
        replay_config['dir'] = f'data/{agent.name.lower()}-{env.name.lower()}'
        replay_config['n_steps'] = 1
        replay_config['save'] = True
        replay_config['save_temp'] = True
        replay_config['capacity'] = int(1e6)
        replay_config['has_next_obs'] = True
        replay = create_replay(replay_config)

        def collect(obs, action, reward, discount, next_obs, logpi, **kwargs):
            replay.add(obs=obs,
                       action=action,
                       reward=reward,
                       discount=discount,
                       next_obs=next_obs,
                       logpi=logpi)
    else:

        def collect(**kwargs):
            pass

    if n < env.n_envs:
        n = env.n_envs
    scores, epslens, video = evaluate(env,
                                      agent,
                                      n,
                                      record=record,
                                      size=size,
                                      video_len=video_len,
                                      step_fn=collect)
    pwc(f'After running {n} episodes',
        f'Score: {np.mean(scores):.3g}\tEpslen: {np.mean(epslens):.3g}',
        color='cyan')

    if save:
        replay.save()

    if record:
        save_video(f'{algo_name}-{env_name}', video, fps=fps)
    if use_ray:
        ray.shutdown()