Exemple #1
0
    def test_buffer_op(self):
        replay = create_replay(config)
        simp_replay = ReplayBuffer(config)

        env = gym.make('BipedalWalkerHardcore-v3')

        s = env.reset()
        for i in range(10000):
            a = env.action_space.sample()
            ns, r, d, _ = env.step(a)
            if d:
                ns = env.reset()
            replay.add(obs=s.astype(np.float32),
                       action=a.astype(np.float32),
                       reward=np.float32(r),
                       next_obs=ns.astype(np.float32),
                       done=d)
            simp_replay.add(obs=s, action=a, reward=r, next_obs=ns, done=d)
            s = ns

            if i > 1000:
                set_global_seed(i)
                sample1 = replay.sample()
                set_global_seed(i)
                sample2 = simp_replay.sample()

                for k in sample1.keys():
                    np.testing.assert_allclose(sample1[k],
                                               sample2[k],
                                               err_msg=f'{k}')
Exemple #2
0
    def test_sper(self):
        config = dict(
            replay_type='seqper',  # per or uniform
            precision=32,
            # arguments for PER
            beta0=0.4,
            to_update_top_priority=False,

            # arguments for general replay
            batch_size=2,
            sample_size=7,
            burn_in_size=2,
            min_size=2,
            capacity=10000,
            state_keys=['h', 'c', 'prev_reward'],
            extra_keys=['obs', 'action', 'mu', 'mask'])
        env_config = dict(n_envs=1, name='dummy')
        from env.dummy import DummyEnv
        from env import wrappers
        from env.func import create_env

        def mkenv(config):
            env = DummyEnv(**config)
            env = wrappers.post_wrap(env, config)
            return env

        for n_envs in np.arange(2, 3):
            config['n_envs'] = n_envs
            env_config['n_envs'] = n_envs
            for burn_in_size in np.arange(0, config['sample_size']):
                config['burn_in_size'] = burn_in_size
                replay = create_replay(config)
                env = create_env(env_config, mkenv)
                out = env.output()
                o, prev_reward, d, reset = out
                for i in range(1, 10000):
                    a = np.random.randint(0, 10, n_envs)
                    no, r, d, reset = env.step(a)
                    if n_envs == 1:
                        h = np.ones(2) * r
                        c = np.ones(2) * r
                    else:
                        h = np.ones((n_envs, 2)) * r[:, None]
                        c = np.ones((n_envs, 2)) * r[:, None]
                    replay.add(obs=o,
                               reward=r,
                               discount=d,
                               h=h,
                               c=c,
                               mask=1 - reset,
                               prev_reward=prev_reward)
                    if replay.good_to_learn():
                        data = replay.sample()
                        np.testing.assert_equal(data['reward'][:, 0],
                                                data['h'][:, 0])
                        np.testing.assert_equal(data['obs'][:, 0, 0],
                                                data['c'][:, 0])
                    o = no
                    prev_reward = r
Exemple #3
0
def main(env_config, model_config, agent_config, replay_config):
    silence_tf_logs()
    configure_gpu()
    configure_precision(agent_config.get('precision', 32))

    use_ray = env_config.get('n_workers', 1) > 1
    if use_ray:
        import ray
        from utility.ray_setup import sigint_shutdown_ray
        ray.init()
        sigint_shutdown_ray()

    env = create_env(env_config)
    eval_env_config = env_config.copy()
    eval_env_config['n_workers'] = 1
    eval_env_config['n_envs'] = 1
    reward_key = [k for k in eval_env_config.keys() if 'reward' in k]
    [eval_env_config.pop(k) for k in reward_key]
    eval_env = create_env(eval_env_config, force_envvec=True)

    agent_config['N_UPDATES'] *= env_config['n_workers'] * env_config['n_envs']
    create_model, Agent = pkg.import_agent(config=agent_config)
    models = create_model(model_config, env)

    n_workers = env_config.get('n_workers', 1)
    n_envs = env_config.get('n_envs', 1)
    replay_config['n_envs'] = n_workers * n_envs
    replay_config['seqlen'] = env.max_episode_steps
    if getattr(models, 'state_keys', ()):
        replay_config['state_keys'] = list(models.state_keys)
    replay = create_replay(replay_config)
    replay.load_data()

    am = pkg.import_module('agent', config=agent_config)
    data_format = am.get_data_format(env=env,
                                     replay_config=replay_config,
                                     agent_config=agent_config,
                                     model=models)
    dataset = create_dataset(replay, env, data_format=data_format)

    agent = Agent(config=agent_config, models=models, dataset=dataset, env=env)

    agent.save_config(
        dict(env=env_config,
             model=model_config,
             agent=agent_config,
             replay=replay_config))

    train(agent, env, eval_env, replay)

    if use_ray:
        ray.shutdown()
Exemple #4
0
def get_expert_data(data_path):
    data_path = data_path.lower()
    config = dict(dir=data_path,
                  replay_type='uniform',
                  capacity=int(1e6),
                  n_steps=1,
                  min_size=1,
                  batch_size=64,
                  has_next_obs=True)
    print(f'Loading data from {data_path}')
    exp_buffer = create_replay(config)
    exp_buffer.load_data()
    print(f'Expert buffer size: {len(exp_buffer)}')
    return exp_buffer
Exemple #5
0
def main(env_config, model_config, agent_config, replay_config):
    silence_tf_logs()
    configure_gpu()
    configure_precision(agent_config['precision'])

    use_ray = env_config.get('n_workers', 0) > 1
    if use_ray:
        import ray
        ray.init()
        sigint_shutdown_ray()

    env = create_env(env_config, make_env, force_envvec=True)
    eval_env_config = env_config.copy()
    eval_env_config['n_envs'] = 1
    eval_env_config['n_workers'] = 1
    eval_env = create_env(eval_env_config, make_env)

    replay_config['dir'] = agent_config['root_dir'].replace('logs', 'data')
    replay = create_replay(replay_config)
    replay.load_data()
    dtype = global_policy().compute_dtype
    data_format = pkg.import_module(
        'agent', config=agent_config).get_data_format(
            env=env,
            batch_size=agent_config['batch_size'],
            sample_size=agent_config['sample_size'],
            dtype=dtype)
    process = functools.partial(process_with_env,
                                env=env,
                                obs_range=[-.5, .5],
                                one_hot_action=True,
                                dtype=dtype)
    dataset = Dataset(replay, data_format, process)

    create_model, Agent = pkg.import_agent(config=agent_config)
    models = create_model(model_config, env)

    agent = Agent(config=agent_config, models=models, dataset=dataset, env=env)

    agent.save_config(
        dict(env=env_config,
             model=model_config,
             agent=agent_config,
             replay=replay_config))

    train(agent, env, eval_env, replay)
Exemple #6
0
        def __init__(self,
                    name, 
                    model_fn,
                    config, 
                    model_config,
                    env_config, 
                    replay_config):
            cpu_affinity('Learner')
            silence_tf_logs()
            configure_threads(config['n_cpus'], config['n_cpus'])
            configure_gpu()
            configure_precision(config['precision'])
            self._dtype = global_policy().compute_dtype

            self._envs_per_worker = env_config['n_envs']
            env_config['n_envs'] = 1
            env = create_env(env_config)
            assert env.obs_dtype == np.uint8, \
                f'Expect image observation of type uint8, but get {env.obs_dtype}'
            self._action_shape = env.action_shape
            self._action_dim = env.action_dim
            self._frame_skip = getattr(env, 'frame_skip', 1)

            self.models = Ensemble(
                model_fn=model_fn,
                config=model_config, 
                obs_shape=env.obs_shape,
                action_dim=env.action_dim, 
                is_action_discrete=env.is_action_discrete
            )

            super().__init__(
                name=name, 
                config=config, 
                models=self.models,
                dataset=None,
                env=env)

            replay_config['dir'] = config['root_dir'].replace('logs', 'data')
            self.replay = create_replay(replay_config)
            data_format = get_data_format(env, replay_config)
            process = functools.partial(process_with_env, env=env)
            self.dataset = Dataset(self.replay, data_format, process, prefetch=10)

            self._env_step = self.env_step()
Exemple #7
0
def main(env_config,
         model_config,
         agent_config,
         replay_config,
         n,
         record=False,
         size=(128, 128),
         video_len=1000,
         fps=30,
         save=False):
    logging.basicConfig(level=logging.DEBUG)
    silence_tf_logs()
    configure_gpu()
    configure_precision(agent_config.get('precision', 32))

    use_ray = env_config.get('n_workers', 0) > 1
    if use_ray:
        import ray
        ray.init()
        sigint_shutdown_ray()

    algo_name = agent_config['algorithm']
    env_name = env_config['name']

    if record:
        env_config['log_episode'] = True
        env_config['n_workers'] = env_config['n_envs'] = 1

    env = create_env(env_config)

    create_model, Agent = pkg.import_agent(config=agent_config)

    models = create_model(model_config, env)

    agent = Agent(config=agent_config, models=models, dataset=None, env=env)

    if save:
        n_workers = env_config.get('n_workers', 1)
        n_envs = env_config.get('n_envs', 1)
        replay_config['n_envs'] = n_workers * n_envs
        replay_config['replay_type'] = 'uniform'
        replay_config['dir'] = f'data/{agent.name.lower()}-{env.name.lower()}'
        replay_config['n_steps'] = 1
        replay_config['save'] = True
        replay_config['save_temp'] = True
        replay_config['capacity'] = int(1e6)
        replay_config['has_next_obs'] = True
        replay = create_replay(replay_config)

        def collect(obs, action, reward, discount, next_obs, logpi, **kwargs):
            replay.add(obs=obs,
                       action=action,
                       reward=reward,
                       discount=discount,
                       next_obs=next_obs,
                       logpi=logpi)
    else:

        def collect(**kwargs):
            pass

    if n < env.n_envs:
        n = env.n_envs
    scores, epslens, video = evaluate(env,
                                      agent,
                                      n,
                                      record=record,
                                      size=size,
                                      video_len=video_len,
                                      step_fn=collect)
    pwc(f'After running {n} episodes',
        f'Score: {np.mean(scores):.3g}\tEpslen: {np.mean(epslens):.3g}',
        color='cyan')

    if save:
        replay.save()

    if record:
        save_video(f'{algo_name}-{env_name}', video, fps=fps)
    if use_ray:
        ray.shutdown()