def test_buffer_op(self): replay = create_replay(config) simp_replay = ReplayBuffer(config) env = gym.make('BipedalWalkerHardcore-v3') s = env.reset() for i in range(10000): a = env.action_space.sample() ns, r, d, _ = env.step(a) if d: ns = env.reset() replay.add(obs=s.astype(np.float32), action=a.astype(np.float32), reward=np.float32(r), next_obs=ns.astype(np.float32), done=d) simp_replay.add(obs=s, action=a, reward=r, next_obs=ns, done=d) s = ns if i > 1000: set_global_seed(i) sample1 = replay.sample() set_global_seed(i) sample2 = simp_replay.sample() for k in sample1.keys(): np.testing.assert_allclose(sample1[k], sample2[k], err_msg=f'{k}')
def test_sper(self): config = dict( replay_type='seqper', # per or uniform precision=32, # arguments for PER beta0=0.4, to_update_top_priority=False, # arguments for general replay batch_size=2, sample_size=7, burn_in_size=2, min_size=2, capacity=10000, state_keys=['h', 'c', 'prev_reward'], extra_keys=['obs', 'action', 'mu', 'mask']) env_config = dict(n_envs=1, name='dummy') from env.dummy import DummyEnv from env import wrappers from env.func import create_env def mkenv(config): env = DummyEnv(**config) env = wrappers.post_wrap(env, config) return env for n_envs in np.arange(2, 3): config['n_envs'] = n_envs env_config['n_envs'] = n_envs for burn_in_size in np.arange(0, config['sample_size']): config['burn_in_size'] = burn_in_size replay = create_replay(config) env = create_env(env_config, mkenv) out = env.output() o, prev_reward, d, reset = out for i in range(1, 10000): a = np.random.randint(0, 10, n_envs) no, r, d, reset = env.step(a) if n_envs == 1: h = np.ones(2) * r c = np.ones(2) * r else: h = np.ones((n_envs, 2)) * r[:, None] c = np.ones((n_envs, 2)) * r[:, None] replay.add(obs=o, reward=r, discount=d, h=h, c=c, mask=1 - reset, prev_reward=prev_reward) if replay.good_to_learn(): data = replay.sample() np.testing.assert_equal(data['reward'][:, 0], data['h'][:, 0]) np.testing.assert_equal(data['obs'][:, 0, 0], data['c'][:, 0]) o = no prev_reward = r
def main(env_config, model_config, agent_config, replay_config): silence_tf_logs() configure_gpu() configure_precision(agent_config.get('precision', 32)) use_ray = env_config.get('n_workers', 1) > 1 if use_ray: import ray from utility.ray_setup import sigint_shutdown_ray ray.init() sigint_shutdown_ray() env = create_env(env_config) eval_env_config = env_config.copy() eval_env_config['n_workers'] = 1 eval_env_config['n_envs'] = 1 reward_key = [k for k in eval_env_config.keys() if 'reward' in k] [eval_env_config.pop(k) for k in reward_key] eval_env = create_env(eval_env_config, force_envvec=True) agent_config['N_UPDATES'] *= env_config['n_workers'] * env_config['n_envs'] create_model, Agent = pkg.import_agent(config=agent_config) models = create_model(model_config, env) n_workers = env_config.get('n_workers', 1) n_envs = env_config.get('n_envs', 1) replay_config['n_envs'] = n_workers * n_envs replay_config['seqlen'] = env.max_episode_steps if getattr(models, 'state_keys', ()): replay_config['state_keys'] = list(models.state_keys) replay = create_replay(replay_config) replay.load_data() am = pkg.import_module('agent', config=agent_config) data_format = am.get_data_format(env=env, replay_config=replay_config, agent_config=agent_config, model=models) dataset = create_dataset(replay, env, data_format=data_format) agent = Agent(config=agent_config, models=models, dataset=dataset, env=env) agent.save_config( dict(env=env_config, model=model_config, agent=agent_config, replay=replay_config)) train(agent, env, eval_env, replay) if use_ray: ray.shutdown()
def get_expert_data(data_path): data_path = data_path.lower() config = dict(dir=data_path, replay_type='uniform', capacity=int(1e6), n_steps=1, min_size=1, batch_size=64, has_next_obs=True) print(f'Loading data from {data_path}') exp_buffer = create_replay(config) exp_buffer.load_data() print(f'Expert buffer size: {len(exp_buffer)}') return exp_buffer
def main(env_config, model_config, agent_config, replay_config): silence_tf_logs() configure_gpu() configure_precision(agent_config['precision']) use_ray = env_config.get('n_workers', 0) > 1 if use_ray: import ray ray.init() sigint_shutdown_ray() env = create_env(env_config, make_env, force_envvec=True) eval_env_config = env_config.copy() eval_env_config['n_envs'] = 1 eval_env_config['n_workers'] = 1 eval_env = create_env(eval_env_config, make_env) replay_config['dir'] = agent_config['root_dir'].replace('logs', 'data') replay = create_replay(replay_config) replay.load_data() dtype = global_policy().compute_dtype data_format = pkg.import_module( 'agent', config=agent_config).get_data_format( env=env, batch_size=agent_config['batch_size'], sample_size=agent_config['sample_size'], dtype=dtype) process = functools.partial(process_with_env, env=env, obs_range=[-.5, .5], one_hot_action=True, dtype=dtype) dataset = Dataset(replay, data_format, process) create_model, Agent = pkg.import_agent(config=agent_config) models = create_model(model_config, env) agent = Agent(config=agent_config, models=models, dataset=dataset, env=env) agent.save_config( dict(env=env_config, model=model_config, agent=agent_config, replay=replay_config)) train(agent, env, eval_env, replay)
def __init__(self, name, model_fn, config, model_config, env_config, replay_config): cpu_affinity('Learner') silence_tf_logs() configure_threads(config['n_cpus'], config['n_cpus']) configure_gpu() configure_precision(config['precision']) self._dtype = global_policy().compute_dtype self._envs_per_worker = env_config['n_envs'] env_config['n_envs'] = 1 env = create_env(env_config) assert env.obs_dtype == np.uint8, \ f'Expect image observation of type uint8, but get {env.obs_dtype}' self._action_shape = env.action_shape self._action_dim = env.action_dim self._frame_skip = getattr(env, 'frame_skip', 1) self.models = Ensemble( model_fn=model_fn, config=model_config, obs_shape=env.obs_shape, action_dim=env.action_dim, is_action_discrete=env.is_action_discrete ) super().__init__( name=name, config=config, models=self.models, dataset=None, env=env) replay_config['dir'] = config['root_dir'].replace('logs', 'data') self.replay = create_replay(replay_config) data_format = get_data_format(env, replay_config) process = functools.partial(process_with_env, env=env) self.dataset = Dataset(self.replay, data_format, process, prefetch=10) self._env_step = self.env_step()
def main(env_config, model_config, agent_config, replay_config, n, record=False, size=(128, 128), video_len=1000, fps=30, save=False): logging.basicConfig(level=logging.DEBUG) silence_tf_logs() configure_gpu() configure_precision(agent_config.get('precision', 32)) use_ray = env_config.get('n_workers', 0) > 1 if use_ray: import ray ray.init() sigint_shutdown_ray() algo_name = agent_config['algorithm'] env_name = env_config['name'] if record: env_config['log_episode'] = True env_config['n_workers'] = env_config['n_envs'] = 1 env = create_env(env_config) create_model, Agent = pkg.import_agent(config=agent_config) models = create_model(model_config, env) agent = Agent(config=agent_config, models=models, dataset=None, env=env) if save: n_workers = env_config.get('n_workers', 1) n_envs = env_config.get('n_envs', 1) replay_config['n_envs'] = n_workers * n_envs replay_config['replay_type'] = 'uniform' replay_config['dir'] = f'data/{agent.name.lower()}-{env.name.lower()}' replay_config['n_steps'] = 1 replay_config['save'] = True replay_config['save_temp'] = True replay_config['capacity'] = int(1e6) replay_config['has_next_obs'] = True replay = create_replay(replay_config) def collect(obs, action, reward, discount, next_obs, logpi, **kwargs): replay.add(obs=obs, action=action, reward=reward, discount=discount, next_obs=next_obs, logpi=logpi) else: def collect(**kwargs): pass if n < env.n_envs: n = env.n_envs scores, epslens, video = evaluate(env, agent, n, record=record, size=size, video_len=video_len, step_fn=collect) pwc(f'After running {n} episodes', f'Score: {np.mean(scores):.3g}\tEpslen: {np.mean(epslens):.3g}', color='cyan') if save: replay.save() if record: save_video(f'{algo_name}-{env_name}', video, fps=fps) if use_ray: ray.shutdown()