def main(env_config, model_config, agent_config, buffer_config, train=train): silence_tf_logs() configure_gpu() configure_precision(agent_config['precision']) create_model, Agent = pkg.import_agent(config=agent_config) Buffer = pkg.import_module('buffer', config=agent_config).Buffer use_ray = env_config.get('n_workers', 1) > 1 if use_ray: import ray from utility.ray_setup import sigint_shutdown_ray ray.init() sigint_shutdown_ray() env = create_env(env_config, force_envvec=True) eval_env_config = env_config.copy() if 'num_levels' in eval_env_config: eval_env_config['num_levels'] = 0 if 'seed' in eval_env_config: eval_env_config['seed'] += 1000 eval_env_config['n_workers'] = 1 for k in list(eval_env_config.keys()): # pop reward hacks if 'reward' in k: eval_env_config.pop(k) eval_env = create_env(eval_env_config, force_envvec=True) def sigint_handler(sig, frame): signal.signal(sig, signal.SIG_IGN) env.close() eval_env.close() sys.exit(0) signal.signal(signal.SIGINT, sigint_handler) models = create_model(model_config, env) buffer_config['n_envs'] = env.n_envs buffer_config['state_keys'] = models.state_keys buffer = Buffer(buffer_config) agent = Agent(config=agent_config, models=models, dataset=buffer, env=env) agent.save_config( dict(env=env_config, model=model_config, agent=agent_config, buffer=buffer_config)) train(agent, env, eval_env, buffer) if use_ray: env.close() eval_env.close() ray.shutdown()
def main(env_config, model_config, agent_config, replay_config): silence_tf_logs() configure_gpu() configure_precision(agent_config.get('precision', 32)) use_ray = env_config.get('n_workers', 1) > 1 if use_ray: import ray from utility.ray_setup import sigint_shutdown_ray ray.init() sigint_shutdown_ray() env = create_env(env_config) eval_env_config = env_config.copy() eval_env_config['n_workers'] = 1 eval_env_config['n_envs'] = 1 reward_key = [k for k in eval_env_config.keys() if 'reward' in k] [eval_env_config.pop(k) for k in reward_key] eval_env = create_env(eval_env_config, force_envvec=True) agent_config['N_UPDATES'] *= env_config['n_workers'] * env_config['n_envs'] create_model, Agent = pkg.import_agent(config=agent_config) models = create_model(model_config, env) n_workers = env_config.get('n_workers', 1) n_envs = env_config.get('n_envs', 1) replay_config['n_envs'] = n_workers * n_envs replay_config['seqlen'] = env.max_episode_steps if getattr(models, 'state_keys', ()): replay_config['state_keys'] = list(models.state_keys) replay = create_replay(replay_config) replay.load_data() am = pkg.import_module('agent', config=agent_config) data_format = am.get_data_format(env=env, replay_config=replay_config, agent_config=agent_config, model=models) dataset = create_dataset(replay, env, data_format=data_format) agent = Agent(config=agent_config, models=models, dataset=dataset, env=env) agent.save_config( dict(env=env_config, model=model_config, agent=agent_config, replay=replay_config)) train(agent, env, eval_env, replay) if use_ray: ray.shutdown()
def test_Env(self): for name in [ 'atari_pong', 'atari_breakout', 'BipedalWalkerHardcore-v3' ]: for life_done in [False, True]: for _ in range(2): config = default_config.copy() config['name'] = name config['n_envs'] = 1 config['life_done'] = life_done env = create_env(config) cr = 0 n = 0 re = 0 for i in range(2000): a = env.random_action() s, r, d, re = env.step(a) cr += r if r != 0: print(name, i, r, cr, env.score()) n += env.info().get('frame_skip', 1) np.testing.assert_equal(cr, env.score()) np.testing.assert_equal(n, env.epslen()) if env.info().get('game_over'): cr = 0 n = 0
def __init__(self, *, config, name='Evaluator', model_config, env_config, model_fn): config_actor(name, config) env_config.pop('reward_clip', False) self.env = env = create_env(env_config) model = model_fn(config=model_config, env=env) super().__init__( name=name, config=config, models=model, dataset=None, env=env, ) # the names of network modules that should be in sync with the learner if not hasattr(self, '_pull_names'): self._pull_names = [ k for k in self.model.keys() if 'target' not in k ] # used for recording evaluator side info self._info = collections.defaultdict(list)
def __init__(self, model_fn, replay, config, model_config, env_config, replay_config): config_actor('Learner', config) env = create_env(env_config) model = model_fn(config=model_config, env=env) am = pkg.import_module('agent', config=config, place=-1) data_format = am.get_data_format(env=env, replay_config=replay_config, agent_config=config, model=model) dataset = create_dataset(replay, env, data_format=data_format, use_ray=True) super().__init__( name='Learner', config=config, models=model, dataset=dataset, env=env, )
def __init__(self, name, worker_id, env_config): cpu_affinity(f'Worker_{worker_id}') self.name = name self._id = worker_id self._n_envs = env_config['n_envs'] env_config['n_workers'] = env_config['n_envs'] = 1 self._envs = [create_env(env_config) for _ in range(self._n_envs)]
def __init__(self, worker_id, config, env_config, buffer_config): config_attr(self, config) cpu_affinity(f'Worker_{worker_id}') self._id = worker_id self._n_envvecs = env_config.pop('n_envvecs') env_config.pop('n_workers', None) self._envvecs = [ create_env(env_config, force_envvec=True) for _ in range(self._n_envvecs) ] collect_fn = pkg.import_module('agent', config=config, place=-1).collect self._collect = functools.partial(collect_fn, env=None, step=None, reset=None) buffer_config['force_envvec'] = True self._buffs = { eid: create_local_buffer(buffer_config) for eid in range(self._n_envvecs) } self._obs = { eid: e.output().obs for eid, e in enumerate(self._envvecs) } self._info = collections.defaultdict(list)
def test_RayEnvVec(self): for name in [ 'atari_pong', 'atari_breakout', 'BipedalWalkerHardcore-v3' ]: for _ in range(3): config = default_config.copy() config['name'] = name ray.init() config['n_envs'] = 2 config['n_workers'] = 2 env = create_env(config) cr = np.zeros(env.n_envs) n = np.zeros(env.n_envs) for _ in range(2000): a = env.random_action() s, r, d, re = env.step(a) cr += r n += np.array([i.get('frame_skip', 1) for i in env.info()]) np.testing.assert_allclose(cr, env.score()) np.testing.assert_equal(n, env.epslen()) if np.any(re): info = env.info() for k, i in enumerate(info): if i.get('game_over'): cr[k] = 0 n[k] = 0 ray.shutdown()
def test_sper(self): config = dict( replay_type='seqper', # per or uniform precision=32, # arguments for PER beta0=0.4, to_update_top_priority=False, # arguments for general replay batch_size=2, sample_size=7, burn_in_size=2, min_size=2, capacity=10000, state_keys=['h', 'c', 'prev_reward'], extra_keys=['obs', 'action', 'mu', 'mask']) env_config = dict(n_envs=1, name='dummy') from env.dummy import DummyEnv from env import wrappers from env.func import create_env def mkenv(config): env = DummyEnv(**config) env = wrappers.post_wrap(env, config) return env for n_envs in np.arange(2, 3): config['n_envs'] = n_envs env_config['n_envs'] = n_envs for burn_in_size in np.arange(0, config['sample_size']): config['burn_in_size'] = burn_in_size replay = create_replay(config) env = create_env(env_config, mkenv) out = env.output() o, prev_reward, d, reset = out for i in range(1, 10000): a = np.random.randint(0, 10, n_envs) no, r, d, reset = env.step(a) if n_envs == 1: h = np.ones(2) * r c = np.ones(2) * r else: h = np.ones((n_envs, 2)) * r[:, None] c = np.ones((n_envs, 2)) * r[:, None] replay.add(obs=o, reward=r, discount=d, h=h, c=c, mask=1 - reset, prev_reward=prev_reward) if replay.good_to_learn(): data = replay.sample() np.testing.assert_equal(data['reward'][:, 0], data['h'][:, 0]) np.testing.assert_equal(data['obs'][:, 0, 0], data['c'][:, 0]) o = no prev_reward = r
def main(env_config, model_config, agent_config, replay_config): silence_tf_logs() configure_gpu() configure_precision(agent_config['precision']) use_ray = env_config.get('n_workers', 0) > 1 if use_ray: import ray ray.init() sigint_shutdown_ray() env = create_env(env_config, make_env, force_envvec=True) eval_env_config = env_config.copy() eval_env_config['n_envs'] = 1 eval_env_config['n_workers'] = 1 eval_env = create_env(eval_env_config, make_env) replay_config['dir'] = agent_config['root_dir'].replace('logs', 'data') replay = create_replay(replay_config) replay.load_data() dtype = global_policy().compute_dtype data_format = pkg.import_module( 'agent', config=agent_config).get_data_format( env=env, batch_size=agent_config['batch_size'], sample_size=agent_config['sample_size'], dtype=dtype) process = functools.partial(process_with_env, env=env, obs_range=[-.5, .5], one_hot_action=True, dtype=dtype) dataset = Dataset(replay, data_format, process) create_model, Agent = pkg.import_agent(config=agent_config) models = create_model(model_config, env) agent = Agent(config=agent_config, models=models, dataset=dataset, env=env) agent.save_config( dict(env=env_config, model=model_config, agent=agent_config, replay=replay_config)) train(agent, env, eval_env, replay)
def test_sequential_buffer_random(self): config = dict( replay_type='seqper', # per or uniform # arguments for general replay n_envs=32, seqlen=16, reset_shift=2, state_keys=['h', 'c', 'prev_reward'], extra_keys=['obs', 'action', 'mu', 'mask']) env_config = dict(n_envs=1, name='dummy') from env.dummy import DummyEnv from env import wrappers from env.func import create_env def mkenv(config): env = DummyEnv(**config) env = wrappers.post_wrap(env, config) return env for n_envs in np.arange(2, 3): config['n_envs'] = n_envs env_config['n_envs'] = n_envs for burn_in_size in np.arange(0, config['seqlen']): config['burn_in_size'] = burn_in_size buff = create_local_buffer(config) env = create_env(env_config, mkenv) out = env.output() o, prev_reward, d, reset = out for i in range(1, 1000): a = np.random.randint(0, 10, n_envs) no, r, d, reset = env.step(a) print(r) if n_envs == 1: h = np.ones(2) * r c = np.ones(2) * r else: h = np.ones((n_envs, 2)) * r[:, None] c = np.ones((n_envs, 2)) * r[:, None] buff.add(obs=o, reward=r, discount=d, h=h, c=c, mask=1 - reset, prev_reward=prev_reward) if buff.is_full(): data_list = buff.sample() if n_envs == 1: data_list = [data_list] for data in data_list: np.testing.assert_equal(data['reward'][0], data['h'][0]) np.testing.assert_equal(data['obs'][0, 0], data['c'][0]) buff.reset() prev_reward = r o = no
def main(env_config, model_config, agent_config, buffer_config): silence_tf_logs() configure_gpu() configure_precision(agent_config['precision']) create_model, Agent = pkg.import_agent(config=agent_config) Buffer = pkg.import_module('buffer', config=agent_config).Buffer use_ray = env_config.get('n_workers', 1) > 1 if use_ray: import ray from utility.ray_setup import sigint_shutdown_ray ray.init() sigint_shutdown_ray() env = create_env(env_config, force_envvec=True) eval_env_config = env_config.copy() eval_env_config['seed'] += 1000 eval_env_config['n_workers'] = 1 eval_env_config['n_envs'] = 1 for k in list(eval_env_config.keys()): # pop reward hacks if 'reward' in k: eval_env_config.pop(k) eval_env = create_env(eval_env_config, force_envvec=True) models = create_model(model_config, env) buffer_config['n_envs'] = env.n_envs buffer = Buffer(buffer_config) agent = Agent(config=agent_config, models=models, dataset=buffer, env=env) agent.save_config( dict(env=env_config, model=model_config, agent=agent_config, buffer=buffer_config)) train(agent, env, eval_env, buffer) if use_ray: import ray ray.shutdown()
def __init__(self, *, worker_id, config, model_config, env_config, buffer_config, model_fn, buffer_fn): config_actor(f'Worker_{worker_id}', config) self._id = worker_id self.env = create_env(env_config) buffer_config['n_envs'] = self.env.n_envs if 'seqlen' not in buffer_config: buffer_config['seqlen'] = self.env.max_episode_steps self.buffer = buffer_fn(buffer_config) models = model_fn(config=model_config, env=self.env) super().__init__(name=f'Worker_{worker_id}', config=config, models=models, dataset=self.buffer, env=self.env) # setup runner import importlib em = importlib.import_module( f'env.{env_config["name"].split("_")[0]}') info_func = em.info_func if hasattr(em, 'info_func') else None self._run_mode = getattr(self, '_run_mode', RunMode.NSTEPS) assert self._run_mode in [RunMode.NSTEPS, RunMode.TRAJ] self.runner = Runner(self.env, self, nsteps=self.SYNC_PERIOD if self._run_mode == RunMode.NSTEPS else None, run_mode=self._run_mode, record_envs=getattr(self, '_record_envs', None), info_func=info_func) # worker side prioritization self._worker_side_prioritization = getattr( self, '_worker_side_prioritization', False) self._return_stats = self._worker_side_prioritization \ or buffer_config.get('max_steps', 0) > buffer_config.get('n_steps', 1) # setups self._collect using <collect> function from the algorithm module collect_fn = pkg.import_module('agent', algo=self._algorithm, place=-1).collect self._collect = functools.partial(collect_fn, self.buffer) # the names of network modules that should be in sync with the learner if not hasattr(self, '_pull_names'): self._pull_names = [ k for k in self.model.keys() if 'target' not in k ] # used for recording worker side info self._info = collections.defaultdict(list)
def __init__(self, actor_id, model_fn, config, model_config, env_config): config_actor('Actor', config) self._id = actor_id self._n_envvecs = env_config['n_envvecs'] self._n_envs = env_config['n_envs'] env = create_env(env_config) models = model_fn(model_config, env) super().__init__(name=f'Actor_{actor_id}', config=config, models=models, dataset=None, env=env) # number of workers per actor self._wpa = self._n_workers // self._n_actors self._action_batch = int(self._n_workers * self._n_envvecs * self._action_frac) if 'act_eps' in config: act_eps = compute_act_eps(config['act_eps_type'], config['act_eps'], None, config['n_workers'], self._n_envvecs * self._n_envs) self._act_eps_mapping = act_eps.reshape( config['n_workers'], self._n_envvecs, self._n_envs) print(self.name, self._act_eps_mapping) else: self._act_eps_mapping = None # agent's state if 'rnn' in self.model: self._state_mapping = collections.defaultdict( lambda: self.model.get_initial_state(batch_size=env.n_envs, dtype=self._dtype)) self._prev_action_mapping = collections.defaultdict( lambda: tf.zeros( (env.n_envs, *self._action_shape), self._dtype)) if not hasattr(self, '_pull_names'): self._pull_names = [ k for k in self.model.keys() if 'target' not in k ] self._to_sync = Every(self.SYNC_PERIOD) if getattr( self, 'SYNC_PERIOD') else None
def main(env_config, model_config, agent_config, replay_config, n, record=False, size=(128, 128), video_len=1000, fps=30, save=False): silence_tf_logs() configure_gpu() configure_precision(agent_config.get('precision', 32)) use_ray = env_config.get('n_workers', 0) > 1 if use_ray: import ray ray.init() sigint_shutdown_ray() algo_name = agent_config['algorithm'] env_name = env_config['name'] try: make_env = pkg.import_module('env', algo_name, place=-1).make_env except: make_env = None env_config.pop('reward_clip', False) env = create_env(env_config, env_fn=make_env) create_model, Agent = pkg.import_agent(config=agent_config) models = create_model(model_config, env) agent = Agent(config=agent_config, models=models, dataset=None, env=env) if n < env.n_envs: n = env.n_envs scores, epslens, video = evaluate(env, agent, n, record=record, size=size, video_len=video_len) pwc(f'After running {n} episodes', f'Score: {np.mean(scores):.3g}\tEpslen: {np.mean(epslens):.3g}', color='cyan') if record: save_video(f'{algo_name}-{env_name}', video, fps=fps) if use_ray: ray.shutdown()
def __init__(self, name, model_fn, config, model_config, env_config, replay_config): cpu_affinity('Learner') silence_tf_logs() configure_threads(config['n_cpus'], config['n_cpus']) configure_gpu() configure_precision(config['precision']) self._dtype = global_policy().compute_dtype self._envs_per_worker = env_config['n_envs'] env_config['n_envs'] = 1 env = create_env(env_config) assert env.obs_dtype == np.uint8, \ f'Expect image observation of type uint8, but get {env.obs_dtype}' self._action_shape = env.action_shape self._action_dim = env.action_dim self._frame_skip = getattr(env, 'frame_skip', 1) self.models = Ensemble( model_fn=model_fn, config=model_config, obs_shape=env.obs_shape, action_dim=env.action_dim, is_action_discrete=env.is_action_discrete ) super().__init__( name=name, config=config, models=self.models, dataset=None, env=env) replay_config['dir'] = config['root_dir'].replace('logs', 'data') self.replay = create_replay(replay_config) data_format = get_data_format(env, replay_config) process = functools.partial(process_with_env, env=env) self.dataset = Dataset(self.replay, data_format, process, prefetch=10) self._env_step = self.env_step()
def __init__(self, name, model_fn, config, model_config, env_config): cpu_affinity('Actor') silence_tf_logs() configure_threads(1, 1) configure_gpu() configure_precision(config['precision']) self._dtype = global_policy().compute_dtype self._envs_per_worker = env_config['n_envs'] env_config['n_envs'] = config['action_batch'] env = create_env(env_config) assert self.env.obs_dtype == np.uint8, \ f'Expect image observation of type uint8, but get {self.env.obs_dtype}' self._action_shape = self.env.action_shape self._action_dim = self.env.action_dim self.models = Ensemble( model_fn=model_fn, config=model_config, obs_shape=self.env.obs_shape, action_dim=self.env.action_dim, is_action_discrete=self.env.is_action_discrete ) super().__init__( name=name, config=config, models=self.models, dataset=None, env=self.env) # cache for episodes self._cache = collections.defaultdict(list) # agent's state self._state = collections.defaultdict(lambda: self.rssm.get_initial_state(batch_size=1, dtype=self._dtype)) self._prev_action = collections.defaultdict(lambda: tf.zeros((1, self._action_dim), self._dtype))
def main(env_config, model_config, agent_config, replay_config, n, record=False, size=(128, 128), video_len=1000, fps=30, save=False): logging.basicConfig(level=logging.DEBUG) silence_tf_logs() configure_gpu() configure_precision(agent_config.get('precision', 32)) use_ray = env_config.get('n_workers', 0) > 1 if use_ray: import ray ray.init() sigint_shutdown_ray() algo_name = agent_config['algorithm'] env_name = env_config['name'] if record: env_config['log_episode'] = True env_config['n_workers'] = env_config['n_envs'] = 1 env = create_env(env_config) create_model, Agent = pkg.import_agent(config=agent_config) models = create_model(model_config, env) agent = Agent(config=agent_config, models=models, dataset=None, env=env) if save: n_workers = env_config.get('n_workers', 1) n_envs = env_config.get('n_envs', 1) replay_config['n_envs'] = n_workers * n_envs replay_config['replay_type'] = 'uniform' replay_config['dir'] = f'data/{agent.name.lower()}-{env.name.lower()}' replay_config['n_steps'] = 1 replay_config['save'] = True replay_config['save_temp'] = True replay_config['capacity'] = int(1e6) replay_config['has_next_obs'] = True replay = create_replay(replay_config) def collect(obs, action, reward, discount, next_obs, logpi, **kwargs): replay.add(obs=obs, action=action, reward=reward, discount=discount, next_obs=next_obs, logpi=logpi) else: def collect(**kwargs): pass if n < env.n_envs: n = env.n_envs scores, epslens, video = evaluate(env, agent, n, record=record, size=size, video_len=video_len, step_fn=collect) pwc(f'After running {n} episodes', f'Score: {np.mean(scores):.3g}\tEpslen: {np.mean(epslens):.3g}', color='cyan') if save: replay.save() if record: save_video(f'{algo_name}-{env_name}', video, fps=fps) if use_ray: ray.shutdown()
self._output = EnvOutput(obs, reward, discount, reset) # assert np.all(done) == info.get('game_over', False), (reset, info['game_over']) # assert np.all(reset) == info.get('game_over', False), (reset, info['game_over']) return self._output def get_wrapper_by_name(env, classname): currentenv = env while True: if classname == currentenv.__class__.__name__: return currentenv elif hasattr(currentenv, 'env'): currentenv = currentenv.env else: # don't raise error here, only return None return None if __name__ == '__main__': from env.func import create_env env = create_env(dict(name='smac_3s5z', seed=0)) for i in range(10000): a = env.random_action() out = env.step(a) print(out[2:]) if np.all(out.reset): info = env.info() print(info['score'], info['epslen'])
from env.func import create_env import time import numpy as np import ray from env import procgen if __name__ == '__main__': config = dict( name='procgen_coinrun', n_envs=10, ) def make_env(config): env = procgen.make_procgen_env(config) return env ray.init() env = create_env(config, make_env) print('Env', env) def run(env): st = time.time() for _ in range(10000): a = env.random_action() env.step(a) return time.time() - st print("Ray env:", run(env))