def make_vec_env(make_env, num_env, init_seed): r"""Create a vectorized environment, each associated with a different random seed. Example:: >>> import gym >>> make_vec_env(lambda: gym.make('CartPole-v1'), 3, 0) <VecEnv: 3, CartPole-v1> Args: make_env (function): a function to create an environment num_env (int): number of environments to create. init_seed (int): initial seed for :class:`Seeder` to sample random seeds. Returns ------- env : VecEnv created vectorized environment """ # Generate different seeds for each environment seeder = Seeder(init_seed=init_seed) seeds = seeder(size=num_env) def f(seed): env = make_env() env.seed(seed) env.observation_space.seed(seed) env.action_space.seed(seed) return env # Use partial to generate a list of argument-free make_env, each with different seed list_make_env = [partial(f, seed=seed) for seed in seeds] return VecEnv(list_make_env)
def test_make_vec_env(env_id, num_env, init_seed): def make_env(): return gym.make(env_id) env = make_vec_env(make_env, num_env, init_seed) assert isinstance(env, VecEnv) seeds = [x.keywords['seed'] for x in env.list_make_env] seeder = Seeder(init_seed) assert seeds == seeder(num_env)
def test_episode_runner(vec_env, env_id): env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0) env_spec = EnvSpec(env) if env_id == 'CartPole-v1': sticky_action = 1 elif env_id == 'Pendulum-v0': sticky_action = [0.1] T = 30 agent = StickyAgent(None, env_spec, sticky_action) runner = EpisodeRunner(None, agent, env) D = runner(T) assert D.N == 3 assert D.maxT == max(D.Ts) seeder = Seeder(0) seed1, seed2, seed3 = seeder(3) env1 = make_gym_env(env_id, seed1) env2 = make_gym_env(env_id, seed2) env3 = make_gym_env(env_id, seed3) for n, ev in enumerate([env1, env2, env3]): obs = ev.reset() assert np.allclose(obs, D.observations[n][0]) assert np.allclose(obs, D.numpy_observations[n, 0, ...]) for t in range(T): obs, reward, done, info = ev.step(sticky_action) assert np.allclose(reward, D.rewards[n][t]) assert np.allclose(reward, D.numpy_rewards[n, t]) assert np.allclose(done, D.dones[n][t]) assert done == D.numpy_dones[n, t] assert int(not done) == D.masks[n][t] assert int(not done) == D.numpy_masks[n, t] if done: assert np.allclose(obs, D.infos[n][t]['terminal_observation']) assert D.completes[n] assert np.allclose(0.0, D.numpy_observations[n, t + 1 + 1:, ...]) assert np.allclose(0.0, D.numpy_actions[n, t + 1:, ...]) assert np.allclose(0.0, D.numpy_rewards[n, t + 1:]) assert np.allclose(True, D.numpy_dones[n, t + 1:]) assert np.allclose(0.0, D.numpy_masks[n, t + 1:]) break else: assert np.allclose(obs, D.observations[n][t + 1])
def test_seeder(): seeder = Seeder(init_seed=0) assert seeder.rng.get_state()[1][0] == 0 assert np.random.get_state()[1][20] != seeder.rng.get_state()[1][20] # Single list of seeds seeds = seeder(size=1) assert len(seeds) == 1 seeds = seeder(size=5) assert len(seeds) == 5 # Batched seeds seeds = seeder(size=[1, 3]) assert np.alltrue(np.array(seeds).shape == (1, 3)) seeds = seeder(size=[2, 3]) assert np.alltrue(np.array(seeds).shape == (2, 3))
def test_make_envs(self): list_make_env = make_envs(make_env=make_gym_env, env_id='Pendulum-v0', num_env=3, init_seed=1) assert len(list_make_env) == 3 assert list_make_env[0] != list_make_env[1] and list_make_env[ 0] != list_make_env[2] # Test if the seedings are correct seeder = Seeder(init_seed=1) seeds = seeder(3) for make_env, seed in zip(list_make_env, seeds): assert make_env.keywords['seed'] == seed env = list_make_env[0]() raw_env = gym.make('Pendulum-v0') raw_env.seed(seeds[0]) assert np.allclose(env.reset(), raw_env.reset())
def test_rolling_segment_runner(vec_env, env_id): env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0) env_spec = EnvSpec(env) if env_id == 'CartPole-v1': sticky_action = 1 elif env_id == 'Pendulum-v0': sticky_action = [0.1] T = 30 agent = StickyAgent(None, env_spec, sticky_action) runner = RollingSegmentRunner(None, agent, env) D = runner(T) assert D.N == 3 assert D.T == T seeder = Seeder(0) seed1, seed2, seed3 = seeder(3) env1 = make_gym_env(env_id, seed1) env2 = make_gym_env(env_id, seed2) env3 = make_gym_env(env_id, seed3) for n, ev in enumerate([env1, env2, env3]): obs = ev.reset() assert np.allclose(obs, D.numpy_observations[n, 0, ...]) for t in range(T): obs, reward, done, info = ev.step(sticky_action) if done: info['terminal_observation'] = obs obs = ev.reset() assert np.allclose(obs, D.numpy_observations[n, t + 1, ...]) assert np.allclose(sticky_action, D.numpy_actions[n, t, ...]) assert np.allclose(reward, D.numpy_rewards[n, t]) assert done == D.numpy_dones[n, t] assert int(not done) == D.numpy_masks[n, t] if done: assert np.allclose(info['terminal_observation'], D.infos[n][t]['terminal_observation'])
def algorithm(config, seed, device): logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) seeder = Seeder(seed) seeds = seeder(size=config['env.count']) env_constructors = [] for seed in seeds: env_constructors.append(partial(CraftingEnv, seed)) env = VecStandardize(SerialVecEnv(env_constructors), clip_reward=100.0) env_spec = EnvSpec(env) agent = Agent(config, env_spec, device) runner = RollingSegmentRunner(config, agent, env) engine = Engine(agent, runner, env) for i in range(config['train.iter']): training_result = engine.train(i) print(f'Training iteration {i} complete.') if i % config['log.interval'] == 0: logs = engine.log_train(training_result) pickle_dump(obj=logs, f=logdir / f'iter_{i}_train_logs', ext='.pkl') torch.save(engine.agent.policy.state_dict(), logdir / 'trained_params')
def make_envs(make_env, env_id, num_env, init_seed, **kwargs): r"""Create a list of argument-free make_env() functions based on the given settings. .. note:: Each make_env function in the list uses different random seeds generated by :class:`Seeder`. Example:: >>> make_envs(make_env=make_gym_env, env_id='CartPole-v1', num_env=3, init_seed=0) [functools.partial(<function make_gym_env at 0x7f2127b5ce18>, env_id='CartPole-v1', seed=209652396), functools.partial(<function make_gym_env at 0x7f2127b5ce18>, env_id='CartPole-v1', seed=398764591), functools.partial(<function make_gym_env at 0x7f2127b5ce18>, env_id='CartPole-v1', seed=924231285)] Args: make_env (function): a function to create an environment env_id (str): environment ID, e.g. 'Pendulum-v0', 'Ant-v2' num_env (int): number of environments to create. init_seed (int): initial seed for :class:`Seeder` to sample random seeds. **kwargs: keyword aguments used to specify other options for make_env. Returns ------- list_make_env : list a list of argument-free make_env() functions, each associated with different random seed. """ # Generate different seeds for each environment seeder = Seeder(init_seed=init_seed) seeds = seeder(size=num_env) # Use partial to generate a list of argument-free make_env, each with different seed list_make_env = [ partial(make_env, env_id=env_id, seed=seed, **kwargs) for seed in seeds ] return list_make_env
import torch import pickle params = torch.load('logs-3/0/1013845395/trained_params') from dial_control_rl import agent from dial_control_rl.env import CraftingEnv from lagom.utils import Seeder from lagom.envs.vec_env import SerialVecEnv, VecStandardize from lagom.envs import EnvSpec from functools import partial env = CraftingEnv() seeder = Seeder(0) seeds = seeder(size=1) env_constructors = [] for seed in seeds: env_constructors.append(partial(CraftingEnv, seed)) env = VecStandardize(SerialVecEnv(env_constructors), clip_reward=100.0) env_spec = EnvSpec(env) policy = agent.Policy({'algo.rl': 0}, env_spec, torch.device('cpu')) policy.load_state_dict(params) policy = policy.double() def V(x): out = policy(torch.tensor(x), ['V']) return out['V'][0]
def test_batch_episode(vec_env, env_id): env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0) env_spec = EnvSpec(env) D = BatchEpisode(env_spec) if env_id == 'CartPole-v1': sticky_action = 1 action_shape = () action_dtype = np.int32 elif env_id == 'Pendulum-v0': sticky_action = [0.1] action_shape = env_spec.action_space.shape action_dtype = np.float32 obs = env.reset() D.add_observation(obs) for t in range(30): action = [sticky_action] * env.num_env obs, reward, done, info = env.step(action) D.add_observation(obs) D.add_action(action) D.add_reward(reward) D.add_done(done) D.add_info(info) D.add_batch_info({'V': [0.1 * (t + 1), (t + 1), 10 * (t + 1)]}) [D.set_completed(n) for n, d in enumerate(done) if d] assert D.N == 3 assert len(D.Ts) == 3 assert D.maxT == max(D.Ts) assert all([ isinstance(x, np.ndarray) for x in [ D.numpy_observations, D.numpy_actions, D.numpy_rewards, D.numpy_dones, D.numpy_masks ] ]) assert all([ x.dtype == np.float32 for x in [D.numpy_observations, D.numpy_rewards, D.numpy_masks] ]) assert all([ x.shape == (3, D.maxT) for x in [D.numpy_rewards, D.numpy_dones, D.numpy_masks] ]) assert D.numpy_actions.dtype == action_dtype assert D.numpy_dones.dtype == np.bool assert D.numpy_observations.shape == (3, D.maxT + 1) + env_spec.observation_space.shape assert D.numpy_actions.shape == (3, D.maxT) + action_shape assert isinstance(D.batch_infos, list) and len(D.batch_infos) == 30 assert np.allclose([0.1 * (x + 1) for x in range(30)], [info['V'][0] for info in D.batch_infos]) assert np.allclose([1 * (x + 1) for x in range(30)], [info['V'][1] for info in D.batch_infos]) assert np.allclose([10 * (x + 1) for x in range(30)], [info['V'][2] for info in D.batch_infos]) seeder = Seeder(0) seed1, seed2, seed3 = seeder(3) env1 = make_gym_env(env_id, seed1) env2 = make_gym_env(env_id, seed2) env3 = make_gym_env(env_id, seed3) for n, ev in enumerate([env1, env2, env3]): obs = ev.reset() assert np.allclose(obs, D.observations[n][0]) assert np.allclose(obs, D.numpy_observations[n, 0, ...]) for t in range(30): obs, reward, done, info = ev.step(sticky_action) assert np.allclose(reward, D.rewards[n][t]) assert np.allclose(reward, D.numpy_rewards[n, t]) assert np.allclose(done, D.dones[n][t]) assert done == D.numpy_dones[n, t] assert int(not done) == D.masks[n][t] assert int(not done) == D.numpy_masks[n, t] if done: assert np.allclose(obs, D.infos[n][t]['terminal_observation']) assert D.completes[n] assert np.allclose(0.0, D.numpy_observations[n, t + 1 + 1:, ...]) assert np.allclose(0.0, D.numpy_actions[n, t + 1:, ...]) assert np.allclose(0.0, D.numpy_rewards[n, t + 1:]) assert np.allclose(True, D.numpy_dones[n, t + 1:]) assert np.allclose(0.0, D.numpy_masks[n, t + 1:]) break else: assert np.allclose(obs, D.observations[n][t + 1])
def test_batch_segment(vec_env, env_id): env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0) env_spec = EnvSpec(env) T = 30 D = BatchSegment(env_spec, T) if env_id == 'CartPole-v1': sticky_action = 1 action_shape = () action_dtype = np.int32 elif env_id == 'Pendulum-v0': sticky_action = [0.1] action_shape = env_spec.action_space.shape action_dtype = np.float32 obs = env.reset() D.add_observation(0, obs) for t in range(T): action = [sticky_action] * env.num_env obs, reward, done, info = env.step(action) D.add_observation(t + 1, obs) D.add_action(t, action) D.add_reward(t, reward) D.add_done(t, done) D.add_info(info) D.add_batch_info({'V': [0.1 * (t + 1), (t + 1), 10 * (t + 1)]}) assert D.N == 3 assert D.T == T assert all([ isinstance(x, np.ndarray) for x in [ D.numpy_observations, D.numpy_actions, D.numpy_rewards, D.numpy_dones, D.numpy_masks ] ]) assert all([ x.dtype == np.float32 for x in [D.numpy_observations, D.numpy_rewards, D.numpy_masks] ]) assert D.numpy_actions.dtype == action_dtype assert D.numpy_dones.dtype == np.bool assert D.numpy_observations.shape[:2] == (3, T + 1) assert D.numpy_actions.shape == (3, T) + action_shape assert all([ x.shape == (3, T) for x in [D.numpy_rewards, D.numpy_dones, D.numpy_masks] ]) assert isinstance(D.batch_infos, list) and len(D.batch_infos) == T assert np.allclose([0.1 * (x + 1) for x in range(T)], [info['V'][0] for info in D.batch_infos]) assert np.allclose([1 * (x + 1) for x in range(T)], [info['V'][1] for info in D.batch_infos]) assert np.allclose([10 * (x + 1) for x in range(T)], [info['V'][2] for info in D.batch_infos]) seeder = Seeder(0) seed1, seed2, seed3 = seeder(3) env1 = make_gym_env(env_id, seed1) env2 = make_gym_env(env_id, seed2) env3 = make_gym_env(env_id, seed3) for n, ev in enumerate([env1, env2, env3]): obs = ev.reset() assert np.allclose(obs, D.numpy_observations[n, 0, ...]) for t in range(T): obs, reward, done, info = ev.step(sticky_action) if done: info['terminal_observation'] = obs obs = ev.reset() assert np.allclose(obs, D.numpy_observations[n, t + 1, ...]) assert np.allclose(sticky_action, D.numpy_actions[n, t, ...]) assert np.allclose(reward, D.numpy_rewards[n, t]) assert done == D.numpy_dones[n, t] assert int(not done) == D.numpy_masks[n, t] if done: assert np.allclose(info['terminal_observation'], D.infos[n][t]['terminal_observation'])