def check(agent_name, env_name): # Create environment list_make_env = make_envs(make_env=make_gym_env, env_id=env_name, num_env=1, init_seed=0) env = SerialVecEnv(list_make_env=list_make_env) env_spec = EnvSpec(env) # Create agent if agent_name == 'random': agent = RandomAgent(env_spec=env_spec, config=None) elif agent_name == 'agent1': agent = Agent1(config=None) elif agent_name == 'agent2': agent = Agent2(config=None) else: raise ValueError('Wrong agent name') # Test: not allowed more than one environment for TrajectoryRunner with pytest.raises(AssertionError): list_make_env2 = make_envs(make_env=make_gym_env, env_id=env_name, num_env=2, init_seed=0) env2 = SerialVecEnv(list_make_env=list_make_env2) runner2 = TrajectoryRunner(agent=agent, env=env2, gamma=1.0) # Create runner runner = TrajectoryRunner(agent=agent, env=env, gamma=1.0) # Small batch D = runner(N=3, T=4) assert len(D) == 3 assert all([isinstance(d, Trajectory) for d in D]) assert all([d.T == 4 for d in D]) assert all([d.gamma == 1.0 for d in D]) # Check additional information for d in D: for t in d.transitions: if agent_name != 'random': assert 'action_logprob' in t.info # Check if s in transition is equal to s_next in previous transition for d in D: for s1, s2 in zip(d.transitions[:-1], d.transitions[1:]): assert np.allclose(s1.s_next, s2.s) # Long horizon D = runner(N=3, T=1000) for d in D: if d.T < 1000: assert d.all_done[-1] == True
def test_sanity_check_env(): env = SerialVecEnv(make_sanity_envs([2, 3])) assert isinstance(env.observation_space, Box) and isinstance( env.action_space, Discrete) obs = env.reset() assert np.allclose(obs, [[0.01], [0.01]]) obs, reward, done, info = env.step([0, 1]) assert np.allclose(obs, [[1.01], [1.01]]) assert np.allclose(reward, [0.1, 0.1]) assert np.allclose(done, [False, False]) assert all(len(i) == 0 for i in info) obs, reward, done, info = env.step([1, 0]) assert np.allclose(obs, [[0.01], [2.01]]) assert np.allclose(reward, [0.2, 0.2]) assert np.allclose(done, [True, False]) assert info[0]['terminal_observation'] == [2.01] and len(info[1]) == 0 obs, reward, done, info = env.step([1, 1]) assert np.allclose(obs, [[1.01], [0.01]]) assert np.allclose(reward, [0.1, 0.3]) assert np.allclose(done, [False, True]) assert len(info[0]) == 0 and info[1]['terminal_observation'] == [3.01] obs, reward, done, info = env.step([0, 0]) assert np.allclose(obs, [[0.01], [1.01]]) assert np.allclose(reward, [0.2, 0.1]) assert np.allclose(done, [True, False]) assert info[0]['terminal_observation'] == [2.01] and len(info[1]) == 0
def check(agent_name, env_name): # Create environment list_make_env = make_envs(make_env=make_gym_env, env_id=env_name, num_env=2, init_seed=0) env = SerialVecEnv(list_make_env=list_make_env) env_spec = EnvSpec(env) assert env.num_env == 2 # Create agent if agent_name == 'random': agent = RandomAgent(env_spec=env_spec, config=None) elif agent_name == 'agent1': agent = Agent1(config=None) elif agent_name == 'agent2': agent = Agent2(config=None) else: raise ValueError('Wrong agent name') # Create runner runner = SegmentRunner(agent=agent, env=env, gamma=1.0) # Small batch D = runner(T=3, reset=False) assert len(D) == 2 assert all([isinstance(d, Segment) for d in D]) assert all([d.T == 3 for d in D]) assert all([d.gamma == 1.0 for d in D]) # Check additional information for d in D: for t in d.transitions: if agent_name != 'random': assert 'action_logprob' in t.info # Check if s in transition is equal to s_next in previous transition for d in D: for s1, s2 in zip(d.transitions[:-1], d.transitions[1:]): assert np.allclose(s1.s_next, s2.s) # Take one more step, test rolling effect, i.e. first state should be same as last state in previous D D2 = runner(T=1, reset=False) assert len(D2) == 2 assert all([d.T == 1 for d in D2]) for d, d2 in zip(D, D2): assert np.allclose(d2.all_s[0][0], d.transitions[-1].s_next) # Long horizon D = runner(T=200, reset=True) # Segment with identical time steps assert all([d.T == 200 for d in D]) # For CartPole, 200 time steps, should be somewhere done=True if env_name == 'CartPole-v1': assert any([True in d.all_done for d in D]) assert all([len(d.trajectories) > 1 for d in D])
def make_env_spec(self): list_make_env = make_envs(make_env=make_gym_env, env_id='CartPole-v1', num_env=3, init_seed=0) venv = SerialVecEnv(list_make_env=list_make_env, rolling=True) env_spec = EnvSpec(venv) return env_spec
def make_env_spec(self): list_make_env = make_envs(make_env=make_gym_env, env_id='Pendulum-v0', num_env=1, init_seed=0) venv = SerialVecEnv(list_make_env=list_make_env) env_spec = EnvSpec(venv) return env_spec
def algorithm(config, seed, device): logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) seeder = Seeder(seed) seeds = seeder(size=config['env.count']) env_constructors = [] for seed in seeds: env_constructors.append(partial(CraftingEnv, seed)) env = VecStandardize(SerialVecEnv(env_constructors), clip_reward=100.0) env_spec = EnvSpec(env) agent = Agent(config, env_spec, device) runner = RollingSegmentRunner(config, agent, env) engine = Engine(agent, runner, env) for i in range(config['train.iter']): training_result = engine.train(i) print(f'Training iteration {i} complete.') if i % config['log.interval'] == 0: logs = engine.log_train(training_result) pickle_dump(obj=logs, f=logdir / f'iter_{i}_train_logs', ext='.pkl') torch.save(engine.agent.policy.state_dict(), logdir / 'trained_params')
from dial_control_rl import agent from dial_control_rl.env import CraftingEnv from lagom.utils import Seeder from lagom.envs.vec_env import SerialVecEnv, VecStandardize from lagom.envs import EnvSpec from functools import partial env = CraftingEnv() seeder = Seeder(0) seeds = seeder(size=1) env_constructors = [] for seed in seeds: env_constructors.append(partial(CraftingEnv, seed)) env = VecStandardize(SerialVecEnv(env_constructors), clip_reward=100.0) env_spec = EnvSpec(env) policy = agent.Policy({'algo.rl': 0}, env_spec, torch.device('cpu')) policy.load_state_dict(params) policy = policy.double() def V(x): out = policy(torch.tensor(x), ['V']) return out['V'][0] def Q(x): out = policy(torch.tensor(x), ['action_dist']) out = out['action_dist']
def __call__(self, config): # Set random seeds: PyTorch, numpy.random, random set_global_seeds(seed=config['seed']) # Make a list of make_env functions list_make_env = make_envs(make_env=make_gym_env, env_id=config['env:id'], num_env=config['train:N'], init_seed=config['seed'] * 2) # Create vectorized environment env = SerialVecEnv(list_make_env=list_make_env) # Create environment specification env_spec = EnvSpec(env) # Create device device = torch.device( f'cuda:{config["cuda_id"]}' if config['cuda'] else 'cpu') # Create policy network = MLP(config=config).to(device) policy = CategoricalPolicy(network=network, env_spec=env_spec) # Create optimizer optimizer = optim.Adam(policy.network.parameters(), lr=config['algo:lr']) # Create learning rate scheduler if config['algo:use_lr_scheduler']: max_epoch = config[ 'train:iter'] # Max number of lr decay, Note where lr_scheduler put lambda_f = lambda epoch: 1 - epoch / max_epoch # decay learning rate for each training epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Create agent kwargs = {'device': device} if config['algo:use_lr_scheduler']: kwargs['lr_scheduler'] = lr_scheduler agent = A2CAgent(policy=policy, optimizer=optimizer, config=config, **kwargs) # Create runner runner = SegmentRunner(agent=agent, env=env, gamma=config['algo:gamma']) # Create engine engine = Engine(agent=agent, runner=runner, config=config, logger=None) # Training and evaluation train_logs = [] eval_logs = [] for i in range(config['train:iter']): train_output = engine.train(i) # Logging and evaluation if i == 0 or (i + 1) % config['log:interval'] == 0: # Log training and record the loggings train_logger = engine.log_train(train_output) train_logs.append(train_logger.logs) # Log evaluation and record the loggings eval_output = engine.eval(i) eval_logger = engine.log_eval(eval_output) eval_logs.append(eval_logger.logs) # Save the loggings np.save( Path(config['log:dir']) / str(config['ID']) / 'train', train_logs) np.save( Path(config['log:dir']) / str(config['ID']) / 'eval', eval_logs) return None