def test_make_vec_env(self): venv1 = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 5, 1, True) venv2 = make_vec_env(ParallelVecEnv, make_gym_env, 'CartPole-v1', 5, 1, True) assert isinstance(venv1, VecEnv) and isinstance(venv1, SerialVecEnv) assert isinstance(venv2, VecEnv) and isinstance(venv2, ParallelVecEnv) assert venv1.num_env == venv2.num_env env_spec1 = EnvSpec(venv1) assert env_spec1.num_env == venv1.num_env env_spec2 = EnvSpec(venv2) assert env_spec2.num_env == venv2.num_env assert venv1.observation_space == venv2.observation_space assert venv1.action_space == venv2.action_space assert venv1.reward_range == venv2.reward_range assert venv1.T == venv2.T o1 = venv1.reset() o2 = venv2.reset() # Two environments should have same random seeds, then same results under same actions assert np.allclose(o1, o2) a = [1] * 5 o1, r1, d1, _ = venv1.step(a) o2, r2, d2, _ = venv2.step(a) assert np.allclose(o1, o2) assert np.allclose(r1, r2) assert np.allclose(d1, d2) assert not venv1.closed venv1.close() assert venv1.closed assert not venv2.closed venv2.close() assert venv2.closed
def test_make_gym_env(self): env = make_gym_env(env_id='CartPole-v1', seed=0, monitor=False) assert isinstance(env, Env) assert not isinstance(env, gym.Env) assert isinstance(env, Wrapper) assert isinstance(env.observation_space, Box) assert isinstance(env.action_space, Discrete) env_spec = EnvSpec(env) assert env_spec.control_type == 'Discrete' assert env_spec.T == 500 assert env_spec.max_episode_reward == 475.0 assert env_spec.reward_range == (-float('inf'), float('inf')) assert not env_spec.is_vec_env with pytest.raises(TypeError): env_spec.num_env assert env.reset().shape == (4, ) assert len(env.step(env.action_space.sample())) == 4 del env del env_spec # Pendulum, continuous # do not test redundant part env = make_gym_env('Pendulum-v0', seed=0) assert isinstance(env, Env) env_spec = EnvSpec(env) assert isinstance(env_spec.action_space, Box) assert env_spec.T == 200 assert env_spec.control_type == 'Continuous' assert env.reset().shape == (3, ) assert len(env.step(env.action_space.sample())) == 4
def test_random_agent(): env = make_gym_env('Pendulum-v0', 0) env_spec = EnvSpec(env) agent = RandomAgent(None, env_spec) out = agent.choose_action(env.reset()) assert isinstance(out, dict) assert 'action' in out and out['action'].shape == (1,) venv = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(venv) agent = RandomAgent(None, env_spec) out = agent.choose_action(env.reset()) assert isinstance(out, dict) assert 'action' in out and len(out['action']) == 3 and isinstance(out['action'][0], int)
def test_random_policy(): env = make_gym_env('Pendulum-v0', 0) env_spec = EnvSpec(env) policy = RandomPolicy(None, env_spec) out = policy(env.reset()) assert isinstance(out, dict) assert 'action' in out and out['action'].shape == (1, ) venv = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v0', 3, 0, False) env_spec = EnvSpec(venv) policy = RandomPolicy(None, env_spec) out = policy(env.reset()) assert isinstance(out, dict) assert 'action' in out and len(out['action']) == 3 and isinstance( out['action'][0], int)
def test_terminal_state_from_episode(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) with pytest.raises(AssertionError): terminal_state_from_episode([1, 2, 3]) D = BatchEpisode(env_spec) D.obs[0] = [0.1, 0.2, 1.3] D.done[0] = [False, False, True] D.info[0] = [{}, {}, {'terminal_observation': 0.3}] D.obs[1] = [1, 2, 3, 4, 5, 6, 7, 8, 9] D.done[1] = [False] * 9 D.obs[2] = [10, 15] D.done[2] = [False, True] D.info[2] = [{}, {'terminal_observation': 20}] terminal_states = terminal_state_from_episode(D) assert terminal_states.shape == (2, ) + env_spec.observation_space.shape assert np.allclose(terminal_states[0], 0.3) assert np.allclose(terminal_states[1], 20) D.done[0][-1] = False D.done[2][-1] = False assert terminal_state_from_episode(D) is None with pytest.raises(AssertionError): terminal_state_from_segment(D)
def test_returns_from_episode(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) D = BatchEpisode(env_spec) D.r[0] = [1, 2, 3] D.done[0] = [False, False, True] D.r[1] = [1, 2, 3, 4, 5] D.done[1] = [False, False, False, False, False] D.r[2] = [1, 2, 3, 4, 5, 6, 7, 8] D.done[2] = [False, False, False, False, False, False, False, True] out = returns_from_episode(D, 1.0) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [6, 5, 3, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [15, 14, 12, 9, 5, 0, 0, 0]) assert np.allclose(out[2], [36, 35, 33, 30, 26, 21, 15, 8]) del out out = returns_from_episode(D, 0.1) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [1.23, 2.3, 3, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [1.2345, 2.345, 3.45, 4.5, 5, 0, 0, 0]) assert np.allclose( out[2], [1.2345678, 2.345678, 3.45678, 4.5678, 5.678, 6.78, 7.8, 8]) with pytest.raises(AssertionError): returns_from_segment(D, 0.1)
def test_segment_runner(env_id): env = make_vec_env(SerialVecEnv, make_gym_env, env_id, 3, 0, True) env_spec = EnvSpec(env) agent = RandomAgent(None, env_spec) runner = SegmentRunner(None, agent, env) D = runner(4) assert len(D) == 3 assert all([isinstance(d, Segment) for d in D]) assert all([d.T == 4 for d in D]) # Check if s in transition is equal to s_next in previous transition for d in D: for s1, s2 in zip(d.transitions[:-1], d.transitions[1:]): assert np.allclose(s1.s_next, s2.s) # Long horizon D = runner(T=1000) for d in D: assert d.T == 1000 with pytest.raises(AssertionError): env = make_vec_env(SerialVecEnv, make_gym_env, env_id, 3, 0, False) SegmentRunner(None, agent, env)
def test_bootstrapped_returns_from_segment(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) D = BatchSegment(env_spec, 5) D.r[0] = [1, 2, 3, 4, 5] D.done[0] = [False, False, False, False, False] D.r[1] = [1, 2, 3, 4, 5] D.done[1] = [False, False, True, False, False] D.r[2] = [1, 2, 3, 4, 5] D.done[2] = [True, False, False, False, True] last_Vs = torch.tensor([10, 20, 30]).unsqueeze(1) out = bootstrapped_returns_from_segment(D, last_Vs, 1.0) assert out.shape == (3, 5) assert np.allclose(out[0], [25, 24, 22, 19, 15]) assert np.allclose(out[1], [6, 5, 3, 29, 25]) assert np.allclose(out[2], [1, 14, 12, 9, 5]) del out out = bootstrapped_returns_from_segment(D, last_Vs, 0.1) assert out.shape == (3, 5) assert np.allclose(out[0], [1.2346, 2.346, 3.46, 4.6, 6]) assert np.allclose(out[1], [1.23, 2.3, 3, 4.7, 7]) assert np.allclose(out[2], [1, 2.345, 3.45, 4.5, 5]) with pytest.raises(AssertionError): bootstrapped_returns_from_episode(D, last_Vs, 0.1)
def test_returns_from_segment(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) D = BatchSegment(env_spec, 5) D.r[0] = [1, 2, 3, 4, 5] D.done[0] = [False, False, False, False, False] D.r[1] = [1, 2, 3, 4, 5] D.done[1] = [False, False, True, False, False] D.r[2] = [1, 2, 3, 4, 5] D.done[2] = [True, False, False, False, True] out = returns_from_segment(D, 1.0) assert out.shape == (3, 5) assert np.allclose(out[0], [15, 14, 12, 9, 5]) assert np.allclose(out[1], [6, 5, 3, 9, 5]) assert np.allclose(out[2], [1, 14, 12, 9, 5]) del out out = returns_from_segment(D, 0.1) assert out.shape == (3, 5) assert np.allclose(out[0], [1.2345, 2.345, 3.45, 4.5, 5]) assert np.allclose(out[1], [1.23, 2.3, 3, 4.5, 5]) assert np.allclose(out[2], [1, 2.345, 3.45, 4.5, 5]) with pytest.raises(AssertionError): returns_from_episode(D, 0.1)
def test_bootstrapped_returns_from_episode(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) D = BatchEpisode(env_spec) D.r[0] = [1, 2, 3] D.done[0] = [False, False, True] D.completed[0] = True D.r[1] = [1, 2, 3, 4, 5] D.done[1] = [False, False, False, False, False] D.completed[1] = False D.r[2] = [1, 2, 3, 4, 5, 6, 7, 8] D.done[2] = [False, False, False, False, False, False, False, True] D.completed[2] = True last_Vs = torch.tensor([10, 20, 30]).unsqueeze(1) out = bootstrapped_returns_from_episode(D, last_Vs, 1.0) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [6, 5, 3, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [35, 34, 32, 29, 25, 0, 0, 0]) assert np.allclose(out[2], [36, 35, 33, 30, 26, 21, 15, 8]) del out out = bootstrapped_returns_from_episode(D, last_Vs, 0.1) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [1.23, 2.3, 3, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [1.2347, 2.347, 3.47, 4.7, 7, 0, 0, 0]) assert np.allclose( out[2], [1.2345678, 2.345678, 3.45678, 4.5678, 5.678, 6.78, 7.8, 8]) with pytest.raises(AssertionError): bootstrapped_returns_from_segment(D, last_Vs, 0.1)
def test_final_state_from_segment(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) with pytest.raises(AssertionError): final_state_from_segment([1, 2, 3]) D = BatchSegment(env_spec, 4) D.obs = np.random.randn(*D.obs.shape) D.done.fill(False) D.done[0, -1] = True D.info[0] = [{}, {}, {}, {'terminal_observation': [0.1, 0.2, 0.3, 0.4]}] D.done[1, 2] = True D.info[1] = [{}, {}, {'terminal_observation': [1, 2, 3, 4]}, {}] D.done[2, -1] = True D.info[2] = [{}, {}, {}, {'terminal_observation': [10, 20, 30, 40]}] final_states = final_state_from_segment(D) assert final_states.shape == (3, ) + env_spec.observation_space.shape assert np.allclose(final_states[0], [0.1, 0.2, 0.3, 0.4]) assert np.allclose(final_states[1], D.numpy_observations[1, -1, ...]) assert not np.allclose(final_states[1], [1, 2, 3, 4]) assert np.allclose(final_states[2], [10, 20, 30, 40]) with pytest.raises(AssertionError): final_state_from_episode(D)
def __init__(self, config, agent, env): super().__init__(config, agent, env) self.env_spec = EnvSpec(self.env) self.obs_buffer = None # for next call self.done_buffer = None # masking
def test_sticky_agent(): sticky_action = 0 env = make_gym_env('CartPole-v1', 0) env_spec = EnvSpec(env) agent = StickyAgent(None, env_spec, sticky_action) out = agent.choose_action(env.reset()) assert isinstance(out, dict) assert 'action' in out and isinstance(out['action'], int) assert out['action'] == sticky_action venv = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(venv) agent = StickyAgent(None, env_spec, sticky_action) out = agent.choose_action(env.reset()) assert isinstance(out, dict) assert 'action' in out and len(out['action']) == 3 and isinstance(out['action'][0], int) assert np.allclose(out['action'], [0, 0, 0])
def test_categorical_head(): with pytest.raises(AssertionError): env = make_gym_env('Pendulum-v0', 0) env_spec = EnvSpec(env) CategoricalHead(None, None, 30, env_spec) env = make_gym_env('CartPole-v1', 0) env_spec = EnvSpec(env) head = CategoricalHead(None, None, 30, env_spec) assert head.feature_dim == 30 assert isinstance(head.action_head, nn.Linear) assert head.action_head.in_features == 30 and head.action_head.out_features == 2 dist = head(torch.randn(3, 30)) assert isinstance(dist, Categorical) assert list(dist.batch_shape) == [3] assert list(dist.probs.shape) == [3, 2] action = dist.sample() assert action.shape == (3, )
def check(agent_name, env_name): # Create environment list_make_env = make_envs(make_env=make_gym_env, env_id=env_name, num_env=2, init_seed=0) env = SerialVecEnv(list_make_env=list_make_env) env_spec = EnvSpec(env) assert env.num_env == 2 # Create agent if agent_name == 'random': agent = RandomAgent(env_spec=env_spec, config=None) elif agent_name == 'agent1': agent = Agent1(config=None) elif agent_name == 'agent2': agent = Agent2(config=None) else: raise ValueError('Wrong agent name') # Create runner runner = SegmentRunner(agent=agent, env=env, gamma=1.0) # Small batch D = runner(T=3, reset=False) assert len(D) == 2 assert all([isinstance(d, Segment) for d in D]) assert all([d.T == 3 for d in D]) assert all([d.gamma == 1.0 for d in D]) # Check additional information for d in D: for t in d.transitions: if agent_name != 'random': assert 'action_logprob' in t.info # Check if s in transition is equal to s_next in previous transition for d in D: for s1, s2 in zip(d.transitions[:-1], d.transitions[1:]): assert np.allclose(s1.s_next, s2.s) # Take one more step, test rolling effect, i.e. first state should be same as last state in previous D D2 = runner(T=1, reset=False) assert len(D2) == 2 assert all([d.T == 1 for d in D2]) for d, d2 in zip(D, D2): assert np.allclose(d2.all_s[0][0], d.transitions[-1].s_next) # Long horizon D = runner(T=200, reset=True) # Segment with identical time steps assert all([d.T == 200 for d in D]) # For CartPole, 200 time steps, should be somewhere done=True if env_name == 'CartPole-v1': assert any([True in d.all_done for d in D]) assert all([len(d.trajectories) > 1 for d in D])
def make_env_spec(self): list_make_env = make_envs(make_env=make_gym_env, env_id='CartPole-v1', num_env=3, init_seed=0) venv = SerialVecEnv(list_make_env=list_make_env, rolling=True) env_spec = EnvSpec(venv) return env_spec
def make_env_spec(self): list_make_env = make_envs(make_env=make_gym_env, env_id='Pendulum-v0', num_env=1, init_seed=0) venv = SerialVecEnv(list_make_env=list_make_env) env_spec = EnvSpec(venv) return env_spec
def check(agent_name, env_name): # Create environment list_make_env = make_envs(make_env=make_gym_env, env_id=env_name, num_env=1, init_seed=0) env = SerialVecEnv(list_make_env=list_make_env) env_spec = EnvSpec(env) # Create agent if agent_name == 'random': agent = RandomAgent(env_spec=env_spec, config=None) elif agent_name == 'agent1': agent = Agent1(config=None) elif agent_name == 'agent2': agent = Agent2(config=None) else: raise ValueError('Wrong agent name') # Test: not allowed more than one environment for TrajectoryRunner with pytest.raises(AssertionError): list_make_env2 = make_envs(make_env=make_gym_env, env_id=env_name, num_env=2, init_seed=0) env2 = SerialVecEnv(list_make_env=list_make_env2) runner2 = TrajectoryRunner(agent=agent, env=env2, gamma=1.0) # Create runner runner = TrajectoryRunner(agent=agent, env=env, gamma=1.0) # Small batch D = runner(N=3, T=4) assert len(D) == 3 assert all([isinstance(d, Trajectory) for d in D]) assert all([d.T == 4 for d in D]) assert all([d.gamma == 1.0 for d in D]) # Check additional information for d in D: for t in d.transitions: if agent_name != 'random': assert 'action_logprob' in t.info # Check if s in transition is equal to s_next in previous transition for d in D: for s1, s2 in zip(d.transitions[:-1], d.transitions[1:]): assert np.allclose(s1.s_next, s2.s) # Long horizon D = runner(N=3, T=1000) for d in D: if d.T < 1000: assert d.all_done[-1] == True
def test_diag_gaussian_head(): with pytest.raises(AssertionError): env = make_gym_env('CartPole-v1', 0) env_spec = EnvSpec(env) DiagGaussianHead(None, None, 30, env_spec) env = make_gym_env('Pendulum-v0', 0) env_spec = EnvSpec(env) head = DiagGaussianHead(None, None, 30, env_spec) assert head.feature_dim == 30 assert isinstance(head.mean_head, nn.Linear) assert isinstance(head.logstd_head, nn.Parameter) assert head.mean_head.in_features == 30 and head.mean_head.out_features == 1 assert list(head.logstd_head.shape) == [1] assert torch.eq(head.logstd_head, torch.tensor(-0.510825624)) dist = head(torch.randn(3, 30)) assert isinstance(dist, Independent) and isinstance(dist.base_dist, Normal) assert list(dist.batch_shape) == [3] action = dist.sample() assert list(action.shape) == [3, 1] head = DiagGaussianHead(None, None, 30, env_spec, std_style='softplus') dist = head(torch.randn(3, 30)) action = dist.sample() assert list(action.shape) == [3, 1] assert torch.eq(head.logstd_head, torch.tensor(-0.19587036834631966)) head = DiagGaussianHead(None, None, 30, env_spec, std_style='sigmoidal') assert torch.eq(head.logstd_head, torch.tensor(-0.871222446472449)) head = DiagGaussianHead(None, None, 30, env_spec, std_state_dependent=True) dist = head(torch.randn(3, 30)) action = dist.sample() assert list(action.shape) == [3, 1] head = DiagGaussianHead(None, None, 30, env_spec, constant_std=0.3) dist = head(torch.randn(3, 30)) action = dist.sample() assert list(action.shape) == [3, 1] assert not head.logstd_head.requires_grad assert torch.eq(head.logstd_head, torch.tensor([-1.2039728]))
def __call__(self, config): # Set random seeds: PyTorch, numpy.random, random set_global_seeds(seed=config['seed']) # Create environment and seed it env = make_env(seed=config['seed'], monitor=False, monitor_dir=None) # Create environment specification env_spec = EnvSpec(env) # TODO: integrate within make_env globally # Create device device = torch.device('cuda' if config['cuda'] else 'cpu') # Create logger logger = Logger(name='logger') # Create policy network = MLP(config=config) policy = CategoricalPolicy(network=network, env_spec=env_spec) policy.network = policy.network.to(device) # Create optimizer optimizer = optim.Adam(policy.network.parameters(), lr=config['lr']) # Learning rate scheduler max_epoch = config['train_iter'] # Max number of lr decay, Note where lr_scheduler put lambda_f = lambda epoch: 1 - epoch/max_epoch # decay learning rate for each training epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Create agent agent_class = ActorCriticAgent#REINFORCEAgent agent = agent_class(policy=policy, optimizer=optimizer, config=config, lr_scheduler=lr_scheduler, device=device) # Create runner runner = Runner(agent=agent, env=env, gamma=config['gamma']) # Create engine engine = Engine(agent=agent, runner=runner, config=config, logger=logger) # Training train_output = engine.train() np.save('logs/returns_ActorCritic', train_output) return None
def test_constraint_action(): env = make_gym_env('Pendulum-v0', 0) env_spec = EnvSpec(env) action = torch.tensor([1.5]) assert torch.eq(constraint_action(env_spec, action), torch.tensor([1.5])) action = torch.tensor([3.0]) assert torch.eq(constraint_action(env_spec, action), torch.tensor([2.0])) action = torch.tensor([-10.0]) assert torch.eq(constraint_action(env_spec, action), torch.tensor([-2.0]))
def test_env_spec(self): env = gym.make('CartPole-v1') env = GymWrapper(env) env.seed(0) env_spec = EnvSpec(env) assert isinstance(env_spec.observation_space, Box) assert isinstance(env_spec.action_space, Discrete) assert env_spec.control_type == 'Discrete' assert env_spec.T == 500 assert env_spec.max_episode_reward == 475.0 assert env_spec.reward_range == (-float('inf'), float('inf'))
def test_gae_from_episode(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) D = BatchEpisode(env_spec) D.r[0] = [1, 2, 3] D.done[0] = [False, False, True] D.completed[0] = True D.r[1] = [1, 2, 3, 4, 5] D.done[1] = [False, False, False, False, False] D.completed[1] = False D.r[2] = [1, 2, 3, 4, 5, 6, 7, 8] D.done[2] = [False, False, False, False, False, False, False, True] D.completed[2] = True all_Vs = [ torch.tensor([[0.1], [0.5], [1.0]]), torch.tensor([[1.1], [1.5], [2.0]]), torch.tensor([[2.1], [2.5], [3.0]]), torch.tensor([[3.1], [3.5], [4.0]]), torch.tensor([[4.1], [4.5], [5.0]]), torch.tensor([[5.1], [5.5], [6.0]]), torch.tensor([[6.1], [6.5], [7.0]]), torch.tensor([[7.1], [7.5], [8.0]]) ] last_Vs = torch.tensor([10, 20, 30]).unsqueeze(1) all_Vs = torch.stack(all_Vs, 1) out = gae_from_episode(D, all_Vs, last_Vs, 1.0, 0.5) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [3.725, 3.45, 0.9, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [6.40625, 8.8125, 11.625, 15.25, 20.5, 0, 0, 0]) assert np.allclose(out[2], [5.84375, 7.6875, 9.375, 10.75, 11.5, 11., 8, 0.]) del out out = gae_from_episode(D, all_Vs, last_Vs, 0.1, 0.2) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [1.03256, 1.128, 0.9, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [0.665348, 0.7674, 0.87, 1, 2.5, 0, 0, 0]) assert np.allclose(out[2], [ 0.206164098, 0.308204915, 0.410245728, 0.5122864, 0.61432, 0.716, 0.8, 0 ]) with pytest.raises(AssertionError): gae_from_segment(D, all_Vs, last_Vs, 0.1, 0.2)
def test_episode_runner(vec_env, env_id): env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0) env_spec = EnvSpec(env) if env_id == 'CartPole-v1': sticky_action = 1 elif env_id == 'Pendulum-v0': sticky_action = [0.1] T = 30 agent = StickyAgent(None, env_spec, sticky_action) runner = EpisodeRunner(None, agent, env) D = runner(T) assert D.N == 3 assert D.maxT == max(D.Ts) seeder = Seeder(0) seed1, seed2, seed3 = seeder(3) env1 = make_gym_env(env_id, seed1) env2 = make_gym_env(env_id, seed2) env3 = make_gym_env(env_id, seed3) for n, ev in enumerate([env1, env2, env3]): obs = ev.reset() assert np.allclose(obs, D.observations[n][0]) assert np.allclose(obs, D.numpy_observations[n, 0, ...]) for t in range(T): obs, reward, done, info = ev.step(sticky_action) assert np.allclose(reward, D.rewards[n][t]) assert np.allclose(reward, D.numpy_rewards[n, t]) assert np.allclose(done, D.dones[n][t]) assert done == D.numpy_dones[n, t] assert int(not done) == D.masks[n][t] assert int(not done) == D.numpy_masks[n, t] if done: assert np.allclose(obs, D.infos[n][t]['terminal_observation']) assert D.completes[n] assert np.allclose(0.0, D.numpy_observations[n, t + 1 + 1:, ...]) assert np.allclose(0.0, D.numpy_actions[n, t + 1:, ...]) assert np.allclose(0.0, D.numpy_rewards[n, t + 1:]) assert np.allclose(True, D.numpy_dones[n, t + 1:]) assert np.allclose(0.0, D.numpy_masks[n, t + 1:]) break else: assert np.allclose(obs, D.observations[n][t + 1])
def test_td0_error_from_episode(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) D = BatchEpisode(env_spec) D.r[0] = [1, 2, 3] D.done[0] = [False, False, True] D.completed[0] = True D.r[1] = [1, 2, 3, 4, 5] D.done[1] = [False, False, False, False, False] D.completed[1] = False D.r[2] = [1, 2, 3, 4, 5, 6, 7, 8] D.done[2] = [False, False, False, False, False, False, False, True] D.completed[2] = True all_Vs = [ torch.tensor([[0.1], [0.5], [1.0]]), torch.tensor([[1.1], [1.5], [2.0]]), torch.tensor([[2.1], [2.5], [3.0]]), torch.tensor([[3.1], [3.5], [4.0]]), torch.tensor([[4.1], [4.5], [5.0]]), torch.tensor([[5.1], [5.5], [6.0]]), torch.tensor([[6.1], [6.5], [7.0]]), torch.tensor([[7.1], [7.5], [8.0]]) ] last_Vs = torch.tensor([10, 20, 30]).unsqueeze(1) all_Vs = torch.stack(all_Vs, 1) out = td0_error_from_episode(D, all_Vs, last_Vs, 1.0) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [2.0, 3, 0.9, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [2, 3, 4, 5, 20.5, 0, 0, 0]) assert np.allclose(out[2], [2, 3, 4, 5, 6, 7, 8, 0]) del out out = td0_error_from_episode(D, all_Vs, last_Vs, 0.1) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [1.01, 1.11, 0.9, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [0.65, 0.75, 0.85, 0.95, 2.5, 0, 0, 0]) assert np.allclose(out[2], [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0]) with pytest.raises(AssertionError): td0_error_from_segment(D, all_Vs, last_Vs, 0.1)
def init(self, seed, config): # Make environment # Remember to seed it in each working function ! self.env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=1, init_seed=seed) self.env_spec = EnvSpec(self.env) # Make agent self.network = Network(config=config, env_spec=self.env_spec) if self.env_spec.control_type == 'Discrete': self.policy = CategoricalPolicy(config=config, network=self.network, env_spec=self.env_spec) elif self.env_spec.control_type == 'Continuous': self.policy = GaussianPolicy(config=config, network=self.network, env_spec=self.env_spec) self.agent = Agent(policy=self.policy, config=config)
def _prepare(self, config): self.env = make_vec_env(SerialVecEnv, make_gym_env, config['env.id'], config['train.N'], 0) self.env = VecClipAction(self.env) if config['env.standardize']: self.env = VecStandardize(self.env, use_obs=True, use_reward=False, clip_obs=10.0, clip_reward=10.0, gamma=0.99, eps=1e-08) self.env_spec = EnvSpec(self.env) self.device = torch.device('cpu') self.agent = Agent(config, self.env_spec, self.device)
def test_vec_monitor(env_id, vec_env_class, num_env, seed, runner_class): env = make_vec_env(vec_env_class, make_gym_env, env_id, num_env, seed) env_spec = EnvSpec(env) env = VecMonitor(env) agent = RandomAgent(None, env_spec) runner = runner_class(None, agent, env) D = runner(1050) for infos in D.infos: for info in infos: if 'terminal_observation' in info: assert 'episode' in info assert 'return' in info['episode'] and isinstance( info['episode']['return'], np.float32) assert 'horizon' in info['episode'] and isinstance( info['episode']['horizon'], np.int32) assert 'time' in info['episode'] and isinstance( info['episode']['time'], float)
def test_rolling_segment_runner(vec_env, env_id): env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0) env_spec = EnvSpec(env) if env_id == 'CartPole-v1': sticky_action = 1 elif env_id == 'Pendulum-v0': sticky_action = [0.1] T = 30 agent = StickyAgent(None, env_spec, sticky_action) runner = RollingSegmentRunner(None, agent, env) D = runner(T) assert D.N == 3 assert D.T == T seeder = Seeder(0) seed1, seed2, seed3 = seeder(3) env1 = make_gym_env(env_id, seed1) env2 = make_gym_env(env_id, seed2) env3 = make_gym_env(env_id, seed3) for n, ev in enumerate([env1, env2, env3]): obs = ev.reset() assert np.allclose(obs, D.numpy_observations[n, 0, ...]) for t in range(T): obs, reward, done, info = ev.step(sticky_action) if done: info['terminal_observation'] = obs obs = ev.reset() assert np.allclose(obs, D.numpy_observations[n, t + 1, ...]) assert np.allclose(sticky_action, D.numpy_actions[n, t, ...]) assert np.allclose(reward, D.numpy_rewards[n, t]) assert done == D.numpy_dones[n, t] assert int(not done) == D.numpy_masks[n, t] if done: assert np.allclose(info['terminal_observation'], D.infos[n][t]['terminal_observation'])
def test_vec_env(self, vec_env_class): # unpack class v_id, vec_env_class = vec_env_class venv = make_vec_env(vec_env_class, make_gym_env, 'CartPole-v1', 5, 1, True) assert isinstance(venv, VecEnv) assert v_id in [0, 1] if v_id == 0: isinstance(venv, SerialVecEnv) elif v_id == 1: assert isinstance(venv, ParallelVecEnv) assert venv.num_env == 5 assert not venv.closed and venv.viewer is None assert venv.unwrapped is venv assert isinstance(venv.observation_space, Box) assert isinstance(venv.action_space, Discrete) assert venv.T == 500 assert venv.max_episode_reward == 475.0 assert venv.reward_range == (-float('inf'), float('inf')) obs = venv.reset() assert len(obs) == 5 assert np.asarray(obs).shape == (5, 4) assert all([not np.allclose(obs[0], obs[i]) for i in [1, 2, 3, 4]]) a = [1] * 5 obs, rewards, dones, infos = venv.step(a) assert all([len(item) == 5 for item in [obs, rewards, dones, infos]]) assert all([not np.allclose(obs[0], obs[i]) for i in [1, 2, 3, 4]]) # EnvSpec env_spec = EnvSpec(venv) assert isinstance(env_spec.action_space, Discrete) assert isinstance(env_spec.observation_space, Box) assert env_spec.control_type == 'Discrete' assert env_spec.T == 500 assert env_spec.max_episode_reward == 475.0 assert env_spec.reward_range == (-float('inf'), float('inf')) assert env_spec.is_vec_env venv.close() assert venv.closed