def test_save_multiple_trajectories(): agent = PPOAgent(reward_discount=1.0, n_trajectory=3, max_time_steps=10) assert len(agent._data_buffer) == 0 for i in range(3 * 10): agent._add_data_into_buffer(observation=[0], reward=0, action=[0], is_done=False) assert len(agent._data_buffer) == 3 assert len(agent._data_buffer[0]) == 10
def test_add_sample_into_trajectory(): agent = PPOAgent(reward_discount=1.0, n_trajectory=3, max_time_steps=100) assert len(agent._data_buffer) == 0 agent._add_data_into_buffer(observation=[0], reward=1, action=[2], is_done=False) assert len(agent._trajectory) == 1 assert agent._trajectory[0].observation == [0] assert agent._trajectory[0].reward == 1 assert agent._trajectory[0].action == [2] assert agent._trajectory[0].is_done is False
def test_agent_step(): agent = PPOAgent(reward_discount=1.0, n_trajectory=3, max_time_steps=10, dim_observation=2, dim_action=2) action = agent.step( observation=[5.2, 3.1], reward=1, is_done=False ) assert action.size()[0] == 2
def test_vf_update(): len_traj = 5 agent = PPOAgent(reward_discount=1.0, n_trajectory=3, max_time_steps=len_traj, dim_observation=2, lr_value=0.05, iter_op_vf=100, ) # Data for value estimation # Value of all sequence should be one. for i in range(3): for t in range(len_traj): r = 1 if t == len_traj - 1 else 0 is_done = True if t == len_traj - 1 else False agent._add_data_into_buffer(observation=[t/float(len_traj), 1 - t/float(len_traj)], reward=r, action=[0, 0], is_done=is_done) advantage, reward_to_go = agent._get_advantage() # print(reward_to_go) agent._update_vf(reward_to_go) for i in range(3): for t in range(len_traj): val = agent._evaluate_vf( observation=Tensor([t/float(len_traj), 1. - t/float(len_traj)]) ).detach() if t == len_traj - 1: assert 0 == approx(val, abs=0.2) else: assert 1 == approx(val, abs=0.2)
def test_save_multiple_trajectories_with_done(): agent = PPOAgent(reward_discount=1.0, n_trajectory=3, max_time_steps=10) assert len(agent._data_buffer) == 0 for i in range(3): for j in range(3 * (i + 1)): is_done = j == 3 * (i + 1) - 1 agent._add_data_into_buffer(observation=[0], reward=0, action=[0], is_done=is_done) assert len(agent._data_buffer) == i + 1 assert len(agent._data_buffer[0]) == 3 assert len(agent._data_buffer[1]) == 6 assert len(agent._data_buffer[2]) == 9
def test_reward_normalizer(): agent = PPOAgent(reward_discount=0.9, n_trajectory=3, max_time_steps=10, dim_observation=2) data = [np.random.randn(1) for i in range(5)] for x in data: agent.step( observation=[0, 1], reward=x, is_done=False, is_test=True, ) assert 10. / np.std(data) == approx(agent._normalize_reward(10.), abs=0.01)
def test_policy_update(): agent = PPOAgent(reward_discount=0.0, n_trajectory=3, max_time_steps=10, dim_observation=2, lr_value=0.1, iter_op_vf=3, lr_policy=0.1, iter_op_policy=3, ) # Data for value estimation # Value of all sequence should be one. for i in range(3): for t in range(10): r = 1 if t == 9 else 0 is_done = True if t == 9 else False agent.step( observation=[t / 9., 1 - (t / 9.)], reward=r, is_done=is_done, is_test=True, ) # just a running check advantage, reward_to_go = agent._get_advantage() agent._policy_update(advantage)
def test_observation_normalizer(): agent = PPOAgent(reward_discount=0.9, n_trajectory=3, max_time_steps=10, dim_observation=2) data = 1 + np.random.randn(1000, 2) mean = np.mean(data, 0).astype(np.float32) std = np.std(data, 0).astype(np.float32) for x in data: agent.step( observation=x.tolist(), reward=1, is_done=False, is_test=True, ) obs = agent._normalize_observation(Tensor([0, 0])).float() expected = (Tensor([0, 0]).float() - mean) / std assert torch.allclose(obs, expected, rtol=0.2, atol=0.2)
def test_get_advantage(): agent = PPOAgent(reward_discount=1.0, n_trajectory=3, max_time_steps=10, dim_observation=2) for i in range(3): for j in range(10): obs = [0, 0] agent._add_data_into_buffer(observation=obs, reward=1, action=[0], is_done=False) advantage, reward_to_go = agent._get_advantage() assert reward_to_go.size() == (3, 10) for n, traj in enumerate(agent._data_buffer): for t, expr in enumerate(traj): obs_final = Tensor(traj[-1].observation) value_t = agent._evaluate_vf(obs_final).detach()[0] assert approx((9 + value_t - t).tolist(), reward_to_go[n, t].tolist()) obs = Tensor(expr.observation) assert approx((9 - t + value_t - agent._evaluate_vf(obs)).tolist(), advantage[n, t].tolist())
vis = visdom.Visdom() env_name = "BipedalWalker-v3" # env_name = 'Pendulum-v0' # env_name = "MountainCarContinuous-v0" env = gym.make(env_name) print(f"dim_obs: {len(env.observation_space.high)}") print(f"dim_action: {len(env.action_space.high)}") agent = PPOAgent(reward_discount=0.95, n_trajectory=20, max_time_steps=50, dim_observation=len(env.observation_space.high), dim_action=len(env.action_space.high), lr_value=0.001, iter_op_vf=3, iter_op_policy=3, lr_policy=0.001, eps_policy_clip=0.2) if __name__ == '__main__': episode = 0 rew_sum_list = [] while True: t = 0 reward = None done = False observation = env.reset() rew_sum = 0 while True: