def test_multi_gpu_ppo_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_ppo_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = PPOAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 10000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) # Assume we have learned something. # TODO: This test needs more tuning. -1.0 is not great for the 2x2 grid world. self.assertGreater(results["mean_episode_reward"], -1.0)
def test_ppo_on_cart_pole(self): """ Creates a PPO Agent and runs it via a Runner on the CartPole Env. """ env = OpenAIGymEnv("CartPole-v0", seed=36) agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space ) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows ) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) #self.assertGreaterEqual(results["mean_episode_reward"], 23) #self.assertGreaterEqual(results["max_episode_reward"], 100.0) self.assertLessEqual(results["episodes_executed"], time_steps / 10)
def test_ppo_on_2x2_grid_world_with_container_actions(self): """ Creates a PPO agent and runs it via a Runner on a simple 2x2 GridWorld using container actions. """ # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/ppo_agent_for_2x2_gridworld_with_container_actions.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = PPOAgent.from_spec(agent_config, state_space=FloatBox(shape=(4, )), action_space=dummy_env.action_space) time_steps = 10000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print("Results =", results)
def test_ppo_on_2x2_grid_world(self): """ Creates a PPO Agent and runs it via a Runner on the 2x2 Grid World Env. """ env = GridWorld(world="2x2") agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_2x2_gridworld.json"), state_space=GridWorld.grid_world_2x2_flattened_state_space, action_space=env.action_space, execution_spec=dict(seed=15), ) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=GridWorld.grid_world_2x2_preprocessing_spec ) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) # Assume we have learned something. self.assertGreater(results["mean_episode_reward"], -0.2)
def test_ppo_on_cart_pole(self): """ Creates a PPO Agent and runs it via a Runner on the CartPole env. """ env = OpenAIGymEnv("CartPole-v0", seed=36) agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=False, #self.is_windows episode_finish_callback=lambda episode_return, duration, timesteps, env_num: print("episode return {}; steps={}".format( episode_return, timesteps))) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertLessEqual(results["episodes_executed"], time_steps / 10) # Assume we have learned something. self.assertGreaterEqual(results["mean_episode_reward"], 40.0)
def test_post_processing(self): """ Tests external batch post-processing for the PPO agent. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent_config = config_from_path("configs/ppo_agent_for_pong.json") agent = PPOAgent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) num_samples = 200 states = agent.preprocessed_state_space.sample(num_samples) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) sequence_indices_space = BoolBox(add_batch_rank=True) # GAE is separately tested, just testing if this API method returns results. pg_advantages = agent.post_process( dict(states=states, rewards=reward_space.sample(num_samples), terminals=terminal_space.sample(num_samples, fill_value=0), sequence_indices=sequence_indices_space.sample(num_samples, fill_value=0)))
def test_value_function_weights(self): """ Tests changing of value function weights. """ env = OpenAIGymEnv("Pong-v0") agent_config = config_from_path("configs/ppo_agent_for_pong.json") agent = PPOAgent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) weights = agent.get_weights() assert "value_function_weights" in weights assert "policy_weights" in weights policy_weights = weights["policy_weights"] value_function_weights = weights["value_function_weights"] # Just change vf weights. for key, weight in value_function_weights.items(): value_function_weights[key] = weight + 0.01 agent.set_weights(policy_weights, value_function_weights) new_actual_weights = agent.get_weights() recursive_assert_almost_equal( new_actual_weights["value_function_weights"], value_function_weights)
def test_ppo_compilation(self): """ Tests PPO agent compilation. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent_config = config_from_path("configs/ppo_agent_for_pong.json") agent = PPOAgent.from_spec( agent_config, state_space=env.state_space, action_space=env.action_space ) print("Compiled {}".format(agent))
def test_ppo_on_pendulum(self): """ Creates a PPO Agent and runs it via a Runner on the Pendulum env. """ env = OpenAIGymEnv("Pendulum-v0") agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows) results = worker.execute_episodes(500, use_exploration=True) print(results)
def test_ppo_on_pendulum(self): """ Creates a PPO Agent and runs it via a Runner on the Pendulum env. """ env = OpenAIGymEnv("Pendulum-v0") agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=False, #self.is_windows, episode_finish_callback=lambda episode_return, duration, timesteps, env_num: print("episode return {}; steps={}".format( episode_return, timesteps))) results = worker.execute_episodes(5000, use_exploration=True) print(results)
def test_ppo_on_2x2_grid_world_with_container_actions(self): """ Creates a PPO agent and runs it via a Runner on a simple 2x2 GridWorld using container actions. """ # ----- # |^|H| # ----- # | |G| ^=start, looking up # ----- # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/ppo_agent_for_2x2_gridworld_with_container_actions.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = PPOAgent.from_spec(agent_config, state_space=FloatBox(shape=(4, )), action_space=dummy_env.action_space) time_steps = 5000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertLessEqual(results["episodes_executed"], time_steps) # Assume we have learned something. self.assertGreaterEqual(results["mean_episode_reward"], -2.0)
def test_ppo_on_container_state_and_action_spaces_and_very_large_rewards( self): """ Tests stability of PPO on an extreme env producing strange container states and large rewards and requiring container actions. """ env = RandomEnv( state_space=Dict( {"F_position": FloatBox(shape=(2, ), low=0.01, high=0.02)}), action_space=Dict({ "F_direction_low-1.0_high1.0": FloatBox(shape=(), low=-1.0, high=1.0), "F_forward_direction_low-1.0_high1.0": FloatBox(shape=(), low=-1.0, high=1.0), "B_jump": BoolBox() }), reward_space=FloatBox(low=-1000.0, high=-100000.0), # hugely negative rewards terminal_prob=0.0000001) agent_config = config_from_path( "configs/ppo_agent_for_random_env_with_container_spaces.json") agent = PPOAgent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, preprocessing_spec=None, worker_executes_preprocessing=True, #episode_finish_callback=lambda episode_return, duration, timesteps, env_num: #print("episode return {}; steps={}".format(episode_return, timesteps)) ) results = worker.execute_timesteps(num_timesteps=int(1e6), use_exploration=True) print(results)