def test_batched_backend_equivalence(self): return """ Tests if Python and TensorFlow backend return the same output for a standard DQN-style preprocessing stack. """ env_spec = dict( type="openai", gym_env="Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True ) # Test with batching because we assume vector environments to be the normal case going forward. env = SequentialVectorEnv(num_envs=4, env_spec=env_spec, num_background_envs=2) in_space = env.state_space agent_config = config_from_path("configs/ray_apex_for_pong.json") preprocessing_spec = deepcopy(agent_config["preprocessing_spec"]) # Set up python preprocessor. scopes = [preprocessor["scope"] for preprocessor in preprocessing_spec] # Set backend to python. for spec in preprocessing_spec: spec["backend"] = "python" python_processor = PreprocessorStack(*preprocessing_spec, backend="python") for sub_comp_scope in scopes: python_processor.sub_components[sub_comp_scope].create_variables(dict(preprocessing_inputs=in_space)) python_processor.reset() # To have the use case we considered so far, use agent interface for TF backend. agent_config.pop("type") agent = ApexAgent(state_space=env.state_space, action_space=env.action_space, **agent_config) # Generate a few states from random set points. Test if preprocessed states are almost equal states = np.asarray(env.reset_all()) actions, agent_preprocessed_states = agent.get_action( states=states, use_exploration=False, extra_returns="preprocessed_states") print("TensorFlow preprocessed shape: {}".format(np.asarray(agent_preprocessed_states).shape)) python_preprocessed_states = python_processor.preprocess(states) print("Python preprocessed shape: {}".format(np.asarray(python_preprocessed_states).shape)) print("Asserting (almost) equal values:") for tf_state, python_state in zip(agent_preprocessed_states, python_preprocessed_states): flat_tf = np.ndarray.flatten(tf_state) flat_python = np.ndarray.flatten(python_state) for x, y in zip(flat_tf, flat_python): recursive_assert_almost_equal(x, y, decimals=3) states, _, _, _ = env.step(actions) actions, agent_preprocessed_states = agent.get_action( states=states, use_exploration=False, extra_returns="preprocessed_states") print("TensorFlow preprocessed shape: {}".format(np.asarray(agent_preprocessed_states).shape)) python_preprocessed_states = python_processor.preprocess(states) print("Python preprocessed shape: {}".format(np.asarray(python_preprocessed_states).shape)) print("Asserting (almost) equal values:") recursive_assert_almost_equal(agent_preprocessed_states, python_preprocessed_states, decimals=3)
def __init__(self, agent, env_spec=None, num_envs=1, frameskip=1, render=False, worker_executes_exploration=True, exploration_epsilon=0.1, episode_finish_callback=None): """ Args: agent (Agent): Agent to execute environment on. env_spec Optional[Union[callable, dict]]): Either an environment spec or a callable returning a new environment. num_envs (int): How many single Environments should be run in parallel in a SequentialVectorEnv. frameskip (int): How often actions are repeated after retrieving them from the agent. This setting can be overwritten in the single calls to the different `execute_..` methods. render (bool): Whether to render the environment after each action. Default: False. worker_executes_exploration (bool): If worker executes exploration by sampling. exploration_epsilon (Optional[float]): Epsilon to use if worker executes exploration. """ super(Worker, self).__init__() self.num_environments = num_envs self.logger = logging.getLogger(__name__) if env_spec is not None: self.env_ids = [ "env_{}".format(i) for i in range_(self.num_environments) ] self.vector_env = SequentialVectorEnv( env_spec=env_spec, num_envs=self.num_environments) else: self.env_ids = [] self.vector_env = None self.agent = agent self.frameskip = frameskip self.render = render # Update schedule if worker is performing updates. self.updating = None self.steps_before_update = None self.update_interval = None self.update_steps = None self.sync_interval = None self.episodes_since_update = 0 # Default val or None? self.update_mode = "time_steps" self.worker_executes_exploration = worker_executes_exploration self.exploration_epsilon = exploration_epsilon self.episode_finish_callback = episode_finish_callback
def test_sequential_vector_env(self): vector_env = SequentialVectorEnv(num_environments=self.num_vector_envs, env_spec=self.env_spec, num_background_envs=2) agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. config_from_path("configs/dqn_vector_env.json"), state_space=vector_env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=vector_env.action_space) states = vector_env.reset_all() start = time.monotonic() ep_lengths = [0 for _ in range_(self.num_vector_envs)] for _ in range_(int(self.samples / self.num_vector_envs)): # Sample all envs at once. actions, preprocessed_states = agent.get_action( states, extra_returns="preprocessed_states") states, rewards, terminals, infos = vector_env.step(actions) ep_lengths = [ep_length + 1 for ep_length in ep_lengths] for i, terminal in enumerate(terminals): if terminal: print("reset env {} after {} states".format( i, ep_lengths[i])) vector_env.reset(i) ep_lengths[i] = 0 runtime = time.monotonic() - start tp = self.samples / runtime print('Testing vector env {} performance:'.format( self.env_spec["gym_env"])) print('Ran {} steps, throughput: {} states/s, total time: {} s'.format( self.samples, tp, runtime))
def __init__(self, agent, env_spec=None, num_environments=1, frameskip=1, render=False, worker_executes_exploration=True, exploration_epsilon=0.1, episode_finish_callback=None, max_timesteps=None): """ Args: agent (Agent): Agent to execute environment on. env_spec Optional[Union[callable, dict]]): Either an environment spec or a callable returning a new environment. num_environments (int): How many single Environments should be run in parallel in a SequentialVectorEnv. frameskip (int): How often actions are repeated after retrieving them from the agent. This setting can be overwritten in the single calls to the different `execute_..` methods. render (bool): Whether to render the environment after each action. Default: False. worker_executes_exploration (bool): If worker executes exploration by sampling. exploration_epsilon (Optional[float]): Epsilon to use if worker executes exploration. max_timesteps (Optional[int]): A max number on the time steps this Worker expects to perform. This is not a forced limit, but serves to calculate the `time_percentage` value passed into the Agent for time-dependent (decay) parameter calculations. If None, Worker will try to infer this value automatically. """ super(Worker, self).__init__() self.num_environments = num_environments self.logger = logging.getLogger(__name__) # VectorEnv was passed in directly -> Use that one. if isinstance(env_spec, VectorEnv): self.vector_env = env_spec self.num_environments = self.vector_env.num_environments self.env_ids = [ "env_{}".format(i) for i in range_(self.num_environments) ] # `Env_spec` is for single envs inside a SequentialVectorEnv. elif env_spec is not None: self.vector_env = SequentialVectorEnv( env_spec=env_spec, num_environments=self.num_environments) self.env_ids = [ "env_{}".format(i) for i in range_(self.num_environments) ] # No env_spec. else: self.vector_env = None self.env_ids = [] self.agent = agent self.frameskip = frameskip self.render = render # Update schedule if worker is performing updates. self.updating = None self.steps_before_update = None self.update_interval = None self.update_steps = None self.sync_interval = None self.episodes_since_update = 0 self.max_timesteps = max_timesteps # Default val or None? self.update_mode = "time_steps" self.worker_executes_exploration = worker_executes_exploration self.exploration_epsilon = exploration_epsilon self.episode_finish_callback = episode_finish_callback
def test_sequential_vector_env(self): num_envs = 4 env = SequentialVectorEnv(num_environments=num_envs, env_spec={ "type": "gridworld", "world": "2x2" }) # Simple test runs with fixed actions. # X=player's position s = env.reset(index=0) # ["XH", " G"] X=player's position self.assertTrue(s == 0) s = env.reset_all() all(self.assertTrue(s_ == 0) for s_ in s) s, r, t, _ = env.step([2 for _ in range(num_envs)]) # down: [" H", "XG"] all(self.assertTrue(s_ == 1) for s_ in s) all(self.assertTrue(r_ == -1.0) for r_ in r) all(self.assertTrue(not t_) for t_ in t) s, r, t, _ = env.step([1 for _ in range(num_envs) ]) # right: [" H", " X"] all(self.assertTrue(s_ == 3) for s_ in s) all(self.assertTrue(r_ == 1.0) for r_ in r) all(self.assertTrue(t_) for t_ in t) [env.reset(index=i) for i in range(num_envs)] # ["XH", " G"] X=player's position s, r, t, _ = env.step([1 for _ in range(num_envs) ]) # right: [" X", " G"] -> in the hole all(self.assertTrue(s_ == 2) for s_ in s) all(self.assertTrue(r_ == -5.0) for r_ in r) all(self.assertTrue(t_) for t_ in t) # Run against a wall. env.reset_all() # ["XH", " G"] X=player's position s, r, t, _ = env.step([3 for _ in range(num_envs)]) # left: ["XH", " G"] all(self.assertTrue(s_ == 0) for s_ in s) all(self.assertTrue(r_ == -1.0) for r_ in r) all(self.assertTrue(not t_) for t_ in t) s, r, t, _ = env.step([2 for _ in range(num_envs)]) # down: [" H", "XG"] all(self.assertTrue(s_ == 1) for s_ in s) all(self.assertTrue(r_ == -1.0) for r_ in r) all(self.assertTrue(not t_) for t_ in t) s, r, t, _ = env.step([0 for _ in range(num_envs)]) # up: ["XH", " G"] all(self.assertTrue(s_ == 0) for s_ in s) all(self.assertTrue(r_ == -1.0) for r_ in r) all(self.assertTrue(not t_) for t_ in t)