def test_update_from_demos(self): """ Tests the separate API method to update from demos. """ env = OpenAIGymEnv.from_spec(self.env_spec) agent_config = config_from_path("configs/dqfd_agent_for_cartpole.json") agent = DQFDAgent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) terminals = BoolBox(add_batch_rank=True) rewards = FloatBox(add_batch_rank=True) state_1 = agent.preprocessed_state_space.with_batch_rank().sample(1) action_1 = [1] state_2 = agent.preprocessed_state_space.with_batch_rank().sample(1) action_2 = [0] # Insert two states with fixed actions and a few random examples. for _ in range(10): # State with correct action agent.observe_demos( preprocessed_states=state_1, actions=action_1, rewards=rewards.sample(1), next_states=agent.preprocessed_state_space.with_batch_rank(). sample(1), terminals=terminals.sample(1), ) agent.observe_demos( preprocessed_states=state_2, actions=action_2, rewards=rewards.sample(1), next_states=agent.preprocessed_state_space.with_batch_rank(). sample(1), terminals=terminals.sample(1), ) # Update. agent.update_from_demos(num_updates=1000, batch_size=8) # Test if fixed states and actions map. action = agent.get_action(states=state_1, apply_preprocessing=False, use_exploration=False) self.assertEqual(action, action_1) action = agent.get_action(states=state_2, apply_preprocessing=False, use_exploration=False) self.assertEqual(action, action_2)
def test_dqn_compilation(self): """ Tests DQN Agent compilation. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent_config = config_from_path("configs/dqn_agent_for_pong.json") agent = DQNAgent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space)
def test_multi_gpu_apex_agent_compilation(self): """ Tests if the multi gpu strategy can compile successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ root_logger.setLevel(DEBUG) agent_config = config_from_path( "configs/multi_gpu_ray_apex_for_pong.json") agent_config["execution_spec"].pop("ray_spec") environment = OpenAIGymEnv("Pong-v0", frameskip=4) agent = ApexAgent.from_spec(agent_config, state_space=environment.state_space, action_space=environment.action_space) print("Compiled Apex agent")
def test_multi_gpu_apex_agent_compilation(self): """ Tests if the multi gpu strategy can compile successfully on a multi gpu system. THIS TEST REQUIRES A MULTI GPU SYSTEM. """ root_logger.setLevel(DEBUG) agent_config = config_from_path("configs/multi_gpu_ray_apex_for_pong.json") agent_config["execution_spec"].pop("ray_spec") environment = OpenAIGymEnv("Pong-v0", frameskip=4) agent = ApexAgent.from_spec( agent_config, state_space=environment.state_space, action_space=environment.action_space ) print("Compiled Apex agent")
def test_dqn_compilation(self): """ Creates a DQNAgent and runs it via a Runner on an openAI Pong Env. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent_config = config_from_path("configs/dqn_pytorch_test.json") agent = DQNAgent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space)
def test_apex_compilation(self): """ Tests agent compilation without Ray to ease debugging on Windows. """ agent_config = config_from_path("configs/ray_apex_for_pong.json") agent_config["execution_spec"].pop("ray_spec") # TODO remove after unified. if get_backend() == "pytorch": agent_config["memory_spec"]["type"] = "mem_prioritized_replay" environment = OpenAIGymEnv("Pong-v0", frameskip=4) agent = ApexAgent.from_spec(agent_config, state_space=environment.state_space, action_space=environment.action_space) print('Compiled apex agent')
def test_cartpole_with_worker(self): env = OpenAIGymEnv("CartPole-v0") agent_config = config_from_path( "configs/backend_performance_dqn_cartpole.json") # Test cpu settings for batching here. agent_config["update_spec"] = None agent = DQNAgent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space) worker = SingleThreadedWorker( env_spec=lambda: OpenAIGymEnv("CartPole-v0"), agent=agent, frameskip=1, num_envs=1, worker_executes_preprocessing=False) result = worker.execute_timesteps(1000) print(result)
class TestSingleThreadedWorker(unittest.TestCase): environment = OpenAIGymEnv(gym_env='CartPole-v0') def test_timesteps(self): """ Simply tests if timestep execution loop works and returns a result. """ agent = RandomAgent( action_space=self.environment.action_space, state_space=self.environment.state_space ) worker = SingleThreadedWorker( env_spec=lambda: self.environment, agent=agent, frameskip=1, worker_executes_preprocessing=False ) result = worker.execute_timesteps(100) self.assertEqual(result['timesteps_executed'], 100) self.assertGreater(result['episodes_executed'], 0) self.assertLessEqual(result['episodes_executed'], 100) self.assertGreaterEqual(result['env_frames'], 100) self.assertGreaterEqual(result['runtime'], 0.0) def test_episodes(self): """ Simply tests if episode execution loop works and returns a result. """ agent = RandomAgent( action_space=self.environment.action_space, state_space=self.environment.state_space ) worker = SingleThreadedWorker( env_spec=lambda: self.environment, agent=agent, frameskip=1, worker_executes_preprocessing=False ) result = worker.execute_episodes(5, max_timesteps_per_episode=10) # Max 5 * 10. self.assertLessEqual(result['timesteps_executed'], 50) self.assertEqual(result['episodes_executed'], 5) self.assertLessEqual(result['env_frames'], 50) self.assertGreaterEqual(result['runtime'], 0.0)
def test_ppo_on_pendulum(self): """ Creates a PPO Agent and runs it via a Runner on the Pendulum env. """ env = OpenAIGymEnv("Pendulum-v0") agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows) results = worker.execute_episodes(500, use_exploration=True) print(results)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env, "visualize": FLAGS.visualize }) agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(episode_returns), episode_return, np.mean(episode_returns[-10:]))) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") # Use exploration is true for training, false for evaluation. worker.execute_timesteps(20000, use_exploration=True) # Note: A basic actor critic is very sensitive to hyper-parameters and might collapse after reaching the maximum # reward. In practice, it would be recommended to stop training when a reward threshold is reached. print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(episode_returns), np.mean(episode_returns[-10:])))
def test_dqn_on_pong(self): """ Creates a DQNAgent and runs it via a Runner on an openAI Pong Env. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent_config = config_from_path("configs/ray_apex_for_pong.json") agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space ) time_steps = 4000000 worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env }) print(env.state_space) agent = Agent.from_spec( agent_config, state_space=env.state_space, action_space=env.action_space ) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 5 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format( len(episode_returns), episode_return, np.mean(episode_returns[-5:]) )) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print("Starting workload, this will take some time for the agents to build.") worker.execute_episodes(100, use_exploration=True) # Use exploration is true for training, false for evaluation. #worker.execute_episodes(100, use_exploration=False) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(episode_returns), np.mean(episode_returns[-10:]) ))
def test_sac_on_cartpole(self): """ Creates an SAC-Agent and runs it on CartPole. """ env = OpenAIGymEnv("CartPole-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows) time_steps = 10000 results = worker.execute_timesteps(time_steps) print(results)
def test_post_processing(self): env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent_config = config_from_path("configs/ray_apex_for_pong.json") # Test cpu settings for batching here. agent_config["memory_spec"]["type"] = "mem_prioritized_replay" agent_config["execution_spec"]["torch_num_threads"] = 1 agent_config["execution_spec"]["OMP_NUM_THREADS"] = 1 agent = ApexAgent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space ) samples = 200 rewards = np.random.random(size=samples) states = list(agent.preprocessed_state_space.sample(samples)) actions = agent.action_space.sample(samples) terminals = np.zeros(samples, dtype=np.uint8) next_states = states[1:] next_states.extend([agent.preprocessed_state_space.sample(1)]) next_states = np.asarray(next_states) states = np.asarray(states) weights = np.ones_like(rewards) for _ in range(1): start = time.perf_counter() _, loss_per_item = agent.post_process( dict( states=states, actions=actions, rewards=rewards, terminals=terminals, next_states=next_states, importance_weights=weights ) ) print("post process time = {}".format(time.perf_counter() - start)) profile = Component.call_times print_call_chain(profile, False, 0.003)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) # Override openAI gym env per command line. if FLAGS.env is None: env_spec = agent_config["environment_spec"] else: env_spec = dict(type="openai-gym", gym_env=FLAGS.env) # Override number of visualized envs per command line. if FLAGS.visualize != -1: env_spec["visualize"] = FLAGS.visualize dummy_env = OpenAIGymEnv.from_spec(env_spec) agent = Agent.from_spec( agent_config, state_space=dummy_env.state_space, action_space=dummy_env.action_space ) dummy_env.terminate() learn_updates = 6000 mean_returns = [] for i in range(learn_updates): ret = agent.update() mean_return = _calc_mean_return(ret) mean_returns.append(mean_return) print("Iteration={} Loss={:.4f} Avg-reward={:.2f}".format(i, float(ret[1]), mean_return)) print("Mean return: {:.2f} / over the last 10 episodes: {:.2f}".format( np.nanmean(mean_returns), np.nanmean(mean_returns[-10:]) )) time.sleep(1) agent.terminate() time.sleep(3)
def test_image_value_functions(self): """ Tests if actions and states are successfully merged on image inputs to compute Q(s,a). """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_pong.json"), state_space=env.state_space, action_space=env.action_space ) # Test updating from image batch. batch = dict( states=agent.preprocessed_state_space.sample(32), actions=env.action_space.sample(32), rewards=np.ones((32,)), terminals=np.zeros((32,)), next_states=agent.preprocessed_state_space.sample(32), ) print(agent.update(batch))
def test_apex_weight_syncing(self): agent_config = config_from_path("configs/ray_apex_for_pong.json") agent_config["execution_spec"].pop("ray_spec") environment = OpenAIGymEnv("Pong-v0", frameskip=4) agent = Agent.from_spec( agent_config, state_space=environment.state_space, action_space=environment.action_space ) weights = agent.get_weights()["policy_weights"] print("type weights = ", type(weights)) for variable, value in weights.items(): print("Type value = ", type(value)) value += 0.01 agent.set_weights(weights) new_weights = agent.get_weights()["policy_weights"] recursive_assert_almost_equal(weights, new_weights)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config = read_config_file(FLAGS.config) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env }) agent = Agent.from_spec( agent_config, summary_spec=dict( summary_regexp=FLAGS.summary_regexp ), state_space=env.state_space, action_space=env.action_space ) rewards = [] def episode_finished_callback(reward, duration, timesteps, **kwargs): rewards.append(reward) if len(rewards) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format( len(rewards), reward, np.mean(rewards[-10:]) )) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback ) print("Starting workload, this will take some time for the agents to build.") results = worker.execute_episodes(200, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(rewards), np.mean(rewards[-10:]) ))
def test_update_from_external(self): agent_config = config_from_path("configs/ray_apex_for_pong.json") agent_config["execution_spec"].pop("ray_spec") environment = OpenAIGymEnv("Pong-v0", frameskip=4) agent = Agent.from_spec( agent_config, state_space=environment.state_space, action_space=environment.action_space ) batch = { "states": agent.preprocessed_state_space.sample(200), "actions": environment.action_space.sample(200), "rewards": np.zeros(200, dtype=np.float32), "terminals": [False] * 200, "next_states": agent.preprocessed_state_space.sample(200), "importance_weights": np.ones(200, dtype=np.float32) } agent.update(batch)
def test_ppo_on_pendulum(self): """ Creates a PPO Agent and runs it via a Runner on the Pendulum env. """ env = OpenAIGymEnv("Pendulum-v0") agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=False, #self.is_windows, episode_finish_callback=lambda episode_return, duration, timesteps, env_num: print("episode return {}; steps={}".format( episode_return, timesteps))) results = worker.execute_episodes(5000, use_exploration=True) print(results)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({"type": "openai", "gym_env": FLAGS.env}) agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space) rewards = [] def episode_finished_callback(reward, duration, timesteps, **kwargs): rewards.append(reward) if len(rewards) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(rewards), reward, np.mean(rewards[-10:]))) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") results = worker.execute_episodes(200, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(rewards), np.mean(rewards[-10:])))
def test_policy_sync(self): """ Tests weight syncing of policy (and only policy, not Q-functions). """ env = OpenAIGymEnv("CartPole-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space) weights = agent.get_weights() print("weights =", weights.keys()) new_weights = {} for key, value in weights["policy_weights"].items(): new_weights[key] = value + 0.01 agent.set_weights(policy_weights=new_weights, value_function_weights=None) updated_weights = agent.get_weights()["policy_weights"] recursive_assert_almost_equal(updated_weights, new_weights)
def evaluate(agent_obs, nChildren): envObs = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": 'gym_SmartPrimer:TestEnv-v0' }) improvements = [] for i in range(0, nChildren): ob_obs = envObs.reset() ob_obs = (ob_obs - [-4.5, 0, -5, 0, 0, 1.5, 25, 0 ]) / [9, 1, 10, 1, 1, 3, 50, 1] # ob_obs = (ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5]) / [8, 4, 1, 1, 1, 3, 30, 10] # action_list_obs = [] while True: time_percentage_obs = min(agent_obs.timesteps / 1e6, 1.0) action = agent_obs.get_action(ob_obs, time_percentage=time_percentage_obs) # action = np.random.randint(0, 4) # action = 3 # action_list_obs.append(action) next_ob_obs, reward, done, Baseinfo = envObs.step(action) next_ob_obs = (next_ob_obs - [-4.5, 0, -5, 0, 0, 1.5, 25, 0 ]) / [9, 1, 10, 1, 1, 3, 50, 1] # next_ob_obs = (next_ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5]) / [8, 4, 1, 1, 1, 3, 30, 10] # agent_obs.observe(ob_obs, action, None, reward, next_ob_obs, done) ob_obs = next_ob_obs if done: # print(envObs.gym_env.rewards) improvements.append(envObs.gym_env.rewards) agent_obs.reset() break return np.mean(improvements), np.std(improvements)
def test_ppo_on_cart_pole(self): """ Creates a PPO Agent and runs it via a Runner on the CartPole env. """ env = OpenAIGymEnv("CartPole-v0", seed=36) agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space) time_steps = 3000 worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertLessEqual(results["episodes_executed"], time_steps / 10)
def test_impala_on_outbreak(self): """ Creates a DQNAgent and runs it via a Runner on an openAI Pong Env. """ env = OpenAIGymEnv("Breakout-v0", frameskip=4, max_num_noops=30, episodic_life=True, visualize=False) config_ = config_from_path("configs/impala_agent_for_breakout.json") agent = IMPALAAgent.from_spec( config_, state_space=env.state_space, action_space=env.action_space, ) learn_updates = 4000000 mean_returns = [] for i in range(learn_updates): ret = agent.update() mean_return = self._calc_mean_return(ret) mean_returns.append(mean_return) print("i={} Loss={:.4} Avg-reward={:.2}".format(i, float(ret[1]), mean_return)) time.sleep(3) agent.terminate() time.sleep(3)
def test_sac_on_pendulum(self): """ Creates an SAC-Agent and runs it on Pendulum. """ env = OpenAIGymEnv("Pendulum-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows) # Note: SAC is more computationally expensive. episodes = 50 results = worker.execute_episodes(episodes) print(results) self.assertTrue(results["timesteps_executed"] == episodes * 200) self.assertTrue(results["episodes_executed"] == episodes) self.assertGreater(results["mean_episode_reward"], -800)
def evaluate(agent_obs, nChildren): envObs = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": 'gym_SmartPrimer:SmartPrimer-realistic-v2' }) improvements = [] for i in range(0, nChildren): ob_obs = envObs.reset() ob_obs = (ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5 ]) / [8, 4, 1, 1, 1, 3, 30, 10] action_list_obs = [] while True: time_percentage_obs = min(agent_obs.timesteps / 1e6, 1.0) action = agent_obs.get_action(ob_obs, time_percentage=time_percentage_obs) # action = np.random.randint(0, 4) # action = 3 action_list_obs.append(action) next_ob_obs, reward, done, Baseinfo = envObs.step(action) next_ob_obs = (next_ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5 ]) / [8, 4, 1, 1, 1, 3, 30, 10] ob_obs = next_ob_obs if done: improvements.append(envObs.gym_env.info['improvementPerChild']) agent_obs.reset() break return np.mean(improvements), np.std(improvements)
def test_ppo_on_continuous_action_environment(self): """ Creates a PPO Agent and runs it via a Runner on the CartPole Env. """ env = OpenAIGymEnv("Pendulum-v0", seed=652) agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space) time_steps = 100000 worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) #self.assertGreaterEqual(results["mean_episode_reward"], 23) #self.assertGreaterEqual(results["max_episode_reward"], 100.0) self.assertLessEqual(results["episodes_executed"], time_steps / 10)
def test_pong_with_worker(self): env_spec = dict( type="openai", gym_env="PongNoFrameskip-v4", # The frameskip in the agent config will trigger worker skips, this # is used for internal env. frameskip=4, max_num_noops=30, episodic_life=False) env = OpenAIGymEnv.from_spec(env_spec) agent_config = config_from_path( "configs/backend_performance_dqn_pong.json") # Test cpu settings for batching here. if get_backend() == "pytorch": agent_config["memory_spec"]["type"] = "mem_prioritized_replay" agent_config["update_spec"] = None agent = DQNAgent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space) worker = SingleThreadedWorker( env_spec=env_spec, agent=agent, frameskip=1, num_envs=1, preprocessing_spec=agent_config["preprocessing_spec"], worker_executes_preprocessing=True) result = worker.execute_timesteps(1000) print(result)
def test_openai_atari_env(self): env = OpenAIGymEnv("Pong-v0") # Simple test runs with fixed actions. s = env.reset() # Assert we have pixels. self.assertGreaterEqual(np.mean(s), 0) self.assertLessEqual(np.mean(s), 255) accum_reward = 0.0 for _ in range(100): s, r, t, _ = env.step(env.action_space.sample()) assert isinstance(r, np.ndarray) assert r.dtype == np.float32 assert isinstance(t, bool) self.assertGreaterEqual(np.mean(s), 0) self.assertLessEqual(np.mean(s), 255) accum_reward += r print("Accumulated Reward: ".format(accum_reward)) env.terminate()