Example #1
0
    def test_update_from_demos(self):
        """
        Tests the separate API method to update from demos.
        """
        env = OpenAIGymEnv.from_spec(self.env_spec)
        agent_config = config_from_path("configs/dqfd_agent_for_cartpole.json")
        agent = DQFDAgent.from_spec(agent_config,
                                    state_space=env.state_space,
                                    action_space=env.action_space)
        terminals = BoolBox(add_batch_rank=True)
        rewards = FloatBox(add_batch_rank=True)
        state_1 = agent.preprocessed_state_space.with_batch_rank().sample(1)
        action_1 = [1]
        state_2 = agent.preprocessed_state_space.with_batch_rank().sample(1)
        action_2 = [0]

        # Insert two states with fixed actions and a few random examples.
        for _ in range(10):
            # State with correct action
            agent.observe_demos(
                preprocessed_states=state_1,
                actions=action_1,
                rewards=rewards.sample(1),
                next_states=agent.preprocessed_state_space.with_batch_rank().
                sample(1),
                terminals=terminals.sample(1),
            )
            agent.observe_demos(
                preprocessed_states=state_2,
                actions=action_2,
                rewards=rewards.sample(1),
                next_states=agent.preprocessed_state_space.with_batch_rank().
                sample(1),
                terminals=terminals.sample(1),
            )

        # Update.
        agent.update_from_demos(num_updates=1000, batch_size=8)

        # Test if fixed states and actions map.
        action = agent.get_action(states=state_1,
                                  apply_preprocessing=False,
                                  use_exploration=False)
        self.assertEqual(action, action_1)

        action = agent.get_action(states=state_2,
                                  apply_preprocessing=False,
                                  use_exploration=False)
        self.assertEqual(action, action_2)
Example #2
0
 def test_dqn_compilation(self):
     """
     Tests DQN Agent compilation.
     """
     env = OpenAIGymEnv("Pong-v0",
                        frameskip=4,
                        max_num_noops=30,
                        episodic_life=True)
     agent_config = config_from_path("configs/dqn_agent_for_pong.json")
     agent = DQNAgent.from_spec(
         # Uses 2015 DQN parameters as closely as possible.
         agent_config,
         state_space=env.state_space,
         # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
         action_space=env.action_space)
Example #3
0
    def test_multi_gpu_apex_agent_compilation(self):
        """
        Tests if the multi gpu strategy can compile successfully on a multi gpu system, but
        also runs on a CPU-only system using fake-GPU logic for testing purposes.
        """
        root_logger.setLevel(DEBUG)
        agent_config = config_from_path(
            "configs/multi_gpu_ray_apex_for_pong.json")
        agent_config["execution_spec"].pop("ray_spec")
        environment = OpenAIGymEnv("Pong-v0", frameskip=4)

        agent = ApexAgent.from_spec(agent_config,
                                    state_space=environment.state_space,
                                    action_space=environment.action_space)
        print("Compiled Apex agent")
Example #4
0
    def test_multi_gpu_apex_agent_compilation(self):
        """
        Tests if the multi gpu strategy can compile successfully on a multi gpu system.

        THIS TEST REQUIRES A MULTI GPU SYSTEM.
        """
        root_logger.setLevel(DEBUG)
        agent_config = config_from_path("configs/multi_gpu_ray_apex_for_pong.json")
        agent_config["execution_spec"].pop("ray_spec")
        environment = OpenAIGymEnv("Pong-v0", frameskip=4)

        agent = ApexAgent.from_spec(
            agent_config, state_space=environment.state_space, action_space=environment.action_space
        )
        print("Compiled Apex agent")
Example #5
0
 def test_dqn_compilation(self):
     """
     Creates a DQNAgent and runs it via a Runner on an openAI Pong Env.
     """
     env = OpenAIGymEnv("Pong-v0",
                        frameskip=4,
                        max_num_noops=30,
                        episodic_life=True)
     agent_config = config_from_path("configs/dqn_pytorch_test.json")
     agent = DQNAgent.from_spec(
         # Uses 2015 DQN parameters as closely as possible.
         agent_config,
         state_space=env.state_space,
         # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
         action_space=env.action_space)
Example #6
0
    def test_apex_compilation(self):
        """
        Tests agent compilation without Ray to ease debugging on Windows.
        """
        agent_config = config_from_path("configs/ray_apex_for_pong.json")
        agent_config["execution_spec"].pop("ray_spec")
        # TODO remove after unified.
        if get_backend() == "pytorch":
            agent_config["memory_spec"]["type"] = "mem_prioritized_replay"
        environment = OpenAIGymEnv("Pong-v0", frameskip=4)

        agent = ApexAgent.from_spec(agent_config,
                                    state_space=environment.state_space,
                                    action_space=environment.action_space)
        print('Compiled apex agent')
Example #7
0
    def test_cartpole_with_worker(self):
        env = OpenAIGymEnv("CartPole-v0")
        agent_config = config_from_path(
            "configs/backend_performance_dqn_cartpole.json")

        # Test cpu settings for batching here.
        agent_config["update_spec"] = None

        agent = DQNAgent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space)

        worker = SingleThreadedWorker(
            env_spec=lambda: OpenAIGymEnv("CartPole-v0"),
            agent=agent,
            frameskip=1,
            num_envs=1,
            worker_executes_preprocessing=False)

        result = worker.execute_timesteps(1000)
        print(result)
Example #8
0
class TestSingleThreadedWorker(unittest.TestCase):

    environment = OpenAIGymEnv(gym_env='CartPole-v0')

    def test_timesteps(self):
        """
        Simply tests if timestep execution loop works and returns a result.
        """
        agent = RandomAgent(
            action_space=self.environment.action_space,
            state_space=self.environment.state_space
        )
        worker = SingleThreadedWorker(
            env_spec=lambda: self.environment,
            agent=agent,
            frameskip=1,
            worker_executes_preprocessing=False
        )

        result = worker.execute_timesteps(100)
        self.assertEqual(result['timesteps_executed'], 100)
        self.assertGreater(result['episodes_executed'], 0)
        self.assertLessEqual(result['episodes_executed'], 100)
        self.assertGreaterEqual(result['env_frames'], 100)
        self.assertGreaterEqual(result['runtime'], 0.0)

    def test_episodes(self):
        """
        Simply tests if episode execution loop works and returns a result.
        """
        agent = RandomAgent(
            action_space=self.environment.action_space,
            state_space=self.environment.state_space
        )
        worker = SingleThreadedWorker(
            env_spec=lambda: self.environment,
            agent=agent,
            frameskip=1,
            worker_executes_preprocessing=False
        )

        result = worker.execute_episodes(5, max_timesteps_per_episode=10)
        # Max 5 * 10.
        self.assertLessEqual(result['timesteps_executed'], 50)
        self.assertEqual(result['episodes_executed'], 5)
        self.assertLessEqual(result['env_frames'], 50)
        self.assertGreaterEqual(result['runtime'], 0.0)
Example #9
0
    def test_ppo_on_pendulum(self):
        """
        Creates a PPO Agent and runs it via a Runner on the Pendulum env.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      worker_executes_preprocessing=False,
                                      render=self.is_windows)
        results = worker.execute_episodes(500, use_exploration=True)

        print(results)
Example #10
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env,
        "visualize": FLAGS.visualize
    })

    agent = Agent.from_spec(agent_config,
                            state_space=env.state_space,
                            action_space=env.action_space)

    episode_returns = []

    def episode_finished_callback(episode_return, duration, timesteps,
                                  **kwargs):
        episode_returns.append(episode_return)
        if len(episode_returns) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
                  format(len(episode_returns), episode_return,
                         np.mean(episode_returns[-10:])))

    worker = SingleThreadedWorker(
        env_spec=lambda: env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")

    # Use exploration is true for training, false for evaluation.
    worker.execute_timesteps(20000, use_exploration=True)

    # Note: A basic actor critic is very sensitive to hyper-parameters and might collapse after reaching the maximum
    # reward. In practice, it would be recommended to stop training when a reward threshold is reached.
    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(episode_returns), np.mean(episode_returns[-10:])))
Example #11
0
    def test_dqn_on_pong(self):
        """
        Creates a DQNAgent and runs it via a Runner on an openAI Pong Env.
        """
        env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True)
        agent_config = config_from_path("configs/ray_apex_for_pong.json")
        agent = Agent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space
        )

        time_steps = 4000000
        worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env
    })
    print(env.state_space)

    agent = Agent.from_spec(
        agent_config,
        state_space=env.state_space,
        action_space=env.action_space
    )

    episode_returns = []

    def episode_finished_callback(episode_return, duration, timesteps, **kwargs):
        episode_returns.append(episode_return)
        if len(episode_returns) % 5 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format(
                len(episode_returns), episode_return, np.mean(episode_returns[-5:])
            ))

    worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False,
                                  episode_finish_callback=episode_finished_callback)
    print("Starting workload, this will take some time for the agents to build.")

    worker.execute_episodes(100, use_exploration=True)

    # Use exploration is true for training, false for evaluation.

    #worker.execute_episodes(100, use_exploration=False)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(episode_returns), np.mean(episode_returns[-10:])
    ))
Example #13
0
    def test_sac_on_cartpole(self):
        """
        Creates an SAC-Agent and runs it on CartPole.
        """
        env = OpenAIGymEnv("CartPole-v0")
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_cartpole.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      worker_executes_preprocessing=False,
                                      render=self.is_windows)

        time_steps = 10000
        results = worker.execute_timesteps(time_steps)

        print(results)
Example #14
0
    def test_post_processing(self):
        env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True)
        agent_config = config_from_path("configs/ray_apex_for_pong.json")

        # Test cpu settings for batching here.
        agent_config["memory_spec"]["type"] = "mem_prioritized_replay"
        agent_config["execution_spec"]["torch_num_threads"] = 1
        agent_config["execution_spec"]["OMP_NUM_THREADS"] = 1

        agent = ApexAgent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space
        )
        samples = 200
        rewards = np.random.random(size=samples)
        states = list(agent.preprocessed_state_space.sample(samples))
        actions = agent.action_space.sample(samples)
        terminals = np.zeros(samples, dtype=np.uint8)
        next_states = states[1:]
        next_states.extend([agent.preprocessed_state_space.sample(1)])
        next_states = np.asarray(next_states)
        states = np.asarray(states)
        weights = np.ones_like(rewards)

        for _ in range(1):
            start = time.perf_counter()
            _, loss_per_item = agent.post_process(
                dict(
                    states=states,
                    actions=actions,
                    rewards=rewards,
                    terminals=terminals,
                    next_states=next_states,
                    importance_weights=weights
                )
            )
            print("post process time = {}".format(time.perf_counter() - start))
        profile = Component.call_times
        print_call_chain(profile, False, 0.003)
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    # Override openAI gym env per command line.
    if FLAGS.env is None:
        env_spec = agent_config["environment_spec"]
    else:
        env_spec = dict(type="openai-gym", gym_env=FLAGS.env)
    # Override number of visualized envs per command line.
    if FLAGS.visualize != -1:
        env_spec["visualize"] = FLAGS.visualize

    dummy_env = OpenAIGymEnv.from_spec(env_spec)
    agent = Agent.from_spec(
        agent_config,
        state_space=dummy_env.state_space,
        action_space=dummy_env.action_space
    )
    dummy_env.terminate()

    learn_updates = 6000
    mean_returns = []
    for i in range(learn_updates):
        ret = agent.update()
        mean_return = _calc_mean_return(ret)
        mean_returns.append(mean_return)
        print("Iteration={} Loss={:.4f} Avg-reward={:.2f}".format(i, float(ret[1]), mean_return))

    print("Mean return: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.nanmean(mean_returns), np.nanmean(mean_returns[-10:])
    ))

    time.sleep(1)
    agent.terminate()
    time.sleep(3)
    def test_image_value_functions(self):
        """
        Tests if actions and states are successfully merged on image inputs to compute Q(s,a).
        """
        env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True)
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_pong.json"),
            state_space=env.state_space,
            action_space=env.action_space
        )

        # Test updating from image batch.
        batch = dict(
            states=agent.preprocessed_state_space.sample(32),
            actions=env.action_space.sample(32),
            rewards=np.ones((32,)),
            terminals=np.zeros((32,)),
            next_states=agent.preprocessed_state_space.sample(32),
        )
        print(agent.update(batch))
Example #17
0
    def test_apex_weight_syncing(self):
        agent_config = config_from_path("configs/ray_apex_for_pong.json")
        agent_config["execution_spec"].pop("ray_spec")
        environment = OpenAIGymEnv("Pong-v0", frameskip=4)

        agent = Agent.from_spec(
            agent_config,
            state_space=environment.state_space,
            action_space=environment.action_space
        )

        weights = agent.get_weights()["policy_weights"]
        print("type weights = ", type(weights))
        for variable, value in weights.items():
            print("Type value = ", type(value))
            value += 0.01
        agent.set_weights(weights)

        new_weights = agent.get_weights()["policy_weights"]
        recursive_assert_almost_equal(weights, new_weights)
Example #18
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config = read_config_file(FLAGS.config)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env
    })

    agent = Agent.from_spec(
        agent_config,
        summary_spec=dict(
            summary_regexp=FLAGS.summary_regexp
        ),
        state_space=env.state_space,
        action_space=env.action_space
    )

    rewards = []

    def episode_finished_callback(reward, duration, timesteps, **kwargs):
        rewards.append(reward)
        if len(rewards) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format(
                len(rewards), reward, np.mean(rewards[-10:])
            ))

    worker = SingleThreadedWorker(
        env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback
    )
    print("Starting workload, this will take some time for the agents to build.")
    results = worker.execute_episodes(200, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(rewards), np.mean(rewards[-10:])
    ))
Example #19
0
    def test_update_from_external(self):
        agent_config = config_from_path("configs/ray_apex_for_pong.json")
        agent_config["execution_spec"].pop("ray_spec")
        environment = OpenAIGymEnv("Pong-v0", frameskip=4)

        agent = Agent.from_spec(
            agent_config,
            state_space=environment.state_space,
            action_space=environment.action_space
        )

        batch = {
            "states": agent.preprocessed_state_space.sample(200),
            "actions": environment.action_space.sample(200),
            "rewards": np.zeros(200, dtype=np.float32),
            "terminals": [False] * 200,
            "next_states": agent.preprocessed_state_space.sample(200),
            "importance_weights":  np.ones(200, dtype=np.float32)
        }

        agent.update(batch)
    def test_ppo_on_pendulum(self):
        """
        Creates a PPO Agent and runs it via a Runner on the Pendulum env.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=False,
            render=False,  #self.is_windows,
            episode_finish_callback=lambda episode_return, duration, timesteps,
            env_num: print("episode return {}; steps={}".format(
                episode_return, timesteps)))
        results = worker.execute_episodes(5000, use_exploration=True)

        print(results)
Example #21
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({"type": "openai", "gym_env": FLAGS.env})

    agent = Agent.from_spec(
        # Uses 2015 DQN parameters as closely as possible.
        agent_config,
        state_space=env.state_space,
        # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
        action_space=env.action_space)

    rewards = []

    def episode_finished_callback(reward, duration, timesteps, **kwargs):
        rewards.append(reward)
        if len(rewards) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
                  format(len(rewards), reward, np.mean(rewards[-10:])))

    worker = SingleThreadedWorker(
        env_spec=lambda: env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")
    results = worker.execute_episodes(200, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(rewards), np.mean(rewards[-10:])))
    def test_policy_sync(self):
        """
        Tests weight syncing of policy (and only policy, not Q-functions).
        """
        env = OpenAIGymEnv("CartPole-v0")
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_cartpole.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        weights = agent.get_weights()
        print("weights =", weights.keys())

        new_weights = {}
        for key, value in weights["policy_weights"].items():
            new_weights[key] = value + 0.01

        agent.set_weights(policy_weights=new_weights,
                          value_function_weights=None)

        updated_weights = agent.get_weights()["policy_weights"]
        recursive_assert_almost_equal(updated_weights, new_weights)
Example #23
0
def evaluate(agent_obs, nChildren):
    envObs = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": 'gym_SmartPrimer:TestEnv-v0'
    })

    improvements = []
    for i in range(0, nChildren):
        ob_obs = envObs.reset()
        ob_obs = (ob_obs - [-4.5, 0, -5, 0, 0, 1.5, 25, 0
                            ]) / [9, 1, 10, 1, 1, 3, 50, 1]
        # ob_obs = (ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5]) / [8, 4, 1, 1, 1, 3, 30, 10]
        # action_list_obs = []

        while True:
            time_percentage_obs = min(agent_obs.timesteps / 1e6, 1.0)
            action = agent_obs.get_action(ob_obs,
                                          time_percentage=time_percentage_obs)
            # action = np.random.randint(0, 4)
            # action = 3

            # action_list_obs.append(action)

            next_ob_obs, reward, done, Baseinfo = envObs.step(action)
            next_ob_obs = (next_ob_obs - [-4.5, 0, -5, 0, 0, 1.5, 25, 0
                                          ]) / [9, 1, 10, 1, 1, 3, 50, 1]
            # next_ob_obs = (next_ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5]) / [8, 4, 1, 1, 1, 3, 30, 10]

            # agent_obs.observe(ob_obs, action, None, reward, next_ob_obs, done)
            ob_obs = next_ob_obs

            if done:
                # print(envObs.gym_env.rewards)
                improvements.append(envObs.gym_env.rewards)

                agent_obs.reset()
                break

    return np.mean(improvements), np.std(improvements)
Example #24
0
    def test_ppo_on_cart_pole(self):
        """
        Creates a PPO Agent and runs it via a Runner on the CartPole env.
        """
        env = OpenAIGymEnv("CartPole-v0", seed=36)
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_cartpole.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        time_steps = 3000
        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      worker_executes_preprocessing=False,
                                      render=self.is_windows)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertLessEqual(results["episodes_executed"], time_steps / 10)
    def test_impala_on_outbreak(self):
        """
        Creates a DQNAgent and runs it via a Runner on an openAI Pong Env.
        """
        env = OpenAIGymEnv("Breakout-v0", frameskip=4, max_num_noops=30, episodic_life=True, visualize=False)
        config_ = config_from_path("configs/impala_agent_for_breakout.json")
        agent = IMPALAAgent.from_spec(
            config_,
            state_space=env.state_space,
            action_space=env.action_space,
        )

        learn_updates = 4000000
        mean_returns = []
        for i in range(learn_updates):
            ret = agent.update()
            mean_return = self._calc_mean_return(ret)
            mean_returns.append(mean_return)
            print("i={} Loss={:.4} Avg-reward={:.2}".format(i, float(ret[1]), mean_return))

        time.sleep(3)
        agent.terminate()
        time.sleep(3)
Example #26
0
    def test_sac_on_pendulum(self):
        """
        Creates an SAC-Agent and runs it on Pendulum.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      worker_executes_preprocessing=False,
                                      render=self.is_windows)
        # Note: SAC is more computationally expensive.
        episodes = 50
        results = worker.execute_episodes(episodes)

        print(results)

        self.assertTrue(results["timesteps_executed"] == episodes * 200)
        self.assertTrue(results["episodes_executed"] == episodes)
        self.assertGreater(results["mean_episode_reward"], -800)
def evaluate(agent_obs, nChildren):
    envObs = OpenAIGymEnv.from_spec({
        "type":
        "openai",
        "gym_env":
        'gym_SmartPrimer:SmartPrimer-realistic-v2'
    })

    improvements = []
    for i in range(0, nChildren):
        ob_obs = envObs.reset()
        ob_obs = (ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5
                            ]) / [8, 4, 1, 1, 1, 3, 30, 10]
        action_list_obs = []

        while True:
            time_percentage_obs = min(agent_obs.timesteps / 1e6, 1.0)
            action = agent_obs.get_action(ob_obs,
                                          time_percentage=time_percentage_obs)
            # action = np.random.randint(0, 4)
            # action = 3

            action_list_obs.append(action)

            next_ob_obs, reward, done, Baseinfo = envObs.step(action)
            next_ob_obs = (next_ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5
                                          ]) / [8, 4, 1, 1, 1, 3, 30, 10]

            ob_obs = next_ob_obs

            if done:
                improvements.append(envObs.gym_env.info['improvementPerChild'])

                agent_obs.reset()
                break

    return np.mean(improvements), np.std(improvements)
Example #28
0
    def test_ppo_on_continuous_action_environment(self):
        """
        Creates a PPO Agent and runs it via a Runner on the CartPole Env.
        """
        env = OpenAIGymEnv("Pendulum-v0", seed=652)
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        time_steps = 100000
        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      worker_executes_preprocessing=False,
                                      render=self.is_windows)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        #self.assertGreaterEqual(results["mean_episode_reward"], 23)
        #self.assertGreaterEqual(results["max_episode_reward"], 100.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 10)
Example #29
0
    def test_pong_with_worker(self):
        env_spec = dict(
            type="openai",
            gym_env="PongNoFrameskip-v4",
            # The frameskip in the agent config will trigger worker skips, this
            # is used for internal env.
            frameskip=4,
            max_num_noops=30,
            episodic_life=False)

        env = OpenAIGymEnv.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/backend_performance_dqn_pong.json")

        # Test cpu settings for batching here.
        if get_backend() == "pytorch":
            agent_config["memory_spec"]["type"] = "mem_prioritized_replay"
        agent_config["update_spec"] = None

        agent = DQNAgent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space)

        worker = SingleThreadedWorker(
            env_spec=env_spec,
            agent=agent,
            frameskip=1,
            num_envs=1,
            preprocessing_spec=agent_config["preprocessing_spec"],
            worker_executes_preprocessing=True)

        result = worker.execute_timesteps(1000)
        print(result)
Example #30
0
    def test_openai_atari_env(self):
        env = OpenAIGymEnv("Pong-v0")

        # Simple test runs with fixed actions.
        s = env.reset()
        # Assert we have pixels.
        self.assertGreaterEqual(np.mean(s), 0)
        self.assertLessEqual(np.mean(s), 255)
        accum_reward = 0.0
        for _ in range(100):
            s, r, t, _ = env.step(env.action_space.sample())
            assert isinstance(r, np.ndarray)
            assert r.dtype == np.float32
            assert isinstance(t, bool)
            self.assertGreaterEqual(np.mean(s), 0)
            self.assertLessEqual(np.mean(s), 255)
            accum_reward += r

        print("Accumulated Reward: ".format(accum_reward))

        env.terminate()