Exemple #1
0
    def test_ppo_on_2x2_grid_world(self):
        """
        Creates a PPO Agent and runs it via a Runner on the 2x2 Grid World Env.
        """
        env = GridWorld(world="2x2")
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_2x2_gridworld.json"),
            state_space=GridWorld.grid_world_2x2_flattened_state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=15),
        )

        time_steps = 3000
        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=True,
            preprocessing_spec=GridWorld.grid_world_2x2_preprocessing_spec
        )
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        # Assume we have learned something.
        self.assertGreater(results["mean_episode_reward"], -0.2)
    def test_cartpole_with_worker(self):
        env = OpenAIGymEnv("CartPole-v0")
        agent_config = config_from_path("configs/backend_performance_dqn_cartpole.json")

        # Test cpu settings for batching here.
        agent_config["update_spec"] = None

        agent = DQNAgent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(
            env_spec=lambda: OpenAIGymEnv("CartPole-v0"),
            agent=agent,
            frameskip=1,
            num_environments=1,
            worker_executes_preprocessing=False
        )

        result = worker.execute_timesteps(1000)
        print(result)
Exemple #3
0
    def test_sac_on_pendulum(self):
        """
        Creates an SAC-Agent and runs it on Pendulum.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=False,
            render=self.is_windows
        )
        # Note: SAC is more computationally expensive.
        episodes = 50
        results = worker.execute_episodes(episodes)

        print(results)

        self.assertTrue(results["timesteps_executed"] == episodes * 200)
        self.assertTrue(results["episodes_executed"] == episodes)
        self.assertGreater(results["mean_episode_reward"], -800)
Exemple #4
0
    def test_sac_on_cartpole(self):
        """
        Creates an SAC-Agent and runs it on CartPole.
        """
        env = OpenAIGymEnv("CartPole-v0")
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_cartpole.json"),
            state_space=env.state_space,
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=False,
            render=False,  # self.is_windows,
            episode_finish_callback=lambda episode_return, duration, timesteps, **kwargs:
            print("episode: return={} ts={}".format(episode_return, timesteps))
        )

        time_steps = 5000
        results = worker.execute_timesteps(time_steps)

        print(results)

        self.assertTrue(results["timesteps_executed"] == time_steps)
        self.assertLessEqual(results["episodes_executed"], time_steps / 20)
        self.assertGreater(results["mean_episode_reward"], 40.0)
        self.assertGreater(results["max_episode_reward"], 100.0)
        self.assertGreater(results["mean_episode_reward_last_10_episodes"], 100.0)
    def test_dqn_on_cart_pole(self):
        """
        Creates a DQNAgent and runs it via a Runner on the CartPole Env.
        """
        dummy_env = OpenAIGymEnv("CartPole-v0")
        agent = DQNAgent.from_spec(
            config_from_path("configs/dqn_agent_for_cartpole.json"),
            double_q=False,
            dueling_q=False,
            state_space=dummy_env.state_space,
            action_space=dummy_env.action_space,
            execution_spec=dict(seed=15),
            update_spec=dict(update_interval=4,
                             batch_size=24,
                             sync_interval=64),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
            store_last_q_table=True)

        time_steps = 3000
        worker = SingleThreadedWorker(
            env_spec=lambda: OpenAIGymEnv("CartPole-v0", seed=15),
            agent=agent,
            render=self.is_windows,
            worker_executes_preprocessing=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], 25)
        self.assertGreaterEqual(results["max_episode_reward"], 100.0)
        self.assertLessEqual(results["episodes_executed"], 200)
Exemple #6
0
    def test_sac_2x2_grid_world_with_container_actions(self):
        """
        Creates a SAC agent and runs it via a Runner on a simple 2x2 GridWorld using container actions.
        """
        # ftj = forward + turn + jump
        env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path("configs/sac_agent_for_2x2_gridworld_with_container_actions.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")

        agent = SACAgent.from_spec(
            agent_config,
            state_space=FloatBox(shape=(4,)),
            action_space=dummy_env.action_space,
        )

        time_steps = 10000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=False,
            render=False
        )
        results = worker.execute_timesteps(time_steps, use_exploration=True)
        print(results)
    def test_ppo_on_cart_pole(self):
        """
        Creates a PPO Agent and runs it via a Runner on the CartPole env.
        """
        env = OpenAIGymEnv("CartPole-v0", seed=36)
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_cartpole.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        time_steps = 3000
        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=False,
            render=False,  #self.is_windows
            episode_finish_callback=lambda episode_return, duration, timesteps,
            env_num: print("episode return {}; steps={}".format(
                episode_return, timesteps)))
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertLessEqual(results["episodes_executed"], time_steps / 10)
        # Assume we have learned something.
        self.assertGreaterEqual(results["mean_episode_reward"], 40.0)
Exemple #8
0
    def test_double_dueling_dqn_on_cart_pole(self):
        """
        Creates a double and dueling DQNAgent and runs it via a Runner on the CartPole Env.
        """
        dummy_env = OpenAIGymEnv("CartPole-v0")
        agent = DQNAgent.from_spec(
            config_from_path("configs/dqn_agent_for_cartpole.json"),
            double_q=True,
            dueling_q=True,
            state_space=dummy_env.state_space,
            action_space=dummy_env.action_space,
            observe_spec=dict(buffer_size=200),
            execution_spec=dict(seed=156),
            update_spec=dict(update_interval=4,
                             batch_size=64,
                             sync_interval=16),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
            store_last_q_table=True)

        time_steps = 3000
        worker = SingleThreadedWorker(
            env_spec=lambda: OpenAIGymEnv("CartPole-v0", seed=10),
            agent=agent,
            render=self.is_windows,
            worker_executes_preprocessing=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        #print("STATES:\n{}".format(agent.last_q_table["states"]))
        #print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], 15)
        self.assertGreaterEqual(results["max_episode_reward"], 160.0)
        self.assertLessEqual(results["episodes_executed"], 100)
    def test_pong_with_worker(self):
        env_spec = dict(
            type="openai",
            gym_env="PongNoFrameskip-v4",
            # The frameskip in the agent config will trigger worker skips, this
            # is used for internal env.
            frameskip=4,
            max_num_noops=30,
            episodic_life=False
        )

        env = OpenAIGymEnv.from_spec(env_spec)
        agent_config = config_from_path("configs/backend_performance_dqn_pong.json")

        # Test cpu settings for batching here.
        agent_config["update_spec"] = None

        agent = DQNAgent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(
            env_spec=env_spec,
            agent=agent,
            frameskip=1,
            preprocessing_spec=agent_config["preprocessing_spec"],
            worker_executes_preprocessing=True
        )

        result = worker.execute_timesteps(1000)
        print(result)
Exemple #10
0
    def test_sac_on_pendulum(self):
        """
        Creates an SAC-Agent and runs it on Pendulum.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=False,
            render=False,  # self.is_windows
            episode_finish_callback=lambda episode_return, duration, timesteps, **kwargs:
            print("episode: return={} ts={}".format(episode_return, timesteps))
        )
        # Note: SAC is more computationally expensive.
        episodes = 50
        results = worker.execute_episodes(episodes)

        print(results)

        self.assertTrue(results["timesteps_executed"] == episodes * 200)
        self.assertTrue(results["episodes_executed"] == episodes)
        self.assertGreater(results["mean_episode_reward_last_10_episodes"], -700)
        self.assertGreater(results["max_episode_reward"], -100)
    def test_dqn_on_pong(self):
        """
        Creates a DQNAgent and runs it via a Runner on an openAI Pong Env.
        """
        env = OpenAIGymEnv("Pong-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True,
                           visualize=False)
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        preprocessing_spec = agent_config.pop("preprocessor_spec")
        agent = Agent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=self.pong_preprocessed_state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space)

        time_steps = 4000000
        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      render=True,
                                      preprocessing_spec=preprocessing_spec,
                                      worker_executes_preprocessing=True)
        results = worker.execute_timesteps(time_steps, use_exploration=True)
    def test_double_dqn_on_2x2_grid_world_single_action_to_container(self):
        """
        Tests how dqn solves a mapping of a single integer to multiple actions (as opposed to using container
        actions).
        """
        # ftj = forward + turn + jump
        env_spec = dict(world="2x2",
                        action_type="ftj",
                        state_representation="xy+orientation")
        agent_config = config_from_path(
            "configs/dqn_agent_for_2x2_gridworld_single_to_container.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")

        action_space = IntBox(0, 18)
        agent = DQNAgent.from_spec(agent_config,
                                   huber_loss=True,
                                   double_q=True,
                                   dueling_q=True,
                                   state_space=FloatBox(shape=(4, )),
                                   action_space=action_space,
                                   store_last_q_table=True)

        time_steps = 10000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=True,
            render=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)
        print(results)
    def test_actor_critic_on_cart_pole(self):
        """
        Creates an Actor-critic and runs it via a Runner on the CartPole Env.
        """
        env_spec = dict(type="open-ai-gym",
                        gym_env="CartPole-v0",
                        visualize=False)  #self.is_windows)
        dummy_env = OpenAIGymEnv.from_spec(env_spec)
        agent = ActorCriticAgent.from_spec(
            config_from_path("configs/actor_critic_agent_for_cartpole.json"),
            state_space=dummy_env.state_space,
            action_space=dummy_env.action_space)

        time_steps = 20000
        worker = SingleThreadedWorker(env_spec=env_spec,
                                      agent=agent,
                                      worker_executes_preprocessing=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], 20)
        self.assertGreaterEqual(results["max_episode_reward"], 100.0)
Exemple #14
0
    def test_ppo_on_cart_pole(self):
        """
        Creates a PPO Agent and runs it via a Runner on the CartPole Env.
        """
        env = OpenAIGymEnv("CartPole-v0", seed=36)
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_cartpole.json"),
            state_space=env.state_space,
            action_space=env.action_space
        )

        time_steps = 3000
        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=False,
            render=self.is_windows
        )
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        #self.assertGreaterEqual(results["mean_episode_reward"], 23)
        #self.assertGreaterEqual(results["max_episode_reward"], 100.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 10)
    def test_double_dqn_on_2x2_grid_world(self):
        """
        Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld.
        """
        env_spec = dict(world="2x2")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/dqn_agent_for_2x2_gridworld.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")
        agent = DQNAgent.from_spec(
            agent_config,
            dueling_q=False,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
            execution_spec=dict(seed=10),
            update_spec=dict(update_interval=4,
                             batch_size=24,
                             sync_interval=32),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
            store_last_q_table=True)

        time_steps = 1000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=True)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print("STATES:\n{}".format(agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(
            np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], 350)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)
Exemple #16
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env,
        "visualize": FLAGS.visualize
    })

    agent = Agent.from_spec(agent_config,
                            state_space=env.state_space,
                            action_space=env.action_space)

    episode_returns = []

    def episode_finished_callback(episode_return, duration, timesteps,
                                  **kwargs):
        episode_returns.append(episode_return)
        if len(episode_returns) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
                  format(len(episode_returns), episode_return,
                         np.mean(episode_returns[-10:])))

    worker = SingleThreadedWorker(
        env_spec=lambda: env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")

    # Use exploration is true for training, false for evaluation.
    worker.execute_timesteps(20000, use_exploration=True)

    # Note: A basic actor critic is very sensitive to hyper-parameters and might collapse after reaching the maximum
    # reward. In practice, it would be recommended to stop training when a reward threshold is reached.
    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(episode_returns), np.mean(episode_returns[-10:])))
Exemple #17
0
    def test_ppo_on_pendulum(self):
        """
        Creates a PPO Agent and runs it via a Runner on the Pendulum env.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      worker_executes_preprocessing=False,
                                      render=self.is_windows)
        results = worker.execute_episodes(500, use_exploration=True)

        print(results)
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env
    })
    print(env.state_space)

    agent = Agent.from_spec(
        agent_config,
        state_space=env.state_space,
        action_space=env.action_space
    )

    episode_returns = []

    def episode_finished_callback(episode_return, duration, timesteps, **kwargs):
        episode_returns.append(episode_return)
        if len(episode_returns) % 5 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format(
                len(episode_returns), episode_return, np.mean(episode_returns[-5:])
            ))

    worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False,
                                  episode_finish_callback=episode_finished_callback)
    print("Starting workload, this will take some time for the agents to build.")

    worker.execute_episodes(100, use_exploration=True)

    # Use exploration is true for training, false for evaluation.

    #worker.execute_episodes(100, use_exploration=False)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(episode_returns), np.mean(episode_returns[-10:])
    ))
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = MLAgentsEnv()

    agent = Agent.from_spec(agent_config,
                            state_space=env.state_space,
                            action_space=env.action_space)
    episode_returns = []

    def episode_finished_callback(episode_return, duration, timesteps,
                                  **kwargs):
        episode_returns.append(episode_return)
        finished_episodes = len(episode_returns)
        if finished_episodes % 4 == 0:
            print(
                "Episode {} finished in {:d}sec: total avg. reward={:.2f}; last 10 episodes={:.2f}; last "
                "100 episodes={:.2f}".format(
                    finished_episodes, int(duration), np.mean(episode_returns),
                    np.mean(episode_returns[-min(finished_episodes, 10):]),
                    np.mean(episode_returns[-min(finished_episodes, 100):])))

    worker = SingleThreadedWorker(
        env_spec=env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")

    # Use exploration is true for training, false for evaluation.
    worker.execute_timesteps(500000, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(episode_returns), np.mean(episode_returns[-10:])))
Exemple #20
0
    def test_sac_on_cartpole(self):
        """
        Creates an SAC-Agent and runs it on CartPole.
        """
        env = OpenAIGymEnv("CartPole-v0")
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_cartpole.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      worker_executes_preprocessing=False,
                                      render=self.is_windows)

        time_steps = 10000
        results = worker.execute_timesteps(time_steps)

        print(results)
    def test_double_dueling_dqn_on_cart_pole(self):
        """
        Creates a double and dueling DQNAgent and runs it via a Runner on the CartPole Env.
        """
        gym_env = "CartPole-v0"
        dummy_env = OpenAIGymEnv(gym_env)
        config_ = config_from_path("configs/dqn_agent_for_cartpole.json")
        # Add dueling config to agent.
        config_["policy_spec"] = {
            "units_state_value_stream": 3,
            "action_adapter_spec": {
                "pre_network_spec": [{
                    "type": "dense",
                    "units": 3
                }]
            }
        }
        agent = DQNAgent.from_spec(config_,
                                   double_q=True,
                                   dueling_q=True,
                                   state_space=dummy_env.state_space,
                                   action_space=dummy_env.action_space,
                                   execution_spec=dict(seed=13),
                                   update_spec=dict(update_interval=4,
                                                    batch_size=64,
                                                    sync_interval=16),
                                   optimizer_spec=dict(type="adam",
                                                       learning_rate=0.01),
                                   store_last_q_table=True)

        time_steps = 3000
        worker = SingleThreadedWorker(
            env_spec=lambda: OpenAIGymEnv(gym_env, seed=10),
            agent=agent,
            render=self.is_windows,
            worker_executes_preprocessing=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], 25)
        self.assertLessEqual(results["episodes_executed"], 150)
Exemple #22
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config = read_config_file(FLAGS.config)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env
    })

    agent = Agent.from_spec(
        agent_config,
        summary_spec=dict(
            summary_regexp=FLAGS.summary_regexp
        ),
        state_space=env.state_space,
        action_space=env.action_space
    )

    rewards = []

    def episode_finished_callback(reward, duration, timesteps, **kwargs):
        rewards.append(reward)
        if len(rewards) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format(
                len(rewards), reward, np.mean(rewards[-10:])
            ))

    worker = SingleThreadedWorker(
        env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback
    )
    print("Starting workload, this will take some time for the agents to build.")
    results = worker.execute_episodes(200, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(rewards), np.mean(rewards[-10:])
    ))
    def test_ppo_on_pendulum(self):
        """
        Creates a PPO Agent and runs it via a Runner on the Pendulum env.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=False,
            render=False,  #self.is_windows,
            episode_finish_callback=lambda episode_return, duration, timesteps,
            env_num: print("episode return {}; steps={}".format(
                episode_return, timesteps)))
        results = worker.execute_episodes(5000, use_exploration=True)

        print(results)
Exemple #24
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({"type": "openai", "gym_env": FLAGS.env})

    agent = Agent.from_spec(
        # Uses 2015 DQN parameters as closely as possible.
        agent_config,
        state_space=env.state_space,
        # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
        action_space=env.action_space)

    rewards = []

    def episode_finished_callback(reward, duration, timesteps, **kwargs):
        rewards.append(reward)
        if len(rewards) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
                  format(len(rewards), reward, np.mean(rewards[-10:])))

    worker = SingleThreadedWorker(
        env_spec=lambda: env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")
    results = worker.execute_episodes(200, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(rewards), np.mean(rewards[-10:])))
    def test_ppo_on_2x2_grid_world_with_container_actions(self):
        """
        Creates a PPO agent and runs it via a Runner on a simple 2x2 GridWorld using container actions.
        """
        # -----
        # |^|H|
        # -----
        # | |G|  ^=start, looking up
        # -----

        # ftj = forward + turn + jump
        env_spec = dict(world="2x2",
                        action_type="ftj",
                        state_representation="xy+orientation")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/ppo_agent_for_2x2_gridworld_with_container_actions.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")

        agent = PPOAgent.from_spec(agent_config,
                                   state_space=FloatBox(shape=(4, )),
                                   action_space=dummy_env.action_space)

        time_steps = 5000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=True,
            render=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertLessEqual(results["episodes_executed"], time_steps)
        # Assume we have learned something.
        self.assertGreaterEqual(results["mean_episode_reward"], -2.0)
Exemple #26
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = MLAgentsEnv()

    agent = Agent.from_spec(agent_config,
                            state_space=env.state_space,
                            action_space=env.action_space)
    rewards = []

    def episode_finished_callback(reward, duration, timesteps, **kwargs):
        rewards.append(reward)
        if len(rewards) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
                  format(len(rewards), reward, np.mean(rewards[-10:])))

    worker = SingleThreadedWorker(
        env_spec=env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        #synchronous_reset=True,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")

    # Use exploration is true for training, false for evaluation.
    worker.execute_timesteps(100000, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(rewards), np.mean(rewards[-10:])))
Exemple #27
0
    def test_sac_learning_on_gaussian_density_as_reward_env(self):
        """
        Creates an SAC-Agent and runs it via a Runner on the GaussianDensityAsRewardEnv.
        """
        env = GaussianDensityAsRewardEnv(episode_length=5)
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_gaussian_density_env.json"),
            state_space=env.state_space,
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent)
        worker.execute_episodes(num_episodes=500)
        rewards = worker.finished_episode_rewards[0]  # 0=1st env in vector-env
        self.assertTrue(np.mean(rewards[:100]) < np.mean(rewards[-100:]))

        worker.execute_episodes(num_episodes=100, use_exploration=False, update_spec=None)
        rewards = worker.finished_episode_rewards[0]
        self.assertTrue(len(rewards) == 100)
        evaluation_score = np.mean(rewards)
        self.assertTrue(.5 * env.get_max_reward() < evaluation_score <= env.get_max_reward())
    def test_double_dqn_on_2x2_grid_world_with_container_actions(self):
        """
        Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld using container actions.
        """
        # ftj = forward + turn + jump
        env_spec = dict(world="2x2",
                        action_type="ftj",
                        state_representation="xy+orientation")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/dqn_agent_for_2x2_gridworld_with_container_actions.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")

        agent = DQNAgent.from_spec(agent_config,
                                   double_q=True,
                                   dueling_q=False,
                                   state_space=FloatBox(shape=(4, )),
                                   action_space=dummy_env.action_space,
                                   execution_spec=dict(seed=15),
                                   store_last_q_table=True)

        time_steps = 10000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=True,
            render=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print("LAST q-table:\n{}".format(agent.last_q_table))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -7)
        self.assertGreaterEqual(results["max_episode_reward"], -1.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 3)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (0., 0., -1., 0.): {
                "forward": (-5.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 0., 1., 0.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 0., 0., -1.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 0., 0., 1.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 1., -1., 0.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 1., 1., 0.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 1., 0., -1.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 1., 0., 1.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
        }
        for state, q_values_forward, q_values_jump in zip(
                agent.last_q_table["states"],
                agent.last_q_table["q_values"]["forward"],
                agent.last_q_table["q_values"]["jump"]):
            state, q_values_forward, q_values_jump = tuple(state), tuple(
                q_values_forward), tuple(q_values_jump)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(
                q_values_forward,
                expected_q_values_per_state[state]["forward"],
                decimals=0)
            recursive_assert_almost_equal(
                q_values_jump,
                expected_q_values_per_state[state]["jump"],
                decimals=0)
episode_returns = []


def episode_finished_callback(episode_return, duration, timesteps, *args,
                              **kwargs):
    episode_returns.append(episode_return)
    if len(episode_returns) % 100 == 0:
        print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
              format(len(episode_returns), episode_return,
                     np.mean(episode_returns[-100:])))


worker = SingleThreadedWorker(
    env_spec=lambda: env,
    agent=agent,
    render=False,
    worker_executes_preprocessing=False,
    episode_finish_callback=episode_finished_callback)

# Use exploration is true for training, false for evaluation.
worker.execute_episodes(1000, use_exploration=True)


def plotting(Baseline, PPO, quit, finished, quitBaseline, finishedBaseline,
             actionInfo):
    ax1 = plt.subplot(311)
    #ax1.set_title('Scenario 4: average reward of last 100 children without quitting penalty')
    ax1.margins(0.05)
    #ax1.set_xlabel('Number of children')
    ax1.set_title('Average reward of last 100 children')
    ax1.plot(PPO, 'r', label='PPO')