Exemple #1
0
    def test_sac_on_pendulum(self):
        """
        Creates an SAC-Agent and runs it on Pendulum.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=False,
            render=self.is_windows
        )
        # Note: SAC is more computationally expensive.
        episodes = 50
        results = worker.execute_episodes(episodes)

        print(results)

        self.assertTrue(results["timesteps_executed"] == episodes * 200)
        self.assertTrue(results["episodes_executed"] == episodes)
        self.assertGreater(results["mean_episode_reward"], -800)
Exemple #2
0
    def test_sac_on_pendulum(self):
        """
        Creates an SAC-Agent and runs it on Pendulum.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=False,
            render=False,  # self.is_windows
            episode_finish_callback=lambda episode_return, duration, timesteps, **kwargs:
            print("episode: return={} ts={}".format(episode_return, timesteps))
        )
        # Note: SAC is more computationally expensive.
        episodes = 50
        results = worker.execute_episodes(episodes)

        print(results)

        self.assertTrue(results["timesteps_executed"] == episodes * 200)
        self.assertTrue(results["episodes_executed"] == episodes)
        self.assertGreater(results["mean_episode_reward_last_10_episodes"], -700)
        self.assertGreater(results["max_episode_reward"], -100)
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env
    })
    print(env.state_space)

    agent = Agent.from_spec(
        agent_config,
        state_space=env.state_space,
        action_space=env.action_space
    )

    episode_returns = []

    def episode_finished_callback(episode_return, duration, timesteps, **kwargs):
        episode_returns.append(episode_return)
        if len(episode_returns) % 5 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format(
                len(episode_returns), episode_return, np.mean(episode_returns[-5:])
            ))

    worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False,
                                  episode_finish_callback=episode_finished_callback)
    print("Starting workload, this will take some time for the agents to build.")

    worker.execute_episodes(100, use_exploration=True)

    # Use exploration is true for training, false for evaluation.

    #worker.execute_episodes(100, use_exploration=False)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(episode_returns), np.mean(episode_returns[-10:])
    ))
Exemple #4
0
    def test_sac_learning_on_gaussian_density_as_reward_env(self):
        """
        Creates an SAC-Agent and runs it via a Runner on the GaussianDensityAsRewardEnv.
        """
        env = GaussianDensityAsRewardEnv(episode_length=5)
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_gaussian_density_env.json"),
            state_space=env.state_space,
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent)
        worker.execute_episodes(num_episodes=500)
        rewards = worker.finished_episode_rewards[0]  # 0=1st env in vector-env
        self.assertTrue(np.mean(rewards[:100]) < np.mean(rewards[-100:]))

        worker.execute_episodes(num_episodes=100, use_exploration=False, update_spec=None)
        rewards = worker.finished_episode_rewards[0]
        self.assertTrue(len(rewards) == 100)
        evaluation_score = np.mean(rewards)
        self.assertTrue(.5 * env.get_max_reward() < evaluation_score <= env.get_max_reward())
Exemple #5
0
    def test_ppo_on_pendulum(self):
        """
        Creates a PPO Agent and runs it via a Runner on the Pendulum env.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      worker_executes_preprocessing=False,
                                      render=self.is_windows)
        results = worker.execute_episodes(500, use_exploration=True)

        print(results)
Exemple #6
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config = read_config_file(FLAGS.config)

    env = OpenAIGymEnv.from_spec({
        "type": "openai",
        "gym_env": FLAGS.env
    })

    agent = Agent.from_spec(
        agent_config,
        summary_spec=dict(
            summary_regexp=FLAGS.summary_regexp
        ),
        state_space=env.state_space,
        action_space=env.action_space
    )

    rewards = []

    def episode_finished_callback(reward, duration, timesteps, **kwargs):
        rewards.append(reward)
        if len(rewards) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format(
                len(rewards), reward, np.mean(rewards[-10:])
            ))

    worker = SingleThreadedWorker(
        env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback
    )
    print("Starting workload, this will take some time for the agents to build.")
    results = worker.execute_episodes(200, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(rewards), np.mean(rewards[-10:])
    ))
    def test_ppo_on_pendulum(self):
        """
        Creates a PPO Agent and runs it via a Runner on the Pendulum env.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=False,
            render=False,  #self.is_windows,
            episode_finish_callback=lambda episode_return, duration, timesteps,
            env_num: print("episode return {}; steps={}".format(
                episode_return, timesteps)))
        results = worker.execute_episodes(5000, use_exploration=True)

        print(results)
Exemple #8
0
def main(argv):
    try:
        FLAGS(argv)
    except flags.Error as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))

    agent_config_path = os.path.join(os.getcwd(), FLAGS.config)
    with open(agent_config_path, 'rt') as fp:
        agent_config = json.load(fp)

    env = OpenAIGymEnv.from_spec({"type": "openai", "gym_env": FLAGS.env})

    agent = Agent.from_spec(
        # Uses 2015 DQN parameters as closely as possible.
        agent_config,
        state_space=env.state_space,
        # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
        action_space=env.action_space)

    rewards = []

    def episode_finished_callback(reward, duration, timesteps, **kwargs):
        rewards.append(reward)
        if len(rewards) % 10 == 0:
            print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
                  format(len(rewards), reward, np.mean(rewards[-10:])))

    worker = SingleThreadedWorker(
        env_spec=lambda: env,
        agent=agent,
        render=False,
        worker_executes_preprocessing=False,
        episode_finish_callback=episode_finished_callback)
    print(
        "Starting workload, this will take some time for the agents to build.")
    results = worker.execute_episodes(200, use_exploration=True)

    print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format(
        np.mean(rewards), np.mean(rewards[-10:])))
    episode_returns.append(episode_return)
    if len(episode_returns) % 100 == 0:
        print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
              format(len(episode_returns), episode_return,
                     np.mean(episode_returns[-100:])))


worker = SingleThreadedWorker(
    env_spec=lambda: env,
    agent=agent,
    render=False,
    worker_executes_preprocessing=False,
    episode_finish_callback=episode_finished_callback)

# Use exploration is true for training, false for evaluation.
worker.execute_episodes(1000, use_exploration=True)


def plotting(Baseline, PPO, quit, finished, quitBaseline, finishedBaseline,
             actionInfo):
    ax1 = plt.subplot(311)
    #ax1.set_title('Scenario 4: average reward of last 100 children without quitting penalty')
    ax1.margins(0.05)
    #ax1.set_xlabel('Number of children')
    ax1.set_title('Average reward of last 100 children')
    ax1.plot(PPO, 'r', label='PPO')
    ax1.plot(Baseline, 'b', label='Baseline')
    ax1.legend()

    ax4 = plt.subplot(312)
    ax4.set_title('Actions taken')
                        action_space=env.action_space)

#define number of children to simulate
episode_count = 1000

episode_returns = []


def episode_finished_callback(episode_return, duration, timesteps, *args,
                              **kwargs):
    episode_returns.append(episode_return)
    if len(episode_returns) % 100 == 0:
        print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
              format(len(episode_returns), episode_return,
                     np.mean(episode_returns[-100:])))


# create the worker
worker = SingleThreadedWorker(
    env_spec=lambda: env,
    agent=agent,
    render=False,
    worker_executes_preprocessing=False,
    episode_finish_callback=episode_finished_callback)

# Use exploration is true for training, false for evaluation.
worker.execute_episodes(episode_count, use_exploration=True)

#make the plots
env.gym_env.render()