def main():
    env = gym.envs.make("MountainCar-v0")

    # Feature Preprocessing: Normalize to zero mean and unit variance
    # We use a few samples from the observation space to do this
    observation_examples = np.array(
        [env.observation_space.sample() for x in range(10000)])
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(observation_examples)

    # Used to convert a state to a featurized represenation.
    # We use RBF kernels with different variances to cover different parts of the space
    featurizer = sklearn.pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
    ])
    featurizer.fit(scaler.transform(observation_examples))

    estimator = Estimator(env, scaler, featurizer)

    # Note: For the Mountain Car we don't actually need an epsilon > 0.0
    # because our initial estimate for all states is too "optimistic" which leads
    # to the exploration of all states.
    stats = q_learning(env, estimator, 100, epsilon=0.0)

    plotting.plot_cost_to_go_mountain_car(env, estimator)
    plotting.plot_episode_stats(stats, smoothing_window=25)
コード例 #2
0
def main():
    env = gym.make('MountainCar-v0')
    outdir = './experiment-results'
    # env = wrappers.Monitor(env, directory=outdir, force=True)

    # Keeps track of useful statistics
    num_episodes = 300
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # Feature Preprocessing: Normalize to zero mean and unit variance
    # We use a few samples from the observation space to do this
    observation_examples = np.array(
        [env.observation_space.sample() for x in range(10000)])
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(observation_examples)

    # Used to converte a state to a featurizes represenation.
    # We use RBF kernels with different variances to cover different parts of the space
    featurizer = sklearn.pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
    ])
    featurizer.fit(scaler.transform(observation_examples))

    agent = Agent(env.action_space.n,
                  scaler,
                  featurizer,
                  env.observation_space.sample(),
                  epsilon=0,
                  gamma=1)

    for i_episode in range(num_episodes):
        print("\rEpisode {}/{} ({})".format(
            i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]),
              end="")
        sys.stdout.flush()

        state = env.reset()
        action = agent.set_initial_state(state)

        for t in itertools.count():
            next_state, reward, done, info = env.step(action)
            action = agent.act(next_state, reward)

            # book-keeping
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            if done:
                break

    env.close()
    # gym.upload(outdir, api_key='sk_9YxUhFDaT5XSahcLut47w')

    plotting.plot_cost_to_go_mountain_car(env, agent.Q)
    plotting.plot_episode_stats(stats, smoothing_window=25)
コード例 #3
0
def main():
    matplotlib.style.use('ggplot')

    env = gym.envs.make("MountainCar-v0")

    num_episodes = 100

    estimator_q_learning = tile_coding_estimator.Estimator(env)
    statistics_q_learning = plotting.EpisodeStats(
        "q_learning",
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Note: For the Mountain Car we don't actually need an epsilon > 0.0
    # because our initial estimate for all states is too "optimistic" which leads
    # to the exploration of all states.
    q_learning_tile_coding.q_learning(env,
                                      estimator_q_learning,
                                      num_episodes,
                                      statistics_q_learning,
                                      epsilon=0.0)
    plotting.plot_cost_to_go_mountain_car(env, estimator_q_learning)

    estimator_sarsa = tile_coding_estimator.Estimator(env)
    statistics_sarsa = plotting.EpisodeStats(
        "sarsa",
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Note: For the Mountain Car we don't actually need an epsilon > 0.0
    # because our initial estimate for all states is too "optimistic" which leads
    # to the exploration of all states.
    sarsa_tile_coding.sarsa(env,
                            estimator_sarsa,
                            num_episodes,
                            statistics_sarsa,
                            epsilon=0.0)
    plotting.plot_cost_to_go_mountain_car(env, estimator_sarsa)

    estimator_expected_sarsa = tile_coding_estimator.Estimator(env)
    statistics_expected_sarsa = plotting.EpisodeStats(
        "expected_sarsa",
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Note: For the Mountain Car we don't actually need an epsilon > 0.0
    # because our initial estimate for all states is too "optimistic" which leads
    # to the exploration of all states.
    expected_sarsa_tile_coding.expected_sarsa(env,
                                              estimator_expected_sarsa,
                                              num_episodes,
                                              statistics_expected_sarsa,
                                              epsilon=0.0)
    plotting.plot_cost_to_go_mountain_car(env, estimator_expected_sarsa)

    plotting.plot_episode_stats(
        [statistics_q_learning, statistics_sarsa, statistics_expected_sarsa],
        smoothing_window=25)
コード例 #4
0
            # next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs)
            # td_target = reward + discount_factor * q_values_next[next_action]

            # Update the function approximator using our target
            estimator.update(state, action, td_target)
            #plt.figure()
            plt.clf()
            plt.imshow(env.render(mode='rgb_array'))

            print("\rStep {} @ Episode {}/{} ({})".format(
                t, i_episode + 1, num_episodes, last_reward)),
            sys.stdout.flush()

            if done:
                break

            state = next_state

    return stats


estimator = Estimator()

# Note: For the Mountain Car we don't actually need an epsilon > 0.0
# because our initial estimate for all states is too "optimistic" which leads
# to the exploration of all states.
stats = q_learning(env, estimator, 100, epsilon=0.0)

plotting.plot_cost_to_go_mountain_car(env, estimator)
plotting.plot_episode_stats(stats, smoothing_window=25)
コード例 #5
0
        fig = None
        final_stats = None
        for episode, t, stats in q_learning(
                env,
                q_estimator=q_estimator,
                target_estimator=target_estimator,
                update_target_estimator_every=10000,
                num_episodes=5000,
                epsilon_start=1,
                epsilon_end=0,
                epsilon_decay_steps=500000):
            final_stats = stats
            if episode % 50 == 0:
                if fig is not None:
                    plt.close()
                fig = plotting.plot_cost_to_go_mountain_car(env,
                                                            q_estimator,
                                                            block=False)

            if episode % 500 == 0:
                q_estimator.save(save_directory)

            run_episode(env, q_estimator, render=False)
        log_episode_stats(get_empty_data_file("stats.csv"), final_stats)

        # plotting.plot_cost_to_go_mountain_car(env, q_estimator)
        # plotting.plot_episode_stats(final_stats, smoothing_window=25)

        while True:
            run_episode(env, q_estimator, render=True)
コード例 #6
0
 def plot(self, stats):
     plotting.plot_cost_to_go_mountain_car(self.env, self.estimator)
     plotting.plot_episode_stats(stats, smoothing_window=25)
            # update current state
            state = new_state
            
            if terminated:
                break
    
    return stats


# In[122]:


estimator = Estimator()


# In[123]:


# Note: For the Mountain Car we don't actually need an epsilon > 0.0
# because our initial estimate for all states is too "optimistic" which leads
# to the exploration of all states.
stats = q_learning(env, estimator, 100, epsilon=0.0)


# In[124]:


plotting.plot_cost_to_go_mountain_car(env, estimator)
plotting.plot_episode_stats(stats, smoothing_window=25)

コード例 #8
0
            # Update the function approximator using our target
            estimator.update(state, action, td_target)

            print(
                f'Step {t} @ Episode {i_episode + 1}/{num_episodes} ({last_reward})'
            )

            if done:
                break

            state = next_state

    return stats


if __name__ == "__main__":
    estimator = Estimator()

    # Mountain car does not need epsilon > 0
    # Initial estimate for all states is too "optimistic"
    stats = q_learning(env, estimator, 100, epsilon=0.0)

    # Plotting
    plotting.plot_cost_to_go_mountain_car(env,
                                          estimator,
                                          name='q_learning_fn_estimator')
    plotting.plot_episode_stats(stats,
                                name='q_learning_fn_estimator',
                                smoothing_window=25,
                                noshow=True)