def main(): env = gym.envs.make("MountainCar-v0") # Feature Preprocessing: Normalize to zero mean and unit variance # We use a few samples from the observation space to do this observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) scaler = sklearn.preprocessing.StandardScaler() scaler.fit(observation_examples) # Used to convert a state to a featurized represenation. # We use RBF kernels with different variances to cover different parts of the space featurizer = sklearn.pipeline.FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=100)), ("rbf3", RBFSampler(gamma=1.0, n_components=100)), ("rbf4", RBFSampler(gamma=0.5, n_components=100)) ]) featurizer.fit(scaler.transform(observation_examples)) estimator = Estimator(env, scaler, featurizer) # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. stats = q_learning(env, estimator, 100, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator) plotting.plot_episode_stats(stats, smoothing_window=25)
def main(): env = gym.make('MountainCar-v0') outdir = './experiment-results' # env = wrappers.Monitor(env, directory=outdir, force=True) # Keeps track of useful statistics num_episodes = 300 stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Feature Preprocessing: Normalize to zero mean and unit variance # We use a few samples from the observation space to do this observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) scaler = sklearn.preprocessing.StandardScaler() scaler.fit(observation_examples) # Used to converte a state to a featurizes represenation. # We use RBF kernels with different variances to cover different parts of the space featurizer = sklearn.pipeline.FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=100)), ("rbf3", RBFSampler(gamma=1.0, n_components=100)), ("rbf4", RBFSampler(gamma=0.5, n_components=100)) ]) featurizer.fit(scaler.transform(observation_examples)) agent = Agent(env.action_space.n, scaler, featurizer, env.observation_space.sample(), epsilon=0, gamma=1) for i_episode in range(num_episodes): print("\rEpisode {}/{} ({})".format( i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end="") sys.stdout.flush() state = env.reset() action = agent.set_initial_state(state) for t in itertools.count(): next_state, reward, done, info = env.step(action) action = agent.act(next_state, reward) # book-keeping stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: break env.close() # gym.upload(outdir, api_key='sk_9YxUhFDaT5XSahcLut47w') plotting.plot_cost_to_go_mountain_car(env, agent.Q) plotting.plot_episode_stats(stats, smoothing_window=25)
def main(): matplotlib.style.use('ggplot') env = gym.envs.make("MountainCar-v0") num_episodes = 100 estimator_q_learning = tile_coding_estimator.Estimator(env) statistics_q_learning = plotting.EpisodeStats( "q_learning", episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. q_learning_tile_coding.q_learning(env, estimator_q_learning, num_episodes, statistics_q_learning, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator_q_learning) estimator_sarsa = tile_coding_estimator.Estimator(env) statistics_sarsa = plotting.EpisodeStats( "sarsa", episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. sarsa_tile_coding.sarsa(env, estimator_sarsa, num_episodes, statistics_sarsa, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator_sarsa) estimator_expected_sarsa = tile_coding_estimator.Estimator(env) statistics_expected_sarsa = plotting.EpisodeStats( "expected_sarsa", episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. expected_sarsa_tile_coding.expected_sarsa(env, estimator_expected_sarsa, num_episodes, statistics_expected_sarsa, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator_expected_sarsa) plotting.plot_episode_stats( [statistics_q_learning, statistics_sarsa, statistics_expected_sarsa], smoothing_window=25)
# next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) # td_target = reward + discount_factor * q_values_next[next_action] # Update the function approximator using our target estimator.update(state, action, td_target) #plt.figure() plt.clf() plt.imshow(env.render(mode='rgb_array')) print("\rStep {} @ Episode {}/{} ({})".format( t, i_episode + 1, num_episodes, last_reward)), sys.stdout.flush() if done: break state = next_state return stats estimator = Estimator() # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. stats = q_learning(env, estimator, 100, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator) plotting.plot_episode_stats(stats, smoothing_window=25)
fig = None final_stats = None for episode, t, stats in q_learning( env, q_estimator=q_estimator, target_estimator=target_estimator, update_target_estimator_every=10000, num_episodes=5000, epsilon_start=1, epsilon_end=0, epsilon_decay_steps=500000): final_stats = stats if episode % 50 == 0: if fig is not None: plt.close() fig = plotting.plot_cost_to_go_mountain_car(env, q_estimator, block=False) if episode % 500 == 0: q_estimator.save(save_directory) run_episode(env, q_estimator, render=False) log_episode_stats(get_empty_data_file("stats.csv"), final_stats) # plotting.plot_cost_to_go_mountain_car(env, q_estimator) # plotting.plot_episode_stats(final_stats, smoothing_window=25) while True: run_episode(env, q_estimator, render=True)
def plot(self, stats): plotting.plot_cost_to_go_mountain_car(self.env, self.estimator) plotting.plot_episode_stats(stats, smoothing_window=25)
# update current state state = new_state if terminated: break return stats # In[122]: estimator = Estimator() # In[123]: # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. stats = q_learning(env, estimator, 100, epsilon=0.0) # In[124]: plotting.plot_cost_to_go_mountain_car(env, estimator) plotting.plot_episode_stats(stats, smoothing_window=25)
# Update the function approximator using our target estimator.update(state, action, td_target) print( f'Step {t} @ Episode {i_episode + 1}/{num_episodes} ({last_reward})' ) if done: break state = next_state return stats if __name__ == "__main__": estimator = Estimator() # Mountain car does not need epsilon > 0 # Initial estimate for all states is too "optimistic" stats = q_learning(env, estimator, 100, epsilon=0.0) # Plotting plotting.plot_cost_to_go_mountain_car(env, estimator, name='q_learning_fn_estimator') plotting.plot_episode_stats(stats, name='q_learning_fn_estimator', smoothing_window=25, noshow=True)