Exemple #1
0
def q_learning_greedy_probability_policy():

    env = gym.make('FrozenLake-v0')
    q_learning = QLearning(env.action_space.n,
                           env.observation_space.n,
                           epsilon=0.1,
                           learning_rate=0.1)
    q_learning.set_general_state_action_values([0.5, 1, 0.5, 0.5])
    episode_rewards = []
    all_over_reward = 0.0
    for i_episode in range(7000):

        # We start a new episode with have to reset the environment and stats
        observation = env.reset()
        accumulated_reward = 0.0

        for t in range(100):

            # Show current state
            # env.render()

            # Choose action based on current experience
            action = q_learning.greedy_probability_policy(observation)

            # Save previous state, and commit action, resulting new current state
            previous_observation = observation
            observation, reward, done, info = env.step(action)

            # Accumulate more reward
            accumulated_reward += reward

            # Train algorithm based on new experience
            q_learning.update_state_action_function(previous_observation,
                                                    action, reward,
                                                    observation)

            #
            if done:
                print "Episode finished after {} timesteps".format(t + 1)
                print "Total reward for episode %i: %i" % (i_episode,
                                                           accumulated_reward)
                all_over_reward += accumulated_reward
                episode_rewards.append(accumulated_reward)
                break

    plot = Plot()
    plot.plot_evolution(episode_rewards)
    print q_learning.q_table
    q_learning.write_q_function_dump()
Exemple #2
0
def greedy_probability_policy():

    env = gym.make('FrozenLake-v0')
    q_learning = QLearning(env.action_space.n, env.observation_space.n, 0.9)
    q_learning.set_general_state_action_values([0.5, 1, 0.5, 0.5])

    for i_episode in range(20):
        observation = env.reset()
        for t in range(100):
            env.render()
            print(observation)
            action = q_learning.greedy_probability_policy(observation)
            observation, reward, done, info = env.step(action)
            if done:
                print "Episode finished after {} timesteps".format(t + 1)
                break