Ejemplo n.º 1
0
def sarsa(env,
          estimator,
          num_episodes,
          statistics,
          discount_factor=1.0,
          epsilon=0.1,
          epsilon_decay=1.0):
    """
    sarsa algorithm for on-policy TD control using Function Approximation.
    Args:
        env: OpenAI environment.
        estimator: Action-Value function estimator
        num_episodes: Number of episodes to run for.
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
        discount_factor: Lambda time discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
        epsilon_decay: Each episode, epsilon is decayed by this factor

    Returns:

    """

    for i_episode in range(num_episodes):
        # The policy we're following
        e_greedy_policy = utility.make_epsilon_greedy_policy_with_fa(
            estimator, epsilon * epsilon_decay**i_episode, env.action_space.n)

        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = statistics.episode_rewards[i_episode - 1]
        sys.stdout.flush()

        # Reset the environment and pick the first action
        obvservation = env.reset()
        action = utility.make_decision(e_greedy_policy, obvservation)
        for t in itertools.count():
            next_observation, reward, done, _ = env.step(action)

            # Update statistics
            statistics.episode_rewards[i_episode] += reward
            statistics.episode_lengths[i_episode] = t

            next_action = utility.make_decision(e_greedy_policy,
                                                next_observation)
            q_values_next = estimator.predict(next_observation, next_action)

            td_target = reward + discount_factor * q_values_next

            # Update the function approximator using our target
            estimator.update(obvservation, action, td_target)

            print("\rStep {} @ Episode {}/{} ({})".format(
                t, i_episode + 1, num_episodes, last_reward),
                  end="")

            if done:
                break
            action = next_action
            obvservation = next_observation
Ejemplo n.º 2
0
def q_learning(env, num_episodes, statistics, discount_factor=1.0, alpha=0.5, epsilon=0.1):
    """
    Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy
    while following an epsilon-greedy policy

    Args:
        env: OpenAI environment.
        num_episodes: Number of episodes to run for.
        discount_factor: Lambda time discount factor.
        alpha: TD learning rate.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.

    Returns:
        Q is the optimal action-value function, a dictionary mapping state -> action values.
        stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    # The policy we're following
    e_greedy_policy = utility.make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):
        # Print out which episode we're on, useful for debugging.
        if (i_episode + 1) % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="")
            sys.stdout.flush()

        # Reset the environment and pick the first action
        state = env.reset()

        # One step in the environment
        # total_reward = 0.0

        for t in itertools.count():
            # Take a step
            action = utility.make_decision(e_greedy_policy, state)
            next_state, reward, done, _ = env.step(action)

            # Update statistics
            statistics.episode_rewards[i_episode] += reward
            statistics.episode_lengths[i_episode] = t

            # TD Update
            best_next_action = np.argmax(Q[next_state])
            td_target = reward + discount_factor * \
                Q[next_state][best_next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha * td_delta

            if done:
                break

            state = next_state

    return Q
Ejemplo n.º 3
0
def run_episode(env, greedy_policy):
    observation = env.reset()
    for t in itertools.count():
        env.render()
        action = utility.make_decision(greedy_policy, observation)
        ob, reward, done, info = env.step(action)
        if done:
            break
        observation = ob
def mc_control_epsilon_greedy(env, num_episodes, statistics, discount_factor=1.0, epsilon=0.1):
    """
    Monte Carlo Control using Epsilon-Greedy policies.
    Finds an optimal epsilon-greedy policy.

    Args:
        env: OpenAI gym environment.
        num_episodes: Nubmer of episodes to sample.
        statistics: namedTuple of statistics informaiton 

        discount_factor: Lambda discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.

    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function taht takes an observation as an argument and returns
        action probabilities
    """

    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    # The policy we're following
    policy = utility.make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):
        # Print out which episode we're on, useful for debugging.
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()

        # Generate an episode.
        # An episode is an array of (state, action, reward) tuples
        episode = []
        state = env.reset()
        for t in itertools.count():
            action = utility.make_decision(policy, state)
            next_state, reward, done, _ = env.step(action)

            # Update statistics
            statistics.episode_rewards[i_episode] += reward
            statistics.episode_lengths[i_episode] = t

            episode.append((state, action, reward))
            if done:
                break
            state = next_state

        # Find all (state, action) pairs we've visited in this episode
        # We convert each state to a tuple so that we can use it as a dict key
        # the tuple is (state,action)
        state_action_in_episode = set([(x[0], x[1]) for x in episode])
        for state, action in state_action_in_episode:
            state_action = (state, action)
            # Find the first occurance of the (state, action) pair in the episode
            first_occurence_idx = next(i for i, x in enumerate(episode)
                                       if x[0] == state and x[1] == action)
            # Sum up all rewards since the first occurance
            G = sum([x[2] * (discount_factor**i)
                     for i, x in enumerate(episode[first_occurence_idx:])])
            # Calculate average return for this state over all sampled episodes
            returns_sum[state_action] += G
            returns_count[state_action] += 1.0
            Q[state][action] = returns_sum[state_action] / \
                returns_count[state_action]

        # The policy is improved implicitly by changing the Q dictionar

    return Q
def expected_sarsa(env, num_episodes, statistics, discount_factor=1.0, alpha=0.5, epsilon=0.1):
    """
    Q-expected_sarsa algorithm: on-policy TD control. Finds the optimal epsilon-greedy policy

    Args:
        env: OpenAI environment.
        num_episodes: Number of episodes to run for.
        statistics: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
        discount_factor: Lambda time discount factor.
        alpha: TD learning rate.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.

    Returns:
        Q is the optimal action-value function, a dictionary mapping state -> action values.
        stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    q = defaultdict(lambda: np.zeros(env.action_space.n))

    # The policy we're following
    e_greedy_policy = utility.make_epsilon_greedy_policy(q, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):
        # Print out which episode we're on, useful for debugging.
        if (i_episode + 1) % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="")
            sys.stdout.flush()

        # Reset the environment and pick the first action
        observation = env.reset()

        # One step in the environment
        for t in itertools.count():
            # Take a step
            action = utility.make_decision(e_greedy_policy, observation)
            next_observation, reward, done, _ = env.step(action)

            # Update statistics
            statistics.episode_rewards[i_episode] += reward
            statistics.episode_lengths[i_episode] = t

            expected_next_q = 0

            next_actions = e_greedy_policy(next_observation)

            for action, action_prob in enumerate(next_actions):
                expected_next_q += action_prob * q[next_observation][action]

            td_target = reward + discount_factor * expected_next_q

            td_delta = td_target - q[observation][action]
            q[observation][action] += alpha * td_delta

            if done:
                break

            observation = next_observation

    return q