Beispiel #1
0
def least_squares_td(env, policy, epsilon, alpha, gamma, n_episodes, tile_coder):
    # Initialization.
    n = tile_coder.total_n_tiles
    A = (1/epsilon) * np.eye(n)
    b = np.zeros((n,1))
    d = np.zeros((n,1))

    for episode in range(n_episodes):
        done = False
        obs = env.reset()

        while not done:
            feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \
                              env.action_space_size)
            a = policy.greedy_action(feature_vectors)
            obs_prime, reward, done = env.step(a)
            x = np.array(tile_coder.get_tile_code(obs)).reshape(-1,1)
            x_prime = np.array(tile_coder.get_tile_code(obs_prime)).reshape(-1,1)
            b = b + reward * x
            d = (x - gamma * x_prime)
            A = A + x @ d.T
            if env.steps == 2:
                inv_A = np.linalg.inv(A)
            else:
                t = np.eye(n) - (((x @ d.T)/(1 + ((d.T @ inv_A) @ x))) @ nv_A)
                inv_A = inv_A @ t
            theta = inv_A @ b
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    v = LinearValueFunction(tile_coder.total_n_tiles)
    v.weights = theta.flatten()
    return v
Beispiel #2
0
def actor_critic_eligibility_traces(env, eta, alpha_th, alpha_w, lambda_th, lambda_w, \
                                    gamma, n_episodes):

    policy = ExponentialSoftmax(env.observation_space_size *
                                env.action_space_size)
    v = LinearValueFunction(env.observation_space_size)
    z_th = np.zeros(env.observation_space_size * env.action_space_size)
    z_w = np.zeros(env.observation_space_size)
    R_bar = 0

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        obs_vec = encode_state(obs, env.observation_space_size)

        while not done:
            sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
                        env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(sa_pairs)
            obs_prime, reward, done = env.step(a)
            obs_prime_vec = encode_state(obs_prime, env.observation_space_size)
            delta = reward - R_bar + v.evaluate(obs_prime_vec) - v.evaluate(
                obs_vec)
            R_bar += eta * delta
            z_w = lambda_w * z_w + obs_vec
            z_th = lambda_th * z_th + policy.eligibility_vector(a, sa_pairs)
            v.weights += alpha_w * delta * z_w
            policy.weights += alpha_th * delta * z_th
            obs_vec = obs_prime_vec
            obs = obs_prime

        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
def one_step_actor_critic(env, alpha_th, alpha_w, gamma, n_episodes):
    policy = ExponentialSoftmax(env.observation_space_size*env.action_space_size)
    v = LinearValueFunction(env.observation_space_size)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        obs_vec = encode_state(obs, env.observation_space_size)
        I = 1

        while not done:
            sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
                       env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(sa_pairs)
            obs_prime, reward, done = env.step(a)
            obs_prime_vec = encode_state(obs_prime, env.observation_space_size)
            delta = reward + gamma * v.evaluate(obs_prime_vec) - v.evaluate(obs_vec)
            v.weights += alpha_w * I  * delta * obs_vec
            policy.weights += alpha_th  * I * delta * policy.eligibility_vector(a,
            sa_pairs)
            I *= I
            obs_vec = obs_prime_vec
            obs = obs_prime

        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
Beispiel #4
0
def gradient_mc_prediction(env, policy, alpha, n_episodes, tile_coder):
    # Initialization.
    v = LinearValueFunction(tile_coder.total_n_tiles)
    states = []
    rewards = [None]

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        # Store the feature vector representation of the state.
        states.append(tile_coder.get_tile_code(obs))
        feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \
                          env.action_space_size)
        a = policy.greedy_action(feature_vectors)

        while not done:
            obs, reward, done = env.step(a)
            feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \
                              env.action_space_size)
            a = policy.greedy_action(feature_vectors)
            rewards.append(reward)
            states.append(tile_coder.get_tile_code(obs))

        for i in range(len(states)):
            G = np.sum(rewards[i + 1:])
            # Update weights.
            v.weights += alpha * np.dot((G - v.evaluate(states[i])), states[i])
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return v
Beispiel #5
0
def sarsa(env, gamma, alpha, epsilon, n_episodes):
    # Create iterators.
    sa_pairs = product(range(70), range(4))
    # Initialize state-action value function.
    Q = dict.fromkeys(sa_pairs, 0.0)

    epsilon_start = epsilon
    decay = lambda x: x - (10/n_episodes)*epsilon_start if \
            x - (10/n_episodes)*epsilon_start > 1e-4 else 1e-4

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size)

        while not done:
            obs_prime, reward, done = env.step(action)
            action_prime = eps_greedy_policy(Q, obs_prime, epsilon, \
                                             env.action_space_size)
            # Update state-action value estimate.
            Q[obs,action] += alpha * (reward + gamma * \
                             (Q[obs_prime, action_prime]) - Q[obs, action])
            obs = obs_prime
            action = action_prime

        # Decay epsilon.
        epsilon = decay(epsilon)
        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return Q
def off_policy_n_step_sarsa(env, n, alpha, gamma, epsilon, n_episodes):
    # Initialize target policy and state-action value function.
    sa_pairs = product(range(env.observation_space_size), \
                       range(env.action_space_size))
    Q = dict.fromkeys(sa_pairs, 0.0)
    policy = dict.fromkeys(range(env.observation_space_size), 0)
    states = np.zeros(n)
    actions = np.zeros(n)
    rewards = np.zeros(n)

    decay = lambda x: x - 2 / n_episodes if x - 2 / n_episodes > 0.1 else 0.1

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        action = np.random.randint(4)
        states[0] = obs
        actions[0] = action
        t = 0
        tau = -1
        T = np.inf

        while not done or tau != T - 1:
            if t < T:
                obs_prime, reward, done = env.step(action)
                states[(t + 1) % n] = obs_prime
                rewards[(t + 1) % n] = reward
                if done:
                    T = t + 1
                else:
                    action = np.random.randint(4)
                    actions[(t + 1) % n] = action
            tau = t - n + 1
            if tau > -1:
                p = 1
                for i in range(tau + 1, min(tau + n - 1, T - 1)):
                    s = states[i % n]
                    a = actions[i % n]
                    policy_proba = eps_greedy_proba(policy, s, a, epsilon)
                    # 0.25 constant used as behaviour policy acts randomly.
                    p *= policy_proba / 0.25
                G = np.sum([gamma**(i-tau-1)*rewards[i%n] for i in \
                            range(tau+1, min(tau+n,T))])
                if tau + n < T:
                    s = states[(tau + n) % n]
                    a = actions[(tau + n) % n]
                    G += gamma**n * Q[s, a]
                s = states[tau % n]
                a = actions[tau % n]
                # Update state-action value estimate of the target policy.
                Q[s, a] += alpha * p * (G - Q[s, a])
                # Make target policy greedy w.r.t. Q.
                action_values = [Q[s, i] for i in range(4)]
                policy[s] = np.argmax(action_values)
            t += 1
        epsilon = decay(epsilon)
        if episode % 1000 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
Beispiel #7
0
def REINFORCE(env, alpha, gamma, n_episodes):
    policy = ExponentialSoftmax(env.observation_space_size *
                                env.action_space_size)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
        env.action_space_size) for a in range(env.action_space_size)]
        a = policy.sample_action(all_sa_pairs)
        states = [obs]
        actions = [a]
        rewards = [None]

        while not done:
            obs, reward, done = env.step(a)
            all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(all_sa_pairs)
            states.append(obs)
            actions.append(a)
            rewards.append(reward)

        for t in range(len(states)):
            G_t = sum(rewards[t + 1:])
            all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            policy.weights += alpha * (gamma ** t) * G_t * \
                              policy.eligibility_vector(actions[t], all_sa_pairs)

        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
def semi_gradient_n_step_sarsa(env, n, alpha, gamma, epsilon, n_episodes, \
                               tile_coder, action_len, stop_threshold):
    # Initialization.
    q = LinearPolicy(tile_coder.total_n_tiles, action_len, env.action_space_size)
    states = [None] * n
    actions = np.zeros(n)
    rewards = np.zeros(n)
    all_steps = []

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        states[0] = obs
        a = eps_greedy_func_policy(q, obs, epsilon, tile_coder,\
                                   env.action_space_size)
        t = 0
        tau = -1
        T = np.inf

        while not done or tau != T-1:
            if t < T:
                obs_prime, reward, done = env.step(a)
                rewards[(t+1)%n] = reward
                states[(t+1)%n] = obs_prime
                if done:
                    T = t+1
                else:
                    a = eps_greedy_func_policy(q, obs_prime, epsilon, \
                        tile_coder, env.action_space_size)
                    actions[(t+1)%n] = a
            tau = t-n+1
            if tau > -1:
                # Calculate n-step return.
                G = np.sum([gamma**(i-tau-1)*rewards[i%n] \
                    for i in range(tau+1, min(tau+n,T))])
                if tau + n < T:
                    s = states[(tau+n)%n]
                    a = actions[(tau+n)%n]
                    x = tile_coder.get_feature_vector(s, a)
                    G += gamma**n * q.evaluate(x)
                s = states[tau%n]
                a = actions[tau%n]
                x = tile_coder.get_feature_vector(s, a)
                # Update weights.
                q.weights += alpha * (np.dot((G - q.evaluate(x)),x))
            t += 1
        print_episode(episode, n_episodes)
        # Stop training if state-action value function has converged.
        if len(all_steps) > 10 and sum(all_steps[-10:]) < stop_threshold:
            break
        # Store steps for plotting.
        all_steps.append(env.steps)
    # Plot agent performance during training.
    create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \
    'Number of steps:', 'Number of steps required to reach goal during training:')
    print_episode(n_episodes, n_episodes)
    return q
def n_step_sarsa(env, n, alpha, gamma, epsilon, n_episodes):
    # Initialize state-action value function.
    sa_pairs = product(range(env.observation_space_size), \
                       range(env.action_space_size))
    Q = dict.fromkeys(sa_pairs, 0.0)
    states = np.zeros(n)
    actions = np.zeros(n)
    rewards = np.zeros(n)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        t = 0
        T = np.inf
        states[t] = obs
        a = eps_greedy_policy(Q, obs, epsilon, env.action_space_size)
        actions[t] = a

        step_count = 0
        while not done or tau != T - 1:
            step_count += 1
            if t < T:
                obs_prime, reward, done = env.step(a)
                rewards[(t + 1) % n] = reward
                states[(t + 1) % n] = obs_prime
                if done:
                    T = t + 1
                else:
                    a = eps_greedy_policy(Q, obs_prime, epsilon,\
                                          env.action_space_size)
                    actions[(t + 1) % n] = a
            tau = t - n + 1
            if tau > -1:
                G = np.sum([
                    gamma**(i - tau - 1) * rewards[i % n]
                    for i in range(tau + 1, min(tau + n, T))
                ])
                if tau + n < T:
                    state = states[(tau + n) % n]
                    action = actions[(tau + n) % n]
                    G += gamma**n * Q[state, action]
                s = states[tau % n]
                a = actions[tau % n]
                # Update state-action value estimate.
                Q[s, a] += alpha * (G - Q[s, a])
            t += 1
        if episode % 1 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return Q
def differential_semi_gradient_n_step_sarsa(env, n, alpha, beta, epsilon, \
                        n_episodes, tile_coder, action_vec_dim, stop_threshold):
    # Initialization.
    q = LinearPolicy(tile_coder.total_n_tiles, action_vec_dim,
                     env.action_space_size)
    r_bar = 0
    states = [None] * n
    actions = np.zeros(n)
    rewards = np.zeros(n)
    all_steps = []

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        states[0] = obs
        a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \
                                   env.action_space_size)
        t = 0
        tau = -1

        while not done:
            obs, reward, done = env.step(a)
            states[(t + 1) % n] = obs
            rewards[(t + 1) % n] = reward
            a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \
                                       env.action_space_size)
            actions[(t + 1) % n] = a
            tau = t - n + 1
            if tau > -1:
                x = tile_coder.get_feature_vector(states[tau % n],
                                                  actions[tau % n])
                x_n = tile_coder.get_feature_vector(states[(tau+n)%n], \
                                                    actions[(tau+n)%n])
                summ = np.sum(
                    [rewards[i % n] - r_bar for i in range(tau + 1, tau + n)])
                delta = summ + q.evaluate(x_n) - q.evaluate(x)
                r_bar += beta * delta
                q.weights += alpha * delta * x
            t += 1
        # Stop training if state-action value function has converged.
        if len(all_steps) > 10 and sum(all_steps[-10:]) < stop_threshold:
            break
        # Store steps for plotting.
        all_steps.append(env.steps)
        print_episode(episode, n_episodes)
    # Plot agent performance during training.
    create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \
    'Number of steps:', 'Number of steps required to reach goal during training:')
    print_episode(n_episodes, n_episodes)
    return q
Beispiel #11
0
def off_policy_mc(env, gamma, b_policy, n_episodes):
    # Create required iterators.
    n_hands, n_dealer, usable = tuple(
        [env.observation_space[i].n for i in range(3)])
    state_space = product(range(n_hands), range(n_dealer), [True, False])
    it_states1, it_states2 = tee(state_space)
    action_space = range(2)
    sa_pairs = product(it_states1, action_space)
    it_pairs1, it_pairs2 = tee(sa_pairs)

    # Initialization
    Q = dict.fromkeys(it_pairs1, 0.0)
    C = dict.fromkeys(it_pairs2, 0.0)
    target = dict.fromkeys(it_states2, 0)

    # Solving for optimal policy.
    for episode in range(n_episodes):
        if episode % 10000 == 0:
            print_episode(episode, n_episodes)
        done = False
        obs = env.reset()
        states = []
        actions = []
        rewards = []

        # Generate an episode.
        while not done:
            action = b_policy(obs)
            states.append(obs)
            obs, reward, done, info = env.step(action)
            actions.append(action)
            rewards.append(reward)

        G = 0
        W = 1

        # Update action-value function.
        for t in range(len(states) - 1, -1, -1):
            G = gamma * G + rewards[t]
            s, a = states[t], actions[t]
            C[(s, a)] += W
            Q[(s, a)] += (W / C[(s, a)]) * (G - Q[(s, a)])
            action_values = [Q[s, i] for i in range(env.action_space.n)]
            target[(s, a)] = np.argmax(action_values)
            if a == target[(s, a)]:
                W *= (1 / 0.5)
            else:
                break
    print_episode(n_episodes, n_episodes)
    return target
def op_mc_control(env, n_episodes):
    '''On-policy first-visit Monte Carlo control algorithm.'''
    obs_space = product(range(1, 32), range(1, 11), [True, False])
    states = list(obs_space)
    sa_pairs = product(states, range(2))
    keys = list(sa_pairs)

    # Initialization.
    Q = {s: np.zeros((2)) for s in states}
    returns = {pair: [] for pair in keys}
    policy = {s[0]: 1 for s in keys}
    epsilon = 1.0

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        pairs = []

        # Generate an episode.
        while not done:
            action = policy[obs]
            pairs.append([obs, action])
            obs, reward, done, info = env.step(action)
        pairs.append((obs, policy[obs]))

        # Store returns for each state-action pair visited.
        for s, a in pairs:
            returns[s, a].append(reward)

        # Average returns for each state-action pair.
        for (s, a), G in returns.items():
            if len(G) > 0:
                Q[s][a] = np.mean(G)

        # Update policy (epsilon-greedy w.r.t action-value function).
        for s, _ in pairs:
            opt_a = np.argmax(Q[s])
            if np.random.uniform() < epsilon:
                policy[s] = env.action_space.sample()
            else:
                policy[s] = opt_a

        # Decay epsilon.
        epsilon = epsilon - 3 / n_episodes if epsilon > 0.1 else 0.1

        if episode % 1000 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
def td_pred(env, policy, alpha, gamma, n_episodes):
    # Initialize state-value function.
    V = np.zeros(70)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        while not done:
            action = policy[obs]
            obs_prime, reward, done = env.step(action)
            # Update state-value estimate.
            V[obs] += alpha * (reward + gamma * V[obs_prime] - V[obs])
            obs = obs_prime
        if episode % 1000 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return V
Beispiel #14
0
def mc_pred(env, policy, n_episodes):
    '''First-visit Monte Carlo prediction algorithm.'''
    hands = range(12, 22)
    dealer = range(1, 11)
    usable = [True, False]
    obs_space = product(hands, dealer, usable)

    # Initialization.
    keys = list(obs_space)
    V = dict.fromkeys(keys, 0)
    returns = {key:[] for key in keys}

    # For all hands less than 12 the player will hit to attain a hand in
    # the interval [12, 21]. Function prevents these states from being tracked
    # as optimal action already known (hit).
    is_valid = lambda x: True if x[0] > 11 and x[0] < 22 else False

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        states = []
        if is_valid(obs):
            states.append(obs)

        # Generate an episode using given policy.
        while not done:
            action = policy[obs]
            obs, reward, done, info = env.step(action)
            if obs not in states and is_valid(obs):
                states.append(obs)

        # Append return that follows first occurrence of each state visited.
        for state in states:
            ls = returns[state]
            returns[state].append(reward)

        # Updated action-value function.
        for state,G in returns.items():
            if len(G) > 0:
                V[state] = np.mean(G)

        if episode % 1000 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return V
Beispiel #15
0
def Q_learing(env, alpha, gamma, epsilon, n_episodes):
    # Initialize state-action value function.
    Q = {}
    curr_row = 0
    for row, col in env.state_space:
        for i in range(curr_row, curr_row + row):
            positions = product([i], range(col))
            velocities = product(range(-3, 1), range(-2, 3))
            states = product(positions, velocities)
            sa_pairs = product(states, range(9))
            # Key: (((pos_x, pos_y), (dy, dx)), action)
            for pair in sa_pairs:
                Q[pair] = 0
        curr_row += row

    # Store rewards for plot.
    rewards = []
    decay = lambda x: x - 2 / n_episodes if x - 2 / n_episodes > 0 else 0.1

    for episode in range(n_episodes):
        done = False
        val = 0
        obs = env.reset()

        while not done:
            action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size)
            obs_prime, reward, done = env.step(action)
            val += reward
            action_values = [Q[obs_prime, i] for i in range(9)]
            opt_a = np.argmax(action_values)
            # Update state-action value estimate.
            Q[obs,action] += alpha * (reward + gamma * Q[obs_prime,opt_a] \
                             - Q[obs,action])
            obs = obs_prime
        epsilon = decay(epsilon)
        rewards.append(val)
        if episode % 10 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)

    # Plot rewards over training process.
    create_line_plot(range(len(rewards)), rewards, 'Episode number:', \
                    'Return:', 'Agent returns over training:')
    return Q
Beispiel #16
0
def sarsa_lambda(env, lamda, alpha, gamma, epsilon, n_episodes):
    # Initialize state-action value function.
    q = LinearPolicy(env.observation_space_size * env.action_space_size, 0, \
                     env.action_space_size)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        action = eps_greedy_policy_bin_features(q, obs, epsilon, \
                 env.observation_space_size, env.action_space_size)
        z = np.zeros(env.observation_space_size * env.action_space_size)

        while not done:
            obs_prime, reward, done = env.step(action)
            delta = reward
            sa_vec = encode_sa_pair(obs, action, env.observation_space_size, \
                                    env.action_space_size)
            idx_active = np.argwhere(sa_vec == 1)
            delta -= np.sum(q.weights[idx_active])
            # Accumulating traces.
            z[idx_active] += 1

            if done:
                # Update weights.
                q.weights += alpha * delta * z
            else:
                action_prime = eps_greedy_policy_bin_features(q, obs_prime, epsilon, \
                               env.observation_space_size, env.action_space_size)
                sa_prime_vec = encode_sa_pair(obs_prime, action_prime, \
                               env.observation_space_size, env.action_space_size)
                idx_active = np.argwhere(sa_prime_vec == 1)
                delta += gamma * np.sum(q.weights[idx_active])
                # Update weights.
                q.weights += alpha * delta * z
                # Update accumulating traces.
                z = gamma * lamda * z
                obs = obs_prime
                action = action_prime
        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return q
Beispiel #17
0
def double_Q(env, alpha, gamma, epsilon, n_episodes):
    # Initialize state-action value functions.
    Q_1 = {}
    Q_2 = {}
    curr_row = 0
    for row, col in env.state_space:
        for i in range(curr_row, curr_row + row):
            positions = product([i], range(col))
            velocities = product(range(-3, 1), range(-2, 3))
            states = product(positions, velocities)
            # Key: (((pos_x, pos_y), (dy, dx)), action)
            for pair in product(states, range(9)):
                Q_1[pair] = 0
                Q_2[pair] = 0
        curr_row += row

    decay = lambda i,x: x/(i+1)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        while not done:
            a = double_Q_eps_greedy_policy(obs, Q_1, Q_2, epsilon)
            obs_prime, reward, done = env.step(a)
            # Update state-action value estimate.
            if np.random.uniform() < 0.5:
                action_vals = [Q_1[obs_prime, i] for i in range(9)]
                a_prime = np.argmax(action_vals)
                Q_1[obs, a] += alpha * (reward +gamma*Q_2[obs_prime, a_prime]\
                                        - Q_1[obs,a])
            else:
                action_vals = [Q_2[obs_prime, i] for i in range(9)]
                a_prime = np.argmax(action_vals)
                Q_2[obs, a] += alpha * (reward + gamma*Q_1[obs_prime, a_prime]\
                                        - Q_2[obs,a])
            obs = obs_prime
        epsilon = decay(episode, epsilon)
        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    Q = {s:i + x for s,i,x in zip(Q_1.keys(), Q_1.values(), Q_2.values())}
    return Q
Beispiel #18
0
def semi_gradient_td_zero(env, policy, alpha, gamma, n_episodes, tile_coder):
    # Initialization.
    v = LinearValueFunction(tile_coder.total_n_tiles)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        while not done:
            feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \
                              env.action_space_size)
            a = policy.greedy_action(feature_vectors)
            obs_prime, reward, done = env.step(a)
            s = tile_coder.get_tile_code(obs)
            s_prime = tile_coder.get_tile_code(obs_prime)
            # Update weights.
            v.weights += alpha * (np.dot((reward + gamma*v.evaluate(s_prime)- \
                                  v.evaluate(s)), s))
            obs = obs_prime
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return v
def REINFORCE_baseline(env, alpha_th, alpha_w, gamma, n_episodes):
    policy = ExponentialSoftmax(env.observation_space_size *
                                env.action_space_size)
    v = LinearValueFunction(env.observation_space_size)

    returns = []
    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
        env.action_space_size) for a in range(env.action_space_size)]
        a = policy.sample_action(all_sa_pairs)
        states = [obs]
        actions = [a]
        rewards = [None]

        while not done:
            obs, reward, done = env.step(a)
            all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(all_sa_pairs)
            states.append(obs)
            actions.append(a)
            rewards.append(reward)

        for t in range(len(states)):
            G_t = sum(rewards[t + 1:])
            x_t = encode_state(states[t], env.observation_space_size)
            delta = G_t - v.evaluate(x_t)
            v.weights += alpha_w * (gamma**t) * delta * x_t
            all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            policy.weights += alpha_th * (gamma ** t) * G_t * delta * \
                              policy.eligibility_vector(actions[t], all_sa_pairs)

        returns.append(sum(rewards[1:]))
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return (policy, np.array(returns))
Beispiel #20
0
def semi_gradient_n_step_td(env, policy, n, alpha, gamma, n_episodes,
                            tile_coder):
    # Initialization.
    v = LinearValueFunction(tile_coder.total_n_tiles)
    states = [None] * n
    rewards = np.zeros(n)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        states[0] = tile_coder.get_tile_code(obs)
        t = 0
        tau = -1
        T = np.inf

        while not done or tau != T - 1:
            if t < T:
                feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \
                                  env.action_space_size)
                a = policy.greedy_action(feature_vectors)
                obs, reward, done = env.step(a)
                states[(t + 1) % n] = tile_coder.get_tile_code(obs)
                rewards[(t + 1) % n] = reward
                if done:
                    T = t + 1
            tau = t - n + 1
            if tau > -1:
                # Calculate n-step return.
                G = np.sum([gamma**(i-tau-1)*rewards[i%n] for i in range(tau+1, \
                            min(tau+n, T))])
                if tau + n < T:
                    G += gamma**n * v.evaluate(states[(tau + n) % n])
                # Update weights.
                v.weights += alpha * np.dot((G-v.evaluate(states[tau%n])), \
                                             states[tau%n])
            t += 1
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return v
def n_step_td_pred(env, policy, n, alpha, gamma, n_episodes):
    # Initialize state-value function.
    V = np.zeros(env.observation_space_size)
    states = np.zeros(n)
    rewards = np.zeros(n)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        states[0] = obs
        tau = -1
        t = 0
        T = np.inf

        while not done or tau != T - 1:
            if t < T:
                action = policy(obs)
                obs_prime, reward, done = env.step(action)
                states[(t + 1) % n] = obs_prime
                rewards[(t + 1) % n] = reward
                obs = obs_prime
                if done:
                    T = t + 1
            tau = t - n + 1
            if tau > -1:
                G = np.sum([gamma ** (i-tau-1) * rewards[i % n] for i in \
                            range(tau + 1, min(tau+n,T))])
                if tau + n < T:
                    state = int(states[(tau + n) % n])
                    G += gamma**n * V[state]
                state = int(states[tau % n])
                # Update state-value estimate.
                V[state] += alpha * (G - V[state])
            t += 1
        if episode % 1 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return V
def online_sarsa_lambda(env, lamda, alpha, gamma, epsilon, n_episodes):
    # Initialize state-action value function.
    q = LinearPolicy(env.observation_space_size * env.action_space_size, 0,
                     env.action_space_size)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        a = eps_greedy_policy_bin_features(q, obs, epsilon, env.observation_space_size, \
                                           env.action_space_size)
        x = encode_sa_pair(obs, a, env.observation_space_size,
                           env.action_space_size)
        z = np.zeros(env.observation_space_size * env.action_space_size)
        Q_old = 0

        while not done:
            obs_prime, reward, done = env.step(a)
            a_prime = eps_greedy_policy_bin_features(q, obs_prime, epsilon, \
                      env.observation_space_size, env.action_space_size)
            x_prime = encode_sa_pair(obs_prime, a_prime, env.observation_space_size, \
                                     env.action_space_size)
            Q = q.evaluate(x)
            Q_prime = q.evaluate(x_prime)
            delta = reward + gamma * Q_prime - Q
            # Update eligibility traces.
            z = gamma * lamda * z + (1 -
                                     alpha * gamma * lamda * np.dot(z, x)) * x
            # Update weights.
            q.weights += alpha * (delta + Q - Q_old) * z - alpha * (Q -
                                                                    Q_old) * x
            Q_old = Q
            x = x_prime
            a = a_prime
        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return q
def semi_gradient_sarsa(env, alpha, gamma, epsilon, n_episodes, tile_coder,
                        action_len):
    # Initialization.
    q = LinearPolicy(tile_coder.total_n_tiles, action_len,
                     env.action_space_size)
    all_steps = []

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \
                                   env.action_space_size)

        while not done:
            obs_prime, reward, done = env.step(a)
            x = tile_coder.get_feature_vector(obs, a)
            if done:
                # Update weights.
                q.weights += alpha * np.dot((reward - q.evaluate(x)), x)
            else:
                a_prime = eps_greedy_func_policy(q, obs_prime, epsilon, \
                          tile_coder, env.action_space_size)
                x_prime = tile_coder.get_feature_vector(obs_prime, a_prime)
                # Update weights.
                q.weights += alpha * np.dot((reward + \
                             gamma * q.evaluate(x_prime) - q.evaluate(x)), x)
                obs = obs_prime
                a = a_prime
        # Store steps for plotting.
        all_steps.append(env.steps)
        print_episode(episode, n_episodes)
    # Plot agent performance over training.
    create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \
    'Number of steps:', 'Number of steps required to reach goal during training:')
    print_episode(n_episodes, n_episodes)
    return q
Beispiel #24
0
def tabular_dyna_Q(env, alpha, gamma, epsilon, n_episodes, n):
    # Create iterators.
    sa_pairs = product(range(env.observation_space_size), \
                       range(env.action_space_size))
    pairs_one, pairs_two = tee(sa_pairs)

    # Initialize state-action value function and model.
    Q = dict.fromkeys(pairs_one, 0)
    model = {pair:(-1,-1) for pair in pairs_two}

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        while not done:
            # Acting, model-learning and direct RL.
            action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size)
            obs_prime, reward, done = env.step(action)
            max_Q = np.argmax([Q[obs_prime, i] for i in range(4)])
            Q[obs, action] += alpha * (reward + gamma * Q[obs_prime, max_Q] - Q[obs, action])
            model[obs, action] = (reward, obs_prime)
            obs = obs_prime

            # Q-planning algorithm.
            for i in range(n):
                possible_pairs = [(s,a) for s,a in list(model.keys()) if \
                                  (model[s,a] != (-1,-1))]
                idx = np.random.choice(len(possible_pairs))
                pair = possible_pairs[idx]
                s = pair[0]
                a = pair[1]
                r, s_prime = model[s,a]
                max_Q = np.argmax([Q[s, x] for x in range(4)])
                Q[s,a] += alpha * (r + gamma * Q[s_prime, max_Q] - Q[s,a])
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return Q
def differential_semi_gradient_sarsa(env, alpha, beta, epsilon, n_episodes,\
                                     tile_coder, action_vec_dim, stop_threshold):
    # Initialization.
    q = LinearPolicy(tile_coder.total_n_tiles, action_vec_dim, env.action_space_size)
    r_bar = 0
    all_steps = []

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \
                                   env.action_space_size)

        while not done:
            obs_prime, reward, done = env.step(a)
            a_prime = eps_greedy_func_policy(q, obs_prime, epsilon, tile_coder,\
                                       env.action_space_size)
            x = tile_coder.get_feature_vector(obs, a)
            x_prime = tile_coder.get_feature_vector(obs_prime, a_prime)
            delta = reward - r_bar + q.evaluate(x_prime) - q.evaluate(x)
            r_bar += beta * delta
            # Update weights.
            q.weights += alpha * delta * x
            obs = obs_prime
            a = a_prime
        # Stop training if state-action value function has converged.
        if len(all_steps) > 10 and sum(all_steps[-10:]) < stop_threshold:
            break
        # Store steps for plotting.
        all_steps.append(env.steps)
        print_episode(episode, n_episodes)
    # Plot agent performance during training.
    create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \
    'Number of steps:', 'Number of steps required to reach goal during training:')
    print_episode(n_episodes, n_episodes)
    return q
Beispiel #26
0
def n_step_Q_sigma(env, n, alpha, gamma, epsilon, sigma, n_episodes):
    # Initialize policy and state-action value function.
    sa_pairs = product(range(env.observation_space_size), \
                       range(env.action_space_size))
    Q = dict.fromkeys(sa_pairs, 0)
    policy = dict.fromkeys(range(env.observation_space_size), 0)
    states = np.zeros(n)
    actions = np.zeros(n)
    Qs = np.zeros(n)
    deltas = np.zeros(n)
    pis = np.zeros(n)
    ratios = np.zeros(n)

    decay = lambda x: x - 2 / n_episodes if x - 2 / n_episodes > 0.1 else 0.1

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        action = np.random.randint(4)
        states[0] = obs
        actions[0] = action
        Qs[0] = Q[obs, action]
        t = 0
        tau = -1
        T = np.inf

        while not done or tau != T - 1:
            if t < T:
                obs_prime, reward, done = env.step(action)
                states[(t + 1) % n] = obs_prime
                if done:
                    T = t + 1
                    deltas[t % n] = reward - Qs[t % n]
                else:
                    action = np.random.randint(4)
                    actions[(t + 1) % n] = action
                    Qs[(t + 1) % n] = Q[obs_prime, action]
                    sample = gamma * Qs[(t + 1) % n]
                    expectation = gamma*np.sum([eps_greedy_proba(policy, \
                    obs_prime,i,epsilon)*Q[obs_prime, i] for i in range(4)])
                    deltas[t%n] = reward + sigma*sample + (1-sigma) *  \
                                  expectation - Qs[t%n]
                    pis[(t+1)%n] = eps_greedy_proba(policy, obs_prime, \
                                   action, epsilon)
                    ratios[(t + 1) % n] = pis[(t + 1) % n] / 0.25
            tau = t - n + 1
            if tau > -1:
                p = 1
                Z = 1
                G = Qs[tau % n]
                for k in range(tau, min(tau + n - 1, T - 1)):
                    G += Z * deltas[k % n]
                    Z = gamma * Z * ((1 - sigma) * pis[(k + 1) % n] + sigma)
                    p = p * (1 - sigma + sigma * ratios[k % n])
                s = states[tau % n]
                a = actions[tau % n]
                # Update state-action value function.
                Q[s, a] += alpha * p * (G - Q[s, a])
                action_values = [Q[s, i] for i in range(4)]
                policy[s] = np.argmax(action_values)
            t += 1
        epsilon = decay(epsilon)
        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
Beispiel #27
0
def mc_control(env, n_episodes):
    '''Monte Carlo control with Exploring Starts.'''
    # Create required iterators and lists.
    obs_space = product(range(12,22), range(1,11), [True, False])
    states = list(obs_space)
    sa_pairs = product(states, range(2))
    keys = list(sa_pairs)

    # Initialization.
    Q = {s:np.zeros((2)) for s in states}
    returns = {pair:[] for pair in keys}
    starting_sa_pairs = list(returns.keys())
    policy = get_init_policy()

    # Don't track hands where optimal action is known (action = 1 if hand < 12).
    is_valid = lambda x: True if x[0] > 11 and x[0] < 22 else False
    player = lambda x: [x,0]

    for episode in range(n_episodes):
        env.reset()
        # Select random starting state and action.
        rand = np.random.randint(len(starting_sa_pairs))
        (x, y, usable),a = starting_sa_pairs[rand]
        # Configure the environment to use exploring starts.
        env.player = player(x)
        env.dealer = player(y)
        # Used to store result of episode.
        done = a == 0
        episode_data = [starting_sa_pairs[rand]] if not done else []
        obs = starting_sa_pairs[rand][0]

        # Query environment if hold chosen as starting action.
        if done:
            obs, reward, done, info = env.step(a)
            episode_data.append((obs,a))
        else:
            # Hit chosen as starting action.
            while not done:
                a = policy[obs]
                obs, reward, done, info = env.step(a)
                if obs not in episode_data and is_valid(obs):
                    episode_data.append((obs,a))

        # Append return that follows first occurrence of each state-action pair.
        for obs, a in episode_data:
            returns[obs,a].append(reward)

        # Update action-value function.
        for pair, G in returns.items():
            if len(G) > 0:
                s,a = pair
                Q[s][a] = np.mean(G)

        # Update policy, make greedy w.r.t. action-value function.
        for s,ls in Q.items():
            policy[s] = np.argmax(ls)

        if episode % 1000 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
def n_step_tree_backup(env, n, alpha, gamma, epsilon, n_episodes):
    # Initialize policy and state-action value function.
    sa_pairs = product(range(env.observation_space_size),\
                       range(env.action_space_size))
    Q = dict.fromkeys(sa_pairs, 0.0)
    policy = dict.fromkeys(range(env.observation_space_size), 0)
    states = np.zeros(n)
    actions = np.zeros(n)
    Qs = np.zeros(n)
    deltas = np.zeros(n)
    pis = np.zeros(n)

    decay = lambda x: x - 2 / n_episodes if x - 2 / n_episodes > 0.1 else 0.1

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size)
        states[0] = obs
        actions[0] = action
        Qs[0] = Q[obs, action]
        t = -1
        tau = -1
        T = np.inf

        while not done or t != T - 1:
            t += 1
            if t < T:
                obs_prime, reward, done = env.step(action)
                states[(t + 1) % n] = obs_prime
                if done:
                    T = t + 1
                    deltas[t % n] = reward - Qs[t % n]
                else:
                    deltas[t%n] = reward + gamma * \
                    np.sum([policy_proba(policy, obs_prime, i, epsilon) * \
                    Q[obs_prime, i] for i in range(4)]) - Qs[t%n]
                    action = eps_greedy_policy(Q, obs_prime, epsilon, \
                                               env.action_space_size)
                    Qs[(t + 1) % n] = Q[obs_prime, action]
                    pis[(t + 1) % n] = policy_proba(policy, obs_prime, action,
                                                    epsilon)
            tau = t - n + 1
            if tau > -1:
                Z = 1
                G = Qs[tau % n]
                for k in range(tau, min(tau + n - 1, T - 1)):
                    G += Z * deltas[k % n]
                    Z *= gamma * Z * pis[(k + 1) % n]
                s = states[tau % n]
                a = actions[tau % n]
                # Update state-action value function.
                Q[s, a] += alpha * (G - Q[s, a])
                # Make policy greedy w.r.t. Q.
                action_values = [Q[s, i] for i in range(4)]
                policy[s] = np.argmax(action_values)
        epsilon = decay(epsilon)
        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
Beispiel #29
0
def prioritized_sweeping(env, alpha, gamma, epsilon, theta, n_episodes):
    # Create iterators.
    sa_pairs = product(range(env.observation_space_size), \
                       range(env.action_space_size))
    it_one, it_two = tee(sa_pairs)

    # Initialize state-action value function and model.
    Q = dict.fromkeys(it_one, 0)
    model = {pair: (0, 0) for pair in it_two}

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size)
        q = []

        while not done:
            obs_prime, reward, done = env.step(action)
            model[obs, action] = (reward, obs_prime)
            opt_a = np.argmax([Q[obs_prime, i] for i in range(4)])
            P = abs(reward + gamma * Q[obs_prime, opt_a] - Q[obs, action])
            # Maintain priority queue of each state-action pair whose estimated
            # value changes nontrivially. Prioritized by size of change.
            if P > theta:
                # Negative P used to allow a min binary heap to be used.
                q.append((-P, (obs, action)))
            obs = obs_prime
            action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size)

        counter = 0
        heapq.heapify(q)
        while len(q) > 0 and counter < n:
            counter += 1
            _, (s, a) = heapq.heappop(q)
            r, s_prime = model[s, a]
            opt_a = np.argmax([Q[s_prime, i] for i in range(4)])
            Q[s, a] += alpha * (reward + gamma * Q[s_prime, opt_a] - Q[s, a])

            # Determine the effect the change of value has on predecessor state-
            # action pairs' values.
            for s_, a_ in env.get_predecessor_states(s):
                r_, _ = model[s_, a_]
                opt_a = np.argmax([Q[s, i] for i in range(4)])
                P = abs(r_ + gamma * Q[s, opt_a] - Q[s_, a_])

                # Add predecessor state-action pairs to priority queue if change
                # causes their value to change nontrivially.
                if P > theta:
                    # If state-action pair already in queue, keep only the
                    # higher priority entry.
                    ls = [i for i in q if i[1] == (s_, a_)]
                    if len(ls) > 0:
                        if ls[0][0] > -P:
                            q.remove(ls[0])
                            heapq.heapify(q)
                            heapq.heappush(q, ((-P, (s_, a_))))
                    else:
                        heapq.heappush(q, ((-P, (s_, a_))))

        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return Q