Esempio n. 1
0
def off_policy_mc_prediction_weighted_importance_sampling(
        env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int,
                                                          int]]], bpi: Policy,
        pi: Policy, initQ: np.array) -> np.array:

    nA = env_spec.nA  # Number of actions
    nS = env_spec.nS  # Number of states

    gamma = env_spec.gamma  # Gamma

    Q = np.zeros(shape=(nS, nA))
    C = np.zeros(shape=(nS, nA))

    for episode in range(len(trajs)):
        G = 0.0
        W = 1.0

        for t in range(len(trajs[episode]))[::-1]:

            state, action, reward, next_state = trajs[episode][t]
            G = gamma * G + reward
            C[state, action] += W
            Q[state,
              action] += (W / C[state, action]) * (G - initQ[state, action])

            # Update weights with weighted importance sampling ratio
            W = W * (pi.action_prob(state, action) /
                     bpi.action_prob(state, action))

            if W == 0:
                break
        initQ = Q.copy()

    return Q
def off_policy_mc_prediction_weighted_importance_sampling(
        env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int,
                                                          int]]], bpi: Policy,
        pi: Policy, initQ: np.array) -> np.array:
    """
    input:
        env_spec: environment spec
        trajs: N trajectories generated using behavior policy bpi
            list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1})
        bpi: behavior policy used to generate trajectories
        pi: evaluation target policy
        initQ: initial Q values; np array shape of [nS,nA]
    ret:
        Q: $q_pi$ function; numpy array shape of [nS,nA]
    """

    #####################
    # TODO: Implement Off Policy Monte-Carlo prediction algorithm using weighted importance
    # sampling (Hint: Sutton Book p. 110, every-visit implementation is fine)
    #####################
    c = np.zeros([env_spec.nS, env_spec.nA])
    for episode in trajs:
        g = 0
        w = 1
        for t in range(len(episode) - 1, -1, -1):
            if w != 0:
                st, at, rt1, st1 = episode[t]
                g = env_spec.gamma * g + rt1
                c[st, at] += w
                initQ[st, at] += w / c[st, at] * (g - initQ[st, at])
                w *= pi.action_prob(st, at) / bpi.action_prob(st, at)
            else:
                break
    return initQ
Esempio n. 3
0
def value_prediction(env: EnvWithModel, pi: Policy, initV: np.array,
                     theta: float) -> Tuple[np.array, np.array]:
    """
    inp:
        env: environment with model information, i.e. you know transition dynamics and reward function
        pi: policy
        initV: initial V(s); numpy array shape of [nS,]
        theta: exit criteria
    return:
        V: $v_\pi$ function; numpy array shape of [nS]
        Q: $q_\pi$ function; numpy array shape of [nS,nA]
    """
    TD = env.TD
    R = env.R
    Q = np.zeros((env.spec.nS, env.spec.nA))

    while True:
        delta = 0
        for state in range(env.spec.nS):
            v = initV[state]
            sumV, sumQ = 0, 0
            for action in range(env.spec.nA):
                sumQ = sum(TD[state, action, next_state] *
                           (R[state, action, next_state] +
                            env.spec.gamma * initV[next_state])
                           for next_state in range(env.spec.nS))
                Q[state, action] = sumQ
                sumV += pi.action_prob(state, action) * sumQ
            initV[state] = sumV

            delta = max(delta, abs(v - initV[state]))
        if delta < theta:
            break

    return initV, Q
Esempio n. 4
0
def off_policy_mc_prediction_weighted_importance_sampling(
    env_spec:EnvSpec,
    trajs:Iterable[Iterable[Tuple[int,int,int,int]]],
    bpi:Policy,
    pi:Policy,
    initQ:np.array
) -> np.array:
    """
    input:
        env_spec: environment spec
        trajs: N trajectories generated using behavior policy bpi
            list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1})
        bpi: behavior policy used to generate trajectories
        pi: evaluation target policy
        initQ: initial Q values; np array shape of [nS,nA]
    ret:
        Q: $q_pi$ function; numpy array shape of [nS,nA]
    """

    #####################
    # TODO: Implement Off Policy Monte-Carlo prediction algorithm using weighted importance
    # sampling (Hint: Sutton Book p. 110, every-visit implementation is fine)
    #####################

    Q = np.array(initQ)
    C = np.zeros_like(Q)
    for traj in trajs:
        G = 0.0
        W = 1.0
        for step in reversed(traj):
            s = step[0]
            a = step[1]
            r = step[2]
            G = env_spec.gamma*G + r
            if W == 0:
                break
            C[s][a] += W
            Q[s][a] += (W/C[s][a])*(G-Q[s][a])
            W = W*pi.action_prob(s,a)/bpi.action_prob(s,a)
    
    return Q
def Glearning(env: EnvWithModel, rho: Policy, alpha: float, initG: np.array,
              beta: float, num_episodes: int):
    env_spec = env.spec
    nS, nA, gamma = env_spec.nS, env_spec.nA, env_spec.gamma
    G = initG.copy()
    pol = np.zeros((nS, nA))
    for state in range(nS):
        elts = [0 for i in range(nA)]
        for action in range(nA):
            elts[action] = rho.action_prob(state, action) * np.exp(
                -beta * G[state, action])
        pol[state] = np.array(elts) / np.sum(np.array(elts))
    pi = ArbitraryPolicy(nS, nA, pol)
    rewards = []
    for _ in range(num_episodes):
        state = env.reset()
        done = False
        totalReward = 0
        t = 0
        while not done:
            action = pi.action(state)
            new_state, reward, done = env.step(action)
            cost = -reward
            new_amount = 0
            for i in range(nA):
                new_amount += rho.action_prob(new_state, i) * np.exp(
                    -beta * G[new_state][i])
            update = cost - (gamma / beta) * np.log(new_amount)
            G[state][action] = (1 - alpha) * G[state][action] + alpha * update
            elts = [0 for action in range(nA)]
            for i in range(nA):
                elts[i] = rho.action_prob(state, i) * np.exp(
                    -beta * G[state][i])
            pol[state] = np.array(elts) / np.sum(np.array(elts))
            pi = ArbitraryPolicy(nS, nA, pol)
            state = new_state
            totalReward += reward * (gamma**t)
            t += 1
        rewards.append(totalReward)
    return G, pi, rewards
Esempio n. 6
0
def off_policy_mc_prediction_ordinary_importance_sampling(
        env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int,
                                                          int]]], bpi: Policy,
        pi: Policy, initQ: np.array) -> np.array:
    """
    input:
        env_spec: environment spec
        trajs: N trajectories generated using
            list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1})
        bpi: behavior policy used to generate trajectories
        pi: evaluation target policy
        initQ: initial Q values; np array shape of [nS,nA]
    ret:
        Q: $q_pi$ function; numpy array shape of [nS,nA]
    """

    #####################
    # TODO: Implement Off Policy Monte-Carlo prediction algorithm using ordinary importance
    # sampling (Hint: Sutton Book p. 109, every-visit implementation is fine)
    #####################

    Q = initQ.copy()
    N = np.zeros((env_spec.nS, env_spec.nA))
    for e in trajs:
        rho = 1
        G = 0

        for t in range(
                len(e) - 1, -1, -1
        ):  #This for loop moves backwards from from the final step to 0 (s_T-1,a_T-1,r_T,s_T)
            #rho importance ratio
            e_t = e[t]  #current time step t of current episode e
            s, a, r, s_prime = e_t
            G = r + env_spec.gamma * G
            N[s, a] += 1
            Q[s, a] += (rho * G - Q[s, a]) / N[
                s,
                a]  # Dont have to handle 0 in denominator N will always be >=1
            rho = rho * pi.action_prob(s, a) / bpi.action_prob(s, a)
    return Q
Esempio n. 7
0
def off_policy_mc_prediction_ordinary_importance_sampling(
        env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int,
                                                          int]]], bpi: Policy,
        pi: Policy, initQ: np.array) -> np.array:
    """
    input:
        env_spec: environment spec
        trajs: N trajectories generated using
            list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1})
        bpi: behavior policy used to generate trajectories
        pi: evaluation target policy
        initQ: initial Q values; np array shape of [nS,nA]
    ret:
        Q: $q_pi$ function; numpy array shape of [nS,nA]
    """

    #####################
    # TODO: Implement Off Policy Monte-Carlo prediction algorithm using ordinary importance
    # sampling (Hint: Sutton Book p. 109, every-visit implementation is fine)
    #####################

    nS = env_spec.nS
    nA = env_spec.nA
    gamma = env_spec.gamma
    Q = initQ
    tau = np.zeros((nS, nA))

    for eps_tr in trajs:
        G = 0
        W = 1
        for step in reversed(eps_tr):
            G = gamma * G + step[2]
            s = step[0]
            a = step[1]
            tau[s, a] = tau[s, a] + 1
            Q[s, a] = Q[s, a] + (W / tau[s, a]) * (G - Q[s, a])
            W = W * pi.action_prob(s, a) / bpi.action_prob(s, a)

    return Q
Esempio n. 8
0
def off_policy_mc_prediction_weighted_importance_sampling(
        env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int,
                                                          int]]], bpi: Policy,
        pi: Policy, initQ: np.array) -> np.array:
    """
    input:
        env_spec: environment spec
        trajs: N trajectories generated using behavior policy bpi
            list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1})
        bpi: behavior policy used to generate trajectories
        pi: evaluation target policy
        initQ: initial Q values; np array shape of [nS,nA]
    ret:
        Q: $q_pi$ function; numpy array shape of [nS,nA]
    """

    #####################
    # TODO: Implement Off Policy Monte-Carlo prediction algorithm using weighted importance
    # sampling (Hint: Sutton Book p. 110, every-visit implementation is fine)
    #####################

    Q = initQ
    C = np.zeros((env_spec.nS, env_spec.nA))
    for epi in trajs:

        G = 0
        W = 1

        for (state, action, reward, next_state) in reversed(epi):

            if W != 0:
                G = (env_spec._gamma * G) + reward
                C[state][action] += W
                Q[state][action] += ((W / C[state][action]) *
                                     (G - Q[state][action]))
                W = W * (pi.action_prob(state, action) /
                         bpi.action_prob(state, action))

    return Q
Esempio n. 9
0
def off_policy_mc_prediction_weighted_importance_sampling(
        env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int,
                                                          int]]], bpi: Policy,
        pi: Policy, initQ: np.array) -> np.array:
    """
    input:
        env_spec: environment spec
        trajs: N trajectories generated using behavior policy bpi
            list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1})
        bpi: behavior policy used to generate trajectories
        pi: evaluation target policy
        initQ: initial Q values; np array shape of [nS,nA]
    ret:
        Q: $q_pi$ function; numpy array shape of [nS,nA]
    """

    #####################
    # TODO: Implement Off Policy Monte-Carlo prediction algorithm using weighted importance
    # sampling (Hint: Sutton Book p. 110, every-visit implementation is fine)
    #####################

    C = np.zeros((env_spec.nS, env_spec.nA))
    Q = initQ.copy()
    for e in trajs:
        G = 0
        W = 1
        for t in range(
                len(e) - 1, -1,
                -1):  #This for loop moves backwards from the final step to 0
            if W == 0:  #If W is zero then Q is no longer updated
                break
            e_t = e[t]  #current time step t of current episode e
            s, a, r, s_prime = e_t
            G = r + env_spec.gamma * G
            C[s, a] += W  #C sums the weights used to update Q[s,a]
            Q[s, a] += (W / C[s, a]) * (G - Q[s, a])
            W = W * pi.action_prob(s, a) / bpi.action_prob(s, a)
    return Q
Esempio n. 10
0
def off_policy_n_step_sarsa(env_spec: EnvSpec,
                            trajs: Iterable[Iterable[Tuple[int, int, int,
                                                           int]]], bpi: Policy,
                            n: int, alpha: float,
                            initQ: np.array) -> Tuple[np.array, Policy]:
    """
    input:
        env_spec: environment spec
        trajs: N trajectories generated using
            list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1})
        bpi: behavior policy used to generate trajectories
        n: how many steps?
        alpha: learning rate
        initQ: initial Q values; np array shape of [nS,nA]
    ret:
        Q: $q_star$ function; numpy array shape of [nS,nA]
        policy: $pi_star$; instance of policy class
    """
    Q = np.array(initQ)
    eps = 0.1
    pi = egreedy_policy(Q, eps)
    for traj in trajs:
        T = len(traj)
        for t in range(T + n - 1):
            tau = t - n + 1
            if tau >= 0:
                rho = 1
                G = 0
                for i in range(tau + 1, min(tau + n, T - 1) + 1):
                    rho *= pi.action_prob(traj[i][0],
                                          traj[i][1]) / bpi.action_prob(
                                              traj[i][0], traj[i][1])
                for i in range(tau + 1, min(tau + n, T) + 1):
                    G += env_spec.gamma**(i - tau - 1) * traj[i - 1][2]
                if tau + n < T:
                    G += env_spec.gamma**n * Q[traj[tau + n][0]][traj[tau +
                                                                      n][1]]
                Q[traj[tau][0]][traj[tau][1]] += alpha * rho * (
                    G - Q[traj[tau][0]][traj[tau][1]])
                pi = egreedy_policy(Q, eps)

    #####################
    # TODO: Implement Off Policy n-Step SARSA algorithm
    # sampling (Hint: Sutton Book p. 149)
    #####################
    assignment = np.zeros(env_spec.nS)
    for i in range(env_spec.nS):
        assignment[i] = np.argmax(Q[i])
    return Q, optimal_policy(assignment)
def value_prediction(env: EnvWithModel, pi: Policy, initV: np.array,
                     theta: float) -> Tuple[np.array, np.array]:
    """
    inp:
        env: environment with model information, i.e. you know transition dynamics and reward function
        pi: policy
        initV: initial V(s); numpy array shape of [nS,]
        theta: exit criteria
    return:
        V: $v_\pi$ function; numpy array shape of [nS]
        Q: $q_\pi$ function; numpy array shape of [nS,nA]
    """
    n_s = env.spec.nS
    n_a = env.spec.nA
    trans_mat = env.TD
    reward_matrix = env.R

    delta = theta
    v = initV
    q = np.zeros((n_s, n_a))
    while delta >= theta:
        delta = 0
        for s in range(n_s):
            current_state_val = v[s]
            result = 0
            for a in range(n_a):

                trans = trans_mat[s][a]
                sum_val = 0

                for i in range(len(trans)):

                    next_state = i
                    prob = trans[i]

                    sum_val += (prob * (reward_matrix[s][a][next_state] +
                                        (env.spec.gamma * v[next_state])))
                q[s][a] = sum_val

                result += pi.action_prob(s, a) * sum_val

            v[s] = result
            delta = max(delta, abs(v[s] - current_state_val))

    V = v
    Q = q
    return V, Q
Esempio n. 12
0
def value_prediction(env:EnvWithModel, pi:Policy, initV:np.array, theta:float) -> Tuple[np.array,np.array]:

    nA = env.spec.nA        # Number of actions
    nS = env.spec.nS        # Number of states
    
    V = np.zeros(nS)

    R = env.R               # Reward function
    T = env.TD              # State transition function
    gamma = env.spec.gamma  # Gamma

    """
    Iteratively sweep through all states, 
    """
    while True:
        delta = 0
        for s in range(nS):
            value = 0
            for a in range(nA):
                prob = pi.action_prob(s, a)
                for sp in range(nS):
                    value += (prob * (T[s, a, sp] * (R[s, a, sp] + gamma * initV[sp])))
                V[s] = value
            delta = max(delta, abs(initV[s] - V[s]))
        
        """ Check for convergence """
        if delta < theta:
            break 
        
        initV = V.copy()        # Must make an explicit copy
    
    """ 
    With the values function having converged, we can now extract
    the optimal action probabilities for each state, using the 
    Bellman optimality equation.
    """
    Q = np.zeros(shape=(nS, nA))
    for s in range(nS):
        for a in range(nA):
            for sp in range(nS): 
                Q[s, a] += T[s,a,sp] * (R[s,a,sp] + gamma * V[sp])
    
    return V, Q
Esempio n. 13
0
def value_prediction(env:EnvWithModel, pi:Policy, initV:np.array, theta:float) -> Tuple[np.array,np.array]:
    """
    inp:
        env: environment with model information, i.e. you know transition dynamics and reward function
        pi: policy
        initV: initial V(s); numpy array shape of [nS,]
        theta: exit criteria
    return:
        V: $v_\pi$ function; numpy array shape of [nS]
        Q: $q_\pi$ function; numpy array shape of [nS,nA]
    """

    #####################
    # TODO: Implement Value Prediction Algorithm (Hint: Sutton Book p.75)
    #####################
    
    nS = env.spec.nS
    nA = env.spec.nA
    gamma = env.spec.gamma
    TD = env.TD
    R = env.R
    V = initV
    Q = np.zeros((nS,nA))
    
    while True:
        delta = 0;
        for i in range(nS):
            prevVal = V[i]
            action_sum_temp = 0
            for j in range(nA):
                act_pr = pi.action_prob(i,j)
                action_sum_temp = action_sum_temp + act_pr * sum(TD[i,j,:] * (R[i,j,:] + gamma*V))
            V[i] = action_sum_temp
            delta = max(delta, abs(V[i]-prevVal) )
            
        if delta<theta:
            break
    
    for s in range(nS):
        for a in range(nA):
            Q[s,a] = sum(TD[s,a,:] * (R[s,a,:] + gamma*V))
    
    return V, Q
Esempio n. 14
0
def value_prediction(env: EnvWithModel, pi: Policy, initV: np.array,
                     theta: float) -> Tuple[np.array, np.array]:
    """
    inputs:
        env: environment with model information, i.e. you know transition dynamics and reward function
        pi: policy
        initV: initial V(s); numpy array shape of [nS,]
        theta: exit criteria
    return:
        V: $v_\pi$ function; numpy array shape of [nS]
        Q: $q_\pi$ function; numpy array shape of [nS,nA]
    """

    #####################
    # TODO: Implement Value Prediction Algorithm (Hint: Sutton Book p.75)
    #####################
    V = initV.copy()  #Arbitrary except terminal states must be 0

    delta = theta
    while delta >= theta:
        delta = 0
        for s in range(env.spec.nS):
            v = V[s]
            #action probabilities: pi.action_prob(s) is an array (size nA)
            #transition dynamics: env.TD[state,action,state_t+1] is an array of probabilities (size nS)
            #rewards: env.R[state,action,state_t+1]
            update = 0
            for a in range(env.spec.nA):
                #Note below after action_prob the extra zero is because the array is wrapped in another array since array[None]
                #puts the array in another array. Though this should always happen since action_prob shouldn't be passed an action
                update += pi.action_prob(s, a) * np.sum(
                    env.TD[s, a, :] * (env.R[s, a, :] + env.spec.gamma * V))
            V[s] = update
            delta = max(delta, np.abs(v - V[s]))
    Q = np.zeros((env.spec.nS, env.spec.nA))
    for s in range(env.spec.nS):
        for a in range(env.spec.nA):
            Q[s, a] = np.sum(env.TD[s, a, :] *
                             (env.R[s, a, :] + env.spec.gamma * V))
            #Note: summing over the actions in pi(a|s)*Q(s,a) gives V(s)
    return V, Q
Esempio n. 15
0
def value_prediction(env:EnvWithModel, pi:Policy, initV:np.array, theta:float) -> Tuple[np.array,np.array]:
    """
    inp:
        env: environment with model information, i.e. you know transition dynamics and reward function
        pi: policy
        initV: initial V(s); numpy array shape of [nS,]
        theta: exit criteria
    return:
        V: $v_\pi$ function; numpy array shape of [nS]
        Q: $q_\pi$ function; numpy array shape of [nS,nA]
    """

    #####################
    # TODO: Implement Value Prediction Algorithm (Hint: Sutton Book p.75)
    #####################
    num_states = env.spec.nS
    num_actions = env.spec.nA
    V = np.array(initV)
    Q = np.zeros((num_states,num_actions))
    R = env.R
    TD = env.TD

    change = theta + 1
    while change > theta:
        change = 0 
        for i in range(num_states):
            old_v = V[i]
            new_v = 0
            for j in range(num_actions):
                sum_a = 0
                for k in range(num_states):
                    sum_a += TD[i,j,k]*(R[i,j,k]+env.spec.gamma*V[k])
                new_v += pi.action_prob(i,j)*sum_a
            change = max(change, abs(new_v-old_v))
            V[i] = new_v

    for i in range(num_states):
        for j in range(num_actions):
            for k in range(num_states):
                Q[i][j] += TD[i,j,k]*(R[i,j,k]+env.spec.gamma*V[k])     
    return V, Q
def off_policy_n_step_sarsa(env_spec: EnvSpec,
                            trajs: Iterable[Iterable[Tuple[int, int, int,
                                                           int]]], bpi: Policy,
                            n: int, alpha: float,
                            initQ: np.array) -> Tuple[np.array, Policy]:
    """
    input:
        env_spec: environment spec
        trajs: N trajectories generated using
            list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1})
        bpi: behavior policy used to generate trajectories
        pi: evaluation target policy
        n: how many steps?
        alpha: learning rate
        initQ: initial Q values; np array shape of [nS,nA]
    ret:
        Q: $q_star$ function; numpy array shape of [nS,nA]
        policy: $pi_star$; instance of policy class
    """

    #####################
    # TODO: Implement Off Policy n-Step SARSA algorithm
    # sampling (Hint: Sutton Book p. 149)
    #####################

    class PiStar(Policy):
        def __init__(self, optActionProb, optPolicy):
            self.optActionProb = optActionProb
            self.optPolicy = optPolicy

        def action_prob(self, state, action):
            return self.optActionProb[state, action]

        def action(self, state):
            return self.optPolicy[state]

    Q = initQ.copy()
    for e in trajs:  #Now we're inside the episode, a list of transitions (list of tuples)

        T = np.inf
        tau = -1
        R = np.zeros(len(e) + 1)  #zero added since R_0 doesn't exist
        A = np.zeros(len(e),
                     dtype=int)  #There is exactly one action per truple
        S = np.zeros(len(e) + 1, dtype=int)
        t = 0
        while tau < (T - 1):  #Now we iterate each transition in this episode
            if t < T:
                #Extract transition information
                s, a, r, s_prime = e[t]
                S[t] = int(s)
                A[t] = int(a)
                R[t + 1] = r
                S[t + 1] = int(s_prime)
                #Check if S_t+1 is terminal
                if t == (len(e) - 1):
                    T = t + 1
                else:
                    A[t + 1] = e[t + 1][1]  #Store next action
            tau = int(t - n + 1)
            if tau >= 0:
                #Calculate rho (importance ratio) and G (return estimate)
                i = tau + 1
                rho = 1
                while i <= min(tau + n, T - 1):
                    #Calculate target policy pi probability (greedy policy = 1 if Q[S_i,] is max)
                    piprob = int(Q[S[i], A[i]] == np.max(Q[S[i], :]))
                    rho = rho * piprob / bpi.action_prob(S[i], A[i])
                    i += 1
                i = tau + 1
                G = 0
                while i <= min(
                        tau + n,
                        T):  #Compute sum of discounted rewards nsteps ahead
                    G += env_spec.gamma**(i - tau - 1) * R[i]
                    i += 1
                #Print sum of Rewards
#                print('G: ',G)
#Add estimated Q value n steps ahead if we dont hit termination after n steps
                if tau + n < T:
                    G += (env_spec.gamma**n) * Q[S[tau + n], A[tau + n]]
#                    print('Q(S,A) est:',Q[S[tau+n],A[tau+n]])
                Q[S[tau], A[tau]] += alpha * rho * (G - Q[S[tau], A[tau]])
                #Visualize Q_S
#                print('Q_update:\n',Q[S[tau],:].reshape((2, 2)))
            t += 1
    optActionProb = np.zeros((env_spec.nS, env_spec.nA))
    optPolicy = np.zeros(env_spec.nS)

    for s in range(env_spec.nS):
        a = np.argmax(Q[s, :])
        optActionProb[s, a] = 1
        optPolicy[s] = a

    pi = PiStar(optActionProb, optPolicy)

    return Q, pi
Esempio n. 17
0
def value_prediction(env: EnvWithModel, pi: Policy, initV: np.array,
                     theta: float) -> Tuple[np.array, np.array]:
    """
    inp:
        env: environment with model information, i.e. you know transition dynamics and reward function
        pi: policy
        initV: initial V(s); numpy array shape of [nS,]
        theta: exit criteria
    return:
        V: $v_\pi$ function; numpy array shape of [nS]
        Q: $q_\pi$ function; numpy array shape of [nS,nA]
    """

    Q = np.zeros((env.spec.nS, env.spec.nA))
    delta2 = float('inf')

    while delta2 >= theta:
        delta = 0.0

        for state in range(env.spec.nS):
            v = initV[state]
            piActionState = 0

            for action in range(env.spec.nA):
                probActionState = pi.action_prob(state, action)

                for sPrime in range(env.spec.nS):
                    # current reward for state
                    r = env.R[state, action, sPrime]
                    # bellman equation
                    piActionState += probActionState * env.TD[
                        state, action,
                        sPrime] * (r + env.spec.gamma * initV[sPrime])

            # update value prediction
            initV[state] = piActionState

            # update delta
            delta = max(delta, abs(v - initV[state]))
        delta2 = delta

    # update Q values
    delta2 = float('inf')
    while delta2 >= theta:
        delta = 0.0

        for state in range(env.spec.nS):
            for action in range(env.spec.nA):

                # old q value
                q = Q[state][action]
                stateActionValue = 0

                for sPrime in range(env.spec.nS):
                    r = env.R[state, action, sPrime]
                    stateActionValue += env.TD[state, action, sPrime] * (
                        r + env.spec.gamma * initV[sPrime])

                Q[state][action] = stateActionValue

                delta = max(delta, abs(q - Q[state][action]))
        delta2 = delta

    V = initV

    return V, Q
Esempio n. 18
0
def off_policy_n_step_sarsa(env_spec: EnvSpec,
                            trajs: Iterable[Iterable[Tuple[int, int, int,
                                                           int]]], bpi: Policy,
                            n: int, alpha: float,
                            initQ: np.array) -> Tuple[np.array, Policy]:
    """
    input:
        env_spec: environment spec
        trajs: N trajectories generated using
            list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1})
        bpi: behavior policy used to generate trajectories
        n: how many steps?
        alpha: learning rate
        initQ: initial Q values; np array shape of [nS,nA]
    ret:
        Q: $q_star$ function; numpy array shape of [nS,nA]
        policy: $pi_star$; instance of policy class
    """

    #####################
    # TODO: Implement Off Policy n-Step SARSA algorithm
    # sampling (Hint: Sutton Book p. 149)
    #####################

    class OptimalPolicy(Policy):
        def __init__(self, OptActionProb, OptAction):
            self.OptActionProb = OptActionProb
            self.OptAction = OptAction

        def action_prob(self, state, action):
            return self.OptActionProb[state, action]

        def action(self, state):
            return self.OptAction[state]

    nS = env_spec.nS
    nA = env_spec.nA
    gamma = env_spec.gamma
    Q = initQ
    gamma_vec = np.zeros(n + 1)
    gamma_vec[0] = 1
    OptActionProb = (1 / nA) * np.ones((nS, nA))
    OptAction = np.zeros(nS)
    for i in range(n):
        gamma_vec[i + 1] = gamma_vec[i] * gamma

    for eps_tr in trajs:
        T = len(eps_tr) - 1
        for tau, step_tr in enumerate(eps_tr):
            rho = 1
            if tau + n <= T:
                rewards = np.asarray(eps_tr[tau:tau + n])[:, 2]
                Qs_prime = Q[eps_tr[tau + n][0], eps_tr[tau + n][1]]
                G = sum(gamma_vec * np.append(rewards, Qs_prime))
                for i in range(n):
                    s = eps_tr[tau + i + 1][0]
                    a = eps_tr[tau + i + 1][1]
                    rho = rho * OptActionProb[s, a] / bpi.action_prob(s, a)
            else:
                rewards = np.asarray(eps_tr[tau:tau + n])[:, 2]
                rewlen = len(rewards)
                gamma_vec_modf = gamma_vec[0:rewlen]
                G = sum(gamma_vec_modf * rewards)
                for i in range(rewlen - 1):
                    s = eps_tr[tau + i + 1][0]
                    a = eps_tr[tau + i + 1][1]
                    rho = rho * OptActionProb[s, a] / bpi.action_prob(s, a)

            Q[step_tr[0],
              step_tr[1]] = Q[step_tr[0], step_tr[1]] + alpha * rho * (
                  G - Q[step_tr[0], step_tr[1]])
            s_a_vals = Q[step_tr[0], :]
            OptAction[step_tr[0]] = s_a_vals.argmax()
            #best_action = OptAction[s].astype(int)
            OptActionProb[step_tr[0], :] = 0
            OptActionProb[step_tr[0], OptAction[step_tr[0]].astype(int)] = 1

    pi = OptimalPolicy(OptActionProb, OptAction)

    return Q, pi
Esempio n. 19
0
def off_policy_n_step_sarsa(env_spec: EnvSpec,
                            trajs: Iterable[Iterable[Tuple[int, int, int,
                                                           int]]], bpi: Policy,
                            n: int, alpha: float,
                            initQ: np.array) -> Tuple[np.array, Policy]:
    """
    input:
        env_spec: environment spec
        trajs: N trajectories generated using
            list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1})
        bpi: behavior policy used to generate trajectories
        n: how many steps?
        alpha: learning rate
        initQ: initial Q values; np array shape of [nS,nA]
    ret:
        Q: $q_star$ function; numpy array shape of [nS,nA]
        policy: $pi_star$; instance of policy class
    """

    #####################
    # TODO: Implement Off Policy n-Step SARSA algorithm
    # sampling (Hint: Sutton Book p. 149)
    #####################
    Q = np.zeros((env_spec.nS, env_spec.nA))
    Q = initQ
    policy = QPolicy(Q)
    pi = np.zeros(env_spec.nS)

    for epi in trajs:
        T = float('inf')
        tau = 0
        t = 0
        reward = []
        state = []
        action = []
        state.append(epi[t][0])
        action.append(epi[t][1])
        while tau != T - 1:

            if t < T:

                reward.append(epi[t][2])
                state.append(epi[t][3])

                if t == len(epi) - 1:
                    T = t + 1
                else:
                    action.append(epi[t + 1][1])

            tau = t - n + 1
            if tau >= 0:
                G = 0
                rho = 1
                for i in range(tau + 1, min(tau + n, T) + 1):
                    G += (env_spec._gamma**(i - tau - 1)) * reward[i - 1]

                for j in range(tau + 1, min(tau + n, T - 1) + 1):
                    rho = rho * (policy.action_prob(state[j], action[j]) /
                                 bpi.action_prob(state[j], action[j]))

                if tau + n < T:
                    G += ((env_spec._gamma**n) *
                          Q[state[tau + n]][action[tau + n]])

                Q[state[tau]][action[tau]] += (
                    alpha * rho * (G - Q[state[tau]][action[tau]]))

            t += 1

    pi = policy
    return Q, pi
Esempio n. 20
0
def off_policy_n_step_sarsa(
    env_spec:EnvSpec,
    trajs:Iterable[Iterable[Tuple[int,int,int,int]]],
    bpi:Policy,
    n:int,
    alpha:float,
    initQ:np.array
) -> Tuple[np.array,Policy]:
    """
    input:
        env_spec: environment spec
        trajs: N trajectories generated using
            list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1})
        bpi: behavior policy used to generate trajectories
        n: how many steps?
        alpha: learning rate
        initQ: initial Q values; np array shape of [nS,nA]
    ret:
        Q: $q_star$ function; numpy array shape of [nS,nA]
        policy: $pi_star$; instance of policy class
    """

    # greedy policy
    pi = GreedyPolicy(env_spec.nS, env_spec.nA)

    # loop for each episode
    for episode in trajs:

        sar = []

        # select and store an action
        T = sys.maxsize

        # time step being updated
        tau = 0

        while tau != T - 1:

            # for each step of episode
            for t in range(len(episode)):

                state = episode[t][0]
                action = episode[t][1]
                reward = episode[t][2]
                # state1 = episode[t][3]


                if t < T:
                    # reached last state in episode
                    sar.append((state, action, reward))

                    if t == len(episode) - 1:
                        T = t + 1

                    # time of estimate
                    tau = t - n + 1

                    # reached n-step
                    if tau >= 0:

                        rho = np.array([pi.action_prob(sar[i][0], sar[i][1]) / bpi.action_prob(sar[i][0], sar[i][1])
                                        for i in range(tau + 1, tau + n)]).prod()

                        g = np.array([pow(env_spec.gamma, i - tau - 1) * sar[i][2]
                                      for i in range(tau + 1, min(tau + n, T))]).sum()

                        if tau + n < T:
                            tauState = sar[tau + n - 1][0]
                            tauAction = sar[tau + n - 1][1]

                            g = g + pow(env_spec.gamma, n) * initQ[tauState, tauAction]

                        tauState = sar[tau][0]
                        tauAction = sar[tau][1]
                        initQ[tauState, tauAction] = initQ[tauState, tauAction] + (alpha * rho) * (g - initQ[tauState, tauAction])
                        pi.p[tauState, tauAction] = 1.0

    Q = initQ

    return Q, pi