Esempio n. 1
0
def extract_policy(env, v, gamma):
    """ 
    Extract the optimal policy given the optimal value-function.

    Inputs:
    env: OpenAI Gym environment.
            env.P: dictionary
                    P[state][action] is tuples with (probability, nextstate, reward, terminal)
                    probability: float
                    nextstate: int
                    reward: float
                    terminal: boolean
            env.nS: int
                    number of states
            env.nA: int
                    number of actions
    v: numpy.ndarray
        value function
    gamma: float
        Discount factor. Number in range [0, 1)
    
    Outputs:
    policy: numpy.ndarray
    """

    policy = np.zeros(env.nS, dtype=np.int32)
    ############################
    q = action_evaluation(env, gamma, v)
    policy = np.apply_along_axis(np.argmax, 1, q)
    ############################

    return policy
Esempio n. 2
0
def value_iteration(env, gamma, max_iteration, theta):
    """
    Implement value iteration algorithm. 

    Inputs:
    env: OpenAI Gym environment.
            env.P: dictionary
                    the transition probabilities of the environment
                    P[state][action] is list of tuples. Each tuple contains probability, nextstate, reward, terminal
            env.nS: int
                    number of states
            env.nA: int
                    number of actions
    gamma: float
            Discount factor.
    max_iteration: int
            The maximum number of iterations to run before stopping.
    theta: float
            The threshold of convergence.
    
    Outputs:
    V: numpy.ndarray
    policy: numpy.ndarray
    numIterations: int
            Number of iterations
    """

    V = np.zeros(env.nS)
    numIterations = 0

    #Implement the loop part here
    ############################
    # YOUR CODE STARTS HERE
    while True:
        delta = 0
        length = len(V)
        q = action_evaluation(env, gamma, V)
        for i in range(length):
            old_v = V[i]
            V[i] = np.max(q[i])
            diff = np.abs(old_v - V[i])
            if diff > delta: delta = diff
        numIterations += 1
        if delta < theta or numIterations >= max_iteration: break

    # YOUR CODE ENDS HERE
    ############################

    #Extract the "optimal" policy from the value function
    policy = extract_policy(env, V, gamma)

    return V, policy, numIterations
Esempio n. 3
0
def value_iteration(env, gamma, max_iteration, theta):
    """
    Implement value iteration algorithm. 

    Inputs:
    env: OpenAI Gym environment.
            env.P: dictionary
                    the transition probabilities of the environment
                    P[state][action] is list of tuples. Each tuple contains probability, nextstate, reward, terminal
            env.nS: int
                    number of states
            env.nA: int
                    number of actions
    gamma: float
            Discount factor.
    max_iteration: int
            The maximum number of iterations to run before stopping.
    theta: float
            The threshold of convergence.
    
    Outputs:
    V: numpy.ndarray
    policy: numpy.ndarray
    numIterations: int
            Number of iterations
    """

    V = np.zeros(env.nS)
    numIterations = 0

    #Implement the loop part here
    ############################
    # YOUR CODE STARTS HERE
    iteration = True
    while iteration and numIterations <= 500:
        numIterations += 1
        diff = 0
        q = action_evaluation(env, gamma, V)
        for i, s in enumerate(V):
            temp = max(q[i])
            diff = max(diff, temp - s)
            V[i] = temp
        if diff < theta:
            iteration = False
    # YOUR CODE ENDS HERE
    ############################

    #Extract the "optimal" policy from the value function
    policy = extract_policy(env, V, gamma)

    return V, policy, numIterations
Esempio n. 4
0
def policy_improvement(env, value_from_policy, policy, gamma):
    """
    Given the value function from policy, improve the policy.

    Inputs:
    env: OpenAI Gym environment
            env.P: dictionary
                    P[state][action] is tuples with (probability, nextstate, reward, terminal)
                    probability: float
                    nextstate: int
                    reward: float
                    terminal: boolean
            env.nS: int
                    number of states
            env.nA: int
                    number of actions

    value_from_policy: numpy.ndarray
            The value calculated from the policy
    policy: numpy.ndarray
            The previous policy.
    gamma: float
            Discount factor.

    Outputs:
    new policy: numpy.ndarray
            An array of integers. Each integer is the optimal action to take
            in that state according to the environment dynamics and the
            given value function.
    policy_stable: boolean
            True if the "optimal" policy is found, otherwise false
    """
    ############################
    # YOUR CODE STARTS HERE
    policy_stable = True
    q = action_evaluation(env, gamma, value_from_policy)
    for i in range(len(policy)):
        old_action = policy[i]
        new_action = np.argmax(q[i])
        if old_action != new_action: policy_stable = False
        policy[i] = new_action
    # YOUR CODE ENDS HERE
    ############################

    return policy, policy_stable
Esempio n. 5
0
def policy_evaluation(env, policy, gamma, theta):
    """
    Evaluate the value function from a given policy.

    Inputs:
    env: OpenAI Gym environment.
            env.P: dictionary
                    
            env.nS: int
                    number of states
            env.nA: int
                    number of actions

    gamma: float
            Discount factor.
    policy: numpy.ndarray
            The policy to evaluate. Maps states to actions.
    theta: float
            The threshold of convergence.
    
    Outputs:
    V: numpy.ndarray
            The value function from the given policy.
    """
    ############################
    # YOUR CODE STARTS HERE
    V = np.zeros(len(policy))
    iteration = True
    while iteration:
        q = action_evaluation(env, gamma, V)
        diff = 0
        for s in range(len(V)):
            temp = q[s, policy[s]]
            diff = max(temp - V[s], diff)
            V[s] = temp
        if diff < theta:
            iteration = False
    # YOUR CODE ENDS HERE
    ############################

    return V
Esempio n. 6
0
def policy_evaluation(env, policy, gamma, theta):
    """
    Evaluate the value function from a given policy.

    Inputs:
    env: OpenAI Gym environment.
            env.P: dictionary
                    
            env.nS: int
                    number of states
            env.nA: int
                    number of actions

    gamma: float
            Discount factor.
    policy: numpy.ndarray
            The policy to evaluate. Maps states to actions.
    theta: float
            The threshold of convergence.
    
    Outputs:
    V: numpy.ndarray
            The value function from the given policy.
    """
    ############################
    # YOUR CODE STARTS HERE
    V = np.zeros(env.nS)
    while True:
        delta = 0
        q = action_evaluation(env, gamma, V)
        for i in range(len(policy)):
            old_v = V[i]
            V[i] = q[i][policy[i]]
            diff = np.abs(old_v - V[i])
            if diff > delta: delta = diff
        if delta < theta: break
    # YOUR CODE ENDS HERE
    ############################

    return V
Esempio n. 7
0
def extract_policy(env, v, gamma):
    """ 
    Extract the optimal policy given the optimal value-function.

    Inputs:
    env: OpenAI Gym environment.
            env.P: dictionary
                    P[state][action] is tuples with (probability, nextstate, reward, terminal)
                    probability: float
                    nextstate: int
                    reward: float
                    terminal: boolean
            env.nS: int
                    number of states
            env.nA: int
                    number of actions
    v: numpy.ndarray
        value function
    gamma: float
        Discount factor. Number in range [0, 1)
    
    Outputs:
    policy: numpy.ndarray
    """

    policy = np.zeros(env.nS, dtype=np.int32)
    ############################
    # YOUR CODE STARTS HERE
    length = len(policy)
    q = action_evaluation(env, gamma, v)
    for i in range(length):
        policy[i] = np.argmax(q[i])
    # YOUR CODE ENDS HERE
    ############################

    return policy