Example #1
0
def max_action(state, V, gamma=0.9, debug=False):
    """ Compute the best (action, value) pair from a state 
    
    Returns a tuple: (action, value)
    """
    a_max = (0, 0, 0)
    v_max = 0

    # Loop through all possible actions to determine max value
    for a in actions(state):
        # Choose action a and reach afterstate sa
        sa = afterstate(state, a)

        # Milk cows for reward
        r = reward(state, a)

        # Calculate the value of afterstate
        vn = value(sa, r, V, gamma)

        if vn > v_max:
            v_max = vn
            a_max = a

    return (a_max, v_max)
Example #2
0
def max_action(state, V, gamma = 0.9, debug = False):
    """ Compute the best (action, value) pair from a state 
    
    Returns a tuple: (action, value)
    """
    a_max = (0, 0, 0)
    v_max = 0
    
    # Loop through all possible actions to determine max value
    for a in actions(state):
        # Choose action a and reach afterstate sa
        sa = afterstate(state, a)
        
        # Milk cows for reward
        r = reward(state, a)
        
        # Calculate the value of afterstate
        vn = value(sa, r, V, gamma)
        
        if vn > v_max:
            v_max = vn
            a_max = a
    
    return (a_max, v_max)
Example #3
0
def policy_iteration(gamma=0.9, theta=0.01, sweeps=None, value_list=None):
    """ Policy iteration
    
    gamma -- discount factor
    theta -- stop when delta < theta
    sweeps -- stop after N sweeps
    value_list -- passing a list here will populate it with the value functions
                  generated after each policy evaluation step
    
    Returns a tuple (pi*, V*) where
        pi*[s] = action
        V*[s] = value
    """

    # Initialize value function to 0
    V = dict((s, 0) for s in states())

    # Initialize policy to (0, 0, 0)
    pi = dict((s, (0, 0, 0)) for s in states())

    # Assume a stable policy
    policy_stable = False

    while not policy_stable:
        #
        # Policy Evaluation
        #
        print "Policy Evaluation..."
        sweep = 0
        while True:
            sweep += 1
            delta = 0

            # Report progress!
            print '\tSweep', sweep, '...',
            sys.stdout.flush()

            # Loop through every possible state
            for s in states():
                # Store old value of state
                v = V[s]

                # Act according to policy
                sa = afterstate(s, pi[s])
                V[s] = value(sa, reward(s, pi[s]), V, gamma)

                # Update delta
                delta = max(delta, abs(v - V[s]))

            print 'delta =', delta

            #raw_input('Hit enter to continue')

            if theta and delta < theta:
                break

            if sweeps and sweep == sweeps:
                break

        if isinstance(value_list, list):
            value_list.append(copy.deepcopy(V))

        #
        # Policy Improvement
        #
        print "Policy Improvement..."

        policy_stable = True

        # Go through every state
        for s in states():
            b = pi[s]

            an, vn = max_action(s, V, gamma)
            pi[s] = an

            #print "pi[%s] = %s" % (s, pi[s])

            if b != pi[s]:
                policy_stable = False

    # Return the value function and policy
    return V, pi
Example #4
0
def policy_iteration(gamma = 0.9, theta = 0.01, sweeps = None, value_list = None):
    """ Policy iteration
    
    gamma -- discount factor
    theta -- stop when delta < theta
    sweeps -- stop after N sweeps
    value_list -- passing a list here will populate it with the value functions
                  generated after each policy evaluation step
    
    Returns a tuple (pi*, V*) where
        pi*[s] = action
        V*[s] = value
    """
    
    # Initialize value function to 0
    V = dict((s, 0) for s in states())
    
    # Initialize policy to (0, 0, 0)
    pi = dict((s, (0, 0, 0)) for s in states())
    
    # Assume a stable policy
    policy_stable = False
    
    while not policy_stable:
        # 
        # Policy Evaluation
        #
        print "Policy Evaluation..."
        sweep = 0
        while True:
            sweep += 1
            delta = 0
            
            # Report progress!
            print '\tSweep', sweep, '...',
            sys.stdout.flush()
            
            # Loop through every possible state
            for s in states():
                # Store old value of state
                v = V[s]
                
                # Act according to policy
                sa = afterstate(s, pi[s])
                V[s] = value(sa, reward(s, pi[s]), V, gamma)
                
                # Update delta
                delta = max(delta, abs(v - V[s]))
            
            print 'delta =', delta
            
            #raw_input('Hit enter to continue')
            
            if theta and delta < theta:
                break
            
            if sweeps and sweep == sweeps:
                break
        
        
        if isinstance(value_list, list):
            value_list.append(copy.deepcopy(V))
        
        #
        # Policy Improvement
        #
        print "Policy Improvement..."
        
        policy_stable = True
        
        # Go through every state
        for s in states():
            b = pi[s]
            
            an, vn = max_action(s, V, gamma)
            pi[s] = an
            
            #print "pi[%s] = %s" % (s, pi[s])
            
            if b != pi[s]:
                policy_stable = False
        
        
    # Return the value function and policy
    return V, pi