def test_probs(self):
     for state in states():
         print state
         p = probs(state)
         s = sum(p.values())
         assert sum(p.values()) - 1 <= 1e-10, sum(p.values())
         assert all(sum(v) <= H for v in p.keys())
Exemple #2
0
 def test_probs(self):
     for state in states():
         print state
         p = probs(state)
         s = sum(p.values())
         assert sum(p.values()) - 1 <= 1e-10, sum(p.values())
         assert all(sum(v)<=H for v in p.keys())
Exemple #3
0
def value_iteration(gamma=0.9, theta=0.01, sweeps=None):
    """ Value iteration algorithm
    
    gamma -- discount factor
    theta -- stop when delta < theta
    sweeps -- stop after N sweeps
    
    Returns a dictionary V[s] = value 
    """

    # Initialize value function to 0
    V = dict((s, 0) for s in states())

    sweep = 0
    while True:
        sweep += 1
        delta = 0

        # Report progress!
        print 'Sweep', sweep, '...',
        sys.stdout.flush()

        # Loop through every possible state
        for s in states():
            # Store old value of state
            v = V[s]

            an, vn = max_action(s, V, gamma)
            V[s] = vn

            # Update delta
            delta = max(delta, abs(v - V[s]))

        print 'delta =', delta

        #raw_input('Hit enter to continue')

        if theta and delta < theta:
            break

        if sweeps and sweep == sweeps:
            break

    return V
Exemple #4
0
def value_iteration(gamma = 0.9, theta = 0.01, sweeps = None):
    """ Value iteration algorithm
    
    gamma -- discount factor
    theta -- stop when delta < theta
    sweeps -- stop after N sweeps
    
    Returns a dictionary V[s] = value 
    """
    
    # Initialize value function to 0
    V = dict((s, 0) for s in states())
    
    sweep = 0
    while True:
        sweep += 1
        delta = 0
        
        # Report progress!
        print 'Sweep', sweep, '...',
        sys.stdout.flush()
        
        # Loop through every possible state
        for s in states():
            # Store old value of state
            v = V[s]
            
            an, vn = max_action(s, V, gamma)
            V[s] = vn
            
            # Update delta
            delta = max(delta, abs(v - V[s]))
        
        print 'delta =', delta
        
        #raw_input('Hit enter to continue')
        
        if theta and delta < theta:
            break
        
        if sweeps and sweep == sweeps:
            break
        
    return V
Exemple #5
0
def policy_iteration(gamma=0.9, theta=0.01, sweeps=None, value_list=None):
    """ Policy iteration
    
    gamma -- discount factor
    theta -- stop when delta < theta
    sweeps -- stop after N sweeps
    value_list -- passing a list here will populate it with the value functions
                  generated after each policy evaluation step
    
    Returns a tuple (pi*, V*) where
        pi*[s] = action
        V*[s] = value
    """

    # Initialize value function to 0
    V = dict((s, 0) for s in states())

    # Initialize policy to (0, 0, 0)
    pi = dict((s, (0, 0, 0)) for s in states())

    # Assume a stable policy
    policy_stable = False

    while not policy_stable:
        #
        # Policy Evaluation
        #
        print "Policy Evaluation..."
        sweep = 0
        while True:
            sweep += 1
            delta = 0

            # Report progress!
            print '\tSweep', sweep, '...',
            sys.stdout.flush()

            # Loop through every possible state
            for s in states():
                # Store old value of state
                v = V[s]

                # Act according to policy
                sa = afterstate(s, pi[s])
                V[s] = value(sa, reward(s, pi[s]), V, gamma)

                # Update delta
                delta = max(delta, abs(v - V[s]))

            print 'delta =', delta

            #raw_input('Hit enter to continue')

            if theta and delta < theta:
                break

            if sweeps and sweep == sweeps:
                break

        if isinstance(value_list, list):
            value_list.append(copy.deepcopy(V))

        #
        # Policy Improvement
        #
        print "Policy Improvement..."

        policy_stable = True

        # Go through every state
        for s in states():
            b = pi[s]

            an, vn = max_action(s, V, gamma)
            pi[s] = an

            #print "pi[%s] = %s" % (s, pi[s])

            if b != pi[s]:
                policy_stable = False

    # Return the value function and policy
    return V, pi
Exemple #6
0
def policy_iteration(gamma = 0.9, theta = 0.01, sweeps = None, value_list = None):
    """ Policy iteration
    
    gamma -- discount factor
    theta -- stop when delta < theta
    sweeps -- stop after N sweeps
    value_list -- passing a list here will populate it with the value functions
                  generated after each policy evaluation step
    
    Returns a tuple (pi*, V*) where
        pi*[s] = action
        V*[s] = value
    """
    
    # Initialize value function to 0
    V = dict((s, 0) for s in states())
    
    # Initialize policy to (0, 0, 0)
    pi = dict((s, (0, 0, 0)) for s in states())
    
    # Assume a stable policy
    policy_stable = False
    
    while not policy_stable:
        # 
        # Policy Evaluation
        #
        print "Policy Evaluation..."
        sweep = 0
        while True:
            sweep += 1
            delta = 0
            
            # Report progress!
            print '\tSweep', sweep, '...',
            sys.stdout.flush()
            
            # Loop through every possible state
            for s in states():
                # Store old value of state
                v = V[s]
                
                # Act according to policy
                sa = afterstate(s, pi[s])
                V[s] = value(sa, reward(s, pi[s]), V, gamma)
                
                # Update delta
                delta = max(delta, abs(v - V[s]))
            
            print 'delta =', delta
            
            #raw_input('Hit enter to continue')
            
            if theta and delta < theta:
                break
            
            if sweeps and sweep == sweeps:
                break
        
        
        if isinstance(value_list, list):
            value_list.append(copy.deepcopy(V))
        
        #
        # Policy Improvement
        #
        print "Policy Improvement..."
        
        policy_stable = True
        
        # Go through every state
        for s in states():
            b = pi[s]
            
            an, vn = max_action(s, V, gamma)
            pi[s] = an
            
            #print "pi[%s] = %s" % (s, pi[s])
            
            if b != pi[s]:
                policy_stable = False
        
        
    # Return the value function and policy
    return V, pi