def test_probs(self):
     for state in states():
         print state
         p = probs(state)
         s = sum(p.values())
         assert sum(p.values()) - 1 <= 1e-10, sum(p.values())
         assert all(sum(v) <= H for v in p.keys())
Exemple #2
 def test_probs(self):
     for state in states():
         print state
         p = probs(state)
         s = sum(p.values())
         assert sum(p.values()) - 1 <= 1e-10, sum(p.values())
         assert all(sum(v)<=H for v in p.keys())
Exemple #3
def value_iteration(gamma=0.9, theta=0.01, sweeps=None):
    """ Value iteration algorithm
    gamma -- discount factor
    theta -- stop when delta < theta
    sweeps -- stop after N sweeps
    Returns a dictionary V[s] = value 

    # Initialize value function to 0
    V = dict((s, 0) for s in states())

    sweep = 0
    while True:
        sweep += 1
        delta = 0

        # Report progress!
        print 'Sweep', sweep, '...',

        # Loop through every possible state
        for s in states():
            # Store old value of state
            v = V[s]

            an, vn = max_action(s, V, gamma)
            V[s] = vn

            # Update delta
            delta = max(delta, abs(v - V[s]))

        print 'delta =', delta

        #raw_input('Hit enter to continue')

        if theta and delta < theta:

        if sweeps and sweep == sweeps:

    return V
Exemple #4
def value_iteration(gamma = 0.9, theta = 0.01, sweeps = None):
    """ Value iteration algorithm
    gamma -- discount factor
    theta -- stop when delta < theta
    sweeps -- stop after N sweeps
    Returns a dictionary V[s] = value 
    # Initialize value function to 0
    V = dict((s, 0) for s in states())
    sweep = 0
    while True:
        sweep += 1
        delta = 0
        # Report progress!
        print 'Sweep', sweep, '...',
        # Loop through every possible state
        for s in states():
            # Store old value of state
            v = V[s]
            an, vn = max_action(s, V, gamma)
            V[s] = vn
            # Update delta
            delta = max(delta, abs(v - V[s]))
        print 'delta =', delta
        #raw_input('Hit enter to continue')
        if theta and delta < theta:
        if sweeps and sweep == sweeps:
    return V
Exemple #5
def policy_iteration(gamma=0.9, theta=0.01, sweeps=None, value_list=None):
    """ Policy iteration
    gamma -- discount factor
    theta -- stop when delta < theta
    sweeps -- stop after N sweeps
    value_list -- passing a list here will populate it with the value functions
                  generated after each policy evaluation step
    Returns a tuple (pi*, V*) where
        pi*[s] = action
        V*[s] = value

    # Initialize value function to 0
    V = dict((s, 0) for s in states())

    # Initialize policy to (0, 0, 0)
    pi = dict((s, (0, 0, 0)) for s in states())

    # Assume a stable policy
    policy_stable = False

    while not policy_stable:
        # Policy Evaluation
        print "Policy Evaluation..."
        sweep = 0
        while True:
            sweep += 1
            delta = 0

            # Report progress!
            print '\tSweep', sweep, '...',

            # Loop through every possible state
            for s in states():
                # Store old value of state
                v = V[s]

                # Act according to policy
                sa = afterstate(s, pi[s])
                V[s] = value(sa, reward(s, pi[s]), V, gamma)

                # Update delta
                delta = max(delta, abs(v - V[s]))

            print 'delta =', delta

            #raw_input('Hit enter to continue')

            if theta and delta < theta:

            if sweeps and sweep == sweeps:

        if isinstance(value_list, list):

        # Policy Improvement
        print "Policy Improvement..."

        policy_stable = True

        # Go through every state
        for s in states():
            b = pi[s]

            an, vn = max_action(s, V, gamma)
            pi[s] = an

            #print "pi[%s] = %s" % (s, pi[s])

            if b != pi[s]:
                policy_stable = False

    # Return the value function and policy
    return V, pi
Exemple #6
def policy_iteration(gamma = 0.9, theta = 0.01, sweeps = None, value_list = None):
    """ Policy iteration
    gamma -- discount factor
    theta -- stop when delta < theta
    sweeps -- stop after N sweeps
    value_list -- passing a list here will populate it with the value functions
                  generated after each policy evaluation step
    Returns a tuple (pi*, V*) where
        pi*[s] = action
        V*[s] = value
    # Initialize value function to 0
    V = dict((s, 0) for s in states())
    # Initialize policy to (0, 0, 0)
    pi = dict((s, (0, 0, 0)) for s in states())
    # Assume a stable policy
    policy_stable = False
    while not policy_stable:
        # Policy Evaluation
        print "Policy Evaluation..."
        sweep = 0
        while True:
            sweep += 1
            delta = 0
            # Report progress!
            print '\tSweep', sweep, '...',
            # Loop through every possible state
            for s in states():
                # Store old value of state
                v = V[s]
                # Act according to policy
                sa = afterstate(s, pi[s])
                V[s] = value(sa, reward(s, pi[s]), V, gamma)
                # Update delta
                delta = max(delta, abs(v - V[s]))
            print 'delta =', delta
            #raw_input('Hit enter to continue')
            if theta and delta < theta:
            if sweeps and sweep == sweeps:
        if isinstance(value_list, list):
        # Policy Improvement
        print "Policy Improvement..."
        policy_stable = True
        # Go through every state
        for s in states():
            b = pi[s]
            an, vn = max_action(s, V, gamma)
            pi[s] = an
            #print "pi[%s] = %s" % (s, pi[s])
            if b != pi[s]:
                policy_stable = False
    # Return the value function and policy
    return V, pi