def test_probs(self): for state in states(): print state p = probs(state) s = sum(p.values()) assert sum(p.values()) - 1 <= 1e-10, sum(p.values()) assert all(sum(v) <= H for v in p.keys())
def test_probs(self): for state in states(): print state p = probs(state) s = sum(p.values()) assert sum(p.values()) - 1 <= 1e-10, sum(p.values()) assert all(sum(v)<=H for v in p.keys())
def value_iteration(gamma=0.9, theta=0.01, sweeps=None): """ Value iteration algorithm gamma -- discount factor theta -- stop when delta < theta sweeps -- stop after N sweeps Returns a dictionary V[s] = value """ # Initialize value function to 0 V = dict((s, 0) for s in states()) sweep = 0 while True: sweep += 1 delta = 0 # Report progress! print 'Sweep', sweep, '...', sys.stdout.flush() # Loop through every possible state for s in states(): # Store old value of state v = V[s] an, vn = max_action(s, V, gamma) V[s] = vn # Update delta delta = max(delta, abs(v - V[s])) print 'delta =', delta #raw_input('Hit enter to continue') if theta and delta < theta: break if sweeps and sweep == sweeps: break return V
def value_iteration(gamma = 0.9, theta = 0.01, sweeps = None): """ Value iteration algorithm gamma -- discount factor theta -- stop when delta < theta sweeps -- stop after N sweeps Returns a dictionary V[s] = value """ # Initialize value function to 0 V = dict((s, 0) for s in states()) sweep = 0 while True: sweep += 1 delta = 0 # Report progress! print 'Sweep', sweep, '...', sys.stdout.flush() # Loop through every possible state for s in states(): # Store old value of state v = V[s] an, vn = max_action(s, V, gamma) V[s] = vn # Update delta delta = max(delta, abs(v - V[s])) print 'delta =', delta #raw_input('Hit enter to continue') if theta and delta < theta: break if sweeps and sweep == sweeps: break return V
def policy_iteration(gamma=0.9, theta=0.01, sweeps=None, value_list=None): """ Policy iteration gamma -- discount factor theta -- stop when delta < theta sweeps -- stop after N sweeps value_list -- passing a list here will populate it with the value functions generated after each policy evaluation step Returns a tuple (pi*, V*) where pi*[s] = action V*[s] = value """ # Initialize value function to 0 V = dict((s, 0) for s in states()) # Initialize policy to (0, 0, 0) pi = dict((s, (0, 0, 0)) for s in states()) # Assume a stable policy policy_stable = False while not policy_stable: # # Policy Evaluation # print "Policy Evaluation..." sweep = 0 while True: sweep += 1 delta = 0 # Report progress! print '\tSweep', sweep, '...', sys.stdout.flush() # Loop through every possible state for s in states(): # Store old value of state v = V[s] # Act according to policy sa = afterstate(s, pi[s]) V[s] = value(sa, reward(s, pi[s]), V, gamma) # Update delta delta = max(delta, abs(v - V[s])) print 'delta =', delta #raw_input('Hit enter to continue') if theta and delta < theta: break if sweeps and sweep == sweeps: break if isinstance(value_list, list): value_list.append(copy.deepcopy(V)) # # Policy Improvement # print "Policy Improvement..." policy_stable = True # Go through every state for s in states(): b = pi[s] an, vn = max_action(s, V, gamma) pi[s] = an #print "pi[%s] = %s" % (s, pi[s]) if b != pi[s]: policy_stable = False # Return the value function and policy return V, pi
def policy_iteration(gamma = 0.9, theta = 0.01, sweeps = None, value_list = None): """ Policy iteration gamma -- discount factor theta -- stop when delta < theta sweeps -- stop after N sweeps value_list -- passing a list here will populate it with the value functions generated after each policy evaluation step Returns a tuple (pi*, V*) where pi*[s] = action V*[s] = value """ # Initialize value function to 0 V = dict((s, 0) for s in states()) # Initialize policy to (0, 0, 0) pi = dict((s, (0, 0, 0)) for s in states()) # Assume a stable policy policy_stable = False while not policy_stable: # # Policy Evaluation # print "Policy Evaluation..." sweep = 0 while True: sweep += 1 delta = 0 # Report progress! print '\tSweep', sweep, '...', sys.stdout.flush() # Loop through every possible state for s in states(): # Store old value of state v = V[s] # Act according to policy sa = afterstate(s, pi[s]) V[s] = value(sa, reward(s, pi[s]), V, gamma) # Update delta delta = max(delta, abs(v - V[s])) print 'delta =', delta #raw_input('Hit enter to continue') if theta and delta < theta: break if sweeps and sweep == sweeps: break if isinstance(value_list, list): value_list.append(copy.deepcopy(V)) # # Policy Improvement # print "Policy Improvement..." policy_stable = True # Go through every state for s in states(): b = pi[s] an, vn = max_action(s, V, gamma) pi[s] = an #print "pi[%s] = %s" % (s, pi[s]) if b != pi[s]: policy_stable = False # Return the value function and policy return V, pi