def modelbased_value_iteration(gamma, T_matrix, pi_star, actions, states, V): converging = 0 num_iterations = 0 Q = {} # iterate until all state values (v[s]) converge while not (converging): num_iterations += 1 for s in states: Q[s] = {} for a in range(len(actions)): # find the value of each action, given state s Q[s][a] = darts.R(s, actions[a]) for s_prime in states: Q[s][a] += gamma * T_matrix[s][s_prime][a] * V[0][s_prime] # find the action that maximizes Q and the maximum value of Q if a == 0 or (Q[s][a] >= V[1][s]): pi_star[s] = a V[1][s] = Q[s][a] # values of v for iteration k become the values of v for iteration k-1 converging = True for s in states: # check for one component that does not converge if EPSILON_VI < abs(V[0][s] - V[1][s]): converging = False V[0][s] = V[1][s] return T_matrix, pi_star, Q, V
def Q_learning(gamma, numRounds, alpha): states = darts.get_states() actions = darts.get_actions() Q = {} for s in states: Q[s] = [0] * len(actions) for i in range(numRounds): s = throw.START_SCORE numiterations = 0 while s > 0: randAction = random.randint(0, len(actions)) maxAction = Q[score].index(max(Q[s])) #a = ex_strategy_one(Q, randAction, maxAction) a = ex_strategy_two(Q, randAction, maxAction) action = actions[a] s_prime = s - throw.location_to_score(action) if s_prime < 0: s_prime = s maxQ = 0.0 for a_prime in range(len(actions)): if Q[s_prime][a_prime] > maxQ: maxQ = Q[s_prime][a_prime] Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a]) s = s_prime
def infiniteValueIteration(gamma): # takes a discount factor gamma and convergence cutoff epislon # returns V = {} Q = {} V_prime = {} states = darts.get_states() actions = darts.get_actions() notConverged = True # intialize value of each state to 0 for s in states: V[s] = 0 Q[s] = {} # until convergence is reached while notConverged: # store values from previous iteration for s in states: V_prime[s] = V[s] # update Q, pi, and V for s in states: for a in actions: # given current state and action, sum product of T and V over all states summand = 0 for s_prime in states: summand += T(a, s, s_prime) * V_prime[s_prime] # update Q Q[s][a] = darts.R(s, a) + gamma * summand # given current state, store the action that maximizes V in pi and the corresponding value in V PI[s] = actions[0] # bug fix from piazza post 283 V[s] = Q[s][PI[s]] for a in actions: if V[s] <= Q[s][a]: V[s] = Q[s][a] PI[s] = a notConverged = False for s in states: if abs(V[s] - V_prime[s]) > EPSILON: notConverged = True
def test_R(self): self.assertEqual(darts.R(10, 0), 0) self.assertTrue(darts.R(0, 0) > 0)