Esempio n. 1
0
def modelbased_value_iteration(gamma, T_matrix, pi_star, actions, states, V):
    converging = 0
    num_iterations = 0
    Q = {}

    # iterate until all state values (v[s]) converge
    while not (converging):
        num_iterations += 1
        for s in states:
            Q[s] = {}
            for a in range(len(actions)):

                # find the value of each action, given state s
                Q[s][a] = darts.R(s, actions[a])
                for s_prime in states:

                    Q[s][a] += gamma * T_matrix[s][s_prime][a] * V[0][s_prime]

                    # find the action that maximizes Q and the maximum value of Q
                    if a == 0 or (Q[s][a] >= V[1][s]):
                        pi_star[s] = a
                        V[1][s] = Q[s][a]

        # values of v for iteration k become the values of v for iteration k-1
        converging = True
        for s in states:
            # check for one component that does not converge
            if EPSILON_VI < abs(V[0][s] - V[1][s]):
                converging = False

            V[0][s] = V[1][s]

    return T_matrix, pi_star, Q, V
Esempio n. 2
0
def Q_learning(gamma, numRounds, alpha):
  states = darts.get_states()
  actions = darts.get_actions()


  Q = {}
  for s in states:
  	Q[s] = [0] * len(actions)

  for i in range(numRounds):

  	s = throw.START_SCORE

  	numiterations = 0

  	while s > 0:
  	  randAction = random.randint(0, len(actions))
  	  maxAction = Q[score].index(max(Q[s]))

  	  #a = ex_strategy_one(Q, randAction, maxAction)
  	  a = ex_strategy_two(Q, randAction, maxAction)
  	  action = actions[a]

  	  s_prime = s - throw.location_to_score(action)
  	  if s_prime < 0:
  	  	s_prime = s

  	  maxQ = 0.0
  	  for a_prime in range(len(actions)):
  	  	if Q[s_prime][a_prime] > maxQ:
  	  		maxQ = Q[s_prime][a_prime]

	  Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a])

	  s = s_prime
Esempio n. 3
0
def infiniteValueIteration(gamma):
    # takes a discount factor gamma and convergence cutoff epislon
    # returns

    V = {}
    Q = {}
    V_prime = {}

    states = darts.get_states()
    actions = darts.get_actions()

    notConverged = True

    # intialize value of each state to 0
    for s in states:
        V[s] = 0
        Q[s] = {}

    # until convergence is reached
    while notConverged:

        # store values from previous iteration
        for s in states:
            V_prime[s] = V[s]

        # update Q, pi, and V
        for s in states:
            for a in actions:

                # given current state and action, sum product of T and V over all states
                summand = 0
                for s_prime in states:
                    summand += T(a, s, s_prime) * V_prime[s_prime]

                # update Q
                Q[s][a] = darts.R(s, a) + gamma * summand

            # given current state, store the action that maximizes V in pi and the corresponding value in V
            PI[s] = actions[0]

            # bug fix from piazza post 283
            V[s] = Q[s][PI[s]]

            for a in actions:
                if V[s] <= Q[s][a]:
                    V[s] = Q[s][a]
                    PI[s] = a

        notConverged = False
        for s in states:
            if abs(V[s] - V_prime[s]) > EPSILON:
                notConverged = True
Esempio n. 4
0
 def test_R(self):
     self.assertEqual(darts.R(10, 0), 0)
     self.assertTrue(darts.R(0, 0) > 0)