def T(a, s, s_prime): total_prob = 0.0 for w in [x-2 for x in range(5)]: wedgefactor = 0.0 if abs(w)==0: wedgefactor = 0.4 if abs(w)==1: wedgefactor = 0.2 if abs(w)==2: wedgefactor = 0.1 wedge = (a.wedge + w) % throw.NUM_WEDGES # this area for r in [x-2 for x in range(5)]: ringfactor = 0.0 if abs(r)==0: ringfactor = 0.4 if abs(r)==1: ringfactor = 0.2 if abs(r)==2: ringfactor = 0.1 ring = abs(a.ring + r) if throw.location_to_score(throw.location(ring,wedge))==(s-s_prime): total_prob += ringfactor * wedgefactor return total_prob
def T(a, s, s_prime): # takes an action a, current state s, and next state s_prime # returns the probability of transitioning to s_prime when taking action a in state s p_transition = 0.0 probabilities = [0.4, 0.2, 0.2, 0.1, 0.1] # trick to allow wrap around wedge_list = throw.wedges*3 # calculate all 5 wedges you could end up in when aiming for a.wedge wedge_index = len(throw.wedges) + throw.wedges.index(a.wedge) candidate_wedges = [wedge_list[wedge_index], wedge_list[wedge_index+1], wedge_list[wedge_index-1], wedge_list[wedge_index+2], wedge_list[wedge_index-2]] # calulate all 5 regions/rings (some may be the same) you could end up in when aiming for a.ring, with prob array if a.ring == throw.CENTER: candidate_rings = [a.ring, throw.INNER_RING, throw.INNER_RING, throw.FIRST_PATCH, throw.FIRST_PATCH] elif a.ring == throw.INNER_RING: candidate_rings = [a.ring, throw.FIRST_PATCH, throw.CENTER, throw.MIDDLE_RING, throw.INNER_RING] else: candidate_rings = [a.ring, a.ring+1, a.ring-1, a.ring+2, a.ring-2] # for each (ring, wedge) pair, calculate point value, and check if it gets you from s to s_prime for w in range(len(candidate_wedges)): for r in range(len(candidate_rings)): # instantiation of location class real_location = throw.location(candidate_rings[r],candidate_wedges[w]) if s - throw.location_to_score(real_location) == s_prime: p_transition += probabilities[r]*probabilities[w] return p_transition
def T(a, s, s_prime): #CENTER, INNER_RING, FIRST_PATCH, MIDDLE_RING, SECOND_PATCH, OUTER_RING, MISS = range(7) delta = s - s_prime p = 0.0 probs = [.1, .2, .4, .2, .1] throw.init_board() if delta > 3*throw.NUM_WEDGES or delta < 0: return 0 for ri in range(5): for wi in range(5): wedge_num = throw.wedges[(throw.angles[a.wedge] - 2 + wi) % throw.NUM_WEDGES] ring_num = a.ring - 2 + ri; if ring_num > 6: ring_num = 6 if ring_num < 0: ring_num = ring_num*(-1) points = throw.location_to_score(throw.location(ring_num, wedge_num)) if points == delta: p += probs[ri]*probs[wi] return p
def R(s,a): # takes a state s and action a # returns the reward for completing action a in state s r = s - throw.location_to_score(a) if r == 0: return 1 return 0
def play(method): score = throw.START_SCORE turns = 0 if method == "mdp": target = mdp.start_game(GAMMA) else: target = modelfree.start_game() targets = [] results = [] while(True): turns = turns + 1 result = throw.throw(target) targets.append(target) results.append(result) raw_score = throw.location_to_score(result) if raw_score <= score: score = int(score - raw_score) else: cc=1 if score == 0: break if method == "mdp": target = mdp.get_target(score) else: target = modelfree.get_target(score) # print "WOOHOO! It only took", turns, " turns" #end_game(turns) return turns
def Q_learning(gamma, numRounds, alpha): states = darts.get_states() actions = darts.get_actions() currentRound = 0 Q = {} for s in states: Q[s] = [0] * len(actions) for i in range(numRounds): s = throw.START_SCORE numiterations = 0 while s > 0: randAction = random.randint(0, len(actions)) maxAction = Q[score].index(max(Q[s])) #a = ex_strategy_one(Q, randAction, maxAction) a = ex_strategy_two(numRounds, currentRound, Q, len(actions), s) action = actions[a] s_prime = s - throw.location_to_score(action) if s_prime < 0: s_prime = s maxQ = 0.0 for a_prime in range(len(actions)): if Q[s_prime][a_prime] > maxQ: maxQ = Q[s_prime][a_prime] Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a]) s = s_prime currentRound += 1
def Q_learning(gamma, numRounds, alpha): states = darts.get_states() actions = darts.get_actions() Q = {} for s in states: Q[s] = [0] * len(actions) for i in range(numRounds): s = throw.START_SCORE numiterations = 0 while s > 0: randAction = random.randint(0, len(actions)) maxAction = Q[score].index(max(Q[s])) #a = ex_strategy_one(Q, randAction, maxAction) a = ex_strategy_two(Q, randAction, maxAction) action = actions[a] s_prime = s - throw.location_to_score(action) if s_prime < 0: s_prime = s maxQ = 0.0 for a_prime in range(len(actions)): if Q[s_prime][a_prime] > maxQ: maxQ = Q[s_prime][a_prime] Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a]) s = s_prime
def T(a, s, s_prime): # takes an action a, current state s, and next state s_prime # returns the probability of transitioning to s_prime when taking action a in state s #so let's iterate over the possible places on the board we will hit and add up the ones that give the right score reduction if(s_prime>s): return 0.0 if(s == 0 and s_prime == 0): return 1.0 regions = {CENTER:0, INNER_RING:1, FIRST_PATCH:2, MIDDLE_RING:3, SECOND_PATCH:4,OUTER_RING:5,MISS:6} actions = darts.get_actions() score_diff = s-s_prime prob = 0.0 wedge = throw.angles[a.wedge] ring = a.ring for wdel in range(-2,3): for rdel in range(-2,3): wedge_p = throw.wedges[(wdel+wedge)%NUM_WEDGES] ring_p = abs(ring+rdel) dscore = throw.location_to_score(throw.location(ring_p,wedge_p)) if(dscore == score_diff): prob += 0.4/(2**abs(wdel))*0.4/(2**abs(rdel)) return prob
def R_simple(s, a): # takes a state s and action a # returns the reward for completing action a in state s points = throw.location_to_score(a) if points <= s: return points return 0
def play(method): score = throw.START_SCORE turns = 0 if method == "mdp": target = mdp.start_game(GAMMA) else: target = modelfree.start_game() targets = [] results = [] while (True): turns = turns + 1 result = throw.throw(target) targets.append(target) results.append(result) raw_score = throw.location_to_score(result) if raw_score <= score: score = int(score - raw_score) else: cc = 1 if score == 0: break if method == "mdp": target = mdp.get_target(score) else: target = modelfree.get_target(score) # print "WOOHOO! It only took", turns, " turns" #end_game(turns) return turns
def R_simple(s,a): # takes a state s and action a # returns the reward for completing action a in state s points = throw.location_to_score(a) if points <= s: return points return 0
def get_target(s_): global s_old, a_old, num_iterations Q_learning(s_old, s_, a_old) to_explore = 0 if darts.strategy == 1: to_explore = ex_strategy_one() else: num_iterations += 1 to_explore = ex_strategy_two() if to_explore: a_old = choice(actions) else: choices = [(value,a) for (a,value) in Q[s_].iteritems()] """ If first time at state, shoot for 24 (the max) if score >= max Else pick random action that does not exceed score """ if max(choices)[0] == 0: if s_ < 24: a_old = choice(actions) while (throw.location_to_score(a_old) > s_): a_old = choice(actions) else: a_old = actions[-15] return a_old # Else pick action with max Q a_old = max(choices)[1] s_old = s_ return a_old
def T(a, s, s_prime): #CENTER, INNER_RING, FIRST_PATCH, MIDDLE_RING, SECOND_PATCH, OUTER_RING, MISS = range(7) delta = s - s_prime p = 0.0 probs = [.1, .2, .4, .2, .1] throw.init_board() if delta > 3 * throw.NUM_WEDGES or delta < 0: return 0 for ri in range(5): for wi in range(5): wedge_num = throw.wedges[(throw.angles[a.wedge] - 2 + wi) % throw.NUM_WEDGES] ring_num = a.ring - 2 + ri if ring_num > 6: ring_num = 6 if ring_num < 0: ring_num = ring_num * (-1) points = throw.location_to_score( throw.location(ring_num, wedge_num)) if points == delta: p += probs[ri] * probs[wi] return p
def T(a, s, s_prime): total_prob = 0.0 for w in range(-2, 3): wedgefactor = 0.0 if abs(w) == 0: wedgefactor = 0.4 if abs(w) == 1: wedgefactor = 0.2 if abs(w) == 2: wedgefactor = 0.1 wedge = (a.wedge + w) % throw.NUM_WEDGES # this area for r in range(-2, 3): ringfactor = 0.0 if abs(r) == 0: ringfactor = 0.4 if abs(r) == 1: ringfactor = 0.2 if abs(r) == 2: ringfactor = 0.1 ring = abs(a.ring + r) if throw.location_to_score(throw.location(ring, wedge)) == (s - s_prime): total_prob += ringfactor * wedgefactor return total_prob
def Q_learning(gamma, numRounds, alpha): states = darts.get_states() actions = darts.get_actions() Q = {} for s in states: Q[s] = [0] * len(actions) totaliter = 0 for i in range(numRounds): s = throw.START_SCORE numiterations = 0 while s > 0: randAction = random.randint(0, len(actions) - 1) maxAction = Q[s].index(max(Q[s])) a = ex_strategy_one(numRounds, i, randAction, maxAction) #a = ex_strategy_two(numRounds, i, Q, len(actions), s) action = actions[a] s_prime = s - throw.location_to_score(action) if s_prime < 0: s_prime = s maxQ = 0.0 for a_prime in range(len(actions)): if Q[s_prime][a_prime] > maxQ: maxQ = Q[s_prime][a_prime] Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a]) s = s_prime numiterations += 1 totaliter += numiterations print "Average number of throws: " + str(float(totaliter) / numRounds)
def T(a, s, s_prime): # takes an action a, current state s, and next state s_prime # returns the probability of transitioning to s_prime when taking action a in state s if (T_CACHE.has_key((a,s,s_prime))): return T_CACHE[(a,s,s_prime)] def prob(i): if i == 0: return .4 if abs(i) == 1: return .2 if abs(i) == 2: return .1 # Useful local variables diff = s - s_prime wedge_index = throw.wedges.index(a.wedge) # Set ring for r in [-2,-1,0,1,2]: ring = abs(a.ring+r) if ring > 7: ring = 7 # Set wedge for w in [-2,-1,0,1,2]: wedge = throw.wedges[(wedge_index+w) % len(throw.wedges)] # Get score score = throw.location_to_score( throw.location(ring, wedge)) if score == diff: ret = prob(r) * prob(w) T_CACHE[(a,s,s_prime)] = ret return ret return 0.
def R(s,a): # takes a state s and action a # returns the reward for completing action a in state s points = throw.location_to_score(a) if points > s: return BAD_THROW_PENALTY else: return points
def R(s,a): # takes a state s and action a # returns the reward for completing action a in state s # utility function points = throw.location_to_score(a) if points <= s: return points else: return 0
def R(s,a): # takes a state s and action a # returns the reward for completing action a in state s if s == 0: return 0. points = throw.location_to_score(a) if points > s: return -1 return points-1.
def R(s,a): # takes a state s and action a # returns the reward for completing action a in state s if(s == 0): return 10.0 penalty = 0 if(throw.location_to_score(a)>s): penalty = -1 return penalty#-((throw.START_SCORE+1-s))+penalty
def test_T(self) : def act(r,w): return throw.location(r,w) self.assertEqual(mdp.T( act(throw.CENTER, 1), 100, 110), 0.0) self.assertEqual(mdp.T( act(throw.CENTER, 1), 100, 80), mdp.T( act(throw.CENTER,1), 90, 70)); bullseye = throw.location_to_score(throw.location(throw.CENTER, 1)); self.assertEqual( mdp.T(act(throw.FIRST_PATCH, 1), 100, 100-bullseye), 0.1); self.assertAlmostEqual( mdp.T(act(throw.INNER_RING, 1), 100, 95), 0.5);
def test_T(self) : def act(r,w): return throw.location(r,w) self.assertAlmostEqual(mdp.T( act(throw.CENTER, 1), 100, 110), 0.0) self.assertAlmostEqual(mdp.T( act(throw.CENTER, 1), 100, 80), mdp.T( act(throw.CENTER,1), 90, 70)); bullseye = throw.location_to_score(throw.location(throw.CENTER, 1)); self.assertAlmostEqual( mdp.T(act(throw.FIRST_PATCH, 1), 100, 100-bullseye), 0.1); self.assertAlmostEqual( mdp.T(act(throw.INNER_RING, 1), 100, 95), 0.5);
def T(a, s, s_prime): global T_cached if (a, s, s_prime) in T_cached: return T_cached[(a, s, s_prime)] # takes an action a, current state s, and next state s_prime # returns the probability of transitioning to s_prime when taking action a in state s target = s - s_prime target_locations = [] p = 0.0 # find all wedge/ring combos that would lead to s -> s' transition for i in range(-2, 3): current_wedge = get_adj_wedge(a.wedge, i) # iterate through all possible rings for j in range(-2, 3): ring = a.ring + j # off dart board if ring >= throw.MISS: continue # allow for ring "wrap around", e.g. the ring inside and outside the center # ring is the inner ring if ring < 0: ring = abs(ring) new_location = throw.location(ring, current_wedge) # hitting target would go from s -> s'! if target == throw.location_to_score(new_location): # calculate probability of hitting target if i == 0: w_p = 0.4 elif abs(i) == 1: w_p = 0.2 elif abs(i) == 2: w_p = 0.1 else: assert False, "Impossible wedge" if j == 0: r_p = 0.4 elif abs(j) == 1: r_p = 0.2 elif abs(j) == 2: r_p = 0.1 else: assert False, "Impossible ring" p += (w_p * r_p) T_cached[(a, s, s_prime)] = p return p
def T(a, s, s_prime): global T_cached if (a, s, s_prime) in T_cached: return T_cached[(a, s, s_prime)] # takes an action a, current state s, and next state s_prime # returns the probability of transitioning to s_prime when taking action a in state s target = s - s_prime target_locations = [] p = 0.0 # find all wedge/ring combos that would lead to s -> s' transition for i in range(-2, 3): current_wedge = get_adj_wedge(a.wedge, i) # iterate through all possible rings for j in range(-2, 3): ring = a.ring + j # off dart board if ring >= throw.MISS: continue # allow for ring "wrap around", e.g. the ring inside and outside the center # ring is the inner ring if ring < 0: ring = abs(ring) new_location = throw.location(ring, current_wedge) # hitting target would go from s -> s'! if target == throw.location_to_score(new_location): # calculate probability of hitting target if i == 0: w_p = 0.4 elif abs(i) == 1: w_p = 0.2 elif abs(i) == 2: w_p = 0.1 else: assert False, "Impossible wedge" if j == 0: r_p = 0.4 elif abs(j) == 1: r_p = 0.2 elif abs(j) == 2: r_p = 0.1 else: assert False, "Impossible ring" p += w_p * r_p T_cached[(a, s, s_prime)] = p return p
def T(a, s, s_prime): # takes an action a, current state s, and next state s_prime # returns the probability of transitioning to s_prime when taking action a in state s # figure out where would give you that many points # figure out the probability of landing there prob = 0 points = s - s_prime if points < 0: return 0 #print points # Loop through to define transition function for i in range(-2,2): wedge_curr = (throw.wedges.index(a.wedge) + i) # Mod by number of wedges to wrap around if needed if wedge_curr >= throw.NUM_WEDGES: wedge_curr = wedge_curr%throw.NUM_WEDGES prob_wedge = 0.4/(pow(2, abs(i))) for j in range(-2,2): ring_curr = (a.ring + j) if ring_curr < 0: ring_curr = ring_curr % 7 prob_ring = 0.4/(pow(2, abs(j))) '''if (a.ring == 0 and j < 0): ring_curr = 7 - ring_curr if (a.ring == 1 and j < -1): ring_curr = 7 - ring_curr''' if a.ring == 0: ring_curr = 7 - ring_curr if ring_curr == 0: prob_ring = 0.4 if ring_curr == 1: prob_ring == 0.4 if ring_curr == 2: prob_ring = 0.2 if a.ring == 1: ring_curr = 7 - ring_curr if ring_curr == 0: prob_ring == 0.2 if ring_curr == 1: prob_ring = 0.5 if ring_curr == 2: prob_ring = 0.2 if ring_curr == 3: prob_ring == 0.1 #print a.wedge, a.ring, j, i if(throw.location_to_score(throw.location(ring_curr, wedge_curr)) == points): prob += prob_wedge*prob_ring #print a.ring, j, i return prob
def play(method, d=None): score = throw.START_SCORE turns = 0 if method == "mdp": target = mdp.start_game(GAMMA) else: target = modelfree.start_game() targets = [] results = [] while True: turns = turns + 1 result = throw.throw(target) targets.append(target) results.append(result) raw_score = throw.location_to_score(result) if d: if d[score] == None: d[score] = throw.location_to_score(target) else: assert d[score] == throw.location_to_score(target) # print "Target: wedge", target.wedge,", ring", target.ring # print "Result: wedge", result.wedge,", ring", result.ring # print "Raw Score:", raw_score # print "Score:", score if raw_score <= score: score = int(score - raw_score) # else: # print # print "TOO HIGH!" if score == 0: break if method == "mdp": target = mdp.get_target(score) else: target = modelfree.get_target(score) # print "WOOHOO! It only took", turns, " turns" # end_game(turns) return turns
def EPoints(a, s): probs = [0.4, 0.2, 0.1, 0.1, 0.2] total = 0. for r_off in [-2, -1, 0, 1, 2]: for w_off in [-2, -1, 0, 1, 2]: r2 = min(throw.MISS, abs(a.ring + r_off)) w2 = throw.wedges[(throw.wedges.index(a.wedge) + w_off) % len(throw.wedges)] score = throw.location_to_score(throw.location(r2, w2)) if score > s: score = 0. total += probs[r_off] * probs[w_off] * score return total
def T(a, s, s_prime): # takes an action a, current state s, and next state s_prime # returns the probability of transitioning to s_prime when taking action a in state s probabilities = [0 for i in range(throw.START_SCORE + 1)] for i in range(-2,2): index = throw.wedges.index(a.wedge)+i if index >= throw.NUM_WEDGES: index = index % throw.NUM_WEDGES new_wedge = throw.wedges[index] prob_wedge = .4 / (pow(2,abs(i))) for j in range(-2,2): prob_ring = .4 / (pow(2,abs(j))) if a.ring == 0: if j == 0: prob_ring = .4 if j == 1 or j == -1: prob_ring = .4 if j == 2 or j == -2: prob_ring = .2 elif a.ring == 1: if j == 0 or j == -2: prob_ring = .5 if j == -1: prob_ring = .2 if j == 1: prob_ring = .2 if j == 2: prob_ring = .1 new_ring = a.ring + i if new_ring < 0: new_ring = new_ring % 7 loc = throw.location(new_ring, new_wedge) score = int(throw.location_to_score(loc)) new_score = s - score if new_score < 0: return 0 prob = prob_wedge * prob_ring probabilities[new_score] = probabilities[new_score] + prob return probabilities[s_prime]
def modelfree(alpha, gamma, num_games): # store all actions (targets on dartboard) in actions array actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_iterations = 0 Q = [[]] * len(states) # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions)-1) Q[s] = [0] * len(actions) # play num_games games for g in range(1, num_games + 1): #print str(g) + "/" + str(num_games) # run a single game s = throw.START_SCORE while s > 0: num_iterations += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. a = ex_strategy_one(num_iterations, actions, pi_star, s) #a = ex_strategy_two(num_iterations, Q, actions, s, pi_star) action = actions[a] # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) s_prime = int(s - throw.location_to_score(loc)) if s_prime < 0: s_prime = s max_Q = max(Q[s_prime]) Q[s][a] += alpha * (darts.R(s, actions[a]) + gamma * max(Q[s_prime]) - Q[s][a]) pi_star[s] = Q[s].index(max(Q[s])) # Next state becomes current state s = s_prime print "Average turns = ", float(num_iterations)/float(num_games)
def play(method): global actions actions = get_actions() score = throw.START_SCORE turns = 0 if method == "mdp": target = mdp.start_game(GAMMA) else: target = modelfree.start_game() targets = [] results = [] while(True): turns = turns + 1 result = throw.throw(target) targets.append(target) results.append(result) raw_score = throw.location_to_score(result) print "Target: wedge", target.wedge,", ring", target.ring print "Result: wedge", result.wedge,", ring", result.ring print "Raw Score:", raw_score print "Score:", score prior = score if raw_score <= score: score = int(score - raw_score) else: print print "TOO HIGH!" modelfree.q_learning(prior, score, get_index(actions, target)) if score == 0: break if method == "mdp": target = mdp.get_target(score) else: target = modelfree.get_target(score) print "WOOHOO! It only took", turns, " turns" #end_game(turns) return turns
def T(a, s, s_prime): # takes an action a, current state s, and next state s_prime # returns the probability of transitioning to s_prime when taking action a in state s possible_rings = [] ring_prob = [] if (a.ring == throw.CENTER): possible_rings = [throw.CENTER,throw.INNER_RING,throw.FIRST_PATCH] ring_prob = [PROBRING,2*PROBR1,2*PROBR2] elif (a.ring == throw.INNER_RING): possible_rings = [throw.CENTER,throw.INNER_RING,throw.FIRST_PATCH,throw.MIDDLE_RING] ring_prob = [PROBR1,PROBRING+PROBR1,PROBR1,PROBR2] elif (a.ring == throw.FIRST_PATCH): possible_rings = [throw.CENTER,throw.INNER_RING,throw.FIRST_PATCH,throw.MIDDLE_RING,throw.SECOND_PATCH] ring_prob = [PROBR2,PROBR1,PROBRING,PROBR1,PROBR2] elif (a.ring == throw.MIDDLE_RING): possible_rings = [throw.INNER_RING,throw.FIRST_PATCH,throw.MIDDLE_RING,throw.SECOND_PATCH,throw.OUTER_RING] ring_prob = [PROBR2,PROBR1,PROBRING,PROBR1,PROBR2] elif (a.ring == throw.SECOND_PATCH): possible_rings = [throw.FIRST_PATCH,throw.MIDDLE_RING,throw.SECOND_PATCH,throw.OUTER_RING,throw.MISS] ring_prob = [PROBR2,PROBR1,PROBRING,PROBR1,PROBR2] elif (a.ring == throw.OUTER_RING): possible_rings = [throw.MIDDLE_RING,throw.SECOND_PATCH,throw.OUTER_RING,throw.MISS] ring_prob = [PROBR2,PROBR1,PROBRING,PROBR1+PROBR2] elif (a.ring == throw.OUTER_RING): possible_rings = [throw.MIDDLE_RING,throw.SECOND_PATCH,throw.OUTER_RING,throw.MISS] ring_prob = [PROBR2,PROBR1,PROBRING,PROBR1+PROBR2] elif (a.ring == throw.MISS): possible_rings = [throw.SECOND_PATCH,throw.OUTER_RING,throw.MISS] ring_prob = [PROBR2,PROBR1,PROBRING+PROBR1+PROBR2] w_index = throw.wedges.index(a.wedge) possible_wedges = [(a.wedge),(throw.wedges[(w_index+1)%throw.NUM_WEDGES]),(throw.wedges[(w_index-1)%throw.NUM_WEDGES]),(throw.wedges[(w_index+2)%throw.NUM_WEDGES]),(throw.wedges[(w_index-2)%throw.NUM_WEDGES])] wedge_prob = [PROBWEDGE,PROBW1,PROBW1,PROBW2,PROBW2] final_prob = 0 for i in range(len(possible_rings)): for j in range(len(possible_wedges)): myloc = throw.location(possible_rings[i],possible_wedges[j]) if (s - (throw.location_to_score(myloc))) == s_prime: final_prob = final_prob + (ring_prob[i]*wedge_prob[j]) return final_prob
def T(a, s, s_prime): # takes an action a, current state s, and next state s_prime # returns the probability of transitioning to s_prime when taking action a in state s if s_prime > s: return 0. probs = [0.4, 0.2, 0.1, 0.1, 0.2] total = 0. for r_off in [-2, -1, 0, 1, 2]: for w_off in [-2, -1, 0, 1, 2]: r2 = min(throw.MISS, abs(a.ring + r_off)) w2 = throw.wedges[(throw.wedges.index(a.wedge) + w_off) % len(throw.wedges)] score = throw.location_to_score(throw.location(r2, w2)) if score > s: score = 0. if score == s - s_prime: total += probs[r_off] * probs[w_off] return total
def play(method): score = throw.START_SCORE turns = 0 if method == "mdp": target = mdp.start_game(GAMMA) else: target = modelfree.start_game() targets = [] results = [] while(True): turns = turns + 1 result = throw.throw(target) targets.append(target) results.append(result) raw_score = throw.location_to_score(result) #if raw_score > score: # update Q[s][a] #else: #modelfree.Q_learning(score,target,raw_score) print "Target: wedge", target.wedge,", ring", target.ring print "Result: wedge", result.wedge,", ring", result.ring print "Raw Score:", raw_score print "Score:", score if raw_score <= score: score = int(score - raw_score) else: print print "TOO HIGH!" if score == 0: break if method == "mdp": target = mdp.get_target(score) else: target = modelfree.get_target(score) print "WOOHOO! It only took", turns, " turns" #end_game(turns) return turns
def T(a, s, s_prime): # takes an action a, current state s, and next state s_prime # returns the probability of transitioning to s_prime when taking action a in state s probability = 0.0 # -2 -1 0 1 2 for w in range(-2, 3): # hit the wedge (0) if abs(w) == 0: p_wedge = 0.4 # hit region outside the wedge (-1 or 1) elif abs(w) == 1: p_wedge = 0.2 # hit region outside of that (-2 or 2) else: p_wedge = 0.1 # get the wedge and do % to loop around in case of going around circle wedge = (a.wedge + w) % throw.NUM_WEDGES # same thing, but now for the ring for r in range(-2, 3): # hit the ring if abs(r) == 0: p_ring = 0.4 # hit region outside the ring elif abs(r) == 1: p_ring = 0.2 # hit region outside of that else: p_ring = 0.1 # get the ring and do % to loop around in case of going around circle ring = abs(a.ring + r) score = throw.location_to_score(throw.location(ring, wedge)) if score == s - s_prime: probability += p_wedge * p_ring return probability
def T(a, s, s_prime): # takes an action a, current state s, and next state s_prime # returns the probability of transitioning to s_prime when taking action a in state s aRing = a.ring aWedge = a.wedge target = s - s_prime probs = [0.4, 0.2, 0.1] probability = 0 for i in range(-2, 3): w = (throw.wedges.index(a.wedge) + i) % len(throw.wedges) wedge = throw.wedges[w] for j in range(-2, 3): ring = min(abs(aRing + j), 6) loc = throw.location(ring, wedge) score = throw.location_to_score(loc) if target == score: probability += probs[abs(i)] * probs[abs(j)] return probability
def T(a, s, s_prime): # takes an action a, current state s, and next state s_prime # returns the probability of transitioning to s_prime when taking action a in state s aRing = a.ring aWedge = a.wedge target = s - s_prime probs = [0.4, 0.2, 0.1] probability = 0 for i in range (-2, 3): w = (throw.wedges.index(a.wedge) + i) % len(throw.wedges) wedge = throw.wedges[w] for j in range(-2, 3): ring = min(abs(aRing + j), 6) loc = throw.location(ring, wedge) score = throw.location_to_score(loc) if target == score: probability += probs[abs(i)] * probs[abs(j)] return probability
def play(method): score = throw.START_SCORE turns = 0 if method == "mdp": target = mdp.start_game(GAMMA) else: target = modelfree.start_game() targets = [] results = [] while (True): turns = turns + 1 result = throw.throw(target) targets.append(target) results.append(result) raw_score = throw.location_to_score(result) print "Target: wedge", target.wedge, ", ring", target.ring print "Result: wedge", result.wedge, ", ring", result.ring print "Raw Score:", raw_score print "Score:", score if raw_score <= score: score = int(score - raw_score) else: print print "TOO HIGH!" if score == 0: break if method == "mdp": target = mdp.get_target(score) else: target = modelfree.get_target(score) print "WOOHOO! It only took", turns, " turns" #end_game(turns) return turns
def modelbased(gamma, epoch_size, num_games): # store all actions (targets on dartboard) in actions array actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_actions = {} num_transitions = {} T_matrix = {} num_iterations = 0 Q = {} # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions)-1) num_actions[s] = {} num_transitions[s] = {} T_matrix[s] = {} Q[s] = {} for a in range(len(actions)): num_actions[s][a] = 0 Q[s][a] = 0 for s_prime in states: num_transitions[s][s_prime] = {} T_matrix[s][s_prime] = {} for a in range(len(actions)): num_transitions[s][s_prime][a] = 0 T_matrix[s][s_prime][a] = 0 # play num_games games, updating policy after every EPOCH_SIZE number of throws for g in range(1, num_games + 1): # run a single game s = throw.START_SCORE while s > 0: num_iterations += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. #to_explore = ex_strategy_one(s, num_iterations) # Second strategy to_explore = 2 newindex, newaction = ex_strategy_two(s, num_iterations, Q, actions) if to_explore == 2: a = newindex action = newaction elif to_explore: # explore a = random.randint(0, len(actions)-1) action = actions[a] else: # exploit a = pi_star[s] action = actions[a] # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) s_prime = s - throw.location_to_score(loc) if s_prime < 0: s_prime = s # Update experience: # increment number of times this action was taken in this state; # increment number of times we moved from this state to next state on this action. num_actions[s][a] += 1 num_transitions[s][s_prime][a] += 1 # Next state becomes current state s = s_prime # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, # using infinite-horizon value iteration. if num_iterations % epoch_size == 0: # Update transition probabilities for i in states: for j in states: for k in range(len(actions)): if num_actions[i][k] != 0: T_matrix[i][j][k] = float(num_transitions[i][j][k]) / float(num_actions[i][k]) # Update strategy (stored in pi) based on newly updated reward function and transition # probabilities T_matrix, pi_star, Q = modelbased_value_iteration(gamma, T_matrix, pi_star) print "Average turns = ", float(num_iterations)/float(num_games)
def modelbased(gamma, epoch_size, num_games): # store all actions (targets on dartboard) in actions array actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_actions = {} num_transitions = {} T_matrix = {} num_iterations = 0 # initialize v V = {} V[0] = {} V[1] = {} for s in states: V[0][s] = 0 V[1][s] = 0 # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions) - 1) num_actions[s] = {} num_transitions[s] = {} T_matrix[s] = {} for a in range(len(actions)): num_actions[s][a] = 0 for s_prime in states: num_transitions[s][s_prime] = {} T_matrix[s][s_prime] = {} for a in range(len(actions)): num_transitions[s][s_prime][a] = 0 T_matrix[s][s_prime][a] = 0 # play num_games games, updating policy after every EPOCH_SIZE number of throws for g in range(1, num_games + 1): iterations_this_game = 0 Q = {} # run a single game s = throw.START_SCORE while s > 0: iterations_this_game += 1 num_iterations += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. a = ex_strategy_one(actions, pi_star, s, iterations_this_game) # a = ex_strategy_two(actions, Q, s, iterations_this_game) action = actions[a] # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) s_prime = s - throw.location_to_score(loc) if s_prime < 0: s_prime = s # Update experience: # increment number of times this action was taken in this state; # increment number of times we moved from this state to next state on this action. num_actions[s][a] += 1 num_transitions[s][s_prime][a] += 1 # Next state becomes current state s = s_prime # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, # using infinite-horizon value iteration. if num_iterations % epoch_size == 0: # Update transition probabilities for i in states: for j in states: for k in range(len(actions)): if num_actions[i][k] != 0: T_matrix[i][j][k] = float( num_transitions[i][j][k]) / float( num_actions[i][k]) # Update strategy (stored in pi) based on newly updated reward function and transition # probabilities T_matrix, pi_star, Q, V = modelbased_value_iteration( gamma, T_matrix, pi_star, actions, states, V) avg_turns = float(num_iterations) / float(num_games) print "Average turns = ", avg_turns return avg_turns
def modelfree(gamma, learning_rate, num_games, strategy_idx): actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_actions = {} num_transitions = {} T_matrix = {} Q = {} num_iterations = 0 # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions)-1) num_actions[s] = {} Q[s] = {} num_transitions[s] = {} T_matrix[s] = {} for a in range(len(actions)): Q[s][a] = 1.0 num_actions[s][a] = 0 for s_prime in states: num_transitions[s][s_prime] = {} T_matrix[s][s_prime] = {} for a in range(len(actions)): num_transitions[s][s_prime][a] = 0 T_matrix[s][s_prime][a] = 0 # play num_games games, updating policy after every EPOCH_SIZE number of throws for g in range(1, num_games + 1): # run a single game s = throw.START_SCORE throws = 0 explores = 0 exploits = 0 while s > 0: num_iterations += 1 throws += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. if(strategy_idx==1): to_explore = ex_strategy_one(s,g) else: to_explore = ex_strategy_two(s,g) if to_explore: # explore a = random.randint(0, len(actions)-1) action = actions[a] explores += 1 else: # exploit a = bestAction(Q, s) action = actions[a] exploits += 1 #print "a", a, "action",action # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) delta = throw.location_to_score(loc) s_prime = s - delta if s_prime < 0: s_prime = s # Update experience: # increment number of times this action was taken in this state; # increment number of times we moved from this state to next state on this action. num_actions[s][a] += 1 num_transitions[s][s_prime][a] += 1 this_lr = 1 / num_actions[s][a] Q[s][a] = newQ(Q, s, a, s_prime, gamma, this_lr) # Next state becomes current state s = s_prime # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, # using infinite-horizon value iteration. #print "Game",g,"took",throws,"throws (explore ratio %1.4f)" % (float(explores)/(explores+exploits)) print g,throws,"%1.4f" % (float(explores)/(explores+exploits)) avg = float(num_iterations)/float(num_games) return avg
def modelfree(gamma, learning_rate, num_games, strategy_idx): actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_actions = {} num_transitions = {} T_matrix = {} Q = {} num_iterations = 0 # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions) - 1) num_actions[s] = {} Q[s] = {} num_transitions[s] = {} T_matrix[s] = {} for a in range(len(actions)): Q[s][a] = 1.0 num_actions[s][a] = 0 for s_prime in states: num_transitions[s][s_prime] = {} T_matrix[s][s_prime] = {} for a in range(len(actions)): num_transitions[s][s_prime][a] = 0 T_matrix[s][s_prime][a] = 0 # play num_games games, updating policy after every EPOCH_SIZE number of throws for g in range(1, num_games + 1): # run a single game s = throw.START_SCORE throws = 0 explores = 0 exploits = 0 while s > 0: num_iterations += 1 throws += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. if (strategy_idx == 1): to_explore = ex_strategy_one(s, g) else: to_explore = ex_strategy_two(s, g) if to_explore: # explore a = random.randint(0, len(actions) - 1) action = actions[a] explores += 1 else: # exploit a = bestAction(Q, s) action = actions[a] exploits += 1 #print "a", a, "action",action # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) delta = throw.location_to_score(loc) s_prime = s - delta if s_prime < 0: s_prime = s # Update experience: # increment number of times this action was taken in this state; # increment number of times we moved from this state to next state on this action. num_actions[s][a] += 1 num_transitions[s][s_prime][a] += 1 this_lr = 1 / num_actions[s][a] Q[s][a] = newQ(Q, s, a, s_prime, gamma, this_lr) # Next state becomes current state s = s_prime # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, # using infinite-horizon value iteration. #print "Game",g,"took",throws,"throws (explore ratio %1.4f)" % (float(explores)/(explores+exploits)) print g, throws, "%1.4f" % (float(explores) / (explores + exploits)) avg = float(num_iterations) / float(num_games) return avg