def newThrow(id, player, totPlay): if id == -1: # print(throw(0, 1, ran(), ran(), ran(), ran(), ran(), ran() )) return throw(0, 1, ran(), ran(), ran(), ran(), ran(), ran() ) else: # print(throw(id+1, ((player+1)%totPlay)+1 , ran(), ran(), ran(), ran(), ran(), ran() )) return throw(id+1, (player+1)%totPlay , ran(), ran(), ran(), ran(), ran(), ran() )
def play(method): score = throw.START_SCORE turns = 0 if method == "mdp": target = mdp.start_game(GAMMA) else: target = modelfree.start_game() targets = [] results = [] while (True): turns = turns + 1 result = throw.throw(target) targets.append(target) results.append(result) raw_score = throw.location_to_score(result) if raw_score <= score: score = int(score - raw_score) else: cc = 1 if score == 0: break if method == "mdp": target = mdp.get_target(score) else: target = modelfree.get_target(score) # print "WOOHOO! It only took", turns, " turns" #end_game(turns) return turns
def play(method): score = throw.START_SCORE turns = 0 if method == "mdp": target = mdp.start_game(GAMMA) else: target = modelfree.start_game() targets = [] results = [] while(True): turns = turns + 1 result = throw.throw(target) targets.append(target) results.append(result) raw_score = throw.location_to_score(result) if raw_score <= score: score = int(score - raw_score) else: cc=1 if score == 0: break if method == "mdp": target = mdp.get_target(score) else: target = modelfree.get_target(score) # print "WOOHOO! It only took", turns, " turns" #end_game(turns) return turns
def modelfree(alpha, gamma, num_games): # store all actions (targets on dartboard) in actions array actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_iterations = 0 Q = [[]] * len(states) # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions)-1) Q[s] = [0] * len(actions) # play num_games games for g in range(1, num_games + 1): #print str(g) + "/" + str(num_games) # run a single game s = throw.START_SCORE while s > 0: num_iterations += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. a = ex_strategy_one(num_iterations, actions, pi_star, s) #a = ex_strategy_two(num_iterations, Q, actions, s, pi_star) action = actions[a] # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) s_prime = int(s - throw.location_to_score(loc)) if s_prime < 0: s_prime = s max_Q = max(Q[s_prime]) Q[s][a] += alpha * (darts.R(s, actions[a]) + gamma * max(Q[s_prime]) - Q[s][a]) pi_star[s] = Q[s].index(max(Q[s])) # Next state becomes current state s = s_prime print "Average turns = ", float(num_iterations)/float(num_games)
def play(method): global actions actions = get_actions() score = throw.START_SCORE turns = 0 if method == "mdp": target = mdp.start_game(GAMMA) else: target = modelfree.start_game() targets = [] results = [] while(True): turns = turns + 1 result = throw.throw(target) targets.append(target) results.append(result) raw_score = throw.location_to_score(result) print "Target: wedge", target.wedge,", ring", target.ring print "Result: wedge", result.wedge,", ring", result.ring print "Raw Score:", raw_score print "Score:", score prior = score if raw_score <= score: score = int(score - raw_score) else: print print "TOO HIGH!" modelfree.q_learning(prior, score, get_index(actions, target)) if score == 0: break if method == "mdp": target = mdp.get_target(score) else: target = modelfree.get_target(score) print "WOOHOO! It only took", turns, " turns" #end_game(turns) return turns
def play(method, d=None): score = throw.START_SCORE turns = 0 if method == "mdp": target = mdp.start_game(GAMMA) else: target = modelfree.start_game() targets = [] results = [] while True: turns = turns + 1 result = throw.throw(target) targets.append(target) results.append(result) raw_score = throw.location_to_score(result) if d: if d[score] == None: d[score] = throw.location_to_score(target) else: assert d[score] == throw.location_to_score(target) # print "Target: wedge", target.wedge,", ring", target.ring # print "Result: wedge", result.wedge,", ring", result.ring # print "Raw Score:", raw_score # print "Score:", score if raw_score <= score: score = int(score - raw_score) # else: # print # print "TOO HIGH!" if score == 0: break if method == "mdp": target = mdp.get_target(score) else: target = modelfree.get_target(score) # print "WOOHOO! It only took", turns, " turns" # end_game(turns) return turns
def play(method): score = throw.START_SCORE turns = 0 if method == "mdp": target = mdp.start_game(GAMMA) else: target = modelfree.start_game() targets = [] results = [] while(True): turns = turns + 1 result = throw.throw(target) targets.append(target) results.append(result) raw_score = throw.location_to_score(result) #if raw_score > score: # update Q[s][a] #else: #modelfree.Q_learning(score,target,raw_score) print "Target: wedge", target.wedge,", ring", target.ring print "Result: wedge", result.wedge,", ring", result.ring print "Raw Score:", raw_score print "Score:", score if raw_score <= score: score = int(score - raw_score) else: print print "TOO HIGH!" if score == 0: break if method == "mdp": target = mdp.get_target(score) else: target = modelfree.get_target(score) print "WOOHOO! It only took", turns, " turns" #end_game(turns) return turns
def play(method): score = throw.START_SCORE turns = 0 if method == "mdp": target = mdp.start_game(GAMMA) else: target = modelfree.start_game() targets = [] results = [] while (True): turns = turns + 1 result = throw.throw(target) targets.append(target) results.append(result) raw_score = throw.location_to_score(result) print "Target: wedge", target.wedge, ", ring", target.ring print "Result: wedge", result.wedge, ", ring", result.ring print "Raw Score:", raw_score print "Score:", score if raw_score <= score: score = int(score - raw_score) else: print print "TOO HIGH!" if score == 0: break if method == "mdp": target = mdp.get_target(score) else: target = modelfree.get_target(score) print "WOOHOO! It only took", turns, " turns" #end_game(turns) return turns
def modelbased(gamma, epoch_size, num_games): # store all actions (targets on dartboard) in actions array actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_actions = {} num_transitions = {} T_matrix = {} num_iterations = 0 Q = {} # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions)-1) num_actions[s] = {} num_transitions[s] = {} T_matrix[s] = {} Q[s] = {} for a in range(len(actions)): num_actions[s][a] = 0 Q[s][a] = 0 for s_prime in states: num_transitions[s][s_prime] = {} T_matrix[s][s_prime] = {} for a in range(len(actions)): num_transitions[s][s_prime][a] = 0 T_matrix[s][s_prime][a] = 0 # play num_games games, updating policy after every EPOCH_SIZE number of throws for g in range(1, num_games + 1): # run a single game s = throw.START_SCORE while s > 0: num_iterations += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. #to_explore = ex_strategy_one(s, num_iterations) # Second strategy to_explore = 2 newindex, newaction = ex_strategy_two(s, num_iterations, Q, actions) if to_explore == 2: a = newindex action = newaction elif to_explore: # explore a = random.randint(0, len(actions)-1) action = actions[a] else: # exploit a = pi_star[s] action = actions[a] # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) s_prime = s - throw.location_to_score(loc) if s_prime < 0: s_prime = s # Update experience: # increment number of times this action was taken in this state; # increment number of times we moved from this state to next state on this action. num_actions[s][a] += 1 num_transitions[s][s_prime][a] += 1 # Next state becomes current state s = s_prime # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, # using infinite-horizon value iteration. if num_iterations % epoch_size == 0: # Update transition probabilities for i in states: for j in states: for k in range(len(actions)): if num_actions[i][k] != 0: T_matrix[i][j][k] = float(num_transitions[i][j][k]) / float(num_actions[i][k]) # Update strategy (stored in pi) based on newly updated reward function and transition # probabilities T_matrix, pi_star, Q = modelbased_value_iteration(gamma, T_matrix, pi_star) print "Average turns = ", float(num_iterations)/float(num_games)
def modelbased(gamma, epoch_size, num_games): # store all actions (targets on dartboard) in actions array actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_actions = {} num_transitions = {} T_matrix = {} num_iterations = 0 # initialize v V = {} V[0] = {} V[1] = {} for s in states: V[0][s] = 0 V[1][s] = 0 # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions) - 1) num_actions[s] = {} num_transitions[s] = {} T_matrix[s] = {} for a in range(len(actions)): num_actions[s][a] = 0 for s_prime in states: num_transitions[s][s_prime] = {} T_matrix[s][s_prime] = {} for a in range(len(actions)): num_transitions[s][s_prime][a] = 0 T_matrix[s][s_prime][a] = 0 # play num_games games, updating policy after every EPOCH_SIZE number of throws for g in range(1, num_games + 1): iterations_this_game = 0 Q = {} # run a single game s = throw.START_SCORE while s > 0: iterations_this_game += 1 num_iterations += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. a = ex_strategy_one(actions, pi_star, s, iterations_this_game) # a = ex_strategy_two(actions, Q, s, iterations_this_game) action = actions[a] # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) s_prime = s - throw.location_to_score(loc) if s_prime < 0: s_prime = s # Update experience: # increment number of times this action was taken in this state; # increment number of times we moved from this state to next state on this action. num_actions[s][a] += 1 num_transitions[s][s_prime][a] += 1 # Next state becomes current state s = s_prime # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, # using infinite-horizon value iteration. if num_iterations % epoch_size == 0: # Update transition probabilities for i in states: for j in states: for k in range(len(actions)): if num_actions[i][k] != 0: T_matrix[i][j][k] = float( num_transitions[i][j][k]) / float( num_actions[i][k]) # Update strategy (stored in pi) based on newly updated reward function and transition # probabilities T_matrix, pi_star, Q, V = modelbased_value_iteration( gamma, T_matrix, pi_star, actions, states, V) avg_turns = float(num_iterations) / float(num_games) print "Average turns = ", avg_turns return avg_turns
def modelfree(gamma, learning_rate, num_games, strategy_idx): actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_actions = {} num_transitions = {} T_matrix = {} Q = {} num_iterations = 0 # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions)-1) num_actions[s] = {} Q[s] = {} num_transitions[s] = {} T_matrix[s] = {} for a in range(len(actions)): Q[s][a] = 1.0 num_actions[s][a] = 0 for s_prime in states: num_transitions[s][s_prime] = {} T_matrix[s][s_prime] = {} for a in range(len(actions)): num_transitions[s][s_prime][a] = 0 T_matrix[s][s_prime][a] = 0 # play num_games games, updating policy after every EPOCH_SIZE number of throws for g in range(1, num_games + 1): # run a single game s = throw.START_SCORE throws = 0 explores = 0 exploits = 0 while s > 0: num_iterations += 1 throws += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. if(strategy_idx==1): to_explore = ex_strategy_one(s,g) else: to_explore = ex_strategy_two(s,g) if to_explore: # explore a = random.randint(0, len(actions)-1) action = actions[a] explores += 1 else: # exploit a = bestAction(Q, s) action = actions[a] exploits += 1 #print "a", a, "action",action # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) delta = throw.location_to_score(loc) s_prime = s - delta if s_prime < 0: s_prime = s # Update experience: # increment number of times this action was taken in this state; # increment number of times we moved from this state to next state on this action. num_actions[s][a] += 1 num_transitions[s][s_prime][a] += 1 this_lr = 1 / num_actions[s][a] Q[s][a] = newQ(Q, s, a, s_prime, gamma, this_lr) # Next state becomes current state s = s_prime # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, # using infinite-horizon value iteration. #print "Game",g,"took",throws,"throws (explore ratio %1.4f)" % (float(explores)/(explores+exploits)) print g,throws,"%1.4f" % (float(explores)/(explores+exploits)) avg = float(num_iterations)/float(num_games) return avg
# #to_explore = ex_strategy_two(num_iterations) # if to_explore: # # explore # a = random.randint(0, len(actions)-1) # action = actions[a] # else: # # exploit # a = pi_star[s] # action = actions[a] return # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) s_prime = int(s - throw.location_to_score(loc)) if s_prime < 0: s_prime = s a_prime = q_values[s_prime].index(max(q_values[s_prime])) action_prime = actions[a_prime] # Update q value for the action we just performed q_values[s][a] = q_values[s][a] + learning_rate * (darts.R(s, actions[a]) + gamma * q_values[s_prime][a_prime] - q_values[s][a]) # Next state becomes current state s = s_prime return
def modelfree(gamma, learning_rate, num_games, strategy_idx): actions = darts.get_actions() states = darts.get_states() pi_star = {} g = 0 num_actions = {} num_transitions = {} T_matrix = {} Q = {} num_iterations = 0 # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state. for s in states: pi_star[s] = random.randint(0, len(actions) - 1) num_actions[s] = {} Q[s] = {} num_transitions[s] = {} T_matrix[s] = {} for a in range(len(actions)): Q[s][a] = 1.0 num_actions[s][a] = 0 for s_prime in states: num_transitions[s][s_prime] = {} T_matrix[s][s_prime] = {} for a in range(len(actions)): num_transitions[s][s_prime][a] = 0 T_matrix[s][s_prime][a] = 0 # play num_games games, updating policy after every EPOCH_SIZE number of throws for g in range(1, num_games + 1): # run a single game s = throw.START_SCORE throws = 0 explores = 0 exploits = 0 while s > 0: num_iterations += 1 throws += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. if (strategy_idx == 1): to_explore = ex_strategy_one(s, g) else: to_explore = ex_strategy_two(s, g) if to_explore: # explore a = random.randint(0, len(actions) - 1) action = actions[a] explores += 1 else: # exploit a = bestAction(Q, s) action = actions[a] exploits += 1 #print "a", a, "action",action # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) delta = throw.location_to_score(loc) s_prime = s - delta if s_prime < 0: s_prime = s # Update experience: # increment number of times this action was taken in this state; # increment number of times we moved from this state to next state on this action. num_actions[s][a] += 1 num_transitions[s][s_prime][a] += 1 this_lr = 1 / num_actions[s][a] Q[s][a] = newQ(Q, s, a, s_prime, gamma, this_lr) # Next state becomes current state s = s_prime # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, # using infinite-horizon value iteration. #print "Game",g,"took",throws,"throws (explore ratio %1.4f)" % (float(explores)/(explores+exploits)) print g, throws, "%1.4f" % (float(explores) / (explores + exploits)) avg = float(num_iterations) / float(num_games) return avg
def Q_learning(gamma, alpha, num_games): # set these to values that make sense! #alpha = .5 #gamma = .3 Q = {} states = darts.get_states() actions = darts.get_actions() num_iterations = 0 num_total_iterations = 1 # Initialize all the Q values to zero for s in states: Q[s]= {} for a in actions: Q[s][a] = 0 for g in range(1, num_games + 1): #print "Average turns = ", float(num_iterations)/float(g) #print "GAME {}".format(g) # run a single game s = throw.START_SCORE gamethrows = 0; while s > 0: num_total_iterations += 1 gamethrows += 1 # The following two statements implement two exploration-exploitation # strategies. Comment out the strategy that you wish not to use. #to_explore = ex_strategy_one(num_iterations) to_explore = ex_strategy_two(num_total_iterations) #to_explore = ex_strategy_three(g, num_games) action = 0 if to_explore: #explore #print "explore\n" a = random.randint(0, len(actions)-1) action = actions[a] # print "action {}".format(action) else: # exploit num_iterations += 1 #print "exploit\n" action = lookup_max_a(Q,s, actions) #print "action {}".format(action) #action = a # actions[a] # Get result of throw from dart thrower; update score if necessary loc = throw.throw(action) #print "score {}".format(s) #print "throw value:{}".format(throw.location_to_score(loc)) #should reward be based on action of loc? reward = darts.R(s,action) #print "reward {}".format(reward) s_prime = s - throw.location_to_score(loc) if s_prime < 0: s_prime = s # now we update the q score table #oldQ = copy.deepcopy(Q[s][a]) oldQ = (Q[s][action]) #print "oldQ {}".format(oldQ) nextQaction = lookup_max_a(Q, s_prime, actions) #print "nextQaction {}".format(nextQaction) newQ = oldQ + alpha*(reward + gamma*(Q[s_prime][nextQaction]) - oldQ) #print "newQ {}".format(newQ) Q[s][action] = newQ #print "Q[s][a] {}".format(Q[s][a]) #print "in game {},score {}, throw value {}, oldQ {}, newQ{}".format(g,s,throw.location_to_score(loc),oldQ,newQ) s = s_prime #print gamethrows print "Average turns = ", float(num_iterations)/float(num_games/2)