def main(): #policy learning global numEpisodes global returnSum global emu global epi global alpha global num_states global num_actions for i in range(6): print "i:",i if i==0:alpha,emu,epi=0.001,0.05,0.01 if i==1:alpha,emu,epi=0.001,0.1,0.01 if i==2:alpha,emu,epi=0.001,0.2,0.01 if i==3:alpha,emu,epi=0.001,0.3,0.01 if i==4:alpha,emu,epi=0.001,0.4,0.01 if i==5:alpha,emu,epi=0.001,0.6,0.01 print alpha,emu,epi for _ in range(numEpisodes): if _%10000==0: print "Episode:",_ print "Average return:",returnSum/(_+1) s=bj.init() rand=np.random.random() if rand<emu: # print "rand:",rand,"policy: rand" rand_policy(s) else: exp_sarsa(s) # print "rand:",rand,"policy: ExpectedSarsa" bj.printPolicy(sarsa_policy) print "Average return:",returnSum/numEpisodes #determinstic policy returnSum=0.0 numEpisodes=int(math.pow(10,7)) for _ in range(numEpisodes): if _%10000==0: print "Episode:",_ print "Average return:",returnSum/(_+1) s=bj.init() deter_policy(s) bj.printPolicy(sarsa_policy) print "Average return:",returnSum/numEpisodes print "alpha, emu, epi, episodes:",alpha,emu,epi,numEpisodes info[i]=returnSum/numEpisodes,alpha,emu,epi,numEpisodes print info
def qLearning(self): for i in range(1,181): randomValue1 = random.random() randomValue2 = random.random() randomValue1 = randomValue1 * 0.00001 randomValue2 = randomValue2 * 0.00001 self.q[i][0] = randomValue1 self.q[i][1] = randomValue2 iterations = 0 returnSum = 0 while iterations < self.MAXITERATIONS: s = blackjack.init() reward, state = blackjack.sample(s,1) if state == -1: returnSum = returnSum+reward while state != -1: A = self.eGreedy(self.q,state) reward, statePrime = self.giveSample(state, A) returnSum = returnSum + reward if reward == 0 and statePrime != -1: theMaxAction = self.maxAction(self.q, statePrime) newStateMaxQSA = self.q[statePrime][theMaxAction] else: newStateMaxQSA = 0 if self.ALPHA == "Dynamic": #print("YES") ALPHA = self.getDynamicAlpha(state,A) else: ALPHA = self.ALPHA bracketValue = reward+(self.GAMMA*newStateMaxQSA)-self.q[state][A] self.q[state][A] = self.q[state][A]+ALPHA*(bracketValue) state = statePrime iterations = iterations + 1 if self.printEveryOneThousandEpisodes and iterations % 10000 == 0: print("Average Return During Learning Phase at "+str(iterations)+" is "+str(returnSum/iterations)) print("The Policy learned From the Exploration Phase is : ") blackjack.printPolicy(self.printPolicy2) return returnSum/self.MAXITERATIONS
def learn(alpha, eps, numTrainingEpisodes): returnSum = 0.0 for episodeNum in range(numTrainingEpisodes): S = blackjack.init() G = 0 A = 0 R, S = blackjack.sample(S, A) G += R # ACCOUNTS FOR THE NATURAL (INSTANT WIN OR DRAW) # iterate for each step of the episode while S: if np.random.random() > eps: if Q1[S][0] + Q2[S][0] >= Q1[S][1] + Q2[S][1]: A = 0 R, nS = blackjack.sample(S, A) elif Q1[S][0] + Q2[S][0] < Q1[S][1] + Q2[S][1]: A = 1 R, nS = blackjack.sample(S, A) else: A = np.random.randint(0, 2) R, nS = blackjack.sample(S, A) # 0.5 probability of doing Q1 or Q2 prob = np.random.randint(0, 2) if not nS: if prob == 1: Q1[S][A] = Q1[S][A] + alpha * (R - Q1[S][A]) else: Q2[S][A] = Q2[S][A] + alpha * (R - Q2[S][A]) else: if prob == 1: Q1[S][A] = Q1[S][A] + alpha * ( R + Q2[nS][np.argmax(Q1, 1)[nS]] - Q1[S][A]) else: Q2[S][A] = Q2[S][A] + alpha * ( R + Q1[nS][np.argmax(Q2, 1)[nS]] - Q2[S][A]) S = nS G += R #print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G if episodeNum % 10000 == 0 and episodeNum != 0: blackjack.printPolicy(policy) print("Average return so far: ", returnSum / episodeNum)
greedychance = 1-epsilon #Get best value at the next state highest = argmax(states[nextstate]) #Expected sarsa calculation (greedy * best_next_state_action) + (explore * (0.5*next_state_action1 + 0.5*next_state_action2)) target = (greedychance * states[nextstate][highest]) + (epsilon * (0.5*states[nextstate][0] + 0.5*states[nextstate][1])) states[currentState][action] = states[currentState][action] + alpha * (reward + target - states[currentState][action]) currentState = nextstate #print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G if(episodeNum%10000 == 0 and episodeNum != 0 and episodeNum != learningEpisodes): if episodeNum < learningEpisodes: #Print the running average while learning print "Average return ",episodeNum,": ", returnSum/episodeNum else: #Print the average of just the episodes used in evaluation mode (non-learning) print "Average return ",episodeNum,": ", returnSum/(episodeNum-learningEpisodes) if episodeNum == learningEpisodes: epsilon = epsilonPi alpha = 0 returnSum = 0.0 blackjack.printPolicy(returnPolicy) print "Average return: ", returnSum/evaluationEpisodes #Print the policy blackjack.printPolicy(returnPolicy)
# Initial the state (deal the first card ) s=blackjack.init() segma_r = 0 #for episodeNum in range(numEpisodes): while s!=-1: # -1 is terminal a = argmax([ex[s,0], ex[s,1]]) # Choose argmax(Q(s,a)) if random.uniform(0,1) < epsilon/2: # e-greedy a = abs(a-1) # Q(s,a) <- Q(s,a) + alpha * (r + argmax(Q(sp,a)) - Q(s,a)) r,sp=blackjack.sample(s,a) # Get the reward and s' ex[s,a] += alpha * (r - ex[s,a] + ex[sp,argmax([ex[sp,0],ex[sp,1]])]) s=sp; segma_r += r # Return the value and next state return segma_r #epsilon = 1/(episodeNum+1) #returnSum += segma_r #return returnSum,s for episodeNum in range(numEpisodes): epsilon = 1/(episodeNum+1) # Stepwise alternated epsilon is best returnSum += Qlearning(ex) # Update for each episode #def testfunction(policy): #Qlearning(ex,epsilon,alpha,numEpisodes,returnSum) print("\nAverage return: ", returnSum/numEpisodes) blackjack.printPolicy(policy)
#update values G+= reward state = newState if episodeNum % 10000 == 0 and episodeNum != 0: print "Episode: ", episodeNum, "Return: ", G, "Average return: ", returnSum/(episodeNum) returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes print "Running the deterministic policy" returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 state = 0 while state != -1: action = argmax(Q[state]) result = blackjack.sample(state,action) reward = result[0] newState = result[1] #update values G+= reward state = newState if episodeNum % 10000 == 0 and episodeNum != 0: print "Episode: ", episodeNum, "Return: ", G, "Average return: ", returnSum/(episodeNum) returnSum = returnSum + G print "Average return: ", returnSum/(numEpisodes) blackjack.printPolicy(policyPrint)
returnSum = 0.0 for episodeNum in xrange(numEpisodesLearn): G = 0.0 s = blackjack.init() while (s != -1): a = np.argmax(Q[s]) if np.random.random() > eps_mu \ else np.random.randint(numActions) (r, sp) = blackjack.sample(s, a) v_pi = eps_pi * np.average(Q[sp]) + (1 - eps_pi) * np.max(Q[sp]) Q[s, a] += alpha * (r + gamma * v_pi - Q[s, a]) G = r + gamma * G s = sp returnSum += G ep = episodeNum + 1 if (ep % 10000 == 0): print "Episode: ", ep, "Average return: ", returnSum / ep print "Average return while learning: ", returnSum / numEpisodesLearn greedy = lambda s: np.argmax(Q[s]) blackjack.printPolicy(greedy) returnSum = 0.0 for episodeNum in xrange(numEpisodesEval): G = 0.0 s = blackjack.init() while (s != -1): (r, s) = blackjack.sample(s, greedy(s)) G = r + gamma * G returnSum += G print "Average return on deterministic policy: ", returnSum / numEpisodesEval
state=0 return1=0 while (state != -1): action = policy(state) reward,statep=blackjack.sample(state,action) Q[state][action] = Q[state][action] + alpha*(reward + expectedValue(statep) - Q[state][action]) state = statep return1+=reward returnSum+=return1 if (((episodeNum % 10000) == 0) and (episodeNum != 0)): print "Count =",episodeNum,"Average return: ", returnSum/(episodeNum) blackjack.printPolicy(learnedPolicy) print "Average return: ", float(returnSum)/float(numEpisodes) returnSumLearned=0 """ Now use learned policy and calculate average return """ for episodeNum in range(numEpisodes): blackjack.init() state=0 return1=0 while (state != -1): action = learnedPolicy(state) reward,statep=blackjack.sample(state,action) state = statep+0 return1+=reward
Q[s,a] = Q[s,a] + alpha*(r + actionProb(epi,0,s1)*Q[s1,0] + actionProb(epi,1,s1)*Q[s1,1] - Q[s,a]) s = s1 G+=r returnSum = returnSum + G if episodeNum%10000 == 0: print "Episode: ", episodeNum print "Average return: ", returnSum/(episodeNum+1) #Function for the learned policy def learnedPolicy(s): return np.argmax(Q[s]) #Printing out the learned policy bj.printPolicy(learnedPolicy) #Following the learned policy deterministically returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 s = bj.init() while s != -1: r, s = bj.sample(s,learnedPolicy(s)) G+=r returnSum = returnSum + G
def policy(state): Q = Q1 + Q2 if Q[state, 0] > Q[state, 1]: return 0 else: return 1 #part 2 #double-Q learning for test the randompolicy(alpha=0.001,eps=1) for 1000000 episodes. #learn(0.001, 1, 1000000) #blackjack.printPolicy(policy) #value_evaluate=evaluate(1000000) #print(value_evaluate) #double-Q learning for test the eps=0.01 and alpha=0.001 for 1000000 episodes. #learn(0.001, 0.01, 1000000) #blackjack.printPolicy(policy) #value_evaluate=evaluate(1000000) #blackjack.printPolicyToFile(policy) #print(value_evaluate) #part 3 learn(0.001, 0.995, 1000000) blackjack.printPolicy(policy) value_evaluate = evaluate(1000000) #blackjack.printPolicyToFile(policy) print(value_evaluate)
#print R if Sprime == -1: Q[S][A] = Q[S][A] + alpha * (R - Q[S][A]) else: Q[S][A] = Q[S][A] + alpha * (R + gamma * (max(Q[Sprime][0], Q[Sprime][1])) - Q[S][A]) S = Sprime if episodeNum == dropEpsilonEpisode: #print "=============================END EXPLORING PHASE=============================" epsilon = 0 if episodeNum == dropAlpha: #print "=============================END LEARNING PHASE=============================" alpha = 0 #if episodeNum % 10000 == 0: #print "Current Avg: " + str(returnSum / (episodeNum+1)) + " Ep: " + str(episodeNum) returnSum = returnSum + G blackjack.printPolicy(showPolicy) print "" print "Avg Return:" + str(returnSum / numEpisodes) printSettings()
alpha = 0.001 for episodeNum in range(numEpisodes): random.seed(episodeNum) # Cumulative reward G = 0 # Init the game of blackjack and get the initial state s = blackjack.init() #Consider each step of the episode while s!=-1: #-1 is terminal # Take epsilon greedy action at each step of episode a = getEpsilonGreedyAction(Q, s, epsilon) r,sp = blackjack.sample(s,a) # Update action value function with Q-learning off-policy update Q[s, a] = Q[s, a] + alpha * (r + max(Q[sp, :]) - Q[s, a]) G += r s=sp if not(episodeNum % 10000) : print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G print("Average return: ", returnSum/numEpisodes) blackjack.printPolicy(getLearnedPolicy) # Run learned policy policySum = 0.0 for policyEpisodeNum in range(numEpisodes): policySum += runLearnedPolicy() print("Average learned policy return: ", policySum/numEpisodes)
#main function #policy learning for _ in range(numEpisodes): if _%10000==0: print "Episode:",_ print "Average return:",returnSum/(_+1) s=bj.init() rand=np.random.random() if rand<emu: # print "rand:",rand,"policy: rand" rand_policy(s) else: exp_sarsa(s) # print "rand:",rand,"policy: ExpectedSarsa" bj.printPolicy(sarsa_policy) print "Average return:",returnSum/numEpisodes #determinstic policy returnSum=0.0 numEpisodes=int(math.pow(10,7)) for _ in range(numEpisodes): if _%10000==0: print "Episode:",_ print "Average return:",returnSum/(_+1) s=bj.init() deter_policy(s) bj.printPolicy(sarsa_policy) print "Average return:",returnSum/numEpisodes print "alpha, emu, epi, episodes:",alpha,emu,epi,numEpisodes
#Expected sarsa calculation (greedy * best_next_state_action) + (explore * (0.5*next_state_action1 + 0.5*next_state_action2)) target = (greedychance * states[nextstate][highest]) + ( epsilon * (0.5 * states[nextstate][0] + 0.5 * states[nextstate][1])) states[currentState][action] = states[currentState][action] + alpha * ( reward + target - states[currentState][action]) currentState = nextstate #print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G if (episodeNum % 10000 == 0 and episodeNum != 0 and episodeNum != learningEpisodes): if episodeNum < learningEpisodes: #Print the running average while learning print "Average return ", episodeNum, ": ", returnSum / episodeNum else: #Print the average of just the episodes used in evaluation mode (non-learning) print "Average return ", episodeNum, ": ", returnSum / ( episodeNum - learningEpisodes) if episodeNum == learningEpisodes: epsilon = epsilonPi alpha = 0 returnSum = 0.0 blackjack.printPolicy(returnPolicy) print "Average return: ", returnSum / evaluationEpisodes #Print the policy blackjack.printPolicy(returnPolicy)
blackjack.init() state = 0 return1 = 0 while (state != -1): action = policy(state) reward, statep = blackjack.sample(state, action) Q[state][action] = Q[state][action] + alpha * ( reward + expectedValue(statep) - Q[state][action]) state = statep return1 += reward returnSum += return1 if (((episodeNum % 10000) == 0) and (episodeNum != 0)): print "Count =", episodeNum, "Average return: ", returnSum / ( episodeNum) blackjack.printPolicy(learnedPolicy) print "Average return: ", float(returnSum) / float(numEpisodes) returnSumLearned = 0 """ Now use learned policy and calculate average return """ for episodeNum in range(numEpisodes): blackjack.init() state = 0 return1 = 0 while (state != -1): action = learnedPolicy(state) reward, statep = blackjack.sample(state, action) state = statep + 0 return1 += reward returnSumLearned += return1