def main(): #policy learning global numEpisodes global returnSum global emu global epi global alpha global num_states global num_actions for i in range(6): print "i:",i if i==0:alpha,emu,epi=0.001,0.05,0.01 if i==1:alpha,emu,epi=0.001,0.1,0.01 if i==2:alpha,emu,epi=0.001,0.2,0.01 if i==3:alpha,emu,epi=0.001,0.3,0.01 if i==4:alpha,emu,epi=0.001,0.4,0.01 if i==5:alpha,emu,epi=0.001,0.6,0.01 print alpha,emu,epi for _ in range(numEpisodes): if _%10000==0: print "Episode:",_ print "Average return:",returnSum/(_+1) s=bj.init() rand=np.random.random() if rand<emu: # print "rand:",rand,"policy: rand" rand_policy(s) else: exp_sarsa(s) # print "rand:",rand,"policy: ExpectedSarsa" bj.printPolicy(sarsa_policy) print "Average return:",returnSum/numEpisodes #determinstic policy returnSum=0.0 numEpisodes=int(math.pow(10,7)) for _ in range(numEpisodes): if _%10000==0: print "Episode:",_ print "Average return:",returnSum/(_+1) s=bj.init() deter_policy(s) bj.printPolicy(sarsa_policy) print "Average return:",returnSum/numEpisodes print "alpha, emu, epi, episodes:",alpha,emu,epi,numEpisodes info[i]=returnSum/numEpisodes,alpha,emu,epi,numEpisodes print info
def showOneGame(): s=blackjack.init() moves=[0,1,0] turn=0 while s!=-1: #-1 is terminal a=moves[turn] r,sp=blackjack.sample(s,a) print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="") print("\t Player Sum: %d Dealer Card: %d Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce)) s=sp turn+=1 return None
def runLearnedPolicy(): G = 0 # Init the game of blackjack and get the initial state s = blackjack.init() #Consider each step of the episode while s!=-1: #-1 is terminal # Take the action given by learned policy a = getLearnedPolicy(s) r,sp = blackjack.sample(s,a) G += r s=sp return G
def showOneGame(): G = 0 s=blackjack.init() turn=0 while s!=-1: #-1 is terminal a=randint(0,1) r,sp=blackjack.sample(s,a) print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="") print("\t Player Sum: %d Dealer Card: %d Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce)) turn+=1 s=sp G=G+r return G
def showOneGame(): s=blackjack.init() moves=[0,1] turn=0 Reward_sum = 0 while s!=-1: #-1 is terminal a= moves[turn] r,sp=blackjack.sample(s,a) #print("turn %d: s %d a %d -> r %d sp %d "%(turn,s,a,r,sp),end="") #print("\t Player Sum: %d Dealer Card: %d Usable Ace: %d"%(blackjack.playerSum,blackjack.dealerCard, blackjack.usableAce)) s=sp turn=random.randint(0,1) Reward_sum +=r return Reward_sum
def Qlearning(ex): # Initial the state (deal the first card ) s=blackjack.init() segma_r = 0 while s!=-1: # -1 is terminal a = argmax([ex[s,0], ex[s,1]]) # Choose argmax(Q(s,a)) if random.uniform(0,1) < epsilon/2: # e-greedy a = abs(a-1) # Q(s,a) <- Q(s,a) + alpha * (r + argmax(Q(sp,a)) - Q(s,a)) r,sp=blackjack.sample(s,a) # Get the reward and s' ex[s,a] += alpha * (r - ex[s,a] + ex[sp,argmax([ex[sp,0],ex[sp,1]])]) s=sp; segma_r += r # Return the value and next state return segma_r
def qLearning(self): for i in range(1,181): randomValue1 = random.random() randomValue2 = random.random() randomValue1 = randomValue1 * 0.00001 randomValue2 = randomValue2 * 0.00001 self.q[i][0] = randomValue1 self.q[i][1] = randomValue2 iterations = 0 returnSum = 0 while iterations < self.MAXITERATIONS: s = blackjack.init() reward, state = blackjack.sample(s,1) if state == -1: returnSum = returnSum+reward while state != -1: A = self.eGreedy(self.q,state) reward, statePrime = self.giveSample(state, A) returnSum = returnSum + reward if reward == 0 and statePrime != -1: theMaxAction = self.maxAction(self.q, statePrime) newStateMaxQSA = self.q[statePrime][theMaxAction] else: newStateMaxQSA = 0 if self.ALPHA == "Dynamic": #print("YES") ALPHA = self.getDynamicAlpha(state,A) else: ALPHA = self.ALPHA bracketValue = reward+(self.GAMMA*newStateMaxQSA)-self.q[state][A] self.q[state][A] = self.q[state][A]+ALPHA*(bracketValue) state = statePrime iterations = iterations + 1 if self.printEveryOneThousandEpisodes and iterations % 10000 == 0: print("Average Return During Learning Phase at "+str(iterations)+" is "+str(returnSum/iterations)) print("The Policy learned From the Exploration Phase is : ") blackjack.printPolicy(self.printPolicy2) return returnSum/self.MAXITERATIONS
def bjrandomPolicy(numEpisodes=10000): # Input: number of Episodes # Output: Average Return over number of episodes # Policy: Equally Random returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 # Implement equaly random probability assuming gamma = 1 s=blackjack.init() while s!=-1: #-1 is terminal # Rand int returns a number between 0, 1 a=random.randint(0,1) G,sp=blackjack.sample(s,a) s=sp print("Episode: ", episodeNum, "Return: ", G) returnSum = returnSum + G print("Average return: ", returnSum/numEpisodes) return None
def onlyExploitQ(self): iterations = 0 returnSum = 0 while iterations < self.MAXITERATIONS: s = blackjack.init() reward, state = blackjack.sample(s,1) if state == -1: returnSum = returnSum+reward while state != -1: A = self.maxAction(self.q, state) reward, statePrime = self.giveSample(state, A) returnSum = returnSum + reward state = statePrime iterations = iterations + 1 if self.printEveryOneThousandEpisodes and iterations % 10000 == 0: print("Average Return During Exploitation Phase at "+str(iterations)+" is "+str(returnSum/iterations)) return returnSum/self.MAXITERATIONS
from pylab import * from numpy import * def random_policy (list_of_actions): #returns a random action from a list of possible actions next_action = choice(list_of_actions) #print next_action return next_action numEpisodes = 10000 returnSum = 0.0 actions = [0,1] for episodeNum in range(numEpisodes): s = blackjack.init(); G = 0 while (s != -1): a = random_policy (actions) result = blackjack.sample (s,a) #print blackjack.sample (0, 1) G = G + result[0] s = result[1] print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes printPolicy(Q)
if (state == -1): return 0 elif (testnumber <= epsilonpi): return (0.5 * Q[state][0] + 0.5 * Q[state][1]) else: return Q[state][np.argmax(Q[state])] """ Experiments: First learn policy and calculate average return """ for episodeNum in range(numEpisodes): blackjack.init() state = 0 return1 = 0 while (state != -1): action = policy(state) reward, statep = blackjack.sample(state, action) Q[state][action] = Q[state][action] + alpha * ( reward + expectedValue(statep) - Q[state][action]) state = statep return1 += reward returnSum += return1 if (((episodeNum % 10000) == 0) and (episodeNum != 0)): print "Count =", episodeNum, "Average return: ", returnSum / ( episodeNum) blackjack.printPolicy(learnedPolicy)
import blackjack from pylab import * from random import * numEpisodes = 2000 returnSum = 0.0 for episodeNum in range(numEpisodes): G =0 black = blackjack.init() action =[0,1] while black!=-1: num = randint(0,1) n,black = blackjack.sample(black,action[num]) G+=n print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes
numEpisodes = 1000000 e = 1 alpha = 0.001 returnSum = 0.0 Q = [[0,0]]*183 #for i in Q: # i[0],i[1]= uniform(0,1),uniform(0,1) for episodeNum in range(numEpisodes): G = 0 # my code starts here a=0 S =blackjack.init() t = Q[S] if (e > randint(0, 2)): R,S_ =blackjack.sample(S,randint(0,2)) else: if t[0]>t[1]: R,S_ = blackjack.sample(S,0) a=0 else: R,S_ = blackjack.sample(S,1) a=1 while (S_!=(-1)): Q[S][a] = Q[S][a] + alpha*(R + Q[S_][0]+Q[S_][1]-Q[S][a]) S=S_ t = Q[S] if (e > randint(0, 2)):
import blackjack from pylab import * numEpisodes = 2000 returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 # my code starts here R,S = blackjack.sample(blackjack.init(),(randint(0, 2))) if (S==(-1)): R=1 while (S!=(-1)): R,S = blackjack.sample(S,(randint(0, 2))) G = R print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes
epsilonMu = 0.2 epsilonPi = 0.0 alpha = 0.0005 discount = 1 epsilon = epsilonMu def returnPolicy(state): return n.argmax(states[state]) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 #Start a new game of blackjack currentState = blackjack.init() #Continue this game until the terminal state is reached while (currentState != -1): #Get a random number between 0 and 1, if its less than epsilon behavior, then explore rnumber = n.random.rand() if rnumber < epsilon: action = n.random.randint(2) else: #If not exploring, pick the highest action at state S action = returnPolicy(currentState) #Get the next state, get reward and next state next = blackjack.sample(currentState, action) reward = next[0] nextstate = next[1]
import blackjack from pylab import * numEpisodes = 2000 returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 currentstate = blackjack.init() while(currentstate != -1): action = randint(2) #randomly pick the action next = blackjack.sample(currentstate, action) G = G + next[0] currentstate = next[1] print "Episode: ", episodeNum, "Return: ", G returnSum = returnSum + G print "Average return: ", returnSum/numEpisodes
#Function to find the probability of a given action given the policy def actionProb(e,a,s): if np.argmax(Q[s]) == a: return 1 - e + e/num_actions else: return e/num_actions #Learning the policy through the Expected Sarsa algorithm Q = 0.00001*np.random.rand(num_states,num_actions) for episodeNum in range(numEpisodes): G = 0 s = bj.init() while s != -1: a = np.random.choice(2, p=[actionProb(emu,0,s),actionProb(emu,1,s)]) r, s1 = bj.sample(s,a) Q[s,a] = Q[s,a] + alpha*(r + actionProb(epi,0,s1)*Q[s1,0] + actionProb(epi,1,s1)*Q[s1,1] - Q[s,a]) s = s1 G+=r returnSum = returnSum + G if episodeNum%10000 == 0: print "Episode: ", episodeNum print "Average return: ", returnSum/(episodeNum+1) #Function for the learned policy def learnedPolicy(s):
numEpisodes = evaluationEpisodes + learningEpisodes states = 0.00001*n.random.rand(181,2) epsilonMu = 0.2 epsilonPi = 0.0 alpha = 0.0005 discount = 1 epsilon = epsilonMu def returnPolicy(state): return n.argmax(states[state]) returnSum = 0.0 for episodeNum in range(numEpisodes): G = 0 #Start a new game of blackjack currentState = blackjack.init() #Continue this game until the terminal state is reached while(currentState != -1): #Get a random number between 0 and 1, if its less than epsilon behavior, then explore rnumber = n.random.rand() if rnumber < epsilon: action = n.random.randint(2) else: #If not exploring, pick the highest action at state S action = returnPolicy(currentState) #Get the next state, get reward and next state next = blackjack.sample(currentState, action) reward = next[0] nextstate = next[1]
if (state == -1): return 0 elif (testnumber <= epsilonpi): return (0.5*Q[state][0] + 0.5*Q[state][1]) else: return Q[state][np.argmax(Q[state])] """ Experiments: First learn policy and calculate average return """ for episodeNum in range(numEpisodes): blackjack.init() state=0 return1=0 while (state != -1): action = policy(state) reward,statep=blackjack.sample(state,action) Q[state][action] = Q[state][action] + alpha*(reward + expectedValue(statep) - Q[state][action]) state = statep return1+=reward returnSum+=return1 if (((episodeNum % 10000) == 0) and (episodeNum != 0)): print "Count =",episodeNum,"Average return: ", returnSum/(episodeNum)