Example #1
0
def main():
#policy learning	
	global numEpisodes 
	global returnSum 
	global emu
	global epi
	global alpha
	global num_states
	global num_actions

	for i in range(6):	
		print "i:",i
		if i==0:alpha,emu,epi=0.001,0.05,0.01  
		if i==1:alpha,emu,epi=0.001,0.1,0.01
		if i==2:alpha,emu,epi=0.001,0.2,0.01
		if i==3:alpha,emu,epi=0.001,0.3,0.01
		if i==4:alpha,emu,epi=0.001,0.4,0.01
		if i==5:alpha,emu,epi=0.001,0.6,0.01
		print alpha,emu,epi
		for _ in range(numEpisodes):
			if _%10000==0:
				print "Episode:",_
				print "Average return:",returnSum/(_+1)
			s=bj.init()
			rand=np.random.random()
			if rand<emu:
		#		print "rand:",rand,"policy: rand"
				rand_policy(s)
			else:
				exp_sarsa(s)
		#		print "rand:",rand,"policy: ExpectedSarsa"
		
		bj.printPolicy(sarsa_policy)
		print "Average return:",returnSum/numEpisodes
		
		#determinstic policy
		returnSum=0.0
		numEpisodes=int(math.pow(10,7))
		for _ in range(numEpisodes):
			if _%10000==0:
				print "Episode:",_
				print "Average return:",returnSum/(_+1)
			s=bj.init()
			deter_policy(s)
		bj.printPolicy(sarsa_policy)
		print "Average return:",returnSum/numEpisodes
		print "alpha, emu, epi, episodes:",alpha,emu,epi,numEpisodes
		info[i]=returnSum/numEpisodes,alpha,emu,epi,numEpisodes

	print info	
Example #2
0
        def qLearning(self):
          for i in range(1,181):
            randomValue1 = random.random()
            randomValue2 = random.random()
            
            randomValue1 = randomValue1 * 0.00001
            randomValue2 = randomValue2 * 0.00001
            self.q[i][0] = randomValue1
            self.q[i][1] = randomValue2
          
          iterations = 0
          returnSum = 0
          while iterations < self.MAXITERATIONS:      
            s = blackjack.init()
            reward, state = blackjack.sample(s,1)
            if state == -1:
              returnSum = returnSum+reward
            while state != -1:
              A = self.eGreedy(self.q,state)
              reward, statePrime = self.giveSample(state, A)
              returnSum = returnSum + reward
              if reward == 0 and statePrime != -1:
                theMaxAction = self.maxAction(self.q, statePrime)
                newStateMaxQSA = self.q[statePrime][theMaxAction]
              else:
                newStateMaxQSA = 0
              
              if self.ALPHA == "Dynamic":
                      #print("YES")
                 ALPHA = self.getDynamicAlpha(state,A)
              else:
                 ALPHA = self.ALPHA

              bracketValue = reward+(self.GAMMA*newStateMaxQSA)-self.q[state][A]
              self.q[state][A] = self.q[state][A]+ALPHA*(bracketValue)  
              state = statePrime
            
            iterations = iterations + 1
            if self.printEveryOneThousandEpisodes and iterations % 10000 == 0:
                print("Average Return During Learning Phase at "+str(iterations)+" is "+str(returnSum/iterations))

          
          print("The Policy learned From the Exploration Phase is : ")
          blackjack.printPolicy(self.printPolicy2)
          return returnSum/self.MAXITERATIONS
Example #3
0
def learn(alpha, eps, numTrainingEpisodes):
    returnSum = 0.0
    for episodeNum in range(numTrainingEpisodes):
        S = blackjack.init()
        G = 0
        A = 0
        R, S = blackjack.sample(S, A)
        G += R  # ACCOUNTS FOR THE NATURAL (INSTANT WIN OR DRAW)

        # iterate for each step of the episode
        while S:
            if np.random.random() > eps:
                if Q1[S][0] + Q2[S][0] >= Q1[S][1] + Q2[S][1]:
                    A = 0
                    R, nS = blackjack.sample(S, A)
                elif Q1[S][0] + Q2[S][0] < Q1[S][1] + Q2[S][1]:
                    A = 1
                    R, nS = blackjack.sample(S, A)
            else:
                A = np.random.randint(0, 2)
                R, nS = blackjack.sample(S, A)

            # 0.5 probability of doing Q1 or Q2
            prob = np.random.randint(0, 2)
            if not nS:
                if prob == 1:
                    Q1[S][A] = Q1[S][A] + alpha * (R - Q1[S][A])
                else:
                    Q2[S][A] = Q2[S][A] + alpha * (R - Q2[S][A])
            else:
                if prob == 1:
                    Q1[S][A] = Q1[S][A] + alpha * (
                        R + Q2[nS][np.argmax(Q1, 1)[nS]] - Q1[S][A])
                else:
                    Q2[S][A] = Q2[S][A] + alpha * (
                        R + Q1[nS][np.argmax(Q2, 1)[nS]] - Q2[S][A])
            S = nS
            G += R
        #print("Episode: ", episodeNum, "Return: ", G)
        returnSum = returnSum + G
        if episodeNum % 10000 == 0 and episodeNum != 0:
            blackjack.printPolicy(policy)
            print("Average return so far: ", returnSum / episodeNum)
Example #4
0
        greedychance = 1-epsilon
        
        #Get best value at the next state
        highest = argmax(states[nextstate])
        
        #Expected sarsa calculation (greedy * best_next_state_action) + (explore * (0.5*next_state_action1 + 0.5*next_state_action2))
        target = (greedychance * states[nextstate][highest]) + (epsilon * (0.5*states[nextstate][0] + 0.5*states[nextstate][1]))
        states[currentState][action] = states[currentState][action] + alpha * (reward + target - states[currentState][action]) 
            
        currentState = nextstate
            
	#print "Episode: ", episodeNum, "Return: ", G
    returnSum = returnSum + G
    if(episodeNum%10000 == 0 and episodeNum != 0 and episodeNum != learningEpisodes):
        if episodeNum < learningEpisodes:
           #Print the running average while learning 
           print "Average return ",episodeNum,": ", returnSum/episodeNum
        else:
           #Print the average of just the episodes used in evaluation mode (non-learning)
           print "Average return ",episodeNum,": ", returnSum/(episodeNum-learningEpisodes)

    if episodeNum == learningEpisodes: 
        epsilon = epsilonPi
        alpha = 0
        returnSum = 0.0
        blackjack.printPolicy(returnPolicy)
    
print "Average return: ", returnSum/evaluationEpisodes
#Print the policy
blackjack.printPolicy(returnPolicy)
  # Initial the state (deal the first card )
  s=blackjack.init()
  segma_r = 0
  #for episodeNum in range(numEpisodes):
  while s!=-1:                                      # -1 is terminal
    
    a = argmax([ex[s,0], ex[s,1]])                  # Choose argmax(Q(s,a))   
    if random.uniform(0,1) < epsilon/2:             # e-greedy
      a = abs(a-1)    
        
      # Q(s,a) <- Q(s,a) + alpha * (r + argmax(Q(sp,a)) - Q(s,a))
      
    r,sp=blackjack.sample(s,a)                      # Get the reward and s'
    ex[s,a] += alpha * (r - ex[s,a] + ex[sp,argmax([ex[sp,0],ex[sp,1]])])  
    s=sp; segma_r += r                              # Return the value and next state
  return segma_r  
    #epsilon = 1/(episodeNum+1)
    #returnSum += segma_r
  #return returnSum,s                                   

for episodeNum in range(numEpisodes):
  epsilon = 1/(episodeNum+1)                        # Stepwise alternated epsilon is best
  returnSum += Qlearning(ex)                        # Update for each episode


#def testfunction(policy):
#Qlearning(ex,epsilon,alpha,numEpisodes,returnSum) 

print("\nAverage return: ", returnSum/numEpisodes)
blackjack.printPolicy(policy)
        
        #update values
        G+= reward
        state = newState
         
    if episodeNum % 10000 == 0 and episodeNum != 0:
        print "Episode: ", episodeNum, "Return: ", G, "Average return: ", returnSum/(episodeNum)
    returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes

print "Running the deterministic policy"
returnSum = 0.0
for episodeNum in range(numEpisodes):
    G = 0
    state = 0
    while state != -1:
        action = argmax(Q[state])
        
        result = blackjack.sample(state,action)
        reward = result[0]
        newState = result[1]
        
        #update values
        G+= reward
        state = newState
    if episodeNum % 10000 == 0 and episodeNum != 0:
        print "Episode: ", episodeNum, "Return: ", G, "Average return: ", returnSum/(episodeNum)
    returnSum = returnSum + G
print "Average return: ", returnSum/(numEpisodes)
blackjack.printPolicy(policyPrint) 
Example #7
0
returnSum = 0.0
for episodeNum in xrange(numEpisodesLearn):
    G = 0.0
    s = blackjack.init()
    while (s != -1):
        a = np.argmax(Q[s]) if np.random.random() > eps_mu \
            else np.random.randint(numActions)
        (r, sp) = blackjack.sample(s, a)
        v_pi = eps_pi * np.average(Q[sp]) + (1 - eps_pi) * np.max(Q[sp])
        Q[s, a] += alpha * (r + gamma * v_pi - Q[s, a])
        G = r + gamma * G
        s = sp
    returnSum += G
    ep = episodeNum + 1 
    if (ep % 10000 == 0):
        print "Episode: ", ep, "Average return: ", returnSum / ep
print "Average return while learning: ", returnSum / numEpisodesLearn

greedy = lambda s: np.argmax(Q[s])
blackjack.printPolicy(greedy)

returnSum = 0.0
for episodeNum in xrange(numEpisodesEval):
    G = 0.0
    s = blackjack.init()
    while (s != -1):
        (r, s) = blackjack.sample(s, greedy(s))
        G = r + gamma * G
    returnSum += G
print "Average return on deterministic policy: ", returnSum / numEpisodesEval
	state=0
	return1=0
	while (state != -1):
		action = policy(state)
		reward,statep=blackjack.sample(state,action) 
		Q[state][action] = Q[state][action] + alpha*(reward + expectedValue(statep) - Q[state][action])
		state = statep
		return1+=reward
	returnSum+=return1
	if (((episodeNum % 10000) == 0) and (episodeNum != 0)):
		print "Count =",episodeNum,"Average return: ", returnSum/(episodeNum)
	



blackjack.printPolicy(learnedPolicy)
print "Average return: ", float(returnSum)/float(numEpisodes)
returnSumLearned=0

"""
Now use learned policy and calculate average return
"""
for episodeNum in range(numEpisodes):
	blackjack.init()
	state=0
	return1=0
	while (state != -1):
		action = learnedPolicy(state)
		reward,statep=blackjack.sample(state,action) 
		state = statep+0
		return1+=reward
Example #9
0
        Q[s,a] = Q[s,a] + alpha*(r + actionProb(epi,0,s1)*Q[s1,0] + actionProb(epi,1,s1)*Q[s1,1] - Q[s,a])
        s = s1
        G+=r

    returnSum = returnSum + G

    if episodeNum%10000 == 0:
        print "Episode: ", episodeNum
        print "Average return: ", returnSum/(episodeNum+1)

#Function for the learned policy
def learnedPolicy(s):
    return np.argmax(Q[s])

#Printing out the learned policy
bj.printPolicy(learnedPolicy)


#Following the learned policy deterministically
returnSum = 0.0

for episodeNum in range(numEpisodes):
    G = 0

    s = bj.init()
    while s != -1:
        r, s = bj.sample(s,learnedPolicy(s))
        G+=r

    returnSum = returnSum + G
def policy(state):
    Q = Q1 + Q2
    if Q[state, 0] > Q[state, 1]:
        return 0
    else:
        return 1


#part 2
#double-Q learning for test the randompolicy(alpha=0.001,eps=1) for 1000000 episodes.

#learn(0.001, 1, 1000000)
#blackjack.printPolicy(policy)
#value_evaluate=evaluate(1000000)
#print(value_evaluate)

#double-Q learning for test the eps=0.01 and alpha=0.001 for 1000000 episodes.
#learn(0.001, 0.01, 1000000)
#blackjack.printPolicy(policy)
#value_evaluate=evaluate(1000000)
#blackjack.printPolicyToFile(policy)
#print(value_evaluate)

#part 3

learn(0.001, 0.995, 1000000)
blackjack.printPolicy(policy)
value_evaluate = evaluate(1000000)
#blackjack.printPolicyToFile(policy)
print(value_evaluate)
Example #11
0
        #print R

        if Sprime == -1:
            Q[S][A] = Q[S][A] + alpha * (R - Q[S][A])
        else:
            Q[S][A] = Q[S][A] + alpha * (R + gamma *
                                         (max(Q[Sprime][0], Q[Sprime][1])) -
                                         Q[S][A])

        S = Sprime

    if episodeNum == dropEpsilonEpisode:
        #print "=============================END EXPLORING PHASE============================="
        epsilon = 0

    if episodeNum == dropAlpha:
        #print "=============================END LEARNING PHASE============================="
        alpha = 0

    #if episodeNum % 10000 == 0:
    #print "Current Avg: " + str(returnSum / (episodeNum+1)) + " Ep: " + str(episodeNum)

    returnSum = returnSum + G

blackjack.printPolicy(showPolicy)

print ""
print "Avg Return:" + str(returnSum / numEpisodes)

printSettings()
Example #12
0
alpha = 0.001
for episodeNum in range(numEpisodes):
	random.seed(episodeNum)
	# Cumulative reward
	G = 0
	# Init the game of blackjack and get the initial state
	s = blackjack.init()
	#Consider each step of the episode
	while s!=-1: #-1 is terminal
		# Take epsilon greedy action at each step of episode
		a = getEpsilonGreedyAction(Q, s, epsilon)
		r,sp = blackjack.sample(s,a)
		# Update action value function with Q-learning off-policy update
		Q[s, a] = Q[s, a] + alpha * (r + max(Q[sp, :]) - Q[s, a])
		G += r
		s=sp
	
	if not(episodeNum % 10000) :
		print("Episode: ", episodeNum, "Return: ", G)
	returnSum = returnSum + G
	
print("Average return: ", returnSum/numEpisodes)
blackjack.printPolicy(getLearnedPolicy)

# Run learned policy
policySum = 0.0
for policyEpisodeNum in range(numEpisodes):
	policySum += runLearnedPolicy()

print("Average learned policy return: ", policySum/numEpisodes)
Example #13
0
#main function
#policy learning
for _ in range(numEpisodes):
	if _%10000==0:
		print "Episode:",_
		print "Average return:",returnSum/(_+1)
	s=bj.init()
	rand=np.random.random()
	if rand<emu:
#		print "rand:",rand,"policy: rand"
		rand_policy(s)
	else:
		exp_sarsa(s)
#		print "rand:",rand,"policy: ExpectedSarsa"

bj.printPolicy(sarsa_policy)
print "Average return:",returnSum/numEpisodes

#determinstic policy
returnSum=0.0
numEpisodes=int(math.pow(10,7))
for _ in range(numEpisodes):
	if _%10000==0:
		print "Episode:",_
		print "Average return:",returnSum/(_+1)
	s=bj.init()
	deter_policy(s)
bj.printPolicy(sarsa_policy)
print "Average return:",returnSum/numEpisodes
print "alpha, emu, epi, episodes:",alpha,emu,epi,numEpisodes
Example #14
0
        #Expected sarsa calculation (greedy * best_next_state_action) + (explore * (0.5*next_state_action1 + 0.5*next_state_action2))
        target = (greedychance * states[nextstate][highest]) + (
            epsilon *
            (0.5 * states[nextstate][0] + 0.5 * states[nextstate][1]))
        states[currentState][action] = states[currentState][action] + alpha * (
            reward + target - states[currentState][action])

        currentState = nextstate

#print "Episode: ", episodeNum, "Return: ", G
    returnSum = returnSum + G
    if (episodeNum % 10000 == 0 and episodeNum != 0
            and episodeNum != learningEpisodes):
        if episodeNum < learningEpisodes:
            #Print the running average while learning
            print "Average return ", episodeNum, ": ", returnSum / episodeNum
        else:
            #Print the average of just the episodes used in evaluation mode (non-learning)
            print "Average return ", episodeNum, ": ", returnSum / (
                episodeNum - learningEpisodes)

    if episodeNum == learningEpisodes:
        epsilon = epsilonPi
        alpha = 0
        returnSum = 0.0
        blackjack.printPolicy(returnPolicy)

print "Average return: ", returnSum / evaluationEpisodes
#Print the policy
blackjack.printPolicy(returnPolicy)
Example #15
0
    blackjack.init()
    state = 0
    return1 = 0
    while (state != -1):
        action = policy(state)
        reward, statep = blackjack.sample(state, action)
        Q[state][action] = Q[state][action] + alpha * (
            reward + expectedValue(statep) - Q[state][action])
        state = statep
        return1 += reward
    returnSum += return1
    if (((episodeNum % 10000) == 0) and (episodeNum != 0)):
        print "Count =", episodeNum, "Average return: ", returnSum / (
            episodeNum)

blackjack.printPolicy(learnedPolicy)
print "Average return: ", float(returnSum) / float(numEpisodes)
returnSumLearned = 0
"""
Now use learned policy and calculate average return
"""
for episodeNum in range(numEpisodes):
    blackjack.init()
    state = 0
    return1 = 0
    while (state != -1):
        action = learnedPolicy(state)
        reward, statep = blackjack.sample(state, action)
        state = statep + 0
        return1 += reward
    returnSumLearned += return1