forked from rkj777/BlackjackPolicyLearner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
expectedSarsa.py
85 lines (71 loc) · 2.3 KB
/
expectedSarsa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import blackjack
import numpy
from pylab import *
numEpisodes = 1000000
Q = 0.00001*rand(182,2)
#adding values for the terminal state
Q[-1] = [0,0]
#values to use
epsilonMu = 0
epsilonPi = 0.9
alpha = 0.4
returnSum = 0.0
blackjack.init()
#returns the probability of taking an action given epsilonpi and enxt state
def policy(action,epsilonPi,nextState):
greedyAction = numpy.argmax(Q[nextState])
probGreedy = (1-epsilonPi) + (epsilonPi)/2
if action == greedyAction:
return probGreedy
else:
return 1- probGreedy
#Sums the policies probalities for a state
def policySum(nextState,epsilonPi):
sum = 0
for a in range(2):
sum += policy(a,epsilonPi,nextState) * Q[nextState,a]
return sum
#function given to the print policy function
def policyPrint(state):
return argmax(Q[state])
for episodeNum in range(numEpisodes):
#blackjack.init()
G = 0
state = 0
while state != -1:
#take action according the the beahaviour policy
if rand() <= epsilonMu:
action = randint(2)
else:
action = argmax(Q[state])
#Do that action
result = blackjack.sample(state,action)
reward = result[0]
newState = result[1]
#Expected Sarsa
Q[state, action] = Q[state, action] + alpha *(reward + policySum(newState,epsilonPi) - Q[state, action])
#update values
G+= reward
state = newState
if episodeNum % 10000 == 0 and episodeNum != 0:
print "Episode: ", episodeNum, "Return: ", G, "Average return: ", returnSum/(episodeNum)
returnSum = returnSum + G
print "Average return: ", returnSum/numEpisodes
print "Running the deterministic policy"
returnSum = 0.0
for episodeNum in range(numEpisodes):
G = 0
state = 0
while state != -1:
action = argmax(Q[state])
result = blackjack.sample(state,action)
reward = result[0]
newState = result[1]
#update values
G+= reward
state = newState
if episodeNum % 10000 == 0 and episodeNum != 0:
print "Episode: ", episodeNum, "Return: ", G, "Average return: ", returnSum/(episodeNum)
returnSum = returnSum + G
print "Average return: ", returnSum/(numEpisodes)
blackjack.printPolicy(policyPrint)