/
q-learn-new-disc.py
164 lines (133 loc) · 5.06 KB
/
q-learn-new-disc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from __future__ import division
import numpy as np
import numpy.random as npr
import sys
from SwingyMonkey import SwingyMonkey
import matplotlib.pyplot as plt
class Learner:
def __init__(self, epsilon_factor = 10.0):
self.last_state = None
self.last_action = None
self.last_reward = None
# state indexing in order: tree top, tree dist, monkey top, monkey vel, action
self.Q = {}
self.a = {}
# discount used in Q-learning
self.discount = 1.0
self.epsilon_factor = epsilon_factor
def reset(self):
self.last_state = None
self.last_action = None
self.last_reward = None
def state_tupler(self, state):
'''
Input: takes full state they provide
Return: 'tuple'/condensed representation of the state
bin_state = [tree top bin, tree dist bin, monkey top bin, monkey vel bin]
'''
# state indexing in order: tree top, tree dist, monkey top, monkey vel
bin_state = [0,0,0,0]
bin_state[0] = (state['tree']['top'] - 200) // 25
tree_dist = state['tree']['dist']
if tree_dist < 0:
bin_state[1] = 0
elif tree_dist < 300:
bin_state[1] = (tree_dist // 75) + 1
else:
bin_state[1] = 5
monkey_top = state['monkey']['top']
if monkey_top < 125:
bin_state[2] = 0
elif monkey_top < 350:
bin_state[2] = ((monkey_top - 125) // 25) + 1
else:
bin_state[2] = 10
monkey_vel = state['monkey']['vel']
if monkey_vel < 0:
bin_state[3] = 0
elif monkey_vel < 5:
bin_state[3] = 1
else:
bin_state[3] = 2
return bin_state
def state_hash(self, state):
a, b, c, d = self.state_tupler(state)
return int((a*1000000)+(b*10000)+(c*100)+(d))
def update_Q(self, new_state):
s = self.state_hash(self.last_state)
s_prime = self.state_hash(new_state)
if s not in self.a:
self.a[s] = np.array([1.0,1.0])
alpha = 1.0 / self.a[s][self.last_action]
if s not in self.Q:
self.Q[s] = np.array([0.0, 0.0])
old_Q = self.Q[s][self.last_action]
if s_prime not in self.Q:
self.Q[s_prime] = np.array([0.0, 0.0])
max_Q = max(self.Q[s_prime])
self.Q[s][self.last_action] = (old_Q +
alpha * (self.last_reward + self.discount*max_Q - old_Q))
def update_a(self):
s = self.state_hash(self.last_state)
self.a[s][self.last_action] += 1.0
def optimal_action(self, state):
s = self.state_hash(state)
return np.argmax(self.Q[s])
def action_callback(self, state):
'''Implement this function to learn things and take actions.
Return 0 if you don't want to jump and 1 if you do.'''
# You might do some learning here based on the current state and the last state.
# You'll need to take an action, too, and return it.
# Return 0 to swing and 1 to jump.
# tree dist ranges from 400 to negatives
# tree top ranges from 400 to 200
# tree bot ranges from 200 to 0
# gap always 200 pixels
# monkey 56 pixels tall
# monkey between 450 and -50
# monkey vel between -50 and 40
epsilon = 1.0 / ((ii+1.0) * self.epsilon_factor)
# First turn of game -> pick a random action
if self.last_action == None:
if ii == 0.0 or self.state_hash(state) not in self.Q:
new_action = npr.rand() < 0.5
# Use model when it exists (not on first epoch)
else:
new_action = self.optimal_action(state)
# Update Q and a, then pick new action
else:
self.update_Q(state)
self.update_a()
new_action = self.optimal_action(state)
# Explore (take non-optimal action) with probability epsilon
if npr.rand() < epsilon:
new_action = int(not new_action)
self.last_action = new_action
self.last_state = state
return new_action
def reward_callback(self, reward):
'''This gets called so you can see what reward you get.'''
self.last_reward = reward
iters = 150
learner = Learner()
scores = []
for ii in xrange(iters):
# Make a new monkey object.
swing = SwingyMonkey(sound=False, # Don't play sounds.
text="Epoch %d" % (ii), # Display the epoch on screen.
tick_length=1, # Make game ticks super fast.
action_callback=learner.action_callback,
reward_callback=learner.reward_callback)
# Loop until you hit something.
while swing.game_loop():
pass
scores.append(swing.get_score())
# Reset the state of the learner.
learner.reset()
domain = np.arange(1, iters + 1, 1)
plt.plot(domain, scores)
plt.title("Scores over each Epoch (discount = " + str(learner.discount) + ")")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.savefig("scores.png")
plt.show()