-
Notifications
You must be signed in to change notification settings - Fork 1
/
gametrainer.py
302 lines (250 loc) · 15.8 KB
/
gametrainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
import os
import gc
import time
import itertools
import random
import copy
import shutil
import traceback
from collections import deque
import dill
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
from utils import Utils, RingBuf
from gameboard import GameBoard, GameStates, GameActions
from gamemodel import GameModel
class GameTrainer():
def __init__(self, board_size, save_model=True, model_dir=None, debug=True, learning_rate=0.01, max_experience_history=600000):
self.save_model = save_model
self.model_dir = model_dir
self.experience_history_path = self.model_dir + "exp_history.p"
self.max_experience_history = max_experience_history
self.debug = debug
self.learning_rate = learning_rate
self.q_network = GameModel(board_size, model_name='q_network', model_dir=self.model_dir, learning_rate=learning_rate)
print(self.q_network.model.summary())
self.q_hat = GameModel(board_size, model_name='q_hat', model_dir=self.model_dir, learning_rate=learning_rate)
#self.q_hat = self.q_network.copy_weights_to(GameModel(board_size, model_name='q_hat', model_dir=self.model_dir, learning_rate=learning_rate))
def calc_reward(self, oldboard, newboard):
# The rewards should be normalized so that the max tile value can be changed without having to retrain
# if newboard.game_state == GameStates.LOSE: return -1
# nom_reward = np.clip((newboard.score - oldboard.score) // 2 , 1, None)
# reward = np.log2(nom_reward) / (np.log2(newboard.max_tile) - 1)
# return np.clip(reward, -1., 1.)
# if newboard.game_state == GameStates.LOSE: return -1
# if newboard.game_state == GameStates.WIN: return 1
# return 0
#max_reward = np.log2(oldboard.max_tile)
# Mean of the score awarded for new tile placement - either a 2 or a 4 are equally likely so mean = 3
if newboard.game_state == GameStates.LOSE: return -1
if newboard.game_state == GameStates.WIN: return 1
tile_placement_mean_score = np.mean(oldboard.bonus_mask * 3)
reward = np.clip(2 * (newboard.score - oldboard.score - tile_placement_mean_score) / oldboard.max_tile, -1, 1)
#print("Score: {0}".format(score))
return reward
def exec_action(self, gameboard, gameaction):
oldboard = copy.deepcopy(gameboard)
gameboard.make_move(gameaction)
return (oldboard, self.calc_reward(oldboard, gameboard))
def preprocess_state(self, gameboard):
#return gameboard.board.astype(np.float32) / gameboard.max_tile
return (np.log2(np.clip(gameboard.board, 1, gameboard.max_tile))) / np.log2(gameboard.max_tile)
def get_action_probabilities(self, gameboard):
return np.ravel(self.q_network(self.preprocess_state(gameboard)))
def select_action(self, gameboard, epsilon):
if len(gameboard.action_set) <= 0: return None
# Return a random action with probability epsilon, otherwise return the model's recommendation
if np.random.rand() < epsilon: return random.sample(gameboard.action_set, 1)[0]
# Return the action with highest probability that is actually possible on the board, as predicted by the Q network
action_probs = self.get_action_probabilities(gameboard)
best_actions = [GameActions(i) for i in np.argsort(action_probs)[::-1]]
return next(a for a in best_actions if a in gameboard.action_set)
def calculate_y_target(self, newboards, actions_oh, rewards, gamestates, gamma):
# Determine best actions for the future board states from the current Q-network
# (this is the DDQN approach for faster training)
newboards_arr = np.array(newboards)
best_actions = np.argmax(self.q_network(newboards_arr), axis=1)
q_hat_output = self.q_hat(newboards_arr, Utils.one_hot(best_actions, len(GameActions)))
#q_hat_output = self.q_hat(np.array(newboards))
# print("Q-hat output values: ", q_hat_output)
q_hat_pred = np.max(q_hat_output, axis=1)
q_values = q_hat_pred * np.array([0 if s != GameStates.IN_PROGRESS.value else gamma for s in gamestates])
total_reward = np.array(rewards) + q_values
return actions_oh * total_reward.reshape((-1, 1))
def save_experience_history(self, D):
if os.path.exists(self.experience_history_path): os.remove(self.experience_history_path)
saved = False
f_hist = None
while (not saved):
try:
f_hist = open(self.experience_history_path, "wb")
dill.dump(D, f_hist)
saved = True
print("Saved gameplay experience to " + self.experience_history_path)
except Exception as e:
traceback.print_exc()
print(e)
print("WARNING: failed to save experience replay history. Will try again in 5 seconds...")
time.sleep(5)
finally:
if not f_hist is None and 'close' in dir(f_hist): f_hist.close()
if os.path.getsize(self.experience_history_path) > 0: shutil.copy2(self.experience_history_path, self.experience_history_path.replace('.p', '_BACKUP.p'))
def restore_experience_history(self):
f_hist = open(self.experience_history_path, "rb") if os.path.exists(self.experience_history_path) and os.path.getsize(self.experience_history_path) > 0 else None
if f_hist is None: return RingBuf(self.max_experience_history)
D = dill.load(f_hist)
f_hist.close()
if isinstance(D, RingBuf) and len(D) > 0:
print("Restored gameplay experience from " + self.experience_history_path)
return D
return RingBuf(self.max_experience_history)
def train_model(self, episodes=10, max_tile=2048, max_game_history=500, max_epsilon=1.0, min_epsilon=0.1, mini_batch_size=32, gamma=0.99, update_qhat_weights_steps=10000):
# Training variables
D = self.restore_experience_history() # experience replay queue
gamehistory = deque(maxlen=max_game_history) # history of completed games
epsilon = max_epsilon # probability of selecting a random action. This is annealed from 1.0 to 0.1 over time
update_frequency = 4 # Number of actions selected before the Q-network is updated again
globalstep = 0
# approx_steps_per_episode = 200
# episodes_per_tb_output = 100
# steps_per_tb_output = approx_steps_per_episode * episodes_per_tb_output # MUST BE A MULTIPLE OF update_frequency
# # Prepare a callback to write TensorBoard debugging output
# tbCallback = tf.keras.callbacks.TensorBoard(log_dir=self.model_dir, histogram_freq=1,
# batch_size=mini_batch_size, write_graph=False,
# write_images=False, write_grads=True)
q_hat_group = 1
f_log = None
if self.debug:
f_log = open(self.model_dir + "training_log.csv", "w")
f_log.write("Q_hat_group,Avg_Pred,Avg_Target,Avg_Diff,Won_Lost_Games,My_Loss,Actual_Loss\n")
# Loop over requested number of games (episodes)
loss = 0
for episode in range(episodes):
# New game
gameboard = GameBoard(self.q_network.board_size, max_tile=max_tile)
# Play the game
stepcount = 0
while gameboard.game_state == GameStates.IN_PROGRESS:
stepcount += 1
globalstep += 1
# Select an action to perform. It will be a random action with probability epsilon, otherwise
# the action with highest probability from the Q-network will be chosen
action = self.select_action(gameboard, epsilon)
oldboard, reward = self.exec_action(gameboard, action)
# Append the (preprocessed) original board, selected action, reward and new board to the history
# This is to implement experience replay for reinforcement learning
# Ensure history size is capped at max_history by randomly replacing an experience in the queue if necessary
experience = (self.preprocess_state(oldboard), action.value, reward, self.preprocess_state(gameboard), gameboard.game_state.value)
D.append(experience)
# Perform a gradient descent step on the Q-network when a game is finished or every so often
#if globalstep % update_frequency == 0 and len(D) >= mini_batch_size:
if len(D) >= max(mini_batch_size, self.max_experience_history) and globalstep % update_frequency == 0:
# Randomly sample from the experience history and unpack into separate arrays
batch = [D[i] for i in np.random.randint(0, len(D), mini_batch_size)]
oldboards, actions, rewards, newboards, gamestates = [list(k) for k in zip(*batch)]
# One-hot encode the actions for each of these boards as this will form the basis of the
# loss calculation
actions_one_hot = Utils.one_hot(actions, len(GameActions))
# Compute the target network output using the Q-hat network, actions and rewards for each
# sampled history item
y_target = self.calculate_y_target(newboards, actions_one_hot, rewards, gamestates, gamma)
#print("Rewards:{0}".format(" ".join(["{:.2f}".format(r) for r in rewards])))
# Perform a single gradient descent update step on the Q-network
# callbacks = []
# if self.debug and (globalstep % steps_per_tb_output == 0): callbacks.append(tbCallback)
X = [np.array(oldboards).reshape((-1, self.q_network.board_size, self.q_network.board_size, 1)), actions_one_hot]
# if len(callbacks) > 0:
# self.q_network.model.fit(x=X, y=y_target, validation_split=0.16, epochs=1, verbose=False, callbacks=callbacks)
# else:
#loss += self.q_network.model.train_on_batch(x=X, y=y_target)
write_log_entry = (globalstep % (update_frequency * 10) == 0) and f_log
if write_log_entry:
y_pred = self.q_network(oldboards, actions_one_hot)
avg_pred = np.mean(np.abs(np.sum(y_pred, axis=1)))
avg_target = np.mean(np.abs(np.sum(y_target, axis=1)))
diff = np.sum(y_target - y_pred, axis=1)
avg_diff = np.mean(np.abs(diff))
my_loss = np.mean(np.square(diff))
won_lost_games = np.sum(np.array(gamestates) < GameStates.IN_PROGRESS.value)
curloss = self.q_network.model.train_on_batch(x=X, y=y_target)
loss += curloss
if write_log_entry:
f_log.write("{:d},{:.7f},{:.7f},{:.7f},{:d},{:.7f},{:.7f}\n".format(q_hat_group, avg_pred, avg_target, avg_diff, won_lost_games, my_loss, curloss))
# Every so often, copy the network weights over from the Q-network to the Q-hat network
# (this is required for network weight convergence)
if globalstep % update_qhat_weights_steps == 0:
def test_weight_update(testboard):
board_processed = self.preprocess_state(testboard)
q_res = np.ravel(self.q_network(board_processed))
q_hat_res = np.ravel(self.q_hat(board_processed))
print("\nQ-network result on testboard: {0}".format(q_res))
print("Q-hat result on testboard: {0}".format(q_hat_res))
print("Difference: {0}\n".format(q_res - q_hat_res))
test_weight_update(oldboard)
self.q_network.copy_weights_to(self.q_hat)
print("Weights copied to Q-hat network")
self.q_network.save_to_file()
test_weight_update(oldboard)
lr_new = self.learning_rate / (1 + episode / (episodes/2))
#lr_new = self.learning_rate / np.sqrt(episode)
self.q_network.compile(learning_rate=lr_new)
print("Q-network learning rate updated to {:.6f}".format(lr_new))
q_hat_group += 1
# Perform annealing on epsilon
epsilon = max(min_epsilon, min_epsilon + ((max_epsilon - min_epsilon) * (episodes - episode) / episodes))
#epsilon = max_epsilon
#if self.debug: print("epsilon: {:.4f}".format(epsilon))
# Append metrics for each completed game to the game history list
gameresult = (gameboard.game_state.value, gameboard.score, stepcount, gameboard.largest_tile_placed)
if len(gamehistory) >= max_game_history: gamehistory.popleft()
gamehistory.append(gameresult)
print('result: {0}, score: {1}, steps: {2}, max tile: {3}'.format(GameStates(gameresult[0]).name, gameresult[1], gameresult[2], gameresult[3]))
if (episode + 1) % 20 == 0:
games_to_retrieve = min(len(gamehistory), 100)
last_x_games = list(itertools.islice(gamehistory, len(gamehistory) - games_to_retrieve, None))
last_x_results = list(zip(*last_x_games))[0]
games_won = np.sum([1 if r == GameStates.WIN.value else 0 for r in last_x_results])
print("\nEpisode {0}/{1}".format(episode + 1, episodes))
print("History queue {0}/{1}".format(len(D), self.max_experience_history))
print("Game win % (for last {:d} games): {:.1f}%".format(games_to_retrieve, 100. * (games_won / games_to_retrieve)))
print("Epsilon = {:.3f}".format(epsilon))
print("Training loss: {:.5f}\n".format(loss))
loss = 0
# Save experience history to disk periodically
if self.save_model and (episode + 1) % 2000 == 0:
self.save_experience_history(D)
# Output garbage
print("GC.isenabled() = {0}".format(gc.isenabled()))
print("Garbage:", gc.garbage)
print("Counts:", gc.get_count())
print("globals() = ", sorted(list(globals().keys())))
# Perform one final model weight save for next run
if self.save_model:
self.q_network.save_to_file()
self.save_experience_history(D)
if f_log: f_log.close()
return gamehistory
@staticmethod
def display_training_history(gamehistory):
results, scores, stepcounts, max_tiles = list(zip(*gamehistory))
resultpercentages = np.cumsum([1 if r == GameStates.WIN.value else 0 for r in results]) / range(1, len(results) + 1)
print("Final win %: {:2f}".format(resultpercentages[-1] * 100.))
x = range(1, len(results) + 1)
fig, ax_arr = plt.subplots(nrows=4, ncols=1, sharex=True, figsize=(5, 10))
ax_arr[0].plot(x, resultpercentages)
ax_arr[0].set_ylim(bottom=0)
ax_arr[0].set_title('Win % (cumulative) = {:.2f}%'.format(resultpercentages[-1] * 100.))
ax_arr[1].plot(x, scores)
ax_arr[1].set_title('Score')
ax_arr[2].plot(x, stepcounts)
ax_arr[2].set_title('Actions per Game')
ax_arr[3].plot(x, max_tiles)
ax_arr[3].set_title('Max Tile Placed')
ax_arr[3].set_xlabel("Game #")
ax_arr[3].xaxis.set_major_locator(MaxNLocator(integer=True))
fig.canvas.set_window_title("2048 Game Training Results")
plt.get_current_fig_manager().window.state('zoomed')
plt.show()
return gamehistory