/
train.py
158 lines (134 loc) · 5.31 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from collections import deque
import random
import json
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
import tensorflow as tf
import numpy as np
import skimage
from skimage.color import rgb2gray
from skimage.transform import resize
from skimage.exposure import rescale_intensity
from bot_model import tetai_model as model
from play import Play
GAME = 'Tetris'
CONFIG = 'nothreshold'
ACTIONS = 4
GAMMA = 0.95
OBSERVATION = 10000
EXPLORE = 3000000
FINAL_EPSILON = 0.0001
INITIAL_EPSILON = 0.1
REPLAY_MEMORY = 50000
BATCH = 32
FRAME_PER_ACTION = 1
LEARNING_RATE = 1e-4
class Train:
def __init__(self):
self.model = model()
self.optimizer = tf.keras.optimizers.Adam(lr=LEARNING_RATE)
@staticmethod
def _preprocess_image(frame):
frame = rgb2gray(frame)
frame = resize(frame,(80,80))
frame = rescale_intensity(frame,out_range=(0,255))
frame = frame / 255.0
return frame
def loss(self , y , y_hat):
return tf.keras.losses.mean_squared_error(y,y_hat)
@tf.function
def _train_step(self,inputs,targets):
y_hat = self.model(inputs)
loss = self.loss(targets , y_hat)
return loss
def __call__(self , mode = "Train"):
game_state = Play()
D = deque()
do_nothing = 3
frame, action, done = game_state.frame_step(do_nothing)
frame = self.__class__._preprocess_image(frame)
stack_frame = np.stack((frame, frame, frame, frame), axis=2)
#In Keras, need to reshape
stack_frame = stack_frame.reshape(1, stack_frame.shape[0], stack_frame.shape[1], stack_frame.shape[2]) #1*80*80*4
if mode == 'Run':
OBSERVE = 999999999
epsilon = FINAL_EPSILON
print ("Now we load weight")
self.model.load_weights("model.h5")
print ("Weight load successfully")
else: #We go to training mode
OBSERVE = OBSERVATION
epsilon = INITIAL_EPSILON
t = 0
while True:
loss = 0
Q_sa = 0
action_index = 0
reward = 0
a_t = ACTIONS
#choose an action epsilon greedy
if t % FRAME_PER_ACTION == 0:
if random.random() <= epsilon:
print("----------Random Action----------")
action_index = random.randrange(ACTIONS)
a_t = action_index
else:
q = self.model(stack_frame,training=False)
max_Q = np.argmax(q)
a_t=max_Q
#We reduced the epsilon gradually
if epsilon > FINAL_EPSILON and t > OBSERVE:
epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
#run the selected action and observed next state and reward
frame1_colored, reward, done = game_state.frame_step(a_t)
#Restart game if game is over
if done:
game_state.restart()
frame1 = self._preprocess_image(frame1_colored)
frame1 = frame1.reshape(1, frame1.shape[0], frame1.shape[1], 1) #1x80x80x1
stack_frame1 = np.append(frame1, stack_frame[:, :, :, :3], axis=3)
# store the transition in D
D.append((stack_frame, action_index, reward, stack_frame1, done))
if len(D) > REPLAY_MEMORY:
D.popleft()
#only train if done observing
if t > OBSERVE:
#sample a minibatch to train on
minibatch = random.sample(D, BATCH)
#Now we do the experience replay
state_t, action_t, reward_t, state_t1, done = zip(*minibatch)
state_t = np.concatenate(state_t)
state_t1 = np.concatenate(state_t1)
targets = self.model(state_t , training=False)
_targets = np.zeros_like(targets)
Q_sa = self.model(state_t1, training=False)
_targets[range(BATCH), action_t] = reward_t + GAMMA*np.max(Q_sa, axis=1)*np.invert(done)
with tf.GradientTape() as tape:
_loss = self._train_step(state_t,_targets)
gradients = tape.gradient(_loss, self.model.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
print("APPLIED GRADIENT")
stack_frame = stack_frame1
t = t + 1
# save progress every 10000 iterations
#BUG
if t % 1000 == 0:
print("Now we save model")
self.model.save_weights("model.h5", overwrite=True)
#with open("model.json", "w") as outfile:
#json.dump(self.model.to_json(), outfile)
if t <= OBSERVE:
state = "observe"
elif t > OBSERVE and t <= OBSERVE + EXPLORE:
state = "explore"
else:
state = "train"
print('Done = ', done)
print("TIMESTEP", t, "/ STATE", state, \
"/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", reward, \
"/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss)
print("Episode finished!")
print("************************")
if __name__ == "__main__":
bot = Train()
bot()