class QNet: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 hist_size = 1 #original: 4 def __init__(self, use_gpu, enable_controller, dim): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.dim = dim print("Initializing Q-Network...") hidden_dim = 256 self.model = FunctionSet(l4=F.Linear(self.dim * self.hist_size, hidden_dim, wscale=np.sqrt(2)), q_value=F.Linear( hidden_dim, self.num_of_actions, initialW=np.zeros( (self.num_of_actions, hidden_dim), dtype=np.float32))) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.d = [ np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(reward[i]) + self.gamma * max_q_dash[i] else: tmp_ = np.sign(reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def stock_experience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward else: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward self.d[3][data_index] = state_dash self.d[4][data_index] = episode_end_flag def experience_replay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.d[1][replay_index[i]] r_replay[i] = self.d[2][replay_index[i]] s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.d[4][replay_index[i]] if self.use_gpu >= 0: s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def q_func(self, state): h4 = F.relu(self.model.l4(state)) q = self.model.q_value(h4 / 255.0) return q def q_func_target(self, state): h4 = F.relu(self.model_target.l4(state / 255.0)) q = self.model_target.q_value(h4) return q def e_greedy(self, state, epsilon): s = Variable(state) q = self.q_func(s) q = q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print(" Random"), else: if self.use_gpu >= 0: index_action = np.argmax(q.get()) else: index_action = np.argmax(q) print("#Greedy"), return self.index_to_action(index_action), q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 100#10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 1, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Breakout" print "Initializing DN..." # Initialization of Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), l4=F.Linear(3136, 256, wscale=np.sqrt(2)), l5=F.Linear(3136, 256, wscale=np.sqrt(2)), l6=F.Linear(256, 1, initialW=np.zeros((1, 256), dtype=np.float32)), l7=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256), dtype=np.float32)), q_value=DN_out.DN_out(1, self.num_of_actions, self.num_of_actions, nobias = True) ).to_gpu() if args.resumemodel: # load saved model serializers.load_npz(args.resumemodel, self.model) print "load model from resume.model" self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] if args.resumeD1 and args.resumeD2: # load saved D1 and D2 npz_tmp1 = np.load(args.resumeD1) print "finished loading half of D data" npz_tmp2 = np.load(args.resumeD2) self.D = [npz_tmp1['D0'], npz_tmp1['D1'], npz_tmp1['D2'], npz_tmp2['D3'], npz_tmp2['D4']] npz_tmp1.close() npz_tmp2.close() print "loaded stored all D data" else: self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] print "initialized D data" def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp2 = self.Q_func(s_dash) tmp2 = list(map(np.argmax, tmp2.data.get())) # argmaxQ(s',a) tmp = self.Q_func_target(s_dash) # Q'(s',*) tmp = list(tmp.data.get()) # select Q'(s',*) due to argmaxQ(s',a) res1 = [] for i in range(num_of_batch): res1.append(tmp[i][tmp2[i]]) #max_Q_dash = np.asanyarray(tmp, dtype=np.float32) max_Q_dash = np.asanyarray(res1, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): print 'now Q_func is implemented' h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) h4 = F.relu(self.model.l4(h3)) # left side connected with s value h5 = F.relu(self.model.l5(h3)) # right side connected with A value h6 = self.model.l6(h4) # s value h7 = self.model.l7(h5) # A value Q = self.model.q_value(h6, h7) # Q value return Q def Q_func_target(self, state): print 'now Q_func_target is implemented' h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model_target.l2(h1)) h3 = F.relu(self.model_target.l3(h2)) h4 = F.relu(self.model_target.l4(h3)) # left side connected with s value h5 = F.relu(self.model_target.l5(h3)) # right side connected with A value h6 = self.model_target.l6(h4) # s value h7 = self.model_target.l7(h5) # A value Q = self.model_target.q_value(h6, h7) # Q value return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class QNet: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**3 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 hist_size = 1 #original: 4 def __init__(self, use_gpu, enable_controller, dim): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.dim = dim print("Initializing Q-Network...") hidden_dim = 256 self.model = FunctionSet( l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)), q_value=F.Linear(hidden_dim, self.num_of_actions, initialW=np.zeros((self.num_of_actions, hidden_dim), dtype=np.float32)) ) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.d = [np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = reward[i] + self.gamma * max_q_dash[i] else: tmp_ = reward[i] action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def stock_experience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward else: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward self.d[3][data_index] = state_dash self.d[4][data_index] = episode_end_flag def experience_replay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.d[1][replay_index[i]] r_replay[i] = self.d[2][replay_index[i]] s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.d[4][replay_index[i]] if self.use_gpu >= 0: s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def q_func(self, state): h4 = F.relu(self.model.l4(state / 255.0)) q = self.model.q_value(h4) return q def q_func_target(self, state): h4 = F.relu(self.model_target.l4(state / 255.0)) q = self.model_target.q_value(h4) return q def e_greedy(self, state, epsilon): s = Variable(state) q = self.q_func(s) q = q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print(" Random"), else: if self.use_gpu >= 0: index_action = np.argmax(q.get()) else: index_action = np.argmax(q) print("#Greedy"), return self.index_to_action(index_action), q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." print "CUDA init" cuda.init() print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)), l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)), l3=F.Linear(2592, 256), q_value=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256), dtype=np.float32))).to_gpu() print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [ np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals max_Q_dash_ = self.Q_func(s_dash) tmp = list(map(np.max, max_Q_dash_.data.get())) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) target[i, self.action_to_index(action[i])] = tmp_ loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0, 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) Q = self.model.q_value(h3) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor def __init__(self, enable_controller=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." # Initialization of Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" w = math.sqrt(2) # MSRA scaling self.model = FunctionSet( conv1=F.Convolution2D(3, 64, 7, wscale=w, stride=2, pad=3), conv2_1_1=F.Convolution2D(64, 64, 1, wscale=w, stride=1), conv2_1_2=F.Convolution2D(64, 64, 3, wscale=w, stride=1, pad=1), conv2_1_3=F.Convolution2D(64, 256, 1, wscale=w, stride=1), conv2_1_ex=F.Convolution2D(64, 256, 1, wscale=w, stride=1), conv2_2_1=F.Convolution2D(256, 64, 1, wscale=w, stride=1), conv2_2_2=F.Convolution2D(64, 64, 3, wscale=w, stride=1, pad=1), conv2_2_3=F.Convolution2D(64, 256, 1, wscale=w, stride=1), conv2_3_1=F.Convolution2D(256, 64, 1, wscale=w, stride=1), conv2_3_2=F.Convolution2D(64, 64, 3, wscale=w, stride=1, pad=1), conv2_3_3=F.Convolution2D(64, 256, 1, wscale=w, stride=1), conv3_1_1=F.Convolution2D(256, 128, 1, wscale=w, stride=2), conv3_1_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_1_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_1_ex=F.Convolution2D(256, 512, 1, wscale=w, stride=2), conv3_2_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_2_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_2_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_3_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_3_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_3_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_4_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_4_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_4_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_5_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_5_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_5_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_6_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_6_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_6_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_7_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_7_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_7_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_8_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_8_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_8_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv4_1_1=F.Convolution2D(512, 256, 1, wscale=w, stride=2), conv4_1_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_1_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_1_ex=F.Convolution2D(512, 1024, 1, wscale=w, stride=2), conv4_2_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_2_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_2_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_3_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_3_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_3_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_4_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_4_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_4_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_5_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_5_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_5_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_6_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_6_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_6_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_7_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_7_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_7_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_8_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_8_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_8_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_9_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_9_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_9_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_10_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_10_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_10_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_11_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_11_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_11_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_12_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_12_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_12_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_13_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_13_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_13_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_14_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_14_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_14_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_15_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_15_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_15_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_16_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_16_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_16_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_17_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_17_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_17_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_18_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_18_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_18_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_19_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_19_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_19_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_20_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_20_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_20_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_21_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_21_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_21_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_22_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_22_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_22_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_23_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_23_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_23_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_24_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_24_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_24_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_25_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_25_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_25_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_26_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_26_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_26_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_27_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_27_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_27_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_28_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_28_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_28_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_29_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_29_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_29_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_30_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_30_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_30_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_31_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_31_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_31_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_32_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_32_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_32_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_33_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_33_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_33_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_34_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_34_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_34_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_35_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_35_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_35_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_36_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_36_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_36_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv5_1_1=F.Convolution2D(1024, 512, 1, wscale=w, stride=2), conv5_1_2=F.Convolution2D(512, 512, 3, wscale=w, stride=1, pad=1), conv5_1_3=F.Convolution2D(512, 2048, 1, wscale=w, stride=1), conv5_1_ex=F.Convolution2D(1024, 2048, 1, wscale=w, stride=2), conv5_2_1=F.Convolution2D(2048, 512, 1, wscale=w, stride=1), conv5_2_2=F.Convolution2D(512, 512, 3, wscale=w, stride=1, pad=1), conv5_2_3=F.Convolution2D(512, 2048, 1, wscale=w, stride=1), conv5_3_1=F.Convolution2D(2048, 512, 1, wscale=w, stride=1), conv5_3_2=F.Convolution2D(512, 512, 3, wscale=w, stride=1, pad=1), conv5_3_3=F.Convolution2D(512, 2048, 1, wscale=w, stride=1), q_value=F.Linear(2048, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 2048), dtype=np.float32)) ) self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.Adam() self.optimizer.setup(self.model.collect_parameters()) def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable(cuda.to_gpu(np.zeros((num_of_batch, self.num_of_actions), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def Q_func(self, state): h = F.relu(self.model.conv1(state)) h = F.max_pooling_2d(h, 3, stride=2) h_rem = self.model.conv2_1_ex(h) h = F.relu(self.model.conv2_1_1(h)) h = F.relu(self.model.conv2_1_2(h)) h = self.model.conv2_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv2_2_1(h)) h = F.relu(self.model.conv2_2_2(h)) h = self.model.conv2_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv2_3_1(h)) h = F.relu(self.model.conv2_3_2(h)) h = self.model.conv2_3_3(h) h = F.relu(h + h_rem) h_rem = self.model.conv3_1_ex(h) h = F.relu(self.model.conv3_1_1(h)) h = F.relu(self.model.conv3_1_2(h)) h = self.model.conv3_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_2_1(h)) h = F.relu(self.model.conv3_2_2(h)) h = self.model.conv3_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_3_1(h)) h = F.relu(self.model.conv3_3_2(h)) h = self.model.conv3_3_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_4_1(h)) h = F.relu(self.model.conv3_4_2(h)) h = self.model.conv3_4_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_5_1(h)) h = F.relu(self.model.conv3_5_2(h)) h = self.model.conv3_5_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_6_1(h)) h = F.relu(self.model.conv3_6_2(h)) h = self.model.conv3_6_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_7_1(h)) h = F.relu(self.model.conv3_7_2(h)) h = self.model.conv3_7_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_8_1(h)) h = F.relu(self.model.conv3_8_2(h)) h = self.model.conv3_8_3(h) h = F.relu(h + h_rem) h_rem = self.model.conv4_1_ex(h) h = F.relu(self.model.conv4_1_1(h)) h = F.relu(self.model.conv4_1_2(h)) h = self.model.conv4_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_2_1(h)) h = F.relu(self.model.conv4_2_2(h)) h = self.model.conv4_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_3_1(h)) h = F.relu(self.model.conv4_3_2(h)) h = self.model.conv4_3_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_4_1(h)) h = F.relu(self.model.conv4_4_2(h)) h = self.model.conv4_4_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_5_1(h)) h = F.relu(self.model.conv4_5_2(h)) h = self.model.conv4_5_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_6_1(h)) h = F.relu(self.model.conv4_6_2(h)) h = self.model.conv4_6_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_7_1(h)) h = F.relu(self.model.conv4_7_2(h)) h = self.model.conv4_7_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_8_1(h)) h = F.relu(self.model.conv4_8_2(h)) h = self.model.conv4_8_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_9_1(h)) h = F.relu(self.model.conv4_9_2(h)) h = self.model.conv4_9_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_10_1(h)) h = F.relu(self.model.conv4_10_2(h)) h = self.model.conv4_10_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_11_1(h)) h = F.relu(self.model.conv4_11_2(h)) h = self.model.conv4_11_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_12_1(h)) h = F.relu(self.model.conv4_12_2(h)) h = self.model.conv4_12_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_13_1(h)) h = F.relu(self.model.conv4_13_2(h)) h = self.model.conv4_13_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_14_1(h)) h = F.relu(self.model.conv4_14_2(h)) h = self.model.conv4_14_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_15_1(h)) h = F.relu(self.model.conv4_15_2(h)) h = self.model.conv4_15_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_16_1(h)) h = F.relu(self.model.conv4_16_2(h)) h = self.model.conv4_16_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_17_1(h)) h = F.relu(self.model.conv4_17_2(h)) h = self.model.conv4_17_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_18_1(h)) h = F.relu(self.model.conv4_18_2(h)) h = self.model.conv4_18_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_19_1(h)) h = F.relu(self.model.conv4_19_2(h)) h = self.model.conv4_19_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_20_1(h)) h = F.relu(self.model.conv4_20_2(h)) h = self.model.conv4_20_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_21_1(h)) h = F.relu(self.model.conv4_21_2(h)) h = self.model.conv4_21_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_22_1(h)) h = F.relu(self.model.conv4_22_2(h)) h = self.model.conv4_22_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_23_1(h)) h = F.relu(self.model.conv4_23_2(h)) h = self.model.conv4_23_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_24_1(h)) h = F.relu(self.model.conv4_24_2(h)) h = self.model.conv4_24_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_25_1(h)) h = F.relu(self.model.conv4_25_2(h)) h = self.model.conv4_25_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_26_1(h)) h = F.relu(self.model.conv4_26_2(h)) h = self.model.conv4_26_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_27_1(h)) h = F.relu(self.model.conv4_27_2(h)) h = self.model.conv4_27_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_28_1(h)) h = F.relu(self.model.conv4_28_2(h)) h = self.model.conv4_28_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_29_1(h)) h = F.relu(self.model.conv4_29_2(h)) h = self.model.conv4_29_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_30_1(h)) h = F.relu(self.model.conv4_30_2(h)) h = self.model.conv4_30_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_31_1(h)) h = F.relu(self.model.conv4_31_2(h)) h = self.model.conv4_31_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_32_1(h)) h = F.relu(self.model.conv4_32_2(h)) h = self.model.conv4_32_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_33_1(h)) h = F.relu(self.model.conv4_33_2(h)) h = self.model.conv4_33_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_34_1(h)) h = F.relu(self.model.conv4_34_2(h)) h = self.model.conv4_34_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_35_1(h)) h = F.relu(self.model.conv4_35_2(h)) h = self.model.conv4_35_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_36_1(h)) h = F.relu(self.model.conv4_36_2(h)) h = self.model.conv4_36_3(h) h = F.relu(h + h_rem) h_rem = self.model.conv5_1_ex(h) h = F.relu(self.model.conv5_1_1(h)) h = F.relu(self.model.conv5_1_2(h)) h = self.model.conv5_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv5_2_1(h)) h = F.relu(self.model.conv5_2_2(h)) h = self.model.conv5_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv5_3_1(h)) h = F.relu(self.model.conv5_3_2(h)) h = self.model.conv5_3_3(h) h = F.relu(h + h_rem) h = F.average_pooling_2d(h, 7) Q = self.model.q_value(h) return Q def Q_func_target(self, state): h = F.relu(self.model_target.conv1(state)) h = F.max_pooling_2d(h, 3, stride=2) h_rem = self.model_target.conv2_1_ex(h) h = F.relu(self.model_target.conv2_1_1(h)) h = F.relu(self.model_target.conv2_1_2(h)) h = self.model_target.conv2_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv2_2_1(h)) h = F.relu(self.model_target.conv2_2_2(h)) h = self.model_target.conv2_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv2_3_1(h)) h = F.relu(self.model_target.conv2_3_2(h)) h = self.model_target.conv2_3_3(h) h = F.relu(h + h_rem) h_rem = self.model_target.conv3_1_ex(h) h = F.relu(self.model_target.conv3_1_1(h)) h = F.relu(self.model_target.conv3_1_2(h)) h = self.model_target.conv3_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_2_1(h)) h = F.relu(self.model_target.conv3_2_2(h)) h = self.model_target.conv3_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_3_1(h)) h = F.relu(self.model_target.conv3_3_2(h)) h = self.model_target.conv3_3_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_4_1(h)) h = F.relu(self.model_target.conv3_4_2(h)) h = self.model_target.conv3_4_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_5_1(h)) h = F.relu(self.model_target.conv3_5_2(h)) h = self.model_target.conv3_5_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_6_1(h)) h = F.relu(self.model_target.conv3_6_2(h)) h = self.model_target.conv3_6_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_7_1(h)) h = F.relu(self.model_target.conv3_7_2(h)) h = self.model_target.conv3_7_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_8_1(h)) h = F.relu(self.model_target.conv3_8_2(h)) h = self.model_target.conv3_8_3(h) h = F.relu(h + h_rem) h_rem = self.model_target.conv4_1_ex(h) h = F.relu(self.model_target.conv4_1_1(h)) h = F.relu(self.model_target.conv4_1_2(h)) h = self.model_target.conv4_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_2_1(h)) h = F.relu(self.model_target.conv4_2_2(h)) h = self.model_target.conv4_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_3_1(h)) h = F.relu(self.model_target.conv4_3_2(h)) h = self.model_target.conv4_3_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_4_1(h)) h = F.relu(self.model_target.conv4_4_2(h)) h = self.model_target.conv4_4_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_5_1(h)) h = F.relu(self.model_target.conv4_5_2(h)) h = self.model_target.conv4_5_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_6_1(h)) h = F.relu(self.model_target.conv4_6_2(h)) h = self.model_target.conv4_6_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_7_1(h)) h = F.relu(self.model_target.conv4_7_2(h)) h = self.model_target.conv4_7_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_8_1(h)) h = F.relu(self.model_target.conv4_8_2(h)) h = self.model_target.conv4_8_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_9_1(h)) h = F.relu(self.model_target.conv4_9_2(h)) h = self.model_target.conv4_9_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_10_1(h)) h = F.relu(self.model_target.conv4_10_2(h)) h = self.model_target.conv4_10_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_11_1(h)) h = F.relu(self.model_target.conv4_11_2(h)) h = self.model_target.conv4_11_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_12_1(h)) h = F.relu(self.model_target.conv4_12_2(h)) h = self.model_target.conv4_12_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_13_1(h)) h = F.relu(self.model_target.conv4_13_2(h)) h = self.model_target.conv4_13_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_14_1(h)) h = F.relu(self.model_target.conv4_14_2(h)) h = self.model_target.conv4_14_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_15_1(h)) h = F.relu(self.model_target.conv4_15_2(h)) h = self.model_target.conv4_15_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_16_1(h)) h = F.relu(self.model_target.conv4_16_2(h)) h = self.model_target.conv4_16_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_17_1(h)) h = F.relu(self.model_target.conv4_17_2(h)) h = self.model_target.conv4_17_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_18_1(h)) h = F.relu(self.model_target.conv4_18_2(h)) h = self.model_target.conv4_18_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_19_1(h)) h = F.relu(self.model_target.conv4_19_2(h)) h = self.model_target.conv4_19_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_20_1(h)) h = F.relu(self.model_target.conv4_20_2(h)) h = self.model_target.conv4_20_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_21_1(h)) h = F.relu(self.model_target.conv4_21_2(h)) h = self.model_target.conv4_21_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_22_1(h)) h = F.relu(self.model_target.conv4_22_2(h)) h = self.model_target.conv4_22_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_23_1(h)) h = F.relu(self.model_target.conv4_23_2(h)) h = self.model_target.conv4_23_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_24_1(h)) h = F.relu(self.model_target.conv4_24_2(h)) h = self.model_target.conv4_24_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_25_1(h)) h = F.relu(self.model_target.conv4_25_2(h)) h = self.model_target.conv4_25_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_26_1(h)) h = F.relu(self.model_target.conv4_26_2(h)) h = self.model_target.conv4_26_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_27_1(h)) h = F.relu(self.model_target.conv4_27_2(h)) h = self.model_target.conv4_27_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_28_1(h)) h = F.relu(self.model_target.conv4_28_2(h)) h = self.model_target.conv4_28_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_29_1(h)) h = F.relu(self.model_target.conv4_29_2(h)) h = self.model_target.conv4_29_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_30_1(h)) h = F.relu(self.model_target.conv4_30_2(h)) h = self.model_target.conv4_30_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_31_1(h)) h = F.relu(self.model_target.conv4_31_2(h)) h = self.model_target.conv4_31_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_32_1(h)) h = F.relu(self.model_target.conv4_32_2(h)) h = self.model_target.conv4_32_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_33_1(h)) h = F.relu(self.model_target.conv4_33_2(h)) h = self.model_target.conv4_33_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_34_1(h)) h = F.relu(self.model_target.conv4_34_2(h)) h = self.model_target.conv4_34_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_35_1(h)) h = F.relu(self.model_target.conv4_35_2(h)) h = self.model_target.conv4_35_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_36_1(h)) h = F.relu(self.model_target.conv4_36_2(h)) h = self.model_target.conv4_36_3(h) h = F.relu(h + h_rem) h_rem = self.model_target.conv5_1_ex(h) h = F.relu(self.model_target.conv5_1_1(h)) h = F.relu(self.model_target.conv5_1_2(h)) h = self.model_target.conv5_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv5_2_1(h)) h = F.relu(self.model_target.conv5_2_2(h)) h = self.model_target.conv5_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv5_3_1(h)) h = F.relu(self.model_target.conv5_3_2(h)) h = self.model_target.conv5_3_3(h) h = F.relu(h + h_rem) h = F.average_pooling_2d(h, 7) Q = self.model_target.q_value(h) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action) def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 100 #10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." # Initialization of Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" self.model = FunctionSet( l1=convlstm_link.CONVLSTM(7056, 7056), l4=F.Linear(7056, 512, wscale=np.sqrt(2)), q_value=F.Linear(512, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 512), dtype=np.float32))).to_gpu() self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [ np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def reset_state(self): self.model.l2.reset_state() def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable( cuda.to_gpu( np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() ''' def Q_func(self, state): h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0 1.0] #h2 = F.relu(self.model.l2(h1)) #h3 = F.relu(self.model.l3(h2)) h4 = F.relu(self.model.l4(h1)) Q = self.model.q_value(h4) return Q def Q_func_target(self, state): h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] #h2 = F.relu(self.model_target.l2(h1)) #h3 = F.relu(self.model_target.l3(h2)) h4 = F.relu(self.model.l4(h1)) Q = self.model_target.q_value(h4) return Q ''' def Q_func(self, state): self.model.l1.reset_state() for i in range(4): h1 = F.relu(self.model.l1(state / 254.0)) h4 = F.relu(self.model.l4(h1)) Q = self.model.q_value(h4) return Q def Q_func_target(self, state): self.model_target.l1.reset_state() for i in range(4): h1 = F.relu(self.model_target.l1(state / 254.0)) h4 = F.relu(self.model_target.l4(h1)) Q = self.model_target.q_value(h4) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) #Q = self.Q_func(state) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 100#10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 #10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." print "Model Building" self.CNN_model = FunctionSet( l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), ) self.model = FunctionSet( l4=F.Linear(3136, 512, wscale=np.sqrt(2)), q_value=F.Linear(512, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 512), dtype=np.float32)) ).to_gpu() d = 'elite/' self.CNN_model.l1.W.data = np.load(d+'l1_W.npy')#.astype(np.float32) self.CNN_model.l1.b.data = np.load(d+'l1_b.npy')#.astype(np.float32) self.CNN_model.l2.W.data = np.load(d+'l2_W.npy')#.astype(np.float32) self.CNN_model.l2.b.data = np.load(d+'l2_b.npy')#.astype(np.float32) self.CNN_model.l3.W.data = np.load(d+'l3_W.npy')#.astype(np.float32) self.CNN_model.l3.b.data = np.load(d+'l3_b.npy')#.astype(np.float32) self.CNN_model = self.CNN_model.to_gpu() self.CNN_model_target = copy.deepcopy(self.CNN_model) self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool), np.zeros((self.data_size, 1), dtype=np.uint8)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, lstm_reward, state_dash, episode_end_flag, ale_reward): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = lstm_reward self.D[5][data_index] = ale_reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = lstm_reward self.D[3][data_index] = state_dash self.D[5][data_index] = ale_reward self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): h1 = F.relu(self.CNN_model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.CNN_model.l2(h1)) h3 = F.relu(self.CNN_model.l3(h2)) h4 = F.relu(self.model.l4(h3)) #test now #print h3.data.shape Q = self.model.q_value(h4) return Q def Q_func_LSTM(self, state): h1 = F.relu(self.CNN_model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.CNN_model.l2(h1)) h3 = F.relu(self.CNN_model.l3(h2)) return h3.data.get() def Q_func_target(self, state): h1 = F.relu(self.CNN_model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.CNN_model_target.l2(h1)) h3 = F.relu(self.CNN_model_target.l3(h2)) h4 = F.relu(self.model.l4(h3)) Q = self.model_target.q_value(h4) return Q def LSTM_reward(self, lstm_out, state_next): lstm_reward = np.sign((self.lstm_loss - (lstm_out - state_next)**2)) return lstm_reward def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." # Initialization for Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)), l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)), l3=F.Linear(2592, 256), q_value=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256), dtype=np.float32)) ).to_gpu() print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals max_Q_dash_ = self.Q_func(s_dash) tmp = list(map(np.max, max_Q_dash_.data.get())) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) target[i, self.action_to_index(action[i])] = tmp_ loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0, 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) Q = self.model.q_value(h3) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 50000 # 10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10 ** 4 # Target update frequancy. original: 10^4 data_size = 5 * (10 ** 5) # Data size of history. original: 10^6 field_num = 7 field_size = 17 def __init__(self, control_size=10, field_num=7, field_size=17): self.num_of_actions = control_size self.field_size = field_size # self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." # Initialization of Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() self.field_num = field_num print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(self.field_num * 4, 16, ksize=5, stride=1, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(16, 24, ksize=4, stride=1, nobias=False, wscale=np.sqrt(2)), l3=F.Linear(2400, 512, wscale=np.sqrt(2)), q_value=F.Linear(512, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 512), dtype=np.float32)) ).to_gpu() self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) # self.optimizer.setup(self.model.collect_parameters()) self.optimizer.setup(self.model) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.float32), np.zeros((self.data_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) # action_index = self.action_to_index(action[i]) target[i, action[i]] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error # td = Variable(target) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) #print "td_data " + str(td_clip.data) zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) # zero_val = Variable(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.field_num * 4, self.field_size, self.field_size), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) ##修正なし # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) Q = self.model.q_value(h3) return Q def Q_func_target(self, state): h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model_target.l2(h1)) h3 = F.relu(self.model_target.l3(h2)) Q = self.model_target.q_value(h3) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return index_action, Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def save_model(self, model_name, opt_name): serializers.save_hdf5(model_name, self.model) serializers.save_hdf5(opt_name, self.optimizer) def read_model(self, model_name, opt_name): serializers.load_hdf5(model_name, self.model) serializers.load_hdf5(opt_name, self.optimizer) self.model_target = copy.deepcopy(self.model)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 5*10**4 # 10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**6 # Data size of history. original: 10^6 num_of_actions = 2 # Action dimention num_of_states = 12 # State dimention def __init__(self): print "Initializing DQN..." # Initialization of Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" # self.model = FunctionSet( # l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), # l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), # l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), # l4=F.Linear(3136, 512, wscale=np.sqrt(2)), # q_value=F.Linear(512, self.num_of_actions, # initialW=np.zeros((self.num_of_actions, 512), # dtype=np.float32)) # ).to_gpu() # self.critic = FunctionSet( # l1=F.Linear(self.num_of_actions+self.num_of_states,512), # l2=F.Linear(512,256), # l3=F.Linear(256,128), # q_value=F.Linear(128,1,initialW=np.zeros((1,128),dtype=np.float32)) # ).to_gpu() # # self.actor = FunctionSet( # l1=F.Linear(self.num_of_states,512), # l2=F.Linear(512,256), # l3=F.Linear(256,128), # a_value=F.Linear(128,self.num_of_actions,initialW=np.zeros((1,128),dtype=np.float32)) # ).to_gpu() self.critic = FunctionSet( l1=F.Linear(self.num_of_actions+self.num_of_states,1024), l2=F.Linear(1024,512), l3=F.Linear(512,256), l4=F.Linear(256,128), q_value=F.Linear(128,1,initialW=np.zeros((1,128),dtype=np.float32)) ).to_gpu() self.actor = FunctionSet( l1=F.Linear(self.num_of_states,1024), l2=F.Linear(1024,512), l3=F.Linear(512,256), l4=F.Linear(256,128), a_value=F.Linear(128,self.num_of_actions,initialW=np.zeros((1,128),dtype=np.float32)) ).to_gpu() # self.critic = FunctionSet( # l1=F.Linear(self.num_of_actions+self.num_of_states,1024,wscale=0.01*math.sqrt(self.num_of_actions+self.num_of_states)), # l2=F.Linear(1024,512,wscale=0.01*math.sqrt(1024)), # l3=F.Linear(512,256,wscale=0.01*math.sqrt(512)), # l4=F.Linear(256,128,wscale=0.01*math.sqrt(256)), # q_value=F.Linear(128,1,wscale=0.01*math.sqrt(128)) # ).to_gpu() # # self.actor = FunctionSet( # l1=F.Linear(self.num_of_states,1024,wscale=0.01*math.sqrt(self.num_of_states)), # l2=F.Linear(1024,512,wscale=0.01*math.sqrt(1024)), # l3=F.Linear(512,256,wscale=0.01*math.sqrt(512)), # l4=F.Linear(256,128,wscale=0.01*math.sqrt(256)), # a_value=F.Linear(128,self.num_of_actions,wscale=0.01*math.sqrt(128)) # ).to_gpu() self.critic_target = copy.deepcopy(self.critic) self.actor_target = copy.deepcopy(self.actor) print "Initizlizing Optimizer" #self.optim_critic = optimizers.RMSpropGraves(lr=0.0001, alpha=0.95, momentum=0.95, eps=0.0001) #self.optim_actor = optimizers.RMSpropGraves(lr=0.0001, alpha=0.95, momentum=0.95, eps=0.0001) self.optim_critic = optimizers.Adam(alpha=0.00001) self.optim_actor = optimizers.Adam(alpha=0.00001) self.optim_critic.setup(self.critic) self.optim_actor.setup(self.actor) # self.optim_critic.add_hook(chainer.optimizer.WeightDecay(0.00001)) # self.optim_critic.add_hook(chainer.optimizer.GradientClipping(10)) # self.optim_actor.add_hook(chainer.optimizer.WeightDecay(0.00001)) # self.optim_actor.add_hook(chainer.optimizer.GradientClipping(10)) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, self.num_of_states), dtype=np.float32), np.zeros((self.data_size, self.num_of_actions), dtype=np.float32), np.zeros((self.data_size, 1), dtype=np.float32), np.zeros((self.data_size, self.num_of_states), dtype=np.float32), np.zeros((self.data_size, 1), dtype=np.bool)] # with open('dqn_dump.json', 'a') as f: # json.dump(datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), f) # f.write('\n') # json.dump({"alpha": 0.00001, "beta1": 0.7, "beta2": 0.999, "weight_decay": 0.00001}, f) # f.write('\n') # f.close() #self.x_PID = Hover_PID_Controller(12.1, 1.25) #self.y_PID = Hover_PID_Controller(12.1, 1.25) def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(cuda.to_gpu(np.concatenate([state, action],1))) s_dash = Variable(cuda.to_gpu(state_dash)) Q = self.Q_func(s) # Get Q-value # Generate Target through target nets action_dash_tmp = self.A_func_target(s_dash) action_dash = np.asanyarray(action_dash_tmp.data.get(), dtype=np.float32) tmp_dash = Variable(cuda.to_gpu(np.concatenate([state_dash, action_dash],1))) Q_dash_tmp = self.Q_func_target(tmp_dash) Q_dash = np.asanyarray(Q_dash_tmp.data.get(), dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = Reward[i] + self.gamma * Q_dash[i] else: tmp_ = Reward[i] target[i] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, 1), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def updateActor(self, state): num_of_batch = state.shape[0] A_max = 1.0 A_min = -1.0 A = self.A_func(Variable(cuda.to_gpu(state))) tmp = Variable(cuda.to_gpu(np.concatenate([state, A.data.get()],1))) Q = self.Q_func(tmp) # Backward prop towards actor net #self.critic.zerograds() #self.actor.zerograds() Q.grad = cuda.to_gpu(np.ones((num_of_batch, 1), dtype=np.float32)*(-1.0)) # Q.grad = Q.data*(-1.0) Q.backward() A.grad = tmp.grad[:,-self.num_of_actions:] print("sample_A.grad: "+str(A.grad[0])) for i in xrange(num_of_batch): for j in xrange(self.num_of_actions): if A.grad[i][j] < 0: A.grad[i][j] *= (A_max-A.data[i][j])/(A_max-A_min) elif A.grad[i][j] > 0: A.grad[i][j] *= (A.data[i][j]-A_min)/(A_max-A_min) A.backward() self.optim_actor.update() print("sample_A.grad: "+str(A.grad[0])) def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) #reward_list = list(self.D[2]) #replay_index = [i[0] for i in sorted(enumerate(reward_list),key=itemgetter(1),reverse=True)[:32]] #replay_index = np.asarray(replay_index).reshape(32,1) s_replay = np.ndarray(shape=(self.replay_size, self.num_of_states), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, self.num_of_actions), dtype=np.float32) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.num_of_states), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = np.asarray(self.D[1][replay_index[i]], dtype=np.float32) r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.asarray(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] #s_replay = cuda.to_gpu(s_replay) #s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based critic update self.optim_critic.zero_grads() loss, q = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optim_critic.update() # Update the actor self.optim_critic.zero_grads() self.optim_actor.zero_grads() self.updateActor(s_replay) self.soft_target_model_update() print "AVG_Q %f" %(np.average(q.data.get())) print("loss " + str(loss.data)) # with open('dqn_dump.json', 'a') as f: # json.dump({"time": time, "avg_Q": float(np.average(q.data.get())), "loss": float(loss.data)}, f) # f.write('\n') # f.close() def Q_func(self, state): # h1 = F.relu(self.critic.l1(state)) # h2 = F.relu(self.critic.l2(h1)) # h3 = F.relu(self.critic.l3(h2)) # Q = self.critic.q_value(h3) h1 = F.relu(self.critic.l1(state)) h2 = F.relu(self.critic.l2(h1)) h3 = F.relu(self.critic.l3(h2)) h4 = F.relu(self.critic.l4(h3)) Q = self.critic.q_value(h4) return Q def Q_func_target(self, state): # h1 = F.relu(self.critic_target.l1(state)) # h2 = F.relu(self.critic_target.l2(h1)) # h3 = F.relu(self.critic.l3(h2)) # Q = self.critic_target.q_value(h3) h1 = F.relu(self.critic_target.l1(state)) h2 = F.relu(self.critic_target.l2(h1)) h3 = F.relu(self.critic_target.l3(h2)) h4 = F.relu(self.critic.l4(h3)) Q = self.critic_target.q_value(h4) return Q def A_func(self, state): # h1 = F.relu(self.actor.l1(state)) # h2 = F.relu(self.actor.l2(h1)) # h3 = F.relu(self.actor.l3(h2)) # A = self.actor.a_value(h3) h1 = F.relu(self.actor.l1(state)) h2 = F.relu(self.actor.l2(h1)) h3 = F.relu(self.actor.l3(h2)) h4 = F.relu(self.actor.l4(h3)) A = self.actor.a_value(h4) return A def A_func_target(self, state): # h1 = F.relu(self.actor_target.l1(state)) # h2 = F.relu(self.actor_target.l2(h1)) # h3 = F.relu(self.actor.l3(h2)) # A = self.actor_target.a_value(h3) h1 = F.relu(self.actor_target.l1(state)) h2 = F.relu(self.actor_target.l2(h1)) h3 = F.relu(self.actor_target.l3(h2)) h4 = F.relu(self.actor.l4(h3)) A = self.actor_target.a_value(h4) return A def e_greedy(self, state, epsilon): s = Variable(state) A = self.A_func(s) A = A.data if np.random.rand() < epsilon: action = np.random.uniform(-1.,1.,(1,self.num_of_actions)).astype(np.float32) # action = np.zeros((1,self.num_of_actions),dtype=np.float32) # if state[0,0] > 0: # action[0,0] = np.random.uniform(0.0,0.5) # elif state[0,0] < 0: # action[0,0] = np.random.uniform(-0.5,0.0) # if state[0,1] < 0: # action[0,1] = np.random.uniform(0.0,0.5) # elif state[0,1] > 0: # action[0,1] = np.random.uniform(-0.5,0.0) #print("teststate"+str(state)) #action[0,0] = -self.x_PID.getCorrection(state[0][0], 0.0) #action[0,1] = self.y_PID.getCorrection(state[0][1], 0.0) print "RANDOM" else: action = A.get() print "GREEDY" #print(str(action)) return action def hard_target_model_update(self): self.critic_target = copy.deepcopy(self.critic) self.actor_target = copy.deepcopy(self.actor) def soft_target_model_update(self, tau=0.001): self.critic_target.l1.W.data = tau*self.critic.l1.W.data + (1-tau)*self.critic_target.l1.W.data self.critic_target.l2.W.data = tau*self.critic.l2.W.data + (1-tau)*self.critic_target.l2.W.data self.critic_target.l3.W.data = tau*self.critic.l3.W.data + (1-tau)*self.critic_target.l3.W.data self.critic_target.l4.W.data = tau*self.critic.l4.W.data + (1-tau)*self.critic_target.l4.W.data self.critic_target.q_value.W.data = tau*self.critic.q_value.W.data + (1-tau)*self.critic_target.q_value.W.data self.actor_target.l1.W.data = tau*self.actor.l1.W.data + (1-tau)*self.actor_target.l1.W.data self.actor_target.l2.W.data = tau*self.actor.l2.W.data + (1-tau)*self.actor_target.l2.W.data self.actor_target.l3.W.data = tau*self.actor.l3.W.data + (1-tau)*self.actor_target.l3.W.data self.actor_target.l4.W.data = tau*self.actor.l4.W.data + (1-tau)*self.actor_target.l4.W.data self.actor_target.a_value.W.data = tau*self.actor.a_value.W.data + (1-tau)*self.actor_target.a_value.W.data
class QNet: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**3 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 hist_size = 1 # original: 4 def __init__(self, use_gpu, enable_controller, dim, epsilon, epsilon_delta, min_eps): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.dim = dim self.epsilon = epsilon self.epsilon_delta = epsilon_delta self.min_eps = min_eps self.time = 0 app_logger.info("Initializing Q-Network...") hidden_dim = 256 self.model = FunctionSet( l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)), q_value=F.Linear(hidden_dim, self.num_of_actions, initialW=np.zeros((self.num_of_actions, hidden_dim), dtype=np.float32)) ) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.d = [np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = reward[i] + self.gamma * max_q_dash[i] else: tmp_ = reward[i] action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def q_func(self, state): h4 = F.relu(self.model.l4(state / 255.0)) q = self.model.q_value(h4) return q def q_func_target(self, state): h4 = F.relu(self.model_target.l4(state / 255.0)) q = self.model_target.q_value(h4) return q def e_greedy(self, state, epsilon): s = Variable(state) q = self.q_func(s) q = q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) app_logger.info(" Random") else: if self.use_gpu >= 0: index_action = np.argmax(q.get()) else: index_action = np.argmax(q) app_logger.info("#Greedy") return self.index_to_action(index_action), q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action) def start(self, feature): self.state = np.zeros((self.hist_size, self.dim), dtype=np.uint8) self.state[0] = feature state_ = np.asanyarray(self.state.reshape(1, self.hist_size, self.dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Generate an Action e-greedy action, q_now = self.e_greedy(state_, self.epsilon) return_action = action return return_action def update_model(self, replayed_experience): if replayed_experience[0]: self.optimizer.zero_grads() loss, _ = self.forward(replayed_experience[1], replayed_experience[2], replayed_experience[3], replayed_experience[4], replayed_experience[5]) loss.backward() self.optimizer.update() # Target model update if replayed_experience[0] and np.mod(self.time, self.target_model_update_freq) == 0: app_logger.info("Model Updated") self.target_model_update() self.time += 1 app_logger.info("step: {}".format(self.time)) def step(self, features): if self.hist_size == 4: self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], features], dtype=np.uint8) elif self.hist_size == 2: self.state = np.asanyarray([self.state[1], features], dtype=np.uint8) elif self.hist_size == 1: self.state = np.asanyarray([features], dtype=np.uint8) else: app_logger.error("self.DQN.hist_size err") state_ = np.asanyarray(self.state.reshape(1, self.hist_size, self.dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.initial_exploration < self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon else: # Initial Exploation Phase app_logger.info("Initial Exploration : {}/{} steps".format(self.time, self.initial_exploration)) eps = 1.0 # Generate an Action by e-greedy action selection action, q_now = self.e_greedy(state_, eps) if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) return action, eps, q_max
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 100 #10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 30000 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), l4=F.Linear(3136, 512, wscale=np.sqrt(2)), q_value=F.Linear(512, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 512), dtype=np.float32))) #.to_gpu() self.model.l1.W = np.load('elite/l1_W.npy') self.model.l1.b = np.load('elite/l1_b.npy') self.model.l2.W = np.load('elite/l2_W.npy') self.model.l2.b = np.load('elite/l2_b.npy') self.model.l3.W = np.load('elite/l3_W.npy') self.model.l3.b = np.load('elite/l3_b.npy') self.model.l4.W = np.load('elite/l4_W.npy') self.model.l4.b = np.load('elite/l4_b.npy') self.model.q_value.W = np.load('elite/q_value_W.npy') self.model.q_value.b = np.load('elite/q_value_b.npy') self.model_target = copy.deepcopy(self.model) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [ np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(target) - Q td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable( np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] # Gradient-based update loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) def Q_func(self, state): h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) h4 = F.relu(self.model.l4(h3)) Q = self.model.q_value(h4) return Q def Q_func_target(self, state): h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model_target.l2(h1)) h3 = F.relu(self.model_target.l3(h2)) h4 = F.relu(self.model.l4(h3)) Q = self.model_target.q_value(h4) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: #index_action = np.argmax(Q.get()) index_action = np.argmax(Q) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**3 # Initial exploratoin. original replay_size = 32 # Replay (batch) size target_model_update_freq = 10**2 # Target update frequancy. original data_size = 10**5 # Data size of history. original #actions are 0 => do nothing, 1 -> buy, -1 sell def __init__(self, input_vector_length,enable_controller=[0, 1, 2]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" self.input_vector_length = input_vector_length print "Initializing DQN..." # Initialization for Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() #inputs --> 5 * 14 (with 10 temporality) + 5 (of last one hour) + 5 (of last 24 hour) print "Model Building" self.model = FunctionSet( l1=F.Linear(input_vector_length, 500), l2=F.Linear(500, 250), l3=F.Linear(250, 80), q_value=F.Linear(80, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 80), dtype=np.float32)) ).to_gpu() print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, self.input_vector_length), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.input_vector_length), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals max_Q_dash_ = self.Q_func(s_dash) tmp = list(map(np.max, max_Q_dash_.data.get())) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) target[i, self.action_to_index(action[i])] = tmp_ loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.input_vector_length), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.input_vector_length), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): #todo might want to normalize input, but for now I will do that outside this class h1 = F.relu(self.model.l1(state)) h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) Q = self.model.q_value(h3) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)