class DN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 100#10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 1, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Breakout" print "Initializing DN..." # Initialization of Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), l4=F.Linear(3136, 256, wscale=np.sqrt(2)), l5=F.Linear(3136, 256, wscale=np.sqrt(2)), l6=F.Linear(256, 1, initialW=np.zeros((1, 256), dtype=np.float32)), l7=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256), dtype=np.float32)), q_value=DN_out.DN_out(1, self.num_of_actions, self.num_of_actions, nobias = True) ).to_gpu() if args.resumemodel: # load saved model serializers.load_npz(args.resumemodel, self.model) print "load model from resume.model" self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] if args.resumeD1 and args.resumeD2: # load saved D1 and D2 npz_tmp1 = np.load(args.resumeD1) print "finished loading half of D data" npz_tmp2 = np.load(args.resumeD2) self.D = [npz_tmp1['D0'], npz_tmp1['D1'], npz_tmp1['D2'], npz_tmp2['D3'], npz_tmp2['D4']] npz_tmp1.close() npz_tmp2.close() print "loaded stored all D data" else: self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] print "initialized D data" def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp2 = self.Q_func(s_dash) tmp2 = list(map(np.argmax, tmp2.data.get())) # argmaxQ(s',a) tmp = self.Q_func_target(s_dash) # Q'(s',*) tmp = list(tmp.data.get()) # select Q'(s',*) due to argmaxQ(s',a) res1 = [] for i in range(num_of_batch): res1.append(tmp[i][tmp2[i]]) #max_Q_dash = np.asanyarray(tmp, dtype=np.float32) max_Q_dash = np.asanyarray(res1, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): print 'now Q_func is implemented' h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) h4 = F.relu(self.model.l4(h3)) # left side connected with s value h5 = F.relu(self.model.l5(h3)) # right side connected with A value h6 = self.model.l6(h4) # s value h7 = self.model.l7(h5) # A value Q = self.model.q_value(h6, h7) # Q value return Q def Q_func_target(self, state): print 'now Q_func_target is implemented' h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model_target.l2(h1)) h3 = F.relu(self.model_target.l3(h2)) h4 = F.relu(self.model_target.l4(h3)) # left side connected with s value h5 = F.relu(self.model_target.l5(h3)) # right side connected with A value h6 = self.model_target.l6(h4) # s value h7 = self.model_target.l7(h5) # A value Q = self.model_target.q_value(h6, h7) # Q value return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class QNet: # Hyper-Parameters gamma = 0.99 # 報酬の割引率 initial_exploration = 10**3 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 hist_size = 4 #original: 4 save_model_freq = 10**4 # モデルを保存する頻度 def __init__(self, use_gpu, enable_controller, dim): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.dim = dim print("Initializing Q-Network...") print("Input Dim of Q-Network : ",self.dim*self.hist_size) hidden_dim = 256 self.model = FunctionSet( l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)), l5=F.Linear(hidden_dim,hidden_dim,wscale=np.sqrt(2)), q_value=F.Linear(hidden_dim, self.num_of_actions, initialW=np.zeros((self.num_of_actions, hidden_dim), dtype=np.float32)) ) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.d = [np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = reward[i] + self.gamma * max_q_dash[i] else: tmp_ = reward[i] action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def stock_experience(self, time,state, action, reward, state_dash,episode_end_flag): data_index = time % self.data_size #timeを引数に入れることでqueueを実現 if episode_end_flag is True: # ep_endがTrueならstate_dashが全て0になる self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward else: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward self.d[3][data_index] = state_dash self.d[4][data_index] = episode_end_flag def experience_replay(self, time): if self.initial_exploration < time: if time < self.data_size: #during the first sweep of the History replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.d[1][replay_index[i]] r_replay[i] = self.d[2][replay_index[i]] s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.d[4][replay_index[i]] if self.use_gpu >= 0: s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def q_func(self, state): h4 = F.relu(self.model.l4(state / 255.0)) h5 = F.relu(self.model.l5(h4)) q = self.model.q_value(h5) return q def q_func_target(self, state): h4 = F.relu(self.model_target.l4(state / 255.0)) h5 = F.relu(self.model_target.l5(h4)) q = self.model_target.q_value(h5) return q def e_greedy(self, state, epsilon): s = Variable(state) q = self.q_func(s) q = q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print(" Random") else: if self.use_gpu >= 0: index_action = np.argmax(q.get()) else: index_action = np.argmax(q) print("#Greedy") return self.index_to_action(index_action), q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action) def save_model(self,folder,time): try: model_path = "./%s/%dmodel"%(folder,time) serializers.save_npz(model_path,self.model) except: import traceback import sys traceback.print_exc() sys.exit() print "model is saved!!(Model_Path=%s)"%(model_path) print "----------------------------------------------" def load_model(self,folder,model_num): try: model_path = "./%s/%dmodel"%(folder,model_num) serializers.load_npz(model_path,self.model) except: import traceback import sys traceback.print_exc() sys.exit() print "model load is done!!(Model_Path=%s)"%(model_path) print "----------------------------------------------" self.target_model_update()
class DQN_class: gamma = 0.99 #initial_exploration = 10**2 initial_exploration = 10 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 #data_size = 10**6 data_size = 10**5 def __init__(self, enable_controller=[0, 1, 2, 3, 4, 5, 6, 7, 8]): # """ [ 0, 0], # [ 0, 1], # [ 0,-1], # [ 1, 0], # [ 1, 1], # [ 1,-1], # [-1, 0], # [-1, 1], # [-1,-1]]):""" self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller print "Initializing DQN..." print "CUDA init" #cuda.init() print "Model Building" self.model = FunctionSet( l1 = F.Linear(INPUT_SIZE, 5000), # input map[100, 100] + v[2] + w[1] + wp[2] #l1 = F.Linear(INPUT_SIZE, 100), # input map[100, 100] + v[2] + w[1] + wp[2] l2 = F.Linear(5000, 1000), l3 = F.Linear(1000, 500), l4 = F.Linear(500, 100), l5 = F.Linear(100, self.num_of_actions, #l2 = F.Linear(100, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 100), dtype=np.float32)) ).to_gpu() self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) ### 重要!!!! RMSProp!! self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.float32), np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32), np.zeros((self.data_size, 1), dtype=np.bool)] #self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8), # np.zeros(self.data_size, dtype=np.uint8), # np.zeros((self.data_size, 1), dtype=np.int8), # np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8), # np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) #action_index = self.action_to_index(action[i]) #target[i, action_index] = tmp_ target[i, action[i]] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error #print "td-error" #print "np.max(td.data) : ", #print np.max(td.data.get()) # 何のためにあるのか不明 td = td_clipとなっている td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) #print "td_clip.data :", #print td_clip.data zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions))).astype(np.float32)) #zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions)))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q # Dataを保存 def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag # mini batch学習 def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) #s_replay = np.ndarray(shape=(self.replay_size, 100, 100), dtype=np.float32) s_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] if i == 0: print "s", s_replay[0] print "a", a_replay[0] print "s\'", s_dash_replay[0] print "r", r_replay[0] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() ### 逆伝播 self.optimizer.update() ### 学習!!!!!!!!!, ネットワークの更新 def Q_func(self, state): h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs into [0.0 1.0] #h1 = F.relu(self.model.l1(state)) # scale inputs into [0.0 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) h4 = F.relu(self.model.l4(h3)) Q = self.model.l5(h4) #Q = self.model.l2(h1) return Q def Q_func_target(self, state): h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs into [0.0 1.0] #h1 = F.relu(self.model_target.l1(state)) # scale inputs into [0.0 1.0] h2 = F.relu(self.model_target.l2(h1)) h3 = F.relu(self.model_target.l3(h2)) h4 = F.relu(self.model_target.l4(h3)) Q = self.model.l5(h4) #Q = self.model.l2(h1) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: #index_action = np.random.randint(0, self.num_of_actions) action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: #index_action = np.argmax(Q.get()) action = np.argmax(Q.get()) print "GREEDY" #return self.index_to_action(index_action), Q return action, Q def action_to_vec(self, action, vec): # """ [ 0, 0], # [ 0, 1], # [ 0,-1], # [ 1, 0], # [ 1, 1], # [ 1,-1], # [-1, 0], # [-1, 1], # [-1,-1]]):""" #vec = Twist() if action == 3 or action == 4 or action == 5: #vec.linear.x += 0.1 vec.linear.x = 0.3 elif action == 6 or action == 7 or action == 8: #vec.linear.x -= 0.1 vec.linear.x = -0.3 else: vec.linear.x = 0.0 if action == 1 or action == 4 or action == 7: #vec.angular.z += 0.1 vec.angular.z = 0.3 elif action == 2 or action == 5 or action == 8: #vec.angular.z -= 0.1 vec.angular.z = -0.3 else: vec.angular.z = 0.0 if vec.linear.x > 1: vec.linear.x = 1 elif vec.linear.x < -1: vec.linear.x = -1 if vec.angular.z > 1: vec.angular.z = 1 elif vec.angular.z < -1: vec.angular.z = -1 return vec
class QNet: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**3 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 hist_size = 1 #original: 4 time_M = 11 def __init__(self, use_gpu, enable_controller, dim): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.dim = dim print("Initializing Q-Network...") hidden_dim1 = 64 #hidden_dim1 = 32 hidden_dim2 = 128 hidden_dim3 = 10 hidden_cont = 100 self.model = FunctionSet( l4=linearL4_link.LinearL4_link(self.dim * self.hist_size * self.time_M, hidden_cont, wscale=np.sqrt(2)), l5=MU_l6.memory_unit_link(self.dim * self.hist_size * self.time_M, hidden_dim3 * hidden_cont, wscale=np.sqrt(2)), l6=MU_l6.memory_unit_link(self.dim * self.hist_size * self.time_M, hidden_dim3 * hidden_cont, wscale=np.sqrt(2)), l7=attention.Attention(hidden_cont, hidden_dim3 * hidden_cont, hidden_dim3), l8=retrieval.Retrieval(hidden_dim3, hidden_dim3 * hidden_cont, hidden_cont), l9=F.Bilinear(hidden_cont, hidden_cont, hidden_dim2), q_value=F.Linear(hidden_dim2, self.num_of_actions, initialW=np.zeros( (self.num_of_actions, hidden_dim2), dtype=np.float32))) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s(now & 10history), a, r, s_dash, end_episode_flag] # modified to MQN self.d = [ np.zeros((self.data_size, self.hist_size * self.time_M, self.dim), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = reward[i] + self.gamma * max_q_dash[i] else: tmp_ = reward[i] action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def stock_experience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward else: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward self.d[3][data_index] = state_dash self.d[4][data_index] = episode_end_flag def experience_replay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) #modify s_replay for MQN s_replay = np.ndarray(shape=(self.replay_size, self.hist_size * self.time_M, self.dim), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.d[1][replay_index[i]] r_replay[i] = self.d[2][replay_index[i]] s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.d[4][replay_index[i]] #modify s_dash_replay to 11times for MQN model #print 'now state 1' s_dash_tmp = s_dash_replay.reshape(len(s_dash_replay), -1).astype(dtype=np.float32) #print 'now state 2' s_dash_M = np.ndarray(shape=(self.replay_size, self.hist_size * self.time_M, self.dim), dtype=np.float32) #print 'now state 3' s_dash_M[:, 0] = s_dash_tmp #print 'now state 4' for i in range(self.time_M - 1): s_dash_M[:, i + 1] = s_replay[:, i] if self.use_gpu >= 0: s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_M) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def q_func(self, state): h4 = F.relu(self.model.l4(state / 255.0)) h5 = self.model.l5(state / 255.0) h6 = self.model.l6(state / 255.0) h7 = F.softmax(self.model.l7(h4, h5)) h8 = self.model.l8(h7, h6) h9 = F.relu(self.model.l9(h4, h8)) q = self.model.q_value(h9) return q def q_func_target(self, state): h4 = F.relu(self.model_target.l4(state / 255.0)) h5 = self.model_target.l5(state / 255.0) h6 = self.model_target.l6(state / 255.0) h7 = F.softmax(self.model_target.l7(h4, h5)) h8 = self.model_target.l8(h7, h6) h9 = F.relu(self.model_target.l9(h4, h8)) q = self.model_target.q_value(h9) return q def e_greedy(self, state, epsilon): s = Variable(state) q = self.q_func(s) q = q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print(" Random"), else: if self.use_gpu >= 0: index_action = np.argmax(q.get()) else: index_action = np.argmax(q) print("#Greedy"), return self.index_to_action(index_action), q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class QNet: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**3 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 hist_size = 1 #original: 4 def __init__(self, use_gpu, enable_controller, dim): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.dim = dim print("Initializing Q-Network...") #hidden_dim = 256 hidden_dim128 = 128 self.model = FunctionSet( l4=F.Linear(self.dim * self.hist_size, hidden_dim128, wscale=np.sqrt(2)), l5=F.Linear(self.dim * self.hist_size, hidden_dim128, wscale=np.sqrt(2)), l6=F.Linear(hidden_dim128, 1, wscale=np.sqrt(2), initialW=np.zeros((1, hidden_dim128), dtype=np.float32)), #V(s,a) l7=F.Linear(hidden_dim128, self.num_of_actions, wscale=np.sqrt(2), initialW=np.zeros((self.num_of_actions, hidden_dim128), dtype=np.float32)), #A(a) q_value=DN_out.DN_out(1, self.num_of_actions, self.num_of_actions, nobias=True)) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.d = [ np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = reward[i] + self.gamma * max_q_dash[i] else: tmp_ = reward[i] action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def stock_experience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward else: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward self.d[3][data_index] = state_dash self.d[4][data_index] = episode_end_flag def experience_replay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.d[1][replay_index[i]] r_replay[i] = self.d[2][replay_index[i]] s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.d[4][replay_index[i]] if self.use_gpu >= 0: s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def q_func(self, state): h4 = F.relu(self.model.l4(state / 255.0)) h5 = F.relu(self.model.l5(state / 255.0)) #h6 = F.relu(self.model.l6(h4)) #h7 = relu_l7.relu(self.model.l7(h5)) h6 = self.model.l6(h4) h7 = self.model.l7(h5) q = self.model.q_value(h6, h7) return q def q_func_target(self, state): #h4 = F.relu(self.model_target.l4(state / 255.0)) #q = self.model_target.q_value(h4) h4 = F.relu(self.model_target.l4(state / 255.0)) h5 = F.relu(self.model_target.l5(state / 255.0)) #h6 = F.relu(self.model_target.l6(h4)) #h7 = relu_l7.relu(self.model_target.l7(h5)) h6 = self.model_target.l6(h4) h7 = self.model_target.l7(h5) q = self.model_target.q_value(h6, h7) return q def e_greedy(self, state, epsilon): s = Variable(state) q = self.q_func(s) q = q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print(" Random"), else: if self.use_gpu >= 0: index_action = np.argmax(q.get()) else: index_action = np.argmax(q) print("#Greedy"), return self.index_to_action(index_action), q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class ConvQAgent(Agent): def __init__(self, frames_per_action=4): super(ConvQAgent, self).__init__() cuda.init() self.epsilon = 1.0 self.gamma = 0.99 self.iterations = 0 self.model = FunctionSet( l1 = F.Convolution2D(frames_per_action, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2 = F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3 = F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), l4 = F.Linear(64 * 7 * 7, 512), l5 = F.Linear(512, 2) ).to_gpu() self.optimizer = optimizers.RMSprop(lr=1e-5) self.optimizer.setup(self.model) self.update_target() self.num_frames = 0 self.frames_per_action = frames_per_action self.prev_reward = 0.0 self.history = ConvHistory((frames_per_action, 84, 84)) def update_target(self): self.target_model = copy.deepcopy(self.model) self.target_model = self.target_model.to_gpu() def act(self, state): self.update_state_vector(state) if self.num_frames < self.frames_per_action - 1 or self.num_frames % self.frames_per_action != 0: return None if random.random() < 0.001: print 'Epsilon: {}'.format(self.epsilon) if self.epsilon > 0.05: self.epsilon -= (0.95 / 300000) if random.random() < self.epsilon: return random.random() > 0.375 q = self.get_q(Variable(cuda.to_gpu(self.curr_state[np.newaxis, :, :, :]))) if random.random() < 0.01: if q.data[0,1] > q.data[0,0]: print 'On: {}'.format(q.data) else: print 'Off: {}'.format(q.data) return q.data[0,1] > q.data[0,0] def update_state_vector(self, state): if self.num_frames < self.frames_per_action: if self.num_frames == 0: self.curr_state = np.zeros((self.frames_per_action, 84, 84), dtype=np.float32) self.curr_state[self.num_frames, :, :] = state else: if self.num_frames == self.frames_per_action: self.prev_state = np.zeros((self.frames_per_action, 84, 84), dtype=np.float32) self.prev_state[1:, :, :] = self.prev_state[:-1, :, :] self.prev_state[0, :, :] = self.curr_state[-1, :, :] self.curr_state[1:, :, :] = self.curr_state[:-1, :, :] self.curr_state[0, :, :] = state self.num_frames += 1 def accept_reward(self, state, action, reward, new_state, is_terminal): self.prev_reward += reward if not (is_terminal or self.num_frames % self.frames_per_action == 0): return if self.num_frames == self.frames_per_action: self.prev_reward = 0.0 self.prev_action = action return self.history.add((self.prev_state, self.prev_action, self.prev_reward, self.curr_state, is_terminal)) self.prev_reward = 0.0 self.prev_action = action self.iterations += 1 if self.iterations % 10000 == 0: print '*** UPDATING TARGET NETWORK ***' self.update_target() state, action, reward, new_state, is_terminal = self.history.get(num=32) state = cuda.to_gpu(state) action = cuda.to_gpu(action) new_state = cuda.to_gpu(new_state) reward = cuda.to_gpu(reward) loss, q = self.forward(state, action, reward, new_state, is_terminal) self.optimizer.zero_grads() loss.backward() self.optimizer.update() def forward(self, state, action, reward, new_state, is_terminal): q = self.get_q(Variable(state)) q_target = self.get_target_q(Variable(new_state)) max_target_q = cp.max(q_target.data, axis=1) target = cp.copy(q.data) for i in xrange(target.shape[0]): curr_action = int(action[i, 0]) if is_terminal[i]: target[i, curr_action] = reward[i] else: target[i, curr_action] = reward[i] + self.gamma * max_target_q[i] loss = F.mean_squared_error(Variable(target), q) return loss, 0.0 #cp.mean(q.data[:, action[i]]) def get_q(self, state): h1 = F.relu(self.model.l1(state)) h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) h4 = self.model.l4(h3) return self.model.l5(h4) def get_target_q(self, state): h1 = F.relu(self.target_model.l1(state)) h2 = F.relu(self.target_model.l2(h1)) h3 = F.relu(self.target_model.l3(h2)) h4 = self.target_model.l4(h3) return self.target_model.l5(h4) def save(self, file_name): with open(file_name, 'wb') as out_file: pickle.dump((self.model, self.optimizer), out_file) def load(self, file_name): self.epsilon = 0.0 with open(file_name, 'rb') as in_file: model, optimizer = pickle.load(in_file) self.model.copy_parameters_from(model.parameters) self.optimizer = optimizer def start_new_game(self): self.num_frames = 0