def generate_expert(self): state, reward, done, info = self.env.reset() state_buffer, a16_buffer, a32_buffer, a64_buffer = [], [], [], [] while not done: a0, a1, a2 = teacher.action(state, info) action_64 = np.zeros((scr_pixels * scr_pixels, ), dtype=np.float32) action_32 = np.zeros((scr_pixels * scr_pixels / 4, ), dtype=np.float32) action_16 = np.zeros((scr_pixels * scr_pixels / 16, ), dtype=np.float32) action_64[a1 * scr_pixels + a2] = 1 action_32[a1 * scr_pixels / 4 + a2 / 2] = 1 action_16[a1 * scr_pixels / 16 + a2 / 4] = 1 # print(action == 1) state_buffer.append([state]) a16_buffer.append([action_16]) a32_buffer.append([action_32]) a64_buffer.append([action_64]) state, reward, done, info = self.env.step( 1 if a0 == 0 else int(2 + a1 * scr_pixels + a2)) state_buffer, a16_buffer, a32_buffer, a64_buffer = np.vstack( state_buffer), np.vstack(a16_buffer), np.vstack( a32_buffer), np.vstack(a64_buffer) # print(state_buffer.shape) return state_buffer, a16_buffer, a32_buffer, a64_buffer
def pre_train(self): global GLOBAL_RUNNING_R, GLOBAL_EP # self.AC.pull_global() total_step = 1 buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_r, buffer_avail,buffer_a0_exp,buffer_a1_exp,buffer_a2_exp = [], [], [], [], [], [],[],[],[] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: state, _, _, info = self.env.reset( ) # timestep[0] contains rewards, observations, etc. SEE pysc2 FOR MORE INFO ep_r = 0 lei_ji = 0 while True: a0, a1, a2 = self.AC.choose_action([state], [info]) a0_exp, a1_exp, a2_exp = teacher.action(state, info) # print(state) action = 1 if a0 == 0 else int(2 + a1_exp * scr_pixels + a2_exp) buffer_s.append([state]) buffer_avail.append([info]) buffer_a0.append(a0) buffer_a1.append(a1) buffer_a2.append(a2) buffer_a0_exp.append(a0_exp) buffer_a1_exp.append(a1_exp) buffer_a2_exp.append(a2_exp) state, reward, done, info = self.env.step(action) lei_ji += reward if lei_ji >= 20: done = True if reward > 0: reward = reward * (1 + ep_r * weight) buffer_r.append(reward) ep_r += reward if total_step % UPDATE_GLOBAL_ITER == 0 or done: if done: v_s_ = 0 else: v_s_ = sess.run(self.AC.value, {self.AC.s: [state]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ # compute v target buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_v_target, buffer_avail, buffer_a0_exp, buffer_a1_exp, buffer_a2_exp = np.vstack( buffer_s ), np.vstack(buffer_a0), np.vstack(buffer_a1), np.vstack( buffer_a2), np.vstack(buffer_v_target), np.vstack( buffer_avail), np.vstack(buffer_a0_exp), np.vstack( buffer_a1_exp), np.vstack( buffer_a2_exp ) # put together into a single array feed_dict = { self.AC.s: buffer_s, self.AC.a0: buffer_a0, self.AC.a1: buffer_a1, self.AC.a2: buffer_a2, self.AC.a0_exp: buffer_a0_exp, self.AC.a1_exp: buffer_a1_exp, self.AC.a2_exp: buffer_a2_exp, self.AC.v_target: buffer_v_target, self.AC.available: buffer_avail, } test = self.AC.update_global_high( feed_dict) # update parameters #closs ,aloss,exp_loss= sess.run([self.AC.c_loss,self.AC.a_loss,self.AC.exp_loss], feed_dict=feed_dict) #print("c_loss:",closs,"a_loss:",aloss,"exp_loss",exp_loss) #sigma_1,sigma_2 = sess.run([self.AC.sigma_1,self.AC.sigma_2],feed_dict = feed_dict) entropy, aloss, td, exp_loss, prob_a = sess.run( [ self.AC.entropy, self.AC.a_loss, self.AC.td, self.AC.exp_loss, self.AC.log_prob_a ], feed_dict=feed_dict) buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_r, buffer_avail = [], [], [], [], [], [] buffer_a0_exp, buffer_a1_exp, buffer_a2_exp = [], [], [] self.AC.pull_global() total_step += 1 if done: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r) print( self.name, "episode:", GLOBAL_EP, '| reward: %.1f' % lei_ji, "| running_reward: %.1f" % GLOBAL_RUNNING_R[-1], # '| sigma:', test, # debug ) GLOBAL_EP += 1 print("entropy", entropy[0][0], "td", td[0], "prob_a:", prob_a, "prob_exp:", exp_loss, "aloss", aloss) # self.globalAC.save_ckpt() # with open("/summary.txt",'w') as f: # f.write('%.lf' % ep_r) if ep_r > score_high[self.hard] or ep_r < score_low[ self.hard]: self.env.close() self.hard = self.hard + 1 if ep_r > score_high[ self.hard] else self.hard - 1 self.env = wrap(game[self.hard]) break
def pre_train(self): global GLOBAL_RUNNING_R, GLOBAL_EP # self.AC.pull_global() total_step = 1 buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_r, buffer_avail = [], [], [], [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: state, _, _, info = self.env.reset( ) # timestep[0] contains rewards, observations, etc. SEE pysc2 FOR MORE INFO ep_r = 0 while True: a0, a1, a2 = teacher.action(state, info) # print(state) action = 1 if a0 == 0 else int(2 + a1 * scr_pixels + a2) buffer_s.append([state]) buffer_avail.append([info]) buffer_a0.append(a0) buffer_a1.append(a1) buffer_a2.append(a2) state, reward, done, info = self.env.step(action) buffer_r.append(reward) ep_r += reward if total_step % UPDATE_GLOBAL_ITER == 0 or done: if done: v_s_ = 0 else: v_s_ = sess.run(self.AC.value, {self.AC.s: [state]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ # compute v target buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_v_target, buffer_avail = np.vstack( buffer_s ), np.vstack(buffer_a0), np.vstack(buffer_a1), np.vstack( buffer_a2), np.vstack(buffer_v_target), np.vstack( buffer_avail) # put together into a single array feed_dict = { self.AC.s: buffer_s, self.AC.a0: buffer_a0, self.AC.a1: buffer_a1, self.AC.a2: buffer_a2, self.AC.v_target: buffer_v_target, self.AC.available: buffer_avail, } test = self.AC.update_global_high( feed_dict) # update parameters closs, aloss = sess.run([self.AC.c_loss, self.AC.a_loss], feed_dict=feed_dict) print("c_loss:", closs, "a_loss:", aloss) buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_r, buffer_avail = [], [], [], [], [], [] self.AC.pull_global() total_step += 1 if done: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r) print( self.name, "episode:", GLOBAL_EP, '| reward: %.1f' % ep_r, "| running_reward: %.1f" % GLOBAL_RUNNING_R[-1], # '| sigma:', test, # debug ) GLOBAL_EP += 1 # self.globalAC.save_ckpt() # with open("/summary.txt",'w') as f: # f.write('%.lf' % ep_r) break