def update_master_policy(self,rbs,disc, lr, cter): samples =random.sample(rbs,batch_size) minimaps = [] screens = [] infos = [] next_minimaps = [] next_screens = [] next_infos = [] actions = [] rewards = [] for i,[obs,_,action,_,next_obs] in enumerate(samples): minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 next_minimap = np.array(next_obs.observation['minimap'], dtype=np.float32) next_minimap = np.expand_dims(U.preprocess_minimap(next_minimap), axis=0) next_screen = np.array(obs.observation['screen'], dtype=np.float32) next_screen = np.expand_dims(U.preprocess_screen(next_screen), axis=0) next_info = np.zeros([1, self.isize], dtype=np.float32) next_info[0, obs.observation['available_actions']] = 1 reward = next_obs.reward minimaps.append(minimap) screens.append(screen) infos.append(info) next_minimaps.append(next_minimap) next_screens.append(next_screen) next_infos.append(next_info) cur_action = np.zeros(num_subpolicies) cur_action[action]=1 actions.append(cur_action) rewards.append(reward) minimaps = np.concatenate(minimaps, axis=0) screens = np.concatenate(screens, axis=0) infos = np.concatenate(infos, axis=0) next_minimaps = np.concatenate(next_minimaps, axis=0) next_screens = np.concatenate(next_screens, axis=0) next_infos = np.concatenate(next_infos, axis=0) y_batch = [] Qvalue_batch =self.sess_master.run(self.subpolicy_Q,feed_dict = {self.minimap: next_minimaps, self.screen: next_screens, self.info: next_infos}) for i in range(0, batch_size): terminal = samples[i][3] if terminal: y_batch.append(rewards[i]) else: y_batch.append(rewards[i] + disc * np.max(Qvalue_batch[i])) self.sess_master.run(self.master_train_op, feed_dict={self.minimap:minimaps, self.screen:screens, self.info:infos, self.y_input:y_batch, self.action_input:actions, self.learning_rate:lr})
def calc_pixel_change(obs2, obs1): screen1 = np.array(obs1.observation['screen'], dtype=np.float32) screen1 = np.expand_dims(U.preprocess_screen(screen1), axis=0) screen2 = np.array(obs2.observation['screen'], dtype=np.float32) screen2 = np.expand_dims(U.preprocess_screen(screen2), axis=0) screen1_avg = np.mean(screen1, axis=0) screen2_avg = np.mean(screen1, axis=0) d = np.absolute(screen2_avg[2:-2, 2:-2, :] - screen1_avg[2:-2, 2:-2, :]) # (60,60,3), any channel here? but with the np.mean on next line, doesn't matter either way. m = np.mean(d, 2) pc = self._subsample(m, 3) return pc
def getTrainFeedDict(self, obs, action, attributed_act_id): screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = U.preprocess_screen(screen) info = np.zeros([len(U.useful_actions)], dtype=np.float32) info[U.compressActions(obs.observation['available_actions'])] = 1 valid_spatial_action = 0 valid_action = np.zeros([len(U.useful_actions)], dtype=np.float32) custom_inputs = np.array(obs.observation.custom_inputs, dtype=np.float32) act_id = action.function net_act_id = attributed_act_id act_args = action.arguments player_relative = obs.observation.feature_screen.player_relative valid_actions = obs.observation["available_actions"] valid_actions = U.compressActions(valid_actions) valid_action[valid_actions] = 1 args = actions.FUNCTIONS[act_id].args for arg, act_arg in zip(args, act_args): if arg.name in ('screen', 'minimap', 'screen2') and (not self.flags.force_focus_fire or (act_id != 12 and act_id != 2)): valid_spatial_action = 1 return { self.screen: screen, # yes self.info: info, # yes self.custom_inputs: custom_inputs, #yes self.valid_spatial_action: valid_spatial_action, #yes self.valid_action: valid_action, # yes }
def step_low(self, ind_thread, obs, dir_high, act_id): # obs就是环境传入的timestep minimap = np.array( obs.observation['feature_minimap'], dtype=np.float32 ) # 以下4行将minimap和screen的特征做一定处理后分别保存在minimap和screen变量中 minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) # 这四行具体语法暂未研究 screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) # TODO: only use available actions info = np.zeros([1, self.isize], dtype=np.float32) # self.isize值是动作函数的数量 info[0, obs.observation['available_actions']] = 1 # info存储可执行的动作。 # 矿物 军队数量 农民数量 info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32) info_plus[0] = obs.observation.player.minerals, obs.observation[ 'player'][5], obs.observation['player'][6], obs.observation[ 'player'][4] # info 现在的size 是 isize + info_plus_size info = np.concatenate((info, info_plus), axis=1) dir_high_usedToFeedLowNet = np.ones([1, 1], dtype=np.float32) dir_high_usedToFeedLowNet[0][0] = dir_high act_ID = np.ones([1, 1], dtype=np.float32) act_ID[0][0] = act_id feed = { self.minimap: minimap, self.screen: screen, self.info: info, self.dir_high_usedToFeedLowNet: dir_high_usedToFeedLowNet, self.act_id: act_ID } spatial_action_low = self.sess.run( # 数据类型:Tensor("actor_low/Softmax:0", shape=(?, 4096), dtype=float32, device=/device:GPU:0) # [array([[0.00019935, 0.00025348, 0.00024519, ..., 0.00016189, 0.00016014, 0.00016842]], dtype=float32)] [self.spatial_action_low], feed_dict=feed) # 选择施加动作的位置 # spatial_action_low = spatial_action_low.ravel() # ravel()是numpy的函数,作用是将数据降维 target = np.argmax(spatial_action_low) target = [int(target // self.ssize), int(target % self.ssize) ] # 获取要施加动作的位置 疑问:若action是勾选方框怎么办?target只有一个坐标吧,那另一个坐标呢? # if False: # 疑问:if False什么意思?网上没查到 # print(actions.FUNCTIONS[act_id].name, target) # Epsilon greedy exploration # 0.2(epsilon[1])的概率随机选一个位置施加动作 if self.training and np.random.rand() < self.epsilon[1]: dy = np.random.randint(-4, 5) target[0] = int(max(0, min(self.ssize - 1, target[0] + dy))) dx = np.random.randint(-4, 5) target[1] = int(max(0, min(self.ssize - 1, target[1] + dx))) return target[0], target[1]
def step(self, obs): minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) # TODO: only use available actions action_indices = [ 0, 1, 2, 3, 5, 7, 12, 13, 14, 15, 18, 19, 20, 261, 274, 331, 332, 333, 334, 451, 452, 453, 456 ] valid_actions = list( set(obs.observation['available_actions']) & set(action_indices)) # print("valid_actions",valid_actions) valid_actions_indices = [ action_indices.index(i) for i in valid_actions ] info = np.zeros([1, self.isize], dtype=np.float32) info[0, valid_actions_indices] = 1 feed = {self.minimap: minimap, self.screen: screen, self.info: info} non_spatial_action, spatial_action = self.sess.run( [self.non_spatial_action, self.spatial_action], feed_dict=feed) # print("non_spatial_action",non_spatial_action.shape,len(non_spatial_action.ravel())) # print("spatial_action",spatial_action.ravel().shape,len(spatial_action.ravel())) # Select an action and a spatial target non_spatial_action = non_spatial_action.ravel() spatial_action = spatial_action.ravel() # valid_actions = obs.observation['available_actions'] # act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])] # print("valid",non_spatial_action[valid_actions_indices]) act_id = valid_actions[np.argmax( non_spatial_action[valid_actions_indices])] # print("SELECTED",act_id) target = np.argmax(spatial_action) target = [int(target // self.ssize), int(target % self.ssize)] # Epsilon greedy exploration if self.training and np.random.rand() < self.epsilon[0]: act_id = np.random.choice(valid_actions) if self.training and np.random.rand() < self.epsilon[1]: dy = np.random.randint(-4, 5) target[0] = int(max(0, min(self.ssize - 1, target[0] + dy))) dx = np.random.randint(-4, 5) target[1] = int(max(0, min(self.ssize - 1, target[1] + dx))) # Set act_id and act_args act_args = [] for arg in actions.FUNCTIONS[act_id].args: if arg.name in ('screen', 'minimap', 'screen2'): act_args.append([target[1], target[0]]) else: act_args.append([0]) # TODO: Be careful return actions.FunctionCall(act_id, act_args)
def step(self, obs): minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) # self.logger.info(minimap) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) # TODO: only use available actions info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 feed = {self.minimap: minimap, self.screen: screen, self.info: info} non_spatial_action, spatial_action = self.sess.run( [self.non_spatial_action, self.spatial_action], feed_dict=feed) # Select an action and a spatial target non_spatial_action = non_spatial_action.ravel( ) # .ravel flattens the input into 1D array spatial_action = spatial_action.ravel() valid_actions = obs.observation['available_actions'] act_id = valid_actions[np.argmax( non_spatial_action[valid_actions] )] # index of best valid non-spatial action target = np.argmax(spatial_action) target = [ int(target // self.ssize), int(target % self.ssize) ] # int(idx//32), int(idx%32) # what is the purpose of doing like this? if False: # ??? self.logger.info(actions.FUNCTIONS[act_id].name, target) # Epsilon greedy exploration. if self.training and np.random.rand( ) < self.epsilon[0]: # choose action act_id = np.random.choice(valid_actions) if self.training and np.random.rand( ) < self.epsilon[1]: # seems like its not choosing from ALL pixels? dy = np.random.randint(-4, 5) target[0] = int(max(0, min(self.ssize - 1, target[0] + dy))) # max(0, min(31, target[0]+dy)) dx = np.random.randint(-4, 5) target[1] = int(max(0, min(self.ssize - 1, target[1] + dx))) # Set act_id and act_args act_args = [] for arg in actions.FUNCTIONS[ act_id].args: # find args of indexed action if arg.name in ('screen', 'minimap', 'screen2'): act_args.append([target[1], target[0]]) else: act_args.append([0]) # TODO: Be careful (???) return actions.FunctionCall(act_id, act_args)
def step(self, obs): """ choose action get observation, return spatial, nonspatial action using RL obs = observation spec in lib/features.py : 218 """ # for v in tf.get_default_graph().as_graph_def().node: # print(v.name) # obs.observation.screen_feature is (17, 64, 64) screen = np.array(obs.observation.feature_screen, dtype=np.float32) screen = np.expand_dims(preprocess_screen(screen), axis=0) # return (bs=1, channel=42, h=64, w=64) # get available actions info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 # run session to obtain spatial action output and non spatial action array non_spatial_action, spatial_action = self.sess.run( [self.non_spatial_action, self.spatial_action], feed_dict={self.screen: screen, self.info: info}) # select action and spatial target non_spatial_action = non_spatial_action.ravel() # flatten spatial_action = spatial_action.ravel() # flatten valid_actions = obs.observation['available_actions'] # available action index act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])] target = np.argmax(spatial_action) # position to move target = [int(target // self.ssize), int(target % self.ssize)] # e-greedy action selection (IN THIS NETWORK, WE EXPLORE ONLY IF A RANDOM FRACTION IS ABOVE EPSILON) if np.random.random() > self.epsilon: # randomly select non-spatial action act_id = np.random.choice(valid_actions) # randomly select spatial action dy = np.random.randint(-4, 5) target[0] = int(max(0, min(self.ssize - 1, target[0] + dy))) dx = np.random.randint(-4, 5) target[1] = int(max(0, min(self.ssize - 1, target[1] + dx))) # Set act_id and act_args act_args = [] for arg in actions.FUNCTIONS[act_id].args: if arg.name in ('screen', 'minimap', 'screen2'): # in fact, only screen act_args.append([target[1], target[0]]) # y x to x y else: act_args.append([0]) # [0] means not queue # self.steps += 1 # self.reward += obs.reward # print("return action with id: {} and args {} ".format(act_id, act_args)) return actions.FunctionCall(act_id, act_args)
def step(self, obs): screen = np.array(obs.observation.feature_screen, dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) minimap = np.array(obs.observation.feature_minimap, dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) structured = np.zeros([1, self.structured_dimensions], dtype=np.float32) structured[0, obs.observation.available_actions] = 1 feed_dict = { self.screen_ph: screen, self.minimap_ph: minimap, self.structured_ph: structured } non_spatial_action, spatial_action = self.sess.run( [self.non_spatial_action, self.spatial_action], feed_dict=feed_dict) non_spatial_action, spatial_action = non_spatial_action.ravel( ), spatial_action.ravel() available_actions = obs.observation.available_actions action_id = available_actions[np.argmax( non_spatial_action[available_actions])] spatial_target = np.argmax(spatial_action) spatial_target = [ int(spatial_target // self.screen_dimensions), int(spatial_target % self.screen_dimensions) ] # epsilon-greedy exploration if self.training and np.random.rand() < self.epsilon[0]: action_id = np.random.choice(available_actions) if self.training and np.random.rand() < self.epsilon[1]: delta_y, delta_x = np.random.randint(-4, 5), np.random.randint(-4, 5) spatial_target[0] = int( max( 0, min(self.screen_dimensions - 1, spatial_target[0] + delta_y))) spatial_target[1] = int( max( 0, min(self.screen_dimensions - 1, spatial_target[1] + delta_x))) action_args = [] for arg in actions.FUNCTIONS[action_id].args: if arg.name in ('screen', 'minimap', 'screen2'): action_args.append([spatial_target[1], spatial_target[0]]) else: action_args.append([0]) return actions.FunctionCall(action_id, action_args)
def get_cur_Q_action(self,obs): minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) # TODO: only use available actions info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 feed_master = {self.minimap: minimap, self.screen: screen, self.info: info} subpolicy_selected = np.argmax(self.sess_master.run(self.subpolicy_Q, feed_dict=feed_master),axis=1)[0] return subpolicy_selected
def step(self, obs): minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) # TODO: only use available actions info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 subpolicy_index = self.get_cur_Q_action(obs) #print("Subpolicy Chosen is :"+str(subpolicy_index)) feed = {self.minimap: minimap, self.screen: screen, self.info: info} cur_spatial_action,cur_non_spation_action,_ = self.subpolicies[subpolicy_index] non_spatial_action, spatial_action = self.sess_master.run( [cur_non_spation_action, cur_spatial_action], feed_dict=feed) # Select an action and a spatial target non_spatial_action = non_spatial_action.ravel() spatial_action = spatial_action.ravel() valid_actions = obs.observation['available_actions'] act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])] target = np.argmax(spatial_action) target = [int(target // self.ssize), int(target % self.ssize)] if False: print(actions.FUNCTIONS[act_id].name, target) # Epsilon greedy exploration if self.training and np.random.rand() < self.epsilon[0]: act_id = np.random.choice(valid_actions) if self.training and np.random.rand() < self.epsilon[1]: dy = np.random.randint(-4, 5) target[0] = int(max(0, min(self.ssize - 1, target[0] + dy))) dx = np.random.randint(-4, 5) target[1] = int(max(0, min(self.ssize - 1, target[1] + dx))) # Set act_id and act_args act_args = [] for arg in actions.FUNCTIONS[act_id].args: if arg.name in ('screen', 'minimap', 'screen2'): act_args.append([target[1], target[0]]) else: act_args.append([0]) # TODO: Be careful return actions.FunctionCall(act_id, act_args)
def getPredictFeedDict(self, obs, hState, cState): screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, len(U.useful_actions)], dtype=np.float32) info[0, U.compressActions(obs.observation['available_actions'])] = 1 custom_inputs = np.expand_dims( np.array(obs.observation.custom_inputs, dtype=np.float32), axis=0) hState = np.expand_dims(np.array(hState), axis=0) cState = np.expand_dims(np.array(cState), axis=0) return { self.screen: screen, self.info: info, self.custom_inputs: custom_inputs, self.hStateInput: hState, self.cStateInput: cState }
def step(self, obs): super(A3CAgent, self).step(obs) minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) # TODO: only use available actions info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs. observation['available_actions']] = 1 #mask for available actions feed = {self.minimap: minimap, self.screen: screen, self.info: info} non_spatial_action, spatial_action = self.sess.run( [self.non_spatial_action, self.spatial_action], feed_dict=feed) # Select an action and a spatial target non_spatial_action = non_spatial_action.ravel() spatial_action = spatial_action.ravel() valid_actions = obs.observation['available_actions'] act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])] target = np.argmax(spatial_action) target = [int(target // self.ssize), int(target % self.ssize)] if False: print(actions.FUNCTIONS[act_id].name, target) # Epsilon greedy exploration if self.training and np.random.rand() < self.epsilon[0]: act_id = np.random.choice(valid_actions) if self.training and np.random.rand() < self.epsilon[1]: dy = np.random.randint(-4, 5) target[0] = int(max(0, min(self.ssize - 1, target[0] + dy))) dx = np.random.randint(-4, 5) target[1] = int(max(0, min(self.ssize - 1, target[1] + dx))) # Set act_id and act_args act_args = [] for arg in actions.FUNCTIONS[act_id].args: if arg.name in ('screen', 'minimap', 'screen2'): act_args.append([target[1], target[0]]) #use (y,x)->(height,width) else: act_args.append([0]) #not queued TODO: Be careful return actions.FunctionCall(act_id, act_args)
def step(self, obs): minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) # TODO: only use available actions info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 feed = {self.minimap: minimap, self.screen: screen, self.info: info} non_spatial_action, spatial_action = self.sess.run( [self.non_spatial_action, self.spatial_action], feed_dict=feed) # Select an action and a spatial target non_spatial_action = non_spatial_action.ravel() spatial_action = spatial_action.ravel() valid_actions = obs.observation['available_actions'] act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])] target = np.argmax(spatial_action) target = [int(target // self.ssize), int(target % self.ssize)] if False: print(actions.FUNCTIONS[act_id].name, target) # Epsilon greedy exploration if self.training and np.random.rand() < self.epsilon[0]: act_id = np.random.choice(valid_actions) if self.training and np.random.rand() < self.epsilon[1]: dy = np.random.randint(-4, 5) target[0] = int(max(0, min(self.ssize-1, target[0]+dy))) dx = np.random.randint(-4, 5) target[1] = int(max(0, min(self.ssize-1, target[1]+dx))) # Set act_id and act_args act_args = [] for arg in actions.FUNCTIONS[act_id].args: if arg.name in ('screen', 'minimap', 'screen2'): act_args.append([target[1], target[0]]) else: act_args.append([0]) # TODO: Be careful return actions.FunctionCall(act_id, act_args)
def step_high(self, obs): # obs就是环境传入的timestep minimap = np.array( obs.observation['feature_minimap'], dtype=np.float32 ) # 以下4行将minimap和screen的特征做一定处理后分别保存在minimap和screen变量中 minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) # 这四行具体语法暂未研究 screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) # TODO: only use available actions info = np.zeros([1, self.isize], dtype=np.float32) # self.isize值是动作函数的数量 info[0, obs.observation['available_actions']] = 1 # info存储可执行的动作。 # 矿物 军队数量 农民数量 info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32) info_plus[0] = obs.observation.player.minerals, obs.observation[ 'player'][5], obs.observation['player'][6], obs.observation[ 'player'][4] # info 现在的size 是 isize + info_plus_size info = np.concatenate((info, info_plus), axis=1) feed = {self.minimap: minimap, self.screen: screen, self.info: info} dir_high = self.sess.run([self.dir_high], feed_dict=feed) # 选择出宏动作的编号/id #DHN待处理: 可以将dir_high先根据一定的方法筛选一下(比如宏动作中的硬编码微动作是否在obs.observation['available_actions']中) # valid_dir_high = obs.observation['available_actions'] dir_high_id = np.argmax(dir_high) # 获取要执行的宏动作id(从0开始) # if False: # 疑问:if False什么意思?网上没查到 # print(actions.FUNCTIONS[act_id].name, target) # Epsilon greedy exploration # 0.05(epsilon[0])的概率随机选一个宏动作(会覆盖之前的dir_high_id) if self.training and np.random.rand() < self.epsilon[0]: dir_high_id = random.randint(0, num_macro_action - 1) return dir_high_id
def update(self, rbs, disc, lr, cter): # Compute R, which is value of the last observation obs = rbs[-1][-1] # Print out score on a test run through a full episode, don't update network on test run if self.test_run and obs.last(): self.test_scores.append(obs.observation['score_cumulative'][0]) # print("TEST SCORE: " + str(self.test_scores[-1])) return else: train_score = obs.observation['score_cumulative'][0] logger.info('Total game steps: %s', self.count_steps) self.count_steps += len(rbs) if obs.last(): R = 0 else: minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) action_indices = [ 0, 1, 2, 3, 5, 7, 12, 13, 14, 15, 18, 19, 20, 261, 274, 331, 332, 333, 334, 451, 452, 453, 456 ] valid_actions = list( set(obs.observation['available_actions']) & set(action_indices)) valid_actions_indices = [ action_indices.index(i) for i in valid_actions ] info = np.zeros([1, self.isize], dtype=np.float32) info[0, valid_actions_indices] = 1 feed = { self.minimap: minimap, self.screen: screen, self.info: info } R = self.sess.run(self.value, feed_dict=feed)[0] # Compute targets and masks minimaps = [] screens = [] infos = [] value_target = np.zeros([len(rbs)], dtype=np.float32) value_target[-1] = R valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32) spatial_action_selected = np.zeros([len(rbs), self.ssize**2], dtype=np.float32) valid_non_spatial_action = np.zeros([len(rbs), self.isize], dtype=np.float32) non_spatial_action_selected = np.zeros([len(rbs), self.isize], dtype=np.float32) rbs.reverse() for i, [obs, action, next_obs] in enumerate(rbs): minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) action_indices = [ 0, 1, 2, 3, 5, 7, 12, 13, 14, 15, 18, 19, 20, 261, 274, 331, 332, 333, 334, 451, 452, 453, 456 ] valid_actions = list( set(obs.observation['available_actions']) & set(action_indices)) valid_actions_indices = [ action_indices.index(i) for i in valid_actions ] info = np.zeros([1, self.isize], dtype=np.float32) info[0, valid_actions_indices] = 1 minimaps.append(minimap) screens.append(screen) infos.append(info) reward = obs.reward act_id = action.function act_args = action.arguments value_target[i] = reward + disc * value_target[i - 1] # valid_actions = obs.observation["available_actions"] valid_non_spatial_action[i, valid_actions_indices] = 1 non_spatial_action_selected[i, action_indices.index(act_id)] = 1 args = actions.FUNCTIONS[act_id].args for arg, act_arg in zip(args, act_args): if arg.name in ('screen', 'minimap', 'screen2'): ind = act_arg[1] * self.ssize + act_arg[0] valid_spatial_action[i] = 1 spatial_action_selected[i, ind] = 1 minimaps = np.concatenate(minimaps, axis=0) screens = np.concatenate(screens, axis=0) infos = np.concatenate(infos, axis=0) # Train feed = { self.minimap: minimaps, self.screen: screens, self.info: infos, self.value_target: value_target, self.valid_spatial_action: valid_spatial_action, self.spatial_action_selected: spatial_action_selected, self.valid_non_spatial_action: valid_non_spatial_action, self.non_spatial_action_selected: non_spatial_action_selected, self.learning_rate: lr, self.train_score: train_score } _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed) self.summary_writer.add_summary(summary, cter)
def update(self, replay_buffer, gamma, learning_rate, step): obs = replay_buffer[-1][-1] if obs.last(): reward = 0 else: screen = np.array(obs.observation.feature_screen, dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) minimap = np.array(obs.observation.feature_minimap, dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) structured = np.zeros([1, self.structured_dimensions], dtype=np.float32) structured[0, obs.observation.available_actions] = 1 feed_dict = { self.screen_ph: screen, self.minimap_ph: minimap, self.structured_ph: structured } reward = self.sess.run(self.value, feed_dict=feed_dict) # compute targets and masks screens, minimaps, structureds = [], [], [] target_value = np.zeros([len(replay_buffer)], dtype=np.float32) target_value[-1] = reward valid_non_spatial_action = np.zeros( [len(replay_buffer), len(actions.FUNCTIONS)], dtype=np.float32) sample_non_spatial_action = np.zeros( [len(replay_buffer), len(actions.FUNCTIONS)], dtype=np.float32) valid_spatial_action = np.zeros([len(replay_buffer)], dtype=np.float32) sample_spatial_action = np.zeros( [len(replay_buffer), self.screen_dimensions**2], dtype=np.float32) replay_buffer.reverse() for i, [obs, action, next_obs] in enumerate(replay_buffer): screen = np.array(obs.observation.feature_screen, dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) minimap = np.array(obs.observation.feature_minimap, dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) structured = np.zeros([1, self.structured_dimensions], dtype=np.float32) structured[0, obs.observation.available_actions] = 1 screens.append(screen) minimaps.append(minimap) structureds.append(structured) reward = obs.reward action_id = action.function action_args = action.arguments target_value[i] = reward + gamma * target_value[i - 1] available_actions = obs.observation.available_actions valid_non_spatial_action[i, available_actions] = 1 sample_non_spatial_action[i, action_id] = 1 args = actions.FUNCTIONS[action_id].args for arg, action_arg in zip(args, action_args): if arg.name in ('screen', 'minimap', 'screen2'): spatial_action = action_arg[ 1] * self.screen_dimensions + action_arg[0] valid_spatial_action[i] = 1 sample_spatial_action[i, spatial_action] = 1 screens = np.concatenate(screens, axis=0) minimaps = np.concatenate(minimaps, axis=0) structureds = np.concatenate(structureds, axis=0) feed_dict = { self.screen_ph: screens, self.minimap_ph: minimaps, self.structured_ph: structureds, self.target_value_ph: target_value, self.valid_non_spatial_action_ph: valid_non_spatial_action, self.sample_non_spatial_action_ph: sample_non_spatial_action, self.valid_spatial_action_ph: valid_spatial_action, self.sample_spatial_action_ph: sample_spatial_action, self.learning_rate_ph: learning_rate } _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed_dict) self.summary_writer.add_summary(summary, step)
def learn(self): """when certain number of replay size reach, learn from minibatch replay""" print("\nstart learning...") # replace target net parameters if self.learn_step_counter % self.replace_target_iter == 0: self.sess.run(self.replace_target_op) print('\nreplaced target net parameters...') # sample mini-batch sample_indices = np.random.choice(len(self.memory), size=self.batch_size) batch_memory = deque(list(np.array(self.memory)[sample_indices])) print("selecting minibatch of size: {}..." .format(len(batch_memory))) # extract s = [], a = [], s' = [], r = [] screens = [] screens_next = [] infos = [] infos_next = [] rewards = [] # actions valid_spatial_action = np.zeros([self.batch_size], dtype=np.float32) spatial_action_selected = np.zeros([self.batch_size, self.ssize ** 2], dtype=np.float32) valid_non_spatial_action = np.zeros([self.batch_size, len(actions.FUNCTIONS)], dtype=np.float32) non_spatial_action_selected = np.zeros([self.batch_size, len(actions.FUNCTIONS)], dtype=np.float32) for i, [obs, a, r, obs_] in enumerate(batch_memory): # s current state from obs screen = np.array(obs.observation.feature_screen, dtype=np.float32) screen = np.expand_dims(preprocess_screen(screen), axis=0) # return (bs=1, channel=42, h=64, w=64) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 # s_ next state from obs_ screen_next = np.array(obs_.observation.feature_screen, dtype=np.float32) screen_next = np.expand_dims(preprocess_screen(screen_next), axis=0) # return (bs=1, channel=42, h=64, w=64) info_next = np.zeros([1, self.isize], dtype=np.float32) info_next[0, obs_.observation['available_actions']] = 1 # append to s list, s_ list screens.append(screen) infos.append(info) screens_next.append(screen_next) infos_next.append(info_next) # get reward r rewards.append(r) # get action 'a' act_id = a.function act_args = a.arguments valid_actions = obs.observation["available_actions"] valid_non_spatial_action[i, valid_actions] = 1 non_spatial_action_selected[i, act_id] = 1 args = actions.FUNCTIONS[act_id].args for arg, act_arg in zip(args, act_args): if arg.name in ('screen', 'minimap', 'screen2'): ind = act_arg[1] * self.ssize + act_arg[0] valid_spatial_action[i] = 1 spatial_action_selected[i, ind] = 1 screens = np.concatenate(screens, axis=0) # (32, size of s) infos = np.concatenate(infos, axis=0) screens_next = np.concatenate(screens_next, axis=0) infos_next = np.concatenate(infos_next, axis=0) rewards = np.transpose(np.array(rewards)) # (32, r) # get q_next = Q(s', a': theta) to calculate y q_next = self.sess.run(self.q_next, feed_dict={self.screen: screens_next, self.info: infos_next}) # q_next = self.sess.run(self.q_eval, feed_dict={self.screen: screens_next, self.info: infos_next}) q_target = rewards + self.gamma * q_next # train feed = {self.screen: screens, self.info: infos, self.q_target: q_target, self.valid_spatial_action: valid_spatial_action, self.spatial_action_selected: spatial_action_selected, self.valid_non_spatial_action: valid_non_spatial_action, self.non_spatial_action_selected: non_spatial_action_selected } # _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed) # self.summary_writer.add_summary(summary, self.learn_step_counter) _ = self.sess.run(self.train_op, feed_dict=feed) self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max self.learn_step_counter += 1 pass
def update(self, rbs, disc, lr, cter): # Compute R, which is value of the last observation obs = rbs[-1][-1] if obs.last(): R = 0 else: minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 feed = {self.minimap: minimap, self.screen: screen, self.info: info} R = self.sess.run(self.value, feed_dict=feed)[0] # Compute targets and masks minimaps = [] screens = [] infos = [] value_target = np.zeros([len(rbs)], dtype=np.float32) value_target[-1] = R valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32) spatial_action_selected = np.zeros([len(rbs), self.ssize**2], dtype=np.float32) valid_non_spatial_action = np.zeros([len(rbs), len(actions.FUNCTIONS)], dtype=np.float32) non_spatial_action_selected = np.zeros([len(rbs), len(actions.FUNCTIONS)], dtype=np.float32) rbs.reverse() for i, [obs, action, next_obs] in enumerate(rbs): minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 minimaps.append(minimap) screens.append(screen) infos.append(info) reward = obs.reward act_id = action.function act_args = action.arguments value_target[i] = reward + disc * value_target[i-1] valid_actions = obs.observation["available_actions"] valid_non_spatial_action[i, valid_actions] = 1 non_spatial_action_selected[i, act_id] = 1 args = actions.FUNCTIONS[act_id].args for arg, act_arg in zip(args, act_args): if arg.name in ('screen', 'minimap', 'screen2'): ind = act_arg[1] * self.ssize + act_arg[0] valid_spatial_action[i] = 1 spatial_action_selected[i, ind] = 1 minimaps = np.concatenate(minimaps, axis=0) screens = np.concatenate(screens, axis=0) infos = np.concatenate(infos, axis=0) # Train feed = {self.minimap: minimaps, self.screen: screens, self.info: infos, self.value_target: value_target, self.valid_spatial_action: valid_spatial_action, self.spatial_action_selected: spatial_action_selected, self.valid_non_spatial_action: valid_non_spatial_action, self.non_spatial_action_selected: non_spatial_action_selected, self.learning_rate: lr} _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed) self.summary_writer.add_summary(summary, cter)
def step(self, obs): # action selection is in here minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) # TODO: only use available actions info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 feed = {self.minimap: minimap, self.screen: screen, self.info: info} # Select an action and a spatial target. valid_actions = np.zeros(self.isize, dtype=np.int32) valid_actions[obs.observation['available_actions']] = 1 function_id_policy, spatial_policy = self.sess.run( [self.non_spatial_policy, self.spatial_policy], feed_dict=feed) # self.logger.info(f"spatial_policy unraveled: {spatial_policy}.") # self.logger.info(f":{spatial_policy.shape}.") function_id_policy = function_id_policy.ravel( ) # .ravel flattens the input into 1D array spatial_policy = spatial_policy.ravel() # self.logger.info(f"spatial_policy .raveled: {spatial_policy}") # this will help with target below # self.logger.info(f":{spatial_policy.shape}.") function_id_policy *= valid_actions function_ids = np.arange(len(function_id_policy)) function_id_policy /= np.sum(function_id_policy) # act_id = valid_actions[np.argmax(non_spatial_policy[valid_actions])] act_id = np.random.choice(function_ids, p=np.squeeze(function_id_policy)) target = np.argmax(spatial_policy) # currentl will be (,1024) target = [int(target // self.ssize), int(target % self.ssize)] # not sure why different operators if False: self.logger.info( f"if false: {actions.FUNCTIONS[act_id].name, target}") # Epsilon greedy exploration. Keeping this to see if it works # basically, if eps greedy: take the target and move it left/right and up/down 4 px if self.training and np.random.rand() < self.epsilon[1]: dy = np.random.randint(-4, 5) target[0] = int(max( 0, min(self.ssize - 1, target[0] + dy))) # make sure target is within possible pxl range dx = np.random.randint(-4, 5) target[1] = int(max(0, min(self.ssize - 1, target[1] + dx))) args = [] # args: A list of the types of args passed to function_type for arg in actions.FUNCTIONS[act_id].args: if arg.name in ('screen', 'minimap', 'screen2'): # x_policy = self.sess.run( # self.argument_policy[str(arg) + "x"], # feed_dict=feed) # y_policy = self.sess.run( # self.argument_policy[str(arg) + "y"], # feed_dict=feed) # x_policy = np.squeeze(x_policy) # x_ids = np.arange(len(x_policy)) # x = np.random.choice(x_ids, p=x_policy) # y_policy = np.squeeze(y_policy) # y_ids = np.arange(len(y_policy)) # y = np.random.choice(y_ids, p=y_policy) # args.append([x, y]) args.append([target[1], target[0]]) # self.logger.info(f"target coords: {[target[1], target[0]]}") else: arg_policy = self.sess.run(self.argument_policy[str(arg)], feed_dict=feed) arg_policy = np.squeeze(arg_policy) arg_ids = np.arange(len(arg_policy)) arg_index = np.random.choice(arg_ids, p=arg_policy) args.append([arg_index]) # self.logger.info(f"arg: index: {arg_index}") # args.append([0]) # sizes: The max+1 of each of the dimensions this argument takes. return actions.FunctionCall( act_id, args) # args should be int from (0, arg.size)
def step(self, obs): super(RandomAgent, self).step(obs) self.randomOrgreedy = False feature_screen = np.expand_dims(preprocess_screen( obs.observation.feature_screen), axis=0) feature_map = np.expand_dims(preprocess_minimap( obs.observation.feature_minimap), axis=0) info = np.zeros([1, self.action_size], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 feed_dict = { self.minimap: feature_map, self.screen: feature_screen, self.info: info } non_spatial_action, spatial_action = self.sess.run( [self.non_spatial_action, self.spatial_action], feed_dict=feed_dict) non_spatial_action = non_spatial_action.ravel() spatial_action = spatial_action.ravel() #output shape 4096 target = np.argmax(spatial_action) target = [ int(target // self.minimap_size), int(target % self.minimap_size) ] valid_actions = obs.observation.available_actions act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])] # print("available actions = " + str(obs.observation.available_actions)) # function_id = numpy.random.choice(obs.observation.available_actions) # function_id = 1 # print("function_id = " + str(function_id)) # print("observation_spec " + str(self.obs_spec)) # print("action_spec" + str((self.action_spec.functions))) # args = [[numpy.random.randint(0, size) for size in arg.sizes] # for arg in self.action_spec.functions[function_id].args] # print("function args = " + str(self.action_spec.functions[function_id].args)) # for id in obs.observation.available_actions: # for arg in self.action_spec.functions[id].args: # ctr = 0 # for size in arg.sizes: # ctr +=1 # if(ctr>2): # print("function_id = " + str(id)) if np.random.rand() < self.epsilon[0]: act_id = np.random.choice(valid_actions) self.randomOrgreedy = True if np.random.rand() < self.epsilon[1]: dy = np.random.randint(-4, 5) target[0] = int(max(0, min(self.screen_size - 1, target[0] + dy))) dx = np.random.randint(-4, 5) target[1] = int(max(0, min(self.screen_size - 1, target[1] + dx))) act_args = [] for arg in self.action_spec.functions[act_id].args: if arg.name in ('screen', 'minimap', 'screen2'): act_args.append([target[1], target[0]]) else: act_args.append([0]) # TODO: Be careful if (act_id != self.temp_act_id): self.temp_act_id = act_id if (self.randomOrgreedy): print("RANDOM") print("action " + str(actions.FUNCTIONS[act_id].name)) print("target" + str(target)) # print("args = " + str(args)) # print("\n\n\n") return actions.FunctionCall(act_id, act_args)
def update(self, rbs, disc, lr, cter): # Compute R, which is value of the last observation obs = rbs[-1][-1] if obs.last(): # obs[3]['score_cumulative'][0] or obs.reward R = obs[3]['score_cumulative'][0] # enums from https://github.com/Blizzard/s2client-api/blob/master/include/sc2api/sc2_typeenums.h _TERRAN_BARRACKS = 21 _TERRAN_MARINE = 48 _UNIT_TYPE = features.SCREEN_FEATURES.unit_type.index unit_type = obs.observation['feature_screen'][_UNIT_TYPE] barracks_y, barracks_x = (unit_type == _TERRAN_BARRACKS).nonzero() if barracks_x.any(): print('Barracks detected') R += 1 print('Episode reward: {}'.format(R)) else: minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 feed = { self.minimap: minimap, self.screen: screen, self.info: info } R = self.sess.run(self.value, feed_dict=feed)[0] # Compute targets and masks minimaps = [] screens = [] infos = [] value_target = np.zeros([len(rbs)], dtype=np.float32) value_target[-1] = R valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32) spatial_action_selected = np.zeros([len(rbs), self.ssize**2], dtype=np.float32) valid_non_spatial_action = np.zeros( [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32) non_spatial_action_selected = np.zeros( [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32) rbs.reverse() for i, [obs, action, next_obs] in enumerate(rbs): minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 minimaps.append(minimap) screens.append(screen) infos.append(info) #reward = obs.reward reward = 0.25 * (next_obs.observation['score_cumulative'][0] - obs.observation['score_cumulative'][0]) act_id = action.function act_args = action.arguments value_target[i] = reward + disc * value_target[i - 1] valid_actions = obs.observation["available_actions"] valid_non_spatial_action[i, valid_actions] = 1 non_spatial_action_selected[i, act_id] = 1 args = actions.FUNCTIONS[act_id].args for arg, act_arg in zip(args, act_args): if arg.name in ('screen', 'minimap', 'screen2'): ind = act_arg[1] * self.ssize + act_arg[0] valid_spatial_action[i] = 1 spatial_action_selected[i, ind] = 1 minimaps = np.concatenate(minimaps, axis=0) screens = np.concatenate(screens, axis=0) infos = np.concatenate(infos, axis=0) for minimaps, screens, infos, value_target, valid_spatial_action, spatial_action_selected, valid_non_spatial_action, \ non_spatial_action_selected in zip(*[self.batch(mask, BATCH_SIZE) for mask in [minimaps, screens, infos, value_target, valid_spatial_action, spatial_action_selected, valid_non_spatial_action, non_spatial_action_selected]]): # Train in batches feed = { self.minimap: minimaps, self.screen: screens, self.info: infos, self.value_target: value_target, self.valid_spatial_action: valid_spatial_action, self.spatial_action_selected: spatial_action_selected, self.valid_non_spatial_action: valid_non_spatial_action, self.non_spatial_action_selected: non_spatial_action_selected, self.learning_rate: lr } #print('Commiting {} replay samples'.format(len(minimaps))) _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed) self.summary_writer.add_summary(summary, cter)
def step(self, obs, use_unit_selector): minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 if self.init_counter == 0: self.init_counter += 1 return actions.FunctionCall(7, [[1]]) elif self.init_counter == 1: self.init_counter += 1 return actions.FunctionCall( 5, [[sc_ui.ActionMultiPanel.SingleSelect], [0]]) elif self.init_counter == 2: self.init_counter += 1 return actions.FunctionCall(4, [[1], [0]]) elif self.init_counter == 3: self.init_counter += 1 return actions.FunctionCall(7, [[1]]) elif self.init_counter == 4: self.init_counter += 1 return actions.FunctionCall( 5, [[sc_ui.ActionMultiPanel.SingleSelect], [1]]) elif self.init_counter == 5: self.init_counter += 1 return actions.FunctionCall(4, [[1], [1]]) elif use_unit_selector: unitSel = self.get_unit_sel_res(obs) if self.training and np.random.rand() < self.epsilon[0]: unitSel = np.random.randint(0, 4) if unitSel == num_units + 1: return actions.FunctionCall(7, [[1]]) elif unitSel == num_units: feed = { self.minimap: minimap, self.screen: screen, self.info: info } non_spatial_action, spatial_action = self.sess_master.run( [self.non_spatial_action, self.spatial_action], feed_dict=feed) # Select an action and a spatial target non_spatial_action = non_spatial_action.ravel() spatial_action = spatial_action.ravel() valid_actions = obs.observation['available_actions'] act_id = valid_actions[np.argmax( non_spatial_action[valid_actions])] target = np.argmax(spatial_action) target = [int(target // self.ssize), int(target % self.ssize)] # Epsilon greedy exploration if self.training and np.random.rand() < self.epsilon[0]: act_id = np.random.choice(valid_actions) if self.training and np.random.rand() < self.epsilon[1]: dy = np.random.randint(-4, 5) target[0] = int(max(0, min(self.ssize - 1, target[0] + dy))) dx = np.random.randint(-4, 5) target[1] = int(max(0, min(self.ssize - 1, target[1] + dx))) # Set act_id and act_args act_args = [] for arg in actions.FUNCTIONS[act_id].args: if arg.name in ('screen', 'minimap', 'screen2'): act_args.append([target[1], target[0]]) else: act_args.append([0]) # TODO: Be careful return actions.FunctionCall(act_id, act_args) else: return actions.FunctionCall(4, [[0], [unitSel]]) else: feed = { self.minimap: minimap, self.screen: screen, self.info: info } non_spatial_action, spatial_action = self.sess_master.run( [self.non_spatial_action, self.spatial_action], feed_dict=feed) # Select an action and a spatial target non_spatial_action = non_spatial_action.ravel() spatial_action = spatial_action.ravel() valid_actions = obs.observation['available_actions'] act_id = valid_actions[np.argmax( non_spatial_action[valid_actions])] target = np.argmax(spatial_action) target = [int(target // self.ssize), int(target % self.ssize)] # Epsilon greedy exploration if self.training and np.random.rand() < self.epsilon[0]: act_id = np.random.choice(valid_actions) if self.training and np.random.rand() < self.epsilon[1]: dy = np.random.randint(-4, 5) target[0] = int(max(0, min(self.ssize - 1, target[0] + dy))) dx = np.random.randint(-4, 5) target[1] = int(max(0, min(self.ssize - 1, target[1] + dx))) # Set act_id and act_args act_args = [] for arg in actions.FUNCTIONS[act_id].args: if arg.name in ('screen', 'minimap', 'screen2'): act_args.append([target[1], target[0]]) else: act_args.append([0]) # TODO: Be careful return actions.FunctionCall(act_id, act_args)
def update_low(self, ind_thread, rbs, dhs, disc, lr_a, lr_c, cter, macro_type, coord_type): # rbs(replayBuffers)是[last_timesteps[0], actions[0], timesteps[0]]的集合(agent在一回合里进行了多少step就有多少个),具体见run_loop25行 # Compute R, which is value of the last observation obs = rbs[-1][-1] # rbs的最后一个元素,应当是当前一步的timesteps值。即obs可以看作timesteps if obs.last(): R = 0 else: minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) # 类似105-111行 minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32) info_plus[0] = obs.observation.player.minerals, obs.observation[ 'player'][5], obs.observation['player'][6], obs.observation[ 'player'][4] # info 现在的size 是 isize + info_plus_size info = np.concatenate((info, info_plus), axis=1) # print('info') # print(info) # print(info_plus) dir_high_usedToFeedLowNet = np.ones([1, 1], dtype=np.float32) dir_high_usedToFeedLowNet[0][0] = dhs[0] act_id = np.ones([1, 1], dtype=np.float32) # act_ID[0][0] = rbs[-1][1].function # 之所以不能用rbs里的action信息,是因为rbs里的action可能是no_op(由于出现动作not valid/不合法的情况,为了使游戏不崩掉而不得不这么办的补救措施) # 但这里要输入的act_id应该是step_low算出来的act_id act_id[0][0] = GL.get_value(ind_thread, "act_id_micro") feed = { self.minimap: minimap, self.screen: screen, self.info: info, self.dir_high_usedToFeedLowNet: dir_high_usedToFeedLowNet, self.act_id: act_id, } R = self.sess.run(self.value_low, feed_dict=feed)[0] # Compute targets and masks minimaps = [] screens = [] infos = [] dir_highs = [] act_ids = [] value_target = np.zeros( [len(rbs)], dtype=np.float32) # len(rbs) 计算出agent在回合里总共进行的步数 value_target[-1] = R valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32) # 含义是每一个step需不需要坐标参数 spatial_action_selected = np.zeros( [len(rbs), self.ssize**2], dtype=np.float32) # 含义是每一个step需不需要坐标参数(第一维上),且具体坐标参数是什么(第二维上) rbs.reverse() # 先reverse 与莫烦A3C_continuous_action.py的代码类似 micro_isdone = GL.get_value(ind_thread, "micro_isdone") micro_isdone.reverse() sum_low_reward = GL.get_value(ind_thread, "sum_low_reward") for i, [obs, action, next_obs] in enumerate(rbs): # agent在回合里进行了多少步,就进行多少轮循环 minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) # 类似105-111行 minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32) info_plus[0] = obs.observation.player.minerals, obs.observation[ 'player'][5], obs.observation['player'][6], obs.observation[ 'player'][4] # info 现在的size 是 isize + info_plus_size info = np.concatenate((info, info_plus), axis=1) minimaps.append(minimap) screens.append(screen) infos.append(info) dir_high_usedToFeedLowNet = np.ones([1, 1], dtype=np.float32) dir_high_usedToFeedLowNet[0][0] = dhs[i] act_ID = np.ones([1, 1], dtype=np.float32) # act_ID[0][0] = act_id # 之所以不能用rbs里的action信息,是因为rbs里的action可能是no_op(由于出现动作not valid/不合法的情况,为了使游戏不崩掉而不得不这么办的补救措施) # 但这里要输入的act_id应该是step_low算出来的act_id act_ID[0][0] = GL.get_value(ind_thread, "act_id_micro") # dir_highs.append(dir_high_usedToFeedLowNet) # act_ids.append(act_ID) coord = [0, 0] # coord[0], coord[1] = [32, 32] coord[0], coord[1] = self.step_low(ind_thread, obs, dir_high_usedToFeedLowNet, act_ID) reward = low_reward(next_obs, obs, coord, micro_isdone[i], macro_type, coord_type) sum_low_reward += reward GL.add_value_list(ind_thread, "low_reward_of_episode", reward) act_id = action.function # Agent在这一步中选择动作的id序号 act_args = action.arguments value_target[i] = reward + disc * value_target[ i - 1] # 可参考莫烦Q_Learning教程中对Gamma的意义理解的那张图(有3个眼镜那张),得到回合中每个状态的价值V_S # 这里没像莫烦一样再次reverse value 似乎是因为其他参数(如minimap、screen、info等)也都是最后往前反序排列的。见181-182行 args = actions.FUNCTIONS[act_id].args for arg, act_arg in zip(args, act_args): if arg.name in ('screen', 'minimap', 'screen2'): ind = act_arg[1] * self.ssize + act_arg[0] valid_spatial_action[i] = 1 spatial_action_selected[i, ind] = 1 GL.set_value(ind_thread, "sum_low_reward", sum_low_reward) minimaps = np.concatenate(minimaps, axis=0) screens = np.concatenate(screens, axis=0) infos = np.concatenate(infos, axis=0) # 实际上由于low_net是单步更新策略,所以以下feed的参数里面都只有一帧的数据 # Train feed = { self.minimap: minimaps, self.screen: screens, self.info: infos, # self.dir_high_usedToFeedLowNet: dir_highs, self.dir_high_usedToFeedLowNet: dir_high_usedToFeedLowNet, # self.act_id: act_ids, self.act_id: act_ID, self.value_target_low: value_target, self.valid_spatial_action_low: valid_spatial_action, self.spatial_action_selected_low: spatial_action_selected, self.learning_rate_a_low: lr_a, self.learning_rate_c_low: lr_c } _, __, summary = self.sess.run( [self.update_a_low, self.update_c_low, self.summary_op_low], feed_dict=feed) self.summary_writer.add_summary(summary, cter)
def update(self, rbs, disc, lr, cter): # Compute R, which is value of the last observation spatial_action = None non_spatial_action = None obs = rbs[-1][-1] if obs.last(): R = 0 else: minimap = np.array(obs.observation.feature_minimap, dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation.feature_screen, dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation.available_actions] = 1 # first get probabilities for each action; Then greedly pick the largest to calculate q value. one hot vector softmax # have low confidence, just use full episode and the last observation R should be just 0. feed = {self.minimap: minimap, self.screen: screen, self.info: info} spatial_action, non_spatial_action = self.sess.run([self.spatial_action, self.non_spatial_action], feed_dict=feed) # Compute targets and masks minimaps = [] screens = [] infos = [] valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32) spatial_action_selected = np.zeros([len(rbs), self.ssize**2], dtype=np.float32) valid_non_spatial_action = np.zeros([len(rbs), len(actions.FUNCTIONS)], dtype=np.float32) non_spatial_action_selected = np.zeros([len(rbs), len(actions.FUNCTIONS)], dtype=np.float32) rbs.reverse() for i, [obs, action, next_obs] in enumerate(rbs): minimap = np.array(obs.observation.feature_minimap, dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation.feature_screen, dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) #info[0, obs.observation['available_actions']] = 1 info[0, obs.observation.available_actions] = 1 minimaps.append(minimap) screens.append(screen) infos.append(info) act_id = action.function act_args = action.arguments #valid_actions = obs.observation["available_actions"] valid_actions = obs.observation.available_actions valid_non_spatial_action[i, valid_actions] = 1 non_spatial_action_selected[i, act_id] = 1 args = actions.FUNCTIONS[act_id].args for arg, act_arg in zip(args, act_args): if arg.name in ('screen', 'minimap', 'screen2'): ind = act_arg[1] * self.ssize + act_arg[0] valid_spatial_action[i] = 1 spatial_action_selected[i, ind] = 1 value_target = np.zeros([len(rbs)], dtype=np.float32) if spatial_action is not None: q_spatial = np.max(spatial_action * valid_spatial_action[0], axis=1) q_non_spatial = np.max(non_spatial_action * valid_non_spatial_action[0], axis=1) q_value = self.ispatial*q_spatial + q_non_spatial R = q_value[0] value_target[-1] = R for i, [obs, action, next_obs] in enumerate(rbs): reward = obs.reward value_target[i] = reward + disc * value_target[i-1] minimaps = np.concatenate(minimaps, axis=0) screens = np.concatenate(screens, axis=0) infos = np.concatenate(infos, axis=0) # Train feed = {self.minimap: minimaps, self.screen: screens, self.info: infos, self.value_target: value_target, self.valid_spatial_action: valid_spatial_action, self.spatial_action_selected: spatial_action_selected, self.valid_non_spatial_action: valid_non_spatial_action, self.non_spatial_action_selected: non_spatial_action_selected, self.learning_rate: lr} _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed) self.summary_writer.add_summary(summary, cter)
def update_high(self, ind_thread, rbs, dhs, disc, lr_a, lr_c, cter): # rbs(replayBuffers)是[last_timesteps[0], actions[0], timesteps[0]]的集合(更新时经历了多少个step就有多少个),具体见run_loop25行 # dhs(dir_high_buffers) 是指令序号的集合。比如一共有5个宏动作,则dhs形如[5, 4, 1, 2, 3, 4, 2, 1, ......] dir_high_selected = np.zeros( [len(rbs), num_macro_action], dtype=np.float32) # 含义是每一个step需不需要坐标参数(第一维上),且具体坐标参数是什么(第二维上) for i in range(len(rbs)): dir_high_selected[i, dhs[i][0] - 1] = 1 # Compute R, which is value of the last observation obs = rbs[-1][-1] # rbs的最后一个元素,应当是当前一步的timesteps值。即obs可以看作timesteps if obs.last(): R = 0 else: minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) # 类似105-111行 minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32) info_plus[0] = obs.observation.player.minerals, obs.observation[ 'player'][5], obs.observation['player'][6], obs.observation[ 'player'][4] # info 现在的size 是 isize + info_plus_size info = np.concatenate((info, info_plus), axis=1) feed = { self.minimap: minimap, self.screen: screen, self.info: info } R = self.sess.run(self.value_high, feed_dict=feed)[0] # Compute targets and masks minimaps = [] screens = [] infos = [] value_target = np.zeros( [len(rbs)], dtype=np.float32) # len(rbs) 计算出agent在回合里总共进行的步数 value_target[-1] = R valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32) # 含义是每一个step需不需要坐标参数 spatial_action_selected = np.zeros( [len(rbs), self.ssize**2], dtype=np.float32) # 含义是每一个step需不需要坐标参数(第一维上),且具体坐标参数是什么(第二维上) rbs.reverse() # 先reverse 与莫烦A3C_continuous_action.py的代码类似 micro_isdone = GL.get_value(ind_thread, "micro_isdone") micro_isdone.reverse() sum_high_reward = GL.get_value(ind_thread, "sum_high_reward") for i, [obs, action, next_obs] in enumerate(rbs): # agent在回合里进行了多少步,就进行多少轮循环 minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) # 类似105-111行 minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32) info_plus[0] = obs.observation.player.minerals, obs.observation[ 'player'][5], obs.observation['player'][6], obs.observation[ 'player'][4] # info 现在的size 是 isize + info_plus_size info = np.concatenate((info, info_plus), axis=1) minimaps.append(minimap) screens.append(screen) infos.append(info) # reward = obs.reward reward = high_reward(ind_thread, next_obs, obs, action, micro_isdone[i]) # 翔森设计的high reward sum_high_reward += reward GL.add_value_list(ind_thread, "high_reward_of_episode", reward) act_id = action.function # Agent在这一步中选择动作的id序号 act_args = action.arguments value_target[i] = reward + disc * value_target[ i - 1] # 可参考莫烦Q_Learning教程中对Gamma的意义理解的那张图(有3个眼镜那张),得到回合中每个状态的价值V_S # 这里没像莫烦一样再次reverse value 似乎是因为其他参数(如minimap、screen、info等)也都是最后往前反序排列的。见181-182行 args = actions.FUNCTIONS[act_id].args for arg, act_arg in zip(args, act_args): if arg.name in ('screen', 'minimap', 'screen2'): ind = act_arg[1] * self.ssize + act_arg[0] valid_spatial_action[i] = 1 spatial_action_selected[i, ind] = 1 GL.set_value(ind_thread, "sum_high_reward", sum_high_reward) minimaps = np.concatenate(minimaps, axis=0) screens = np.concatenate(screens, axis=0) infos = np.concatenate(infos, axis=0) # Train feed = { self.minimap: minimaps, self.screen: screens, self.info: infos, self.value_target_high: value_target, self.dir_high_selected: dir_high_selected, self.learning_rate_a_high: lr_a, self.learning_rate_c_high: lr_c } _, __, summary = self.sess.run( [self.update_a_high, self.update_c_high, self.summary_op_high], feed_dict=feed) self.summary_writer.add_summary(summary, cter) GL.set_value(ind_thread, "micro_isdone", [])
def step(self, obs): screen = np.array(obs.observation.feature_screen, dtype=np.float32) screen = np.expand_dims(utils.preprocess_screen(screen), axis=0) # np.expand_dims: 展開數組的形狀。 # x = np.array([1,2]) # x.shape # (2,) # y = np.expand_dims(x, axis=0) # y # array([[1, 2]]) # y.shape # (1, 2) minimap = np.array(obs.observation.feature_minimap, dtype=np.float32) minimap = np.expand_dims(utils.preprocess_minimap(minimap), axis=0) structured = np.zeros([1, self.structured_dimensions], dtype=np.float32) structured[0, obs.observation.available_actions] = 1 feed_dict = { self.screen: screen, self.minimap: minimap, self.structured: structured } non_spatial_action, spatial_action = self.sess.run( [self.non_spatial_action, self.spatial_action], feed_dict=feed_dict ) non_spatial_action = non_spatial_action.ravel() spatial_action = spatial_action.ravel() # np.ravel: 返回一個連續的扁平數組(flatten array)。 # x = np.array([[1, 2, 3], [4, 5, 6]]) # print(np.ravel(x)) # [1 2 3 4 5 6] available_actions = obs.observation.available_actions action_id = 0 spatial_target = [] if self.mode == 'original_ac3': non_spatial_action = np.array(non_spatial_action[available_actions]) non_spatial_action /= non_spatial_action.sum() x = np.random.choice(non_spatial_action, p=non_spatial_action) action_id = available_actions[np.where(non_spatial_action == x)[0][0]] spatial_target = random.choice(list(enumerate(spatial_action)))[0] # x = np.random.choice(spatial_action, p=spatial_action) # if len(np.where(spatial_action == x)[0]) > 1: # random = np.random.choice(len(np.where(spatial_action == x)[0])) # spatial_target = np.where(spatial_action == x)[0][random] # else: # spatial_target = np.where(spatial_action == x)[0][0] spatial_target = [int(spatial_target // self.resolution), int(spatial_target % self.resolution)] else: action_id = available_actions[np.argmax(non_spatial_action[available_actions])] spatial_target = np.argmax(spatial_action) spatial_target = [int(spatial_target // self.resolution), int(spatial_target % self.resolution)] # epsilon-greedy exploration if self.training and np.random.rand() < self.epsilon[0]: action_id = np.random.choice(available_actions) if self.training and np.random.rand() < self.epsilon[1]: delta_y, delta_x = np.random.randint(-4, 5), np.random.randint(-4, 5) spatial_target[0] = int(max(0, min(self.resolution -1, spatial_target[0] + delta_y))) spatial_target[1] = int(max(0, min(self.resolution -1, spatial_target[1] + delta_x))) action_args = [] for arg in actions.FUNCTIONS[action_id].args: if arg.name in ('screen', 'minimap', 'screen2'): action_args.append([spatial_target[1], spatial_target[0]]) else: action_args.append([0]) return actions.FunctionCall(action_id, action_args)
def step(self, obs): minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) # TODO: only use available actions info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 #print('info = ',info) feed = {self.minimap: minimap, self.screen: screen, self.info: info} non_spatial_action, spatial_action = self.sess.run( [self.non_spatial_action, self.spatial_action], feed_dict=feed) # Select an action and a spatial target non_spatial_action = non_spatial_action.ravel() spatial_action = spatial_action.ravel() valid_actions = obs.observation['available_actions'] #print('valid_actions = ',valid_actions) #print('self.less_actions = ',self.less_actions) #找valid_actions中各元素在self.less_actions中的脚标 valid_actions_idx = [] for i in range(len(valid_actions)): for j in range(len(self.less_actions)): if (self.less_actions[j] == valid_actions[i]): valid_actions_idx.append(j) #valid_actions_idx = np.sort(valid_actions_idx) act_id = int(self.less_actions[np.argmax( non_spatial_action[valid_actions_idx])]) #print('valid_actions_idx = ',valid_actions_idx) #print('np.argmax(non_spatial_action[valid_actions_idx]) = ', np.argmax(non_spatial_action[valid_actions_idx])) #print('act_id = ',act_id) target = np.argmax(spatial_action) target = [int(target // self.ssize), int(target % self.ssize)] #if False: # print(actions.FUNCTIONS[act_id].name, target) # Epsilon greedy exploration if self.training and np.random.rand() < self.epsilon[0]: act_id = np.random.choice(valid_actions) if self.training and np.random.rand() < self.epsilon[1]: dy = np.random.randint(-4, 5) target[0] = int(max(0, min(self.ssize - 1, target[0] + dy))) dx = np.random.randint(-4, 5) target[1] = int(max(0, min(self.ssize - 1, target[1] + dx))) # Set act_id and act_args act_args = [] for arg in actions.FUNCTIONS[act_id].args: if arg.name in ('screen', 'minimap', 'screen2'): act_args.append([target[1], target[0]]) else: act_args.append([0]) # TODO: Be careful if (not act_id in valid_actions): return actions.FunctionCall(_NOOP, []) return actions.FunctionCall(act_id, act_args)
def update(self, replay_buffer, learning_rate, step): obs = replay_buffer[-1][-1] if obs.last(): reward = 0 else: screen = np.array(obs.observation.feature_screen, dtype=np.float32) screen = np.expand_dims(utils.preprocess_screen(screen), axis=0) minimap = np.array(obs.observation.feature_minimap, dtype=np.float32) minimap = np.expand_dims(utils.preprocess_minimap(minimap), axis=0) structured = np.zeros([1, self.structured_dimensions], dtype=np.float32) structured[0, obs.observation.available_actions] = 1 feed_dict = { self.screen: screen, self.minimap: minimap, self.structured: structured } reward = self.sess.run(self.value, feed_dict=feed_dict) #compute targets and masks screens, minimaps, structureds = [], [], [] target_value = np.zeros([len(replay_buffer)], dtype=np.float32) target_value[-1] = reward valid_non_spatial_action = np.zeros([len(replay_buffer), len(actions.FUNCTIONS)], dtype=np.float32) non_spatial_action_selected = np.zeros([len(replay_buffer), len(actions.FUNCTIONS)], dtype=np.float32) valid_spatial_action = np.zeros([len(replay_buffer)], dtype=np.float32) spatial_action_selected = np.zeros([len(replay_buffer), self.resolution ** 2], dtype=np.float32) record_score = replay_buffer[-1][0].observation['score_cumulative'][0] summary = tf.Summary() summary.value.add(tag='episode_score', simple_value=record_score) print('train!! step %d: score = %f' % (step, record_score)) self.summary_writer.add_summary(summary, step) replay_buffer.reverse() # reverse:方法沒有返回值,但是會對列表的元素進行反向排序 for i, [obs, action, next_obs] in enumerate(replay_buffer): # seq = ['one', 'two', 'three'] # for i, element in enumerate(seq): # print i, element # 0 one # 1 two # 2 three screen = np.array(obs.observation.feature_screen, dtype=np.float32) screen = np.expand_dims(utils.preprocess_screen(screen), axis=0) minimap = np.array(obs.observation.feature_minimap, dtype=np.float32) minimap = np.expand_dims(utils.preprocess_minimap(minimap), axis=0) structured = np.zeros([1, self.structured_dimensions], dtype=np.float32) structured[0, obs.observation.available_actions] = 1 screens.append(screen) minimaps.append(minimap) structureds.append(structured) reward = obs.reward action_id = action.function action_args = action.arguments target_value[i] = reward + self.discount * target_value[i - 1] available_actions = obs.observation.available_actions valid_non_spatial_action[i, available_actions] = 1 non_spatial_action_selected[i, action_id] = 1 args = actions.FUNCTIONS[action_id].args for arg, action_arg in zip(args, action_args): if arg.name in ('screen', 'minimap', 'screen2'): spatial_action = action_arg[1] * self.resolution + action_arg[0] valid_spatial_action[i] = 1 spatial_action_selected[i, spatial_action] = 1 screens = np.concatenate(screens, axis=0) minimaps = np.concatenate(minimaps, axis=0) structureds = np.concatenate(structureds, axis=0) feed_dict = { self.screen: screens, self.minimap: minimaps, self.structured: structureds, self.target_value: target_value, self.valid_non_spatial_action: valid_non_spatial_action, self.non_spatial_action_selected: non_spatial_action_selected, self.valid_spatial_action: valid_spatial_action, self.spatial_action_selected: spatial_action_selected, self.learning_rate: learning_rate } _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed_dict) self.summary_writer.add_summary(summary, step)
def update(self, rbs, replay_buffer, disc, lr, cter): # Compute R, which is value of the last observation buffer_size = len(replay_buffer) obs = rbs[-1][-1] if obs.last(): R = 0 else: minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 feed = { self.minimap: minimap, self.screen: screen, self.info: info } R = self.sess.run(self.value, feed_dict=feed)[0] # Compute targets and masks minimaps = [] screens = [] infos = [] value_target = np.zeros([len(rbs)], dtype=np.float32) value_target[-1] = R valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32) spatial_action_selected = np.zeros([len(rbs), self.ssize**2], dtype=np.float32) valid_non_spatial_action = np.zeros( [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32) non_spatial_action_selected = np.zeros( [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32) rbs.reverse() for i, [obs, action, pixel_change, next_obs] in enumerate(rbs): # added pixel change to update function, just directly put it into the feed dict minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 minimaps.append(minimap) screens.append(screen) infos.append(info) reward = obs.reward act_id = action.function act_args = action.arguments value_target[i] = reward + disc * value_target[i - 1] valid_actions = obs.observation["available_actions"] valid_non_spatial_action[i, valid_actions] = 1 non_spatial_action_selected[i, act_id] = 1 args = actions.FUNCTIONS[act_id].args for arg, act_arg in zip(args, act_args): if arg.name in ('screen', 'minimap', 'screen2'): ind = act_arg[1] * self.ssize + act_arg[0] valid_spatial_action[i] = 1 spatial_action_selected[i, ind] = 1 minimaps = np.concatenate(minimaps, axis=0) screens = np.concatenate(screens, axis=0) infos = np.concatenate(infos, axis=0) # Train feed = { self.minimap: minimaps, self.screen: screens, self.pixel_change: pixel_change, self.info: infos, self.value_target: value_target, self.valid_spatial_action: valid_spatial_action, self.spatial_action_selected: spatial_action_selected, self.valid_non_spatial_action: valid_non_spatial_action, self.non_spatial_action_selected: non_spatial_action_selected, self.learning_rate: lr } # _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed) # self.summary_writer.add_summary(summary, cter) ###################################################################################### # Update the pc network start_pos = np.random.randint(0, buffer_size - self.sequence_size - 1) #take care of terminals if replay_buffer[start_pos][-1].last(): start_pos += 1 # Assuming that there are no successive terminal frames. pc_experience_frames = [] for i in range(self.sequence_size + 1): frame = replay_buffer[start_pos + i] pc_experience_frames.append(frame) if frame[-1].last(): break # Reverse sequence to calculate from the last pc_experience_frames.reverse() batch_pc_si = [] batch_pc_a = [] batch_pc_R = [] batch_pc_va = [] pc_R = np.zeros([20, 20], dtype=np.float32) if not pc_experience_frames[1].last(): # pc_R = self.run_pc_q_max(self.sess, pc_experience_frames[0].state) # def run_pc_q_max(self, sess, s_t): minimap = np.array(obs.observation['minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) # TODO: only use available actions info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 s_feed = { self.pc_minimap: minimap, self.pc_screen: screen, self.pc_info: info } pc_R = self.sess.run(self.pc_q_max, s_feed) pc_valid_non_spatial_action = np.zeros( [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32) for i, [obs, action, pixel_change, next_obs] in enumerate(pc_experience_frames[1:]): pc_R = pixel_change + self.gamma_pc * pc_R a = np.zeros([self.action_size]) a[action] = 1.0 valid_actions = np.zeros((len(actions.FUNCTIONS)), dtype=np.float32) valid_actions_inds = obs.observation["available_actions"] valid_actions[valid_actions_inds] = 1 batch_pc_si.append(frame.state) batch_pc_a.append(a) batch_pc_R.append(pc_R) batch_pc_va.append(valid_actions) batch_pc_si.reverse() batch_pc_a.reverse() batch_pc_R.reverse() batch_pc_va.reverse pc_feed_dict = { self.pc_input: batch_pc_si, self.pc_a: batch_pc_a, self.pc_r: batch_pc_R, self.pc_valid_non_spatial_action: batch_pc_va } feed.update(pc_feed_dict) _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed) self.summary_writer.add_summary(summary, cter)
def supervised_train(training_episode): # Initialization EPISODES, episode, max_average = 20000, 0, 50.0 # specific for pong while episode < training_episode: if episode < EPISODES: episode += 1 replay = trajectory.Trajectory( '/media/kimbring2/Steam/StarCraftII/Replays/', 'Terran', 'Terran', 2500) replay.get_random_trajectory() replay_index = 0 home_replay_done = False home_replay_feature_screen_list, home_replay_feature_player_list, home_replay_available_actions_list = [], [], [] home_replay_fn_id_list, home_replay_arg_ids_list = [], [] home_replay_memory_state_list, home_replay_carry_state_list = [], [] memory_state = np.zeros([1, 256], dtype=np.float32) carry_state = np.zeros([1, 256], dtype=np.float32) while not home_replay_done: home_replay_state = replay.home_trajectory[replay_index][0] home_replay_actions = replay.home_trajectory[replay_index][1] home_replay_done = replay.home_trajectory[replay_index][2] home_replay_feature_screen = home_replay_state['feature_screen'] home_replay_feature_screen = preprocess_screen( home_replay_feature_screen) home_replay_feature_screen = np.transpose( home_replay_feature_screen, (1, 2, 0)) home_replay_feature_player = home_replay_state['player'] home_replay_feature_player = preprocess_player( home_replay_feature_player) home_replay_available_actions = home_replay_state[ 'available_actions'] home_replay_available_actions = preprocess_available_actions( home_replay_available_actions) home_replay_feature_screen_array = np.array( [home_replay_feature_screen]) home_replay_feature_player_array = np.array( [home_replay_feature_player]) home_replay_available_actions_array = np.array( [home_replay_available_actions]) home_replay_feature_screen_list.append( home_replay_feature_screen_array) home_replay_feature_player_list.append( home_replay_feature_player_array) home_replay_available_actions_list.append( home_replay_available_actions_array) home_replay_memory_state_list.append(memory_state) home_replay_carry_state_list.append(carry_state) home_replay_prediction = home_agent.act( home_replay_feature_screen_array, home_replay_feature_player_array, home_replay_available_actions_array, memory_state, carry_state) home_replay_next_memory_state = home_replay_prediction[3] home_replay_next_carry_state = home_replay_prediction[4] home_replay_action = random.choice(home_replay_actions) home_replay_fn_id = int(home_replay_action.function) home_replay_args_ids = dict() for arg_type in actions.TYPES: home_replay_args_ids[arg_type] = -1 arg_index = 0 for arg_type in FUNCTIONS._func_list[home_replay_fn_id].args: home_replay_args_ids[arg_type] = home_replay_action.arguments[ arg_index] arg_index += 1 home_replay_fn_id_list.append(home_replay_fn_id) home_replay_arg_id_list = [] for arg_type in home_replay_args_ids.keys(): arg_id = home_replay_args_ids[arg_type] if type(arg_id) == list: if len(arg_id) == 2: arg_id = arg_id[0] * feature_screen_size + arg_id[1] else: arg_id = int(arg_id[0]) home_replay_arg_id_list.append(arg_id) home_replay_arg_ids_list.append(np.array([home_replay_arg_id_list ])) if home_replay_done == StepType.LAST: home_replay_done = True else: home_replay_done = False if home_replay_done: break replay_index += 1 #print("replay_index: ", replay_index) if replay_index >= len(replay.home_trajectory) - 1: break memory_state = home_replay_next_memory_state carry_state = home_replay_next_carry_state if len(home_replay_feature_screen_list) == 16: if arguments.training == True: home_agent.supervised_replay( home_replay_feature_screen_list, home_replay_feature_player_list, home_replay_available_actions_list, home_replay_fn_id_list, home_replay_arg_ids_list, home_replay_memory_state_list, home_replay_carry_state_list) home_replay_feature_screen_list, home_replay_feature_player_list, home_replay_available_actions_list = [], [], [] home_replay_fn_id_list, home_replay_arg_ids_list = [], [] home_replay_memory_state_list, home_replay_carry_state_list = [], []
def update(self, rbs, disc, lr, cter): # Compute R, which is value of the last observation obs = rbs[-1][-1] if obs.last(): R = 0 else: minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 feed = { self.minimap: minimap, self.screen: screen, self.info: info } R = self.sess.run(self.value, feed_dict=feed)[0] # Compute targets and masks minimaps = [] screens = [] infos = [] value_target = np.zeros([len(rbs)], dtype=np.float32) value_target[-1] = R valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32) spatial_action_selected = np.zeros([len(rbs), self.ssize**2], dtype=np.float32) valid_non_spatial_action = np.zeros( [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32) non_spatial_action_selected = np.zeros( [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32) rbs.reverse() for i, [obs, action, next_obs] in enumerate(rbs): minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(U.preprocess_screen(screen), axis=0) info = np.zeros([1, self.isize], dtype=np.float32) info[0, obs.observation['available_actions']] = 1 minimaps.append(minimap) screens.append(screen) infos.append(info) self.reward = obs.reward act_id = action.function act_args = action.arguments value_target[i] = self.reward + disc * value_target[i - 1] valid_actions = obs.observation["available_actions"] valid_non_spatial_action[i, valid_actions] = 1 non_spatial_action_selected[i, act_id] = 1 args = actions.FUNCTIONS[act_id].args for arg, act_arg in zip(args, act_args): if arg.name in ('screen', 'minimap', 'screen2'): ind = act_arg[1] * self.ssize + act_arg[0] valid_spatial_action[i] = 1 spatial_action_selected[i, ind] = 1 minimaps = np.concatenate(minimaps, axis=0) screens = np.concatenate(screens, axis=0) infos = np.concatenate(infos, axis=0) # Train feed = { self.minimap: minimaps, self.screen: screens, self.info: infos, self.value_target: value_target, self.valid_spatial_action: valid_spatial_action, self.spatial_action_selected: spatial_action_selected, self.valid_non_spatial_action: valid_non_spatial_action, self.non_spatial_action_selected: non_spatial_action_selected, self.learning_rate: lr, self.score: self.reward } # will this work? _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed) self.summary_writer.add_summary(summary, cter)
def reinforcement_train(training_episode): score_list = [] max_average = 5.0 EPISODES, episode, max_average = 20000, 0, 5.0 while episode < training_episode: # Reset episode home_score, home_done, SAVING = 0, False, '' opponent_score, opponent_done = 0, False state = env.reset() home_feature_screen_list, home_feature_player_list, home_feature_units_list, home_available_actions_list = [], [], [], [] home_fn_id_list, home_arg_ids_list, home_rewards, home_dones = [], [], [], [] home_memory_state_list, home_carry_state_list = [], [] memory_state = np.zeros([1, 256], dtype=np.float32) carry_state = np.zeros([1, 256], dtype=np.float32) while not home_done: home_state = state[0] #opponent_state = state[1] home_feature_screen = home_state[3]['feature_screen'] home_feature_screen = preprocess_screen(home_feature_screen) home_feature_screen = np.transpose(home_feature_screen, (1, 2, 0)) home_feature_player = home_state[3]['player'] home_feature_player = preprocess_player(home_feature_player) home_available_actions = home_state[3]['available_actions'] home_available_actions = preprocess_available_actions( home_available_actions) home_feature_units = home_state[3]['feature_units'] home_feature_units = preprocess_feature_units( home_feature_units, feature_screen_size) #print("home_feature_units.shape: ", home_feature_units.shape) home_feature_screen_array = np.array([home_feature_screen]) home_feature_player_array = np.array([home_feature_player]) home_feature_units_array = np.array([home_feature_units]) home_available_actions_array = np.array([home_available_actions]) home_feature_screen_list.append(home_feature_screen_array) home_feature_player_list.append(home_feature_player_array) home_feature_units_list.append(home_feature_units_array) home_available_actions_list.append([home_available_actions]) home_memory_state_list.append(memory_state) home_carry_state_list.append(carry_state) home_prediction = home_agent.act(home_feature_screen_array, home_feature_player_array, home_feature_units_array, home_available_actions_array, memory_state, carry_state) home_fn_pi = home_prediction[0] home_arg_pis = home_prediction[1] home_next_memory_state = home_prediction[3] home_next_carry_state = home_prediction[4] home_fn_samples, home_arg_samples = sample_actions( home_available_actions, home_fn_pi, home_arg_pis) home_fn_id, home_arg_ids = mask_unused_argument_samples( home_fn_samples, home_arg_samples) home_fn_id_list.append(home_fn_id[0]) home_arg_id_list = [] for arg_type in home_arg_ids.keys(): arg_id = home_arg_ids[arg_type] home_arg_id_list.append(arg_id) home_arg_ids_list.append(np.array([home_arg_id_list])) home_actions_list = actions_to_pysc2(home_fn_id, home_arg_ids, (32, 32)) actions_list = [home_actions_list, actions.FUNCTIONS.no_op()] next_state = env.step(actions_list) home_next_state = next_state[0] home_done = home_next_state[0] if home_done == StepType.LAST: home_done = True else: home_done = False state = next_state memory_state = home_next_memory_state carry_state = home_next_carry_state home_reward = float(home_next_state[1]) home_rewards.append(home_reward) home_dones.append(home_done) home_score += home_reward state = next_state if len(home_feature_screen_list) == 16: if arguments.training == True: home_agent.reinforcement_replay( home_feature_screen_list, home_feature_player_list, home_feature_units_list, home_available_actions_list, home_fn_id_list, home_arg_ids_list, home_rewards, home_dones, home_memory_state_list, home_carry_state_list) home_feature_screen_list, home_feature_player_list, home_feature_units_list, home_available_actions_list = [], [], [], [] home_fn_id_list, home_arg_ids_list, home_rewards, home_dones = [], [], [], [] home_memory_state_list, home_carry_state_list = [], [] score_list.append(home_score) average = sum(score_list) / len(score_list) PlotModel(home_score, episode) print("episode: {}/{}, score: {}, average: {:.2f} {}".format( episode, EPISODES, home_score, average, SAVING)) if episode < EPISODES: episode += 1 env.close()