def update(self, rbs): ## backprop after episode ## if not episode end, use newwork to estimate value of last state, else 0 obs = rbs[-1][-1] if obs.last(): R = 0 else: screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(PP.preprocess_screen(screen), axis=0) feed = {self.screen: screen} R = self.sess.run(self.value, feed_dict=feed)[0] ## prepare input & actions & Q value target screens = [] value_target = np.zeros([len(rbs)], dtype=np.float32) value_target[-1] = R valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32) spatial_action_selected = np.zeros([len(rbs), self.ssize**2], dtype=np.float32) rbs.reverse() for i, [obs, action, _] in enumerate(rbs): screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(PP.preprocess_screen(screen), axis=0) screens.append(screen) reward = obs.reward act_id = action.function act_args = action.arguments value_target[i] = reward + self.discount * value_target[i - 1] args = actions.FUNCTIONS[act_id].args for arg, act_arg in zip(args, act_args): if arg.name in ('screen'): idx = act_arg[1] * self.ssize + act_arg[0] valid_spatial_action[i] = 1 spatial_action_selected[i, idx] = 1 screens = np.concatenate(screens, axis=0) ## backprop feed = { self.screen: screens, self.value_target: value_target, self.valid_spatial_action: valid_spatial_action, self.spatial_action_selected: spatial_action_selected, self.learning_rate: self.lr } _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed) self.summary_writer.add_summary(summary, self.cur_episode)
def step(self, obs): ## feed to network minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) minimap = np.expand_dims(PP.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(PP.preprocess_screen(screen), axis=0) structure = np.expand_dims(PP.preprocess_structure(obs), axis=0) feed = { self.minimap: minimap, self.screen: screen, self.structure: structure } non_spatial_action, spatial_action = self.sess.run( [self.non_spatial_action, self.spatial_action], feed_dict=feed) ## choose spatial and non-spatial action non_spatial_action = non_spatial_action.ravel() valid_actions = obs.observation['available_actions'] temp = [] for i in [0, 7, 12]: if i in valid_actions: temp.append(i) valid_actions = temp for i in [0, 7, 12]: if i in valid_actions: temp.append(i) valid_actions = temp act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])] spatial_action = spatial_action.ravel() target = np.argmax(spatial_action) target = [int(target // self.ssize), int(target % self.ssize)] ## epsilon greedy exploration if self.training and np.random.rand() < self.epsilon[0]: act_id = np.random.choice(valid_actions) if self.training and np.random.rand() < self.epsilon[1]: range = int(self.random_range) dy = np.random.randint(-range, range) target[0] = int(max(0, min(self.ssize - 1, target[0] + dy))) dx = np.random.randint(-range, range) target[1] = int(max(0, min(self.ssize - 1, target[1] + dx))) ## return function act_args = [] for arg in actions.FUNCTIONS[act_id].args: if arg.name in ('screen', 'minimap', 'screen2'): ## spatial arg act_args.append([target[1], target[0]]) else: act_args.append([0]) ## non-spatial arg return actions.FunctionCall(act_id, act_args)
def step(self, obs): ## feed to network screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(PP.preprocess_screen(screen), axis=0) feed = {self.screen: screen} spatial_action = self.sess.run([self.spatial_action], feed_dict=feed)[0] ## choose spatial action spatial_action = spatial_action.ravel() target = np.argmax(spatial_action) target = [int(target // self.ssize), int(target % self.ssize)] # print(target, end=' ') # print(obs.observation['feature_screen'][5, target[1], target[0]] == 3, end=' ') ## epsilon greedy exploration if self.training and np.random.rand() < self.epsilon[1]: range = int(self.random_range) dy = np.random.randint(-range, range) target[0] = int(max(0, min(self.ssize - 1, target[0] + dy))) dx = np.random.randint(-range, range) target[1] = int(max(0, min(self.ssize - 1, target[1] + dx))) if 490 in obs.observation.available_actions: # print('train scv') return actions.FunctionCall(490, [[0]]) if obs.observation['player'][3] < obs.observation['player'][ 4] and obs.observation['player'][1] > 50: return actions.FunctionCall(2, [[0], [25, 25]]) if not self.walked and len( obs.observation['multi_select'] ) > 0 and 264 in obs.observation.available_actions: # print('harvest') # print(target) self.walked = True return actions.FunctionCall(264, [[0], target]) if 0 in obs.observation.available_actions and obs.observation[ 'player'][-4] == 0: # print('noop') return actions.FunctionCall(0, []) if 6 in obs.observation.available_actions and obs.observation[ 'player'][-4] > 0: # print('select') self.walked = False return actions.FunctionCall(6, [[1]]) # print('noop') return actions.FunctionCall(0, [])
def update(self, rbs): ## backprop after episode ## if not episode end, use newwork to estimate value of last state, else 0 obs = rbs[-1][-1] if obs.last(): R = 0 else: minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) minimap = np.expand_dims(PP.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(PP.preprocess_screen(screen), axis=0) structure = np.expand_dims(PP.preprocess_structure(obs), axis=0) feed = { self.minimap: minimap, self.screen: screen, self.structure: structure } R = self.sess.run(self.value, feed_dict=feed)[0] ## prepare input & actions & Q value target minimaps = [] screens = [] structures = [] value_target = np.zeros([len(rbs)], dtype=np.float32) value_target[-1] = R valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32) spatial_action_selected = np.zeros([len(rbs), self.ssize**2], dtype=np.float32) valid_non_spatial_action = np.zeros([len(rbs), self.action_size], dtype=np.float32) non_spatial_action_selected = np.zeros([len(rbs), self.action_size], dtype=np.float32) rbs.reverse() for i, [obs, action, _] in enumerate(rbs): minimap = np.array(obs.observation['feature_minimap'], dtype=np.float32) minimap = np.expand_dims(PP.preprocess_minimap(minimap), axis=0) screen = np.array(obs.observation['feature_screen'], dtype=np.float32) screen = np.expand_dims(PP.preprocess_screen(screen), axis=0) structure = np.expand_dims(PP.preprocess_structure(obs), axis=0) minimaps.append(minimap) screens.append(screen) structures.append(structure) reward = obs.reward * (obs.observation["score_cumulative"][0] + 1) * 10 # 10 times reward act_id = action.function act_args = action.arguments value_target[i] = reward + self.discount * value_target[i - 1] valid_actions = obs.observation["available_actions"] valid_non_spatial_action[i, valid_actions] = 1 non_spatial_action_selected[i, act_id] = 1 args = actions.FUNCTIONS[act_id].args for arg, act_arg in zip(args, act_args): if arg.name in ('screen', 'minimap', 'screen2'): idx = act_arg[1] * self.ssize + act_arg[0] valid_spatial_action[i] = 1 spatial_action_selected[i, idx] = 1 minimaps = np.concatenate(minimaps, axis=0) screens = np.concatenate(screens, axis=0) structures = np.concatenate(structures, axis=0) ## backprop feed = { self.minimap: minimaps, self.screen: screens, self.structure: structures, self.value_target: value_target, self.valid_spatial_action: valid_spatial_action, self.spatial_action_selected: spatial_action_selected, self.valid_non_spatial_action: valid_non_spatial_action, self.non_spatial_action_selected: non_spatial_action_selected, self.learning_rate: self.lr } _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed) self.summary_writer.add_summary(summary, self.cur_episode)