def get_action(self, obs_dict, step_cnt): """ get actions :param detector_obs_list: :param fighter_obs_list: :param joint_obs_dict: :param step_cnt: :return: """ detector_action = [] fighter_action = [] for y in range(self.fighter_num): tmp_course = obs_dict['fighter'][y]['course'] # (1, ) tmp_pos = obs_dict['fighter'][y]['pos'] # (2, ) tmp_r_visible_pos = obs_dict['fighter'][y][ 'r_visible_pos'] # (10, 2) tmp_l_missile = obs_dict['fighter'][y]['l_missile'] # rule use tmp_s_missile = obs_dict['fighter'][y]['s_missile'] # rule use tmp_j_visible_fp = obs_dict['fighter'][y][ 'j_visible_fp'] # rule use tmp_j_visible_dir = obs_dict['fighter'][y][ 'j_visible_dir'] # (10, 1) tmp_g_visible_pos = obs_dict['fighter'][y][ 'g_visible_pos'] # (10, 2) # rule use tmp_striking_id = obs_dict['fighter'][y]['striking_id'] # (10, 1) # model obs change, 归一化 course = tmp_course / 359. pos = tmp_pos / self.size_x r_visible_pos = tmp_r_visible_pos.reshape( 1, -1)[0] / self.size_x # (20,) j_visible_dir = tmp_j_visible_dir.reshape(1, -1)[0] / 359 # (10,) striking_id = tmp_striking_id.reshape(1, -1)[0] / 1 obs = np.concatenate( (course, pos, r_visible_pos, j_visible_dir, striking_id), axis=0) logger.debug('obs: {}'.format(obs)) true_action = np.array([0, 1, 0, 0], dtype=np.int32) if obs_dict['fighter'][y]['alive']: # rule policy true_action = fighter_rule(tmp_course, tmp_pos, tmp_l_missile, tmp_s_missile, tmp_r_visible_pos, tmp_j_visible_dir, tmp_j_visible_fp, tmp_g_visible_pos, step_cnt) logger.debug('true action rule out: {}'.format(true_action)) # model policy if not any([any(r_visible_pos >= 0), any(j_visible_dir >= 0)]): tmp_action = self.maddpg.select_action(y, obs) logger.debug('tmp action: {}'.format(tmp_action)) # 添加动作, 将动作转换为偏角 tmp_action_i = np.argmax(tmp_action) logger.info('tmp action i: {}'.format(tmp_action_i)) true_action[0] = action2direction(true_action[0], tmp_action_i, ACTION_NUM) logger.info('true action: {}'.format(true_action)) fighter_action.append(true_action) fighter_action = np.array(fighter_action) return detector_action, fighter_action
def get_action(self, obs_dict, step_cnt): """ get actions :param detector_obs_list: :param fighter_obs_list: :param joint_obs_dict: :param step_cnt: :return: """ detector_action = [] fighter_action = [] for y in range(self.fighter_num): tmp_course = obs_dict['fighter'][y]['course'] # (1, ) tmp_pos = obs_dict['fighter'][y]['pos'] # (2, ) tmp_r_visible_pos = obs_dict['fighter'][y][ 'r_visible_pos'] # (10, 2) tmp_j_visible_fp = obs_dict['fighter'][y][ 'j_visible_fp'] # (10, 1) tmp_l_missile = obs_dict['fighter'][y]['l_missile'] # rule use tmp_s_missile = obs_dict['fighter'][y]['s_missile'] # rule use tmp_j_visible_fp = obs_dict['fighter'][y][ 'j_visible_fp'] # rule use tmp_j_visible_dir = obs_dict['fighter'][y][ 'j_visible_dir'] # (10, 1) # rule use tmp_g_visible_pos = obs_dict['fighter'][y][ 'g_visible_pos'] # (10, 2) # model obs change, 归一化 course = tmp_course / 359. pos = tmp_pos / self.size_x r_visible_pos = tmp_r_visible_pos.reshape( 1, -1)[0] / self.size_x # (20,) j_visible_fp = tmp_j_visible_fp.reshape(1, -1)[0] / 359. # (10,) g_visible_pos = tmp_g_visible_pos.reshape( 1, -1)[0] / self.size_x # (20,) obs = np.concatenate( (course, pos, r_visible_pos, j_visible_fp, g_visible_pos), axis=0) logger.debug('obs: {}'.format(obs)) true_action = np.array([0, 1, 0, 0], dtype=np.int32) if obs_dict['fighter'][y]['alive']: # model policy tmp_action_i = self.maddpg.select_action(y, obs) logger.debug('tmp action i: {}'.format(tmp_action_i)) # rule policy true_action = fighter_rule(tmp_course, tmp_pos, tmp_l_missile, tmp_s_missile, tmp_r_visible_pos, tmp_j_visible_dir, tmp_j_visible_fp, tmp_g_visible_pos, step_cnt) logger.debug('true aciton rule out: {}'.format(true_action)) # 添加动作 todo true_action[2] = np.argmax( tmp_action_i) if IS_DISPERSED else range_transfer( tmp_action_i, 11) if true_action[2] == 11: logger.info('agent {}: right'.format(y + 1)) logger.wait() logger.info('true action: {}'.format(true_action)) fighter_action.append(true_action) fighter_action = np.array(fighter_action) return detector_action, fighter_action
# temp true_action = np.array([0, 1, 0, 0], dtype=np.int32) if not red_obs_dict['fighter'][y]['alive']: # 如果有智能体已经死亡,则默认死亡动作输出 action_list.append( np.array([-1 for _ in range(ACTION_NUM)], dtype=np.float32)) else: # model policy tmp_action = maddpg.select_action(y, obs) logger.debug('tmp action: {}'.format(tmp_action)) # rule policy true_action = fighter_rule(tmp_course, tmp_pos, tmp_l_missile, tmp_s_missile, tmp_r_visible_pos, tmp_j_visible_dir, tmp_j_visible_fp, tmp_g_visible_pos, step_cnt) logger.debug( 'true aciton rule out: {}'.format(true_action)) # 添加动作 true_action[2] = np.argmax( tmp_action) if IS_DISPERSED else range_transfer( tmp_action, ACTION_NUM) logger.debug('true action: {}'.format(true_action)) action_list.append(tmp_action) red_fighter_action.append(true_action) # env step logger.info('agent0 true action: {}'.format(