def _get_available_addrs(self: any, occur_time: str) -> list: ''' 获取occur_time时刻可用的地址 :param occur_time: aicore error发生的时间 :return: 可用空间的list ''' alloc_cmd = [ 'grep', 'DevMalloc: Succ,', '-nr', self.collection.collect_applog_path ] _, alloc_data = utils.execute_command(alloc_cmd) alloc_regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?size\s*=\s*([" \ r"\d]+).+?ptr\s*=\s*([\da-zA-Z]+)" alloc_ret = re.findall(alloc_regexp, alloc_data, re.M) free_cmd = [ 'grep', 'DevFree: mem', '-nr', self.collection.collect_applog_path ] _, free_data = utils.execute_command(free_cmd) free_regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?mem\s*=\s*([\da-zA-Z]+)" free_ret = re.findall(free_regexp, free_data, re.M) avl_addr = [] occur_time_obj = utils.strplogtime(occur_time) for _, (alloc_time, size, addr) in enumerate(alloc_ret): alloc_time_obj = utils.strplogtime(alloc_time) if alloc_time_obj < occur_time_obj: avl_addr.append((addr, int(size))) for _, (free_time, addr) in enumerate(free_ret): free_time_obj = utils.strplogtime(free_time) if free_time_obj < occur_time_obj: avl_addr = self._remove_first_found_addr(addr, avl_addr) utils.print_info_log("get available addr: {}".format(avl_addr)) return avl_addr
def _get_input_output_addrs_cmd_process(self: any, info: any, err_i_folder: str) -> list: cmd = ['grep', "memaddr", '-nr', self.collection.collect_applog_path] _, data = utils.execute_command(cmd) tmp_file = os.path.join(err_i_folder, 'tmp.txt') utils.write_file(tmp_file, data) cmd = ['grep', info.node_name, '-nr', tmp_file] _, data = utils.execute_command(cmd) utils.rm_path(tmp_file, self.output_path) regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?\[IMAS\].+(input\[\d+\]|" \ r"output\[\d+\]) *memaddr\[(\S+)\]" ret = re.findall(regexp, data, re.M) return ret
def _get_hisi_log(self: any, info: any, err_i_folder: str) -> None: hisi_log_devid_path = os.path.join(self.collection.collect_bbox_path, Constant.DIR_BBOX, "device-" + info.dev_id) if not os.path.exists(hisi_log_devid_path): utils.print_warn_log( 'There is no hisi log for device_id(%s), the path=%s.' % (info.dev_id, hisi_log_devid_path)) return key_word = "device_id=%s, stream_id=%s, task_id=%s" % ( info.dev_id, info.stream_id, info.task_id) cmd = ['grep', key_word, '-nr', self.collection.collect_bbox_path] _, data = utils.execute_command(cmd) regexp = r"(%s.+?(\d+)-(\d+).+%s)" % ( self.collection.collect_bbox_path, 'ts.txt') ret = re.findall(regexp, data, re.M) if len(ret) == 0: utils.print_warn_log( "Failed to get hisi log for device_id(%s) stream_id(%s) " "task_id(%s), you may reboot and try again." % (info.dev_id, info.stream_id, info.task_id)) return # find the last time(max time) max_hisi_file_path = self._get_max_hisi_file_path(ret) utils.copy_file(max_hisi_file_path, os.path.join(err_i_folder, "ts.log"))
def _get_tiling_info(self, kernel_name) -> list: aic_info_cmd = [ 'grep', '-r', '-C', '7', "\[AIC_INFO\] dev_func:{}".format(kernel_name), self.collection.collect_applog_path ] _, aic_info = utils.execute_command(aic_info_cmd) aic_info_blockdim_regexp = r"\[AIC_INFO\]\sblock_dim:(\d+)" aic_info_blockdim_ret = re.findall(aic_info_blockdim_regexp, aic_info, re.M) if len(aic_info_blockdim_ret) == 0: utils.print_warn_log(f"Failed to get {aic_info_blockdim_regexp}") elif len(aic_info_blockdim_ret[0]) == 0: utils.print_info_log(f"get {aic_info_blockdim_regexp} is null") block_dim = "" else: block_dim = int(aic_info_blockdim_ret[0][0]) aic_info_tiling_data_regex = r"\[AIC_INFO\]\stiling_data:(.*?)" aic_info_tiling_data_ret = re.findall(aic_info_tiling_data_regex, aic_info, re.M) if len(aic_info_tiling_data_ret) == 0: utils.print_warn_log(f"Failed to get {aic_info_tiling_data_regex}") elif len(aic_info_tiling_data_ret[0]) == 0: utils.print_info_log(f"get {aic_info_tiling_data_regex} is null") tiling_data = "" else: tiling_data = bytes(aic_info_tiling_data_ret[0][0], encoding="utf-8") return (block_dim, tiling_data)
def _get_decompile_status(o_file: str, decompile_file: str) -> int: flags = "dav-m100" cmd = [ Constant.OBJ_DUMP_FILE, '-d', '-mcpu=' + flags, '-line-numbers', o_file ] status, _ = utils.execute_command(cmd, file_out=decompile_file) return status
def _get_air_error_execute_command(self): grep_cmd = ['grep', 'PrintCoreErrorInfo:.*?there is an aicore error', '-inrE', self.collect_applog_path] status, data = utils.execute_command(grep_cmd) if status != 0: utils.print_error_log("Failed to execute command: %s.Maybe rts break when report Core log to host." % " ".join(grep_cmd)) raise utils.AicErrException(Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) return data
def _get_node_and_kernel_name_execute_command(self: any) -> any: grep_cmd = ['grep', 'PrintErrorInfo:.*?aicore kernel execute failed', '-inrE', self.collect_applog_path] status, data = utils.execute_command(grep_cmd) if status != 0: utils.print_error_log("Failed to execute command: %s." % " ".join(grep_cmd)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) return data
def _get_addr_overflow_mini(self: any) -> list: cmd = [ 'grep', 'devmm_page_fault_d2h_query_flag', '-nr', self.collection.collect_slog_path ] _, data = utils.execute_command(cmd) regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?va=([\da-zA-Z]+)" ret = re.findall(regexp, data, re.M) for i, (time_str, value) in enumerate(ret): ret[i] = "%s %s is out of range" % (time_str, value) return ret
def _get_all_error_log(self: any) -> None: error_log_file = os.path.join(self.output_path, "error.log") utils.print_info_log('Start to analyze error slog.') cmd = ['grep', r'\[ERROR\]', '-nr', self.collection.collect_slog_path] status, data = utils.execute_command(cmd) if status != 0: utils.print_error_log("Failed to execute command: %s. %s" % (" ".join(cmd), " ".join(data))) raise utils.AicErrException( Constant.MS_AICERR_EXECUTE_COMMAND_ERROR) utils.write_file(error_log_file, data) utils.print_info_log('The error slog is saved in %s.' % error_log_file)
def _get_op_param(self, kernel_name) -> list: get_param_cmd = [ 'grep', f'BuildSingleOp Prebuilding op: kernelName\[{kernel_name}\]', '-hr', '-A', '3', self.collection.collect_applog_path ] _, get_param_data = utils.execute_command(get_param_cmd) purified_data = re.sub(r"\[INFO\].*?\[fusion_op.cc:\d+?\].*?\s", "", get_param_data) purified_data = re.sub(r"[\n]", "", purified_data) get_param_regexp = r"op inputs:\s*\((.*?)\),\s*outputs:\s*\((.*?)\),\s*attrs:\s*\((.*?)\)\." get_param_ret = re.findall(get_param_regexp, purified_data, re.M) if len(get_param_ret) == 0: utils.print_error_log( f"Fail to get op params of kernel [{kernel_name}] in host log ." ) raise utils.AicErrException( Constant.MS_AICERR_EXECUTE_COMMAND_ERROR) input_str, output_str, attr_str = get_param_ret[0] result_list = [] input_list = ast.literal_eval("[" + input_str + "]") for (index, input_item) in enumerate(input_list): input_item["param_type"] = "input" input_item["run_shape"] = input_item.get("shape") data_file = os.path.join( self.collection.collect_dump_path, ".".join([kernel_name, "input", str(index), "npy"])) input_item["value"] = np.load(data_file) x_range = [] for i in input_item.get("shape"): x_range.append((i, i)) input_item["range"] = x_range result_list.extend(input_list) output_list = ast.literal_eval("[" + output_str + "]") for output_item in output_list: output_item["param_type"] = "output" output_item["run_shape"] = output_item.get("shape") y_range = [] for i in output_item.get("shape"): y_range.append((i, i)) output_item["range"] = y_range result_list.extend(output_list) attr_list_ori = ast.literal_eval("[" + attr_str + "]") for attr_item in attr_list_ori: if isinstance(attr_item, dict): result_list.append(attr_item.get("value")) return result_list
def _get_addr_overflow_cloud(self: any) -> list: cmd = [ 'grep', 'previous alloced start_va', '-nr', self.collection.collect_slog_path ] _, data = utils.execute_command(cmd) regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?va=([\da-zA-Z]+)\s+previous " \ r"alloced start_va=([\da-zA-Z]+), end_va=([\da-zA-Z]+)," ret = re.findall(regexp, data, re.M) for i, (time_str, value, start, end) in enumerate(ret): ret[i] = "%s %s is out of range [%s, %s]" % (time_str, value, start, end) return ret
def _get_alloc_addr(self: any) -> list: # DevMalloc: Succ, size=512, type=2, ptr=0x108040014000 cmd = [ 'grep', 'DevMalloc: Succ,', '-nr', self.collection.collect_applog_path ] _, data = utils.execute_command(cmd) regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?size\s*=\s*([" \ r"\d]+).+?ptr\s*=\s*([\da-zA-Z]+)" ret = re.findall(regexp, data, re.M) alloc_addr = [] for _, (_, size, addr) in enumerate(ret): alloc_addr.append((addr, int(size))) return alloc_addr
def _get_op_and_args_addr(self: any, pc_start: str) -> tuple: # pc_start低48位有效 code = utils.get_01_from_hexstr(pc_start, 47, 0) op_addr = hex(int(code, 2)) match_pattern = "ToCommandBody: funcAddr=%s" % (str(op_addr).upper()) cmd = [ 'grep', match_pattern, '-nr', self.collection.collect_applog_path ] _, data = utils.execute_command(cmd) regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?funcAddr=([\da-zA-Z]+).+?args=([\da-zA-Z]+)" ret = re.findall(regexp, data, re.M) args_addr_late, multi_args_addr = self._get_args_addr_late( op_addr, ret) return op_addr, args_addr_late, multi_args_addr
def _get_module_str(self, kernel_name) -> str: get_module_cmd = [ 'grep', rf'kernel\[{kernel_name}\].*module\[', '-hr', self.collection.collect_applog_path ] _, get_module_data = utils.execute_command(get_module_cmd) get_module_regexp = rf"kernel\[{kernel_name}\].*?module\[(.*?)\]" get_module_ret = re.findall(get_module_regexp, get_module_data, re.M) if len(get_module_ret) == 0: utils.print_error_log( f"Fail to get op module of kernel [{kernel_name}] in host log ." ) raise utils.AicErrException( Constant.MS_AICERR_EXECUTE_COMMAND_ERROR) return get_module_ret[0]
def _get_imas_log(self: any) -> None: imas_log_file = os.path.join(self.output_path, "imas.log") cmd = ['grep', 'IMAS', '-nr', self.collection.collect_applog_path] utils.print_info_log('Start to analyze IMAS log.') status, data = utils.execute_command(cmd) if status == 1: utils.print_warn_log("There is no IMAS log in %s" % self.output_path) return if status != 0: utils.print_error_log("Failed to execute command: %s. %s" % (" ".join(cmd), " ".join(data))) raise utils.AicErrException( Constant.MS_AICERR_EXECUTE_COMMAND_ERROR) utils.write_file(imas_log_file, data) utils.print_info_log('The IMAS log is saved in %s.' % imas_log_file)
def _get_addr_overflow_diff_incorrect_device(self: any) -> list: cmd = [ 'grep', 'devmm_svm_get_vaflgs_by_pid', '-nr', self.collection.collect_slog_path ] _, data = utils.execute_command(cmd) regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?addr is mapped.+va=" \ r"([\da-zA-Z]+).devid=(\d+),bitmap=([\da-zA-Z]+)" ret = re.findall(regexp, data, re.M) for i, (time_str, value, devid, bitmap) in enumerate(ret): # bitmap 的 【31:26】 标明 该地址在哪个device上分配 code = utils.get_01_from_hexstr(bitmap, 31, 26) allocated_dev_id = str(int(code, 2)) ret[i] = "%s %s, allocated for device %s, is visited on wrong " \ "device whose id is %s" % ( time_str, value, allocated_dev_id, devid) return ret
def _get_node_name_by_kernel_name(self: any, kernel_name: any) -> str: """ get node name by kernel name :param kernel_name: :return: node_name """ node_name = '' aic_info_cmd = ['grep', '-r', '-C', '7', "\[AIC_INFO\] dev_func:{}".format(kernel_name), self.collect_applog_path] _, aic_info = utils.execute_command(aic_info_cmd) aic_info_dev_func_regex = r"\[AIC_INFO\]\snode_name:(.*?)," aic_info_dev_func_ret = re.findall(aic_info_dev_func_regex, aic_info) if len(aic_info_dev_func_ret) == 0: utils.print_warn_log("Failed to get node name by kernel name.") return node_name node_name = aic_info_dev_func_ret[0] return node_name
def _get_actual_addr(self: any) -> dict: # 获取真实地址 cmd = [ 'grep', '[ZCPY] Copy Blobs', '-nr', self.collection.collect_slog_path ] _, data = utils.execute_command(cmd) regexp = r'(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?Copy Blobs.+?addr:\s*([' \ r'\da-zA-Z]+).+?data:' \ r'\s*([\da-zA-Z]+)' ret = re.findall(regexp, data, re.M) actual_addr = {} for _, (time_str, old_addr, new_addr) in enumerate(ret): time_obj = utils.strplogtime(time_str) if old_addr in actual_addr: # 取最迟的 if time_obj > actual_addr.get(old_addr)[1]: actual_addr[old_addr] = [new_addr, time_obj] else: actual_addr[old_addr] = [new_addr, time_obj] for old_addr in actual_addr: actual_addr[old_addr] = actual_addr.get(old_addr)[0] return actual_addr
def get_op_info(self: any) -> tuple: grep_cmd = ['grep', '<exception_print>TIME.*4060006', '-nr', '-A', '120', self.collect_slog_path] status, data = utils.execute_command(grep_cmd) if status != 0: utils.print_error_log("Failed to execute command: %s." % " ".join(grep_cmd)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) ret = re.findall(Constant.EXCEPTION_PATTERN, data, re.M | re.S) if len(ret) == 0: utils.print_info_log("No AIC_ERROR found.") raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) for device_aic_err in ret: if len(device_aic_err) != Constant.AIC_ERROR_TUPLE_LEN: utils.print_info_log("The AIC_ERROR is not complete.") raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) log_time = device_aic_err[0] dev_id = device_aic_err[1] stream_id = device_aic_err[2] task_id = device_aic_err[3] err_time = utils.strplogtime(log_time) node_name, kernel_name = self._get_node_and_kernel_name( dev_id, task_id, stream_id, err_time) if node_name == '' and kernel_name == '': continue self.ai_core_error_list.append(device_aic_err) self.node_name_list.append(node_name) self.kernel_name_list.append(kernel_name) if len(self.ai_core_error_list) == 0: utils.print_error_log( "The AIC_ERROR of device does not match the host.") raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) return self.ai_core_error_list, self.node_name_list, self.kernel_name_list
def _get_necessary_addrs(self: any, kernal_name: str) -> list: ''' 获取occur_time时刻可用的地址 :param kernal_name: 发生aicore error的kernal_name :return: 需要的空间 ''' result = {} aic_info_cmd = [ 'grep', '-r', '-C', '7', "\[AIC_INFO\] dev_func:{}".format(kernal_name), self.collection.collect_applog_path ] _, aic_info = utils.execute_command(aic_info_cmd) utils.print_info_log( "===============================\n{}\n==================================" .format(aic_info)) aic_info_all_regexp = r"\[AIC_INFO\]\snode_name:(.*?),\snode_type:(.*?),\sstream_id:(\d+),\stask_id:(\d+)" aic_info_all_ret = re.findall(aic_info_all_regexp, aic_info, re.M) if len(aic_info_all_ret) == 0: utils.print_warn_log( "Failed to get [AIC_INFO]\snode_name(.*?),\snode_tye(.*?),\sstream_id:(\d+),\stask_id:(\d+)" ) return node_name = aic_info_all_ret[0][0] node_type = aic_info_all_ret[0][1] stream_id = aic_info_all_ret[0][2] task_id = aic_info_all_ret[0][3] aic_info_input_regexp = r"\[AIC_INFO\]\sinput:(.*?);shape:(.*?);format:(.*?);dtype:(.*?);addr:(.*?)$" aic_info_input_ret = re.findall(aic_info_input_regexp, aic_info, re.M) if len(aic_info_input_ret) == 0: utils.print_warn_log( "Failed to get [AIC_INFO]\sinput:(.*?);shape(.*?);format:(.*?);dtype(.*?);addr:(.*?)" ) return input_params = [] for input_info in aic_info_input_ret: input_param = {} input_param["index"] = input_info[0] input_param["shape"] = input_info[1] input_param["format"] = input_info[2] input_param["dtype"] = input_info[3] input_param["addr"] = input_info[4] input_params.append(input_param) aic_info_output_regexp = r"\[AIC_INFO\]\soutput:(.*?);shape:(.*?);format:(.*?);dtype:(.*?);addr:(.*?)$" aic_info_output_ret = re.findall(aic_info_output_regexp, aic_info, re.M) if len(aic_info_output_ret) == 0: utils.print_warn_log( "Failed to get [AIC_INFO]\soutput:(.*?);shape(.*?);format:(.*?);dtype(.*?);addr:(.*?)" ) return output_params = [] for output_info in aic_info_output_ret: output_param = {} output_param["index"] = output_info[0] output_param["shape"] = output_info[1] output_param["format"] = output_info[2] output_param["dtype"] = output_info[3] output_param["addr"] = output_info[4] output_params.append(output_param) aic_info_blockdim_regexp = r"\[AIC_INFO\]\sblock_dim:(\d+)" aic_info_blockdim_ret = re.findall(aic_info_blockdim_regexp, aic_info, re.M) if len(aic_info_blockdim_ret) == 0: utils.print_warn_log(f"Failed to get {aic_info_blockdim_regexp}") elif len(aic_info_blockdim_ret[0]) == 0: utils.print_info_log(f"get {aic_info_blockdim_regexp} is null") block_dim = "" else: block_dim = int(aic_info_blockdim_ret[0][0]) aic_info_workspace_regex = r"\[AIC_INFO\]\sworkspace_bytes:(.*?)" aic_info_workspace_ret = re.findall(aic_info_workspace_regex, aic_info, re.M) if len(aic_info_workspace_ret) == 0: utils.print_warn_log(f"Failed to get {aic_info_workspace_regex}") elif len(aic_info_workspace_ret[0]) == 0: utils.print_info_log(f"get {aic_info_workspace_regex} is null") workspace = "0" else: workspace = aic_info_workspace_ret[0][0] aic_info_dev_func_regex = r"\[AIC_INFO\]\sdev_func:(.*?)" aic_info_dev_func_ret = re.findall(aic_info_dev_func_regex, aic_info, re.M) aic_info_tvm_magic_regex = r"\[AIC_INFO\]\stvm_magic:(.*?)" aic_info_tvm_magic_ret = re.findall(aic_info_tvm_magic_regex, aic_info, re.M) aic_info_kernel_info_regex = r"\[AIC_INFO\]\skernel_info:(.*?)" aic_info_kernel_info_ret = re.findall(aic_info_kernel_info_regex, aic_info, re.M) aic_info_tiling_key_regex = r"\[AIC_INFO\]\stiling_key:(.*?)" aic_info_tiling_key_ret = re.findall(aic_info_tiling_key_regex, aic_info, re.M) aic_info_tiling_data_regex = r"\[AIC_INFO\]\stiling_data:(.*?)" aic_info_tiling_data_ret = re.findall(aic_info_tiling_data_regex, aic_info, re.M) if len(aic_info_tiling_data_ret) == 0: utils.print_warn_log(f"Failed to get {aic_info_tiling_data_regex}") elif len(aic_info_tiling_data_ret[0]) == 0: utils.print_info_log(f"get {aic_info_tiling_data_regex} is null") tiling_data = "" else: tiling_data = bytes(aic_info_tiling_data_ret[0][0], encoding="utf-8") aic_info_op_file_path_regex = r"\[AIC_INFO\]\sop_file_path:(.*?)" aic_info_op_file_path_ret = re.findall(aic_info_op_file_path_regex, aic_info, re.M) result["input_addr"] = input_params result["output_addr"] = output_params result["workspace"] = workspace return result