def collect_local_file(report_path: str, key: str, collect_path: str) -> str: """ collect local file: :param report_path: the local report path :param key: the key in slog_conf_path :param collect_path: the collect path :return: the local path """ collect_target_path = os.path.join(collect_path, os.path.basename(report_path)) utils.check_path_valid(collect_target_path, isdir=True, output=True) if key == Constant.DIR_SLOG: slog_report_path = os.path.join(report_path, key) if os.path.exists(slog_report_path) and \ os.path.isdir(slog_report_path): copy_file_to_dest(slog_report_path, key, collect_target_path, report_path) else: utils.print_error_log('There is no %s in %s.' % (key, report_path)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) elif key == 'MNTN_PATH': collect_target_path = os.path.join(collect_path, os.path.basename(report_path)) utils.check_path_valid(collect_target_path, isdir=True, output=True) hisi_report_path = os.path.join(report_path, Constant.DIR_BBOX) if os.path.exists(hisi_report_path) and \ os.path.isdir(hisi_report_path): copy_file_to_dest(hisi_report_path, Constant.DIR_BBOX, collect_target_path, report_path) else: utils.print_warn_log('There is no hisi_logs in %s.' % report_path) return collect_target_path
def copy_kernel_meta(self, report_path: str, collect_compile_path: str, kernel_name: str) -> bool: """ collect local dump file: :param report_path: the local compile path :param collect_compile_path: the collect compile path :param kernel_name: the kernel name """ match = False kernel_meta_path = os.path.join(self.report_path, "extra-info", "ops") if os.path.exists(kernel_meta_path): for root, _, names in os.walk(kernel_meta_path): for name in names: if name.startswith(kernel_name): src = os.path.join(root, name) collect_kernel_meta_path = os.path.join( collect_compile_path, "kernel_meta") utils.check_path_valid(collect_kernel_meta_path, isdir=True, output=True) dest = os.path.join(collect_kernel_meta_path, name) utils.copy_file(src, dest) match = True if not match: utils.print_warn_log('There is no kernel_meta file for "%s" in %s.' % (kernel_name, report_path)) return match
def collect_plog_file(collect_path: str) -> None: """ collect plog file :param collect_path: the collect path """ home_path = os.path.expanduser("~") ascend_path = os.path.join(home_path, Constant.DIR_ASCEND) applog_path = os.path.join(ascend_path, Constant.DIR_LOG) collect_target_path = os.path.join(collect_path, os.path.basename(applog_path)) utils.check_path_valid(collect_target_path, isdir=True, output=True) copy_file_to_dest(applog_path, Constant.DIR_PLOG, collect_target_path, applog_path)
def collect(self: any) -> None: """ collect info """ self.check_argument_valid() collect_path = os.path.join(self.output_path, 'collection') utils.check_path_valid(collect_path, isdir=True, output=True) utils.print_info_log('******************Collection******************') # collect slog utils.print_info_log('Start to collect slog file.') self.collect_slog_path = self.collect_slog_file( self.report_path, collect_path) utils.print_info_log('The slog file is saved in %s.' % self.collect_slog_path) # collect plog utils.print_info_log('Start to collect plog file.') self.collect_plog_file(self, collect_path) self.collect_applog_path = collect_path utils.print_info_log('The plog file is saved in %s.' % self.collect_applog_path) # if os.path.exists(os.path.join(self.report_path, "log", "device")): # utils.print_info_log( # 'Start to parse ai core error by slog and plog file.') # log_parser = DeviceLogParser(self.collect_applog_path, self.collect_slog_path) # else: # # 某些场景无法获取device日志 utils.print_info_log('Start to parse ai core error only by plog file.') log_parser = HostLogParser(self.collect_applog_path) self.ai_core_error_list, self.node_name_list, self.kernel_name_list = log_parser.get_op_info( ) utils.print_info_log('The ai core error occurs in %s.' % self.node_name_list) # collect compile utils.print_info_log('Start to collect compile file.') self.collect_compile_path = self.collect_compile_file( collect_path, self.kernel_name_list) utils.print_info_log('Start to collect dump file.') self.collect_dump_path = self.collect_dump_file( collect_path, self.node_name_list) # collect bbox utils.print_info_log('Start to collect bbox file.') self.collect_bbox_path = self.collect_bbox_file( self.report_path, collect_path) utils.print_info_log('The bbox file is saved in %s.' % self.collect_bbox_path)
def collect_dump_file(self: any, collect_path: str, op_name_list: list) -> str: """ collect dump file :param collect_path: the collect path :param op_name_list: the op name list """ # dump files are in compile_path utils.check_path_valid(self.compile_path, isdir=True) collect_dump_path = os.path.join(collect_path, 'dump') utils.check_path_valid(collect_dump_path, isdir=True, output=True) copy_dump_file_status = False for op_name in op_name_list: copy_dump_file_status = utils.copy_dump_file( self.compile_path, collect_dump_path, op_name) if copy_dump_file_status: utils.print_info_log('The dump file is saved in %s.' % collect_dump_path) return collect_dump_path
def _parse_dump_file(self: any, dump_file: str) -> any: """ Parse the dump file path by big dump data format :param: dump_file the dump file :return: DumpData :exception when read or parse file error """ utils.check_path_valid(dump_file) try: # get file size file_size = os.path.getsize(dump_file) # check file size > 8 if file_size <= Constant.UINT64_SIZE: utils.print_error_log('The size of %s is at least greater then %d, but the file' ' size is %d. Please check the dump file.' % (dump_file, Constant.UINT64_SIZE, file_size)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR) with open(dump_file, 'rb') as dump_data_file: # read header length header_length = dump_data_file.read(Constant.UINT64_SIZE) header_length = struct.unpack(Constant.UINT64_FMT, header_length)[0] # check header_length <= file_size - 8 if header_length > file_size - Constant.UINT64_SIZE: utils.print_error_log( 'The header content size(%d) of %s must be less then ' 'or equal to %d(file size) - %d(header length).' ' Please check the dump file.' % (header_length, dump_file, file_size, Constant.UINT64_SIZE)) raise utils.AicErrException( Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR) # read header content return self._get_dump_data(dump_data_file, header_length, file_size) except IOError as io_error: utils.print_error_log('Failed to read the dump file %s. %s' % (dump_file, str(io_error))) raise utils.AicErrException(Constant.MS_AICERR_OPEN_FILE_ERROR) finally: pass
def check_argument_valid(self: any) -> None: """ check argument valid """ utils.check_path_valid(self.report_path, isdir=True) utils.check_path_valid(self.compile_path, isdir=True) utils.check_path_valid(self.output_path, isdir=True, output=True)
def collect_remote_file(report_path: str, key: str, collect_path: str) -> str: """ collect local file: :param report_path: the local report path :param key: the key in slog_conf_path :param collect_path: the collect path :return: the local path """ collect_target_path = os.path.join(collect_path, os.path.basename(report_path)) utils.check_path_valid(collect_target_path, isdir=True, output=True) if key == Constant.DIR_SLOG: slog_report_path = os.path.join(report_path, "log", "device", "firmware") if os.path.exists(slog_report_path) and \ os.path.isdir(slog_report_path): copy_file_to_dest(slog_report_path, key, collect_target_path, report_path) else: utils.print_error_log( 'There is no %s in %s.' % (key, report_path)) elif key == 'MNTN_PATH': hisi_report_path = os.path.join(report_path, "log", "device", "system") if os.path.exists(hisi_report_path) and \ os.path.isdir(hisi_report_path): copy_file_to_dest(hisi_report_path, Constant.DIR_BBOX, collect_target_path, report_path) else: utils.print_warn_log( 'There is no hisi_logs in %s.' % report_path) elif key == Constant.DIR_PLOG: plog_path = os.path.join(report_path, "log", "host", "cann") if os.path.exists(plog_path) and \ os.path.isdir(plog_path): copy_file_to_dest(plog_path, Constant.DIR_PLOG, collect_target_path, report_path) else: utils.print_warn_log( 'There is no plog in %s.' % report_path) return collect_target_path
def collect_compile_file(self: any, collect_path: str, kernel_name_list: list) -> str: """ collect compile file :param collect_path: the collect path :param kernel_name_list: the kernel name list """ utils.check_path_valid(self.report_path, isdir=True) collect_compile_path = os.path.join(collect_path, 'compile') utils.check_path_valid(collect_compile_path, isdir=True, output=True) copy_kernel_meta_status = False for kernel_name in kernel_name_list: copy_kernel_meta_status = self.copy_kernel_meta( self.report_path, collect_compile_path, kernel_name) copy_proto_file_status = self.copy_proto_file(self.report_path, collect_compile_path) if copy_kernel_meta_status or copy_proto_file_status: utils.print_info_log( 'The compile file is saved in %s.' % collect_compile_path) return collect_compile_path
def parse(self: any) -> None: """ parse by collection info """ utils.print_info_log('******************Analysis******************') aicore_error_data_list = self._aicore_error_data() utils.print_info_log('Start to analyze each ai core error.') summary_info_list = [] # decompile if "aarch64" in platform.machine(): obj_dump_file = "cce-objdump_aarch64" else: obj_dump_file = "cce-objdump" obj_dump_file = os.path.join(os.getcwd(), "tools", obj_dump_file) if os.path.exists(obj_dump_file): os.system("chmod 755 " + obj_dump_file) os.environ["PATH"] = os.path.join( os.getcwd(), "tools") + ":" + os.environ["PATH"] else: cce_dump = shutil.which("cce-objdump") if not cce_dump: # guess where is cce-objdump parent_path = "aarch64-linux" if "aarch64" in platform.machine( ) else "x86_64-linux" cce_dump_guess = os.path.join("usr/local/Ascend/latest", parent_path, "ccec_compiler/bin/cce-objdump") if os.path.exists(cce_dump_guess): cce_dump = cce_dump_guess if not cce_dump: utils.print_error_log( 'Cannot find cce-objdump! please add cce-objdump path in env PATH.' ) raise utils.AicErrException( Constant.MS_AICERR_EXECUTE_COMMAND_ERROR) for i, current_pc in enumerate(self.collection.ai_core_error_list): # parser aic error by slog info = AicErrorInfo() info.err_time, info.dev_id, info.stream_id, info.task_id, \ info.core_id, info.aic_error, info.start_pc, info.extra_info, \ info.current_pc = current_pc utils.print_info_log( "******************No.%d %s******************" % (i, info.err_time)) info.err_time_obj = utils.strplogtime(info.err_time) err_i_folder_name = "aicerror_%d_%s" % ( i, time.strftime("%Y%m%d%H%M%S", info.err_time_obj.timetuple())) err_i_folder = os.path.join(self.output_path, err_i_folder_name) utils.check_path_valid(err_i_folder, isdir=True, output=True) info.node_name = self.collection.node_name_list[i] info.kernel_name = self.collection.kernel_name_list[i] # get hisi log self._get_hisi_log(info, err_i_folder) # get op info in build proto file self._get_op_by_graph(aicore_error_data_list[Constant.GRAPH_FILE], info) kernel_meta_path = os.path.join( self.collection.collect_compile_path, 'kernel_meta') if os.path.exists(kernel_meta_path): # 反编译 出错指令 result = self._decompile([info.kernel_name, kernel_meta_path], err_i_folder, info) if result is False: utils.print_warn_log( "decompile kernel_meta file %s failed." % os.path.join( kernel_meta_path, info.kernel_name + ".o")) else: utils.print_warn_log("kernel_meta path %s not exist" % kernel_meta_path) try: # input output address info.aval_addrs = self._get_available_addrs(info.err_time) info.necessary_addr = self._get_necessary_addrs( info.kernel_name) self._check_addr(info.aval_addrs, info.necessary_addr) # self.print_summary(avl_addr, necessary_addr) except BaseException as e: import logging logging.exception(e) print("Check addr error failed") info.input_output_addrs = self._get_input_output_addrs( info, err_i_folder, aicore_error_data_list[Constant.ALLOC_ADDR], aicore_error_data_list[Constant.ACTUAL_ADDR]) # 地址越界信息收集 info.addr_overflow = aicore_error_data_list[Constant.ADDR_OVERFLOW] # 算子代码地址,args地址 info.op_addr, info.args_addr, info.multi_args_addr = \ self._get_op_and_args_addr(info.start_pc) # parse dump if self.collection.collect_dump_path: parser = DumpDataParser(self.collection.collect_dump_path, info.node_name, info.kernel_name) info.dump_info = parser.parse() # write info file self._write_errorinfo_file(err_i_folder, info, i) summary_info_list.append( "%s %s device_id=%s core_id=%s task_id=%s node=%s " "kernel=%s" % (err_i_folder_name, info.aic_error, info.dev_id, info.core_id, info.task_id, info.node_name, info.kernel_name)) utils.print_info_log('Finish to analyze each ai core error.') # write summary info self._write_summary_file(summary_info_list)
def main() -> int: """ main function """ parser = argparse.ArgumentParser() parser.add_argument("-p", "--report_path", dest="report_path", default="", help="<Optional> the tar dir from npucollector", required=False) parser.add_argument("-f", "--tar_file", dest="tar_file", default="", help="<Optional> the tar.gz path from npucollector", required=False) parser.add_argument("-out", "--output", dest="output_path", default="", help="<Optional> the output path") if len(sys.argv) <= 1: parser.print_usage() return Constant.MS_AICERR_INVALID_PARAM_ERROR args = parser.parse_args(sys.argv[1:]) if (not args.report_path) and (not args.tar_file): utils.print_error_log("report_path and tar_file must have one ") return Constant.MS_AICERR_INVALID_PARAM_ERROR try: collect_time = time.localtime() cur_time_str = time.strftime("%Y%m%d%H%M%S", collect_time) utils.check_path_valid(os.path.realpath(args.output_path), isdir=True, output=True) output_path = os.path.join(os.path.realpath(args.output_path), "info_" + cur_time_str) utils.check_path_valid(output_path, isdir=True, output=True) if args.tar_file: print("Start to unzip tar.gz, ") extract_path = "extract_" + cur_time_str extract_tar(args.tar_file, extract_path) args.report_path = get_select_dir(extract_path) # collect info collection = RemoteCollection(args.report_path, output_path) collection.collect() # clear local script.sh local_script = os.path.join(output_path, 'collection', Constant.SCRIPT) utils.rm_path(local_script, output_path, isdir=True) # parse ai core error parser = AicoreErrorParser(collection, output_path, collect_time) parser.parse() single_op_case = SingleOpCase(collection, output_path, collect_time) single_op_case.run() except utils.AicErrException as error: return error.error_info finally: pass return Constant.MS_AICERR_NONE_ERROR
def check_arguments_valid(self: any) -> None: """ Function Description: check arguments valid """ utils.check_path_valid(self.input_path, isdir=True)