Exemple #1
0
def collect_local_file(report_path: str, key: str, collect_path: str) -> str:
    """
    collect local file:
    :param report_path: the local report path
    :param key: the key in slog_conf_path
    :param collect_path: the collect path
    :return: the local path
    """
    collect_target_path = os.path.join(collect_path,
                                       os.path.basename(report_path))
    utils.check_path_valid(collect_target_path, isdir=True, output=True)
    if key == Constant.DIR_SLOG:
        slog_report_path = os.path.join(report_path, key)
        if os.path.exists(slog_report_path) and \
                os.path.isdir(slog_report_path):
            copy_file_to_dest(slog_report_path, key, collect_target_path,
                              report_path)
        else:
            utils.print_error_log('There is no %s in %s.' % (key, report_path))
            raise utils.AicErrException(
                Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
    elif key == 'MNTN_PATH':
        collect_target_path = os.path.join(collect_path,
                                           os.path.basename(report_path))
        utils.check_path_valid(collect_target_path, isdir=True, output=True)
        hisi_report_path = os.path.join(report_path, Constant.DIR_BBOX)
        if os.path.exists(hisi_report_path) and \
                os.path.isdir(hisi_report_path):
            copy_file_to_dest(hisi_report_path, Constant.DIR_BBOX,
                              collect_target_path, report_path)
        else:
            utils.print_warn_log('There is no hisi_logs in %s.' % report_path)
    return collect_target_path
Exemple #2
0
    def _save_tensor_to_file(self: any, tensor_list: list, tensor_type: str, dump_file: str) -> str:
        result_info = ''
        if len(tensor_list) == 0:
            utils.print_warn_log(
                'There is no %s in "%s".' % (tensor_type, dump_file))
            return result_info
        dump_file_path, _ = os.path.split(dump_file)
        for (index, tensor) in enumerate(tensor_list):
            try:
                array = np.frombuffer(tensor.data,
                                      dtype=self._get_dtype_by_data_type(
                                          tensor.data_type))
                npy_file_name = ".".join([self.kernel_name, tensor_type, str(index), "npy"])
                np.save(os.path.join(dump_file_path, npy_file_name), array)
                if (np.isinf(array).any() or np.isnan(array).any()) and tensor_type == "input":
                    result_info += '%s[%d] NaN/INF\n' % (tensor_type, index)
                    utils.print_error_log('%s[%d] NaN/INF\n' % (tensor_type, index))
                    raise utils.AicErrException(
                        Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR)
            except (ValueError, IOError, OSError, MemoryError) as error:
                utils.print_error_log('Failed to parse the data of %s:%d of "%s". %s' % (
                    tensor_type, index, dump_file, error))
                raise utils.AicErrException(
                    Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR)
            finally:
                pass

        return result_info
Exemple #3
0
 def _get_dump_data(self: any, dump_file: any, header_length: int, file_size: int) -> any:
     # read header content
     content = dump_file.read(header_length)
     dump_data = DD.DumpData()
     try:
         dump_data.ParseFromString(content)
     except DecodeError as de_error:
         utils.print_error_log('Failed to parse the serialized header content of %s. '
                               'Please check the dump file. %s '
                               % (dump_file, str(de_error)))
         raise utils.AicErrException(
             Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR)
     finally:
         pass
     self._check_dump_data_vaild(dump_data, dump_file, header_length, file_size)
     if len(dump_data.input) > 0:
         for (index, _) in enumerate(dump_data.input):
             dump_data.input[index].data = dump_file.read(
                 dump_data.input[index].size)
     if len(dump_data.output) > 0:
         for (index, _) in enumerate(dump_data.output):
             dump_data.output[index].data = dump_file.read(
                 dump_data.output[index].size)
     if len(dump_data.buffer) > 0:
         for (index, _) in enumerate(dump_data.buffer):
             dump_data.buffer[index].data = dump_file.read(
                 dump_data.buffer[index].size)
     return dump_data
Exemple #4
0
    def get_op_info(self: any) -> tuple:
        data = self._get_air_error_execute_command()
        ret = self._get_aicerror_args(data)
        for aic_err in ret:
            log_time = aic_err[0]
            dev_id = aic_err[1]
            err_time = utils.strplogtime(log_time)
            stream_id, task_id, node_name, kernel_name = self._get_node_and_kernel_name(
                dev_id, err_time)
            if node_name == '' and kernel_name == '':
                continue

            # 适配原开发过程中的device_aic_err
            device_aic_err = [None] * 9
            device_aic_err[0] = aic_err[0]  # err time
            device_aic_err[1] = aic_err[1]  # dev id
            device_aic_err[2] = stream_id  # stream id
            device_aic_err[3] = task_id  # task id
            device_aic_err[4] = aic_err[2]  # core id
            device_aic_err[5] = aic_err[3]  # aic error code
            device_aic_err[6] = aic_err[4]  # start pc
            device_aic_err[7] = self._get_extra_info(aic_err)  # extra_info
            device_aic_err[8] = aic_err[5]  # current pc

            self.ai_core_error_list.append(device_aic_err)
            self.node_name_list.append(node_name)
            self.kernel_name_list.append(kernel_name)
        if len(self.ai_core_error_list) == 0:
            utils.print_error_log(
                "The AIC_ERROR of device does not match the host.")
            raise utils.AicErrException(
                Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
        return self.ai_core_error_list, self.node_name_list, self.kernel_name_list
Exemple #5
0
    def _get_op_impl_type(self, params: list, module_name: str) -> str:
        has_dynamic_shape = False
        for para_item in params:
            if not isinstance(para_item,
                              dict) or para_item.get("param_type") != "input":
                continue
            for i in para_item.get("shape"):
                if i < 0:
                    has_dynamic_shape = True
                    break
            if has_dynamic_shape:
                break

        if ".dynamic." in module_name:
            if has_dynamic_shape:
                return "dynamic"
            else:
                return "static"
        else:
            if has_dynamic_shape:
                utils.print_error_log(
                    "There is dynamic shape in param, but call static impl")
                raise utils.AicErrException(
                    Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
            else:
                return "pre-static"
Exemple #6
0
    def _get_node_and_kernel_name(self: any, dev_id: any, err_time: any) -> tuple:
        data = self._get_node_and_kernel_name_execute_command()
        regexp = r"(\d+-\d+-\d+-\d+:\d+:\d+\.\d+\.\d+).+?device_id=\d+\s*,\s*stream_id=" \
                 r"(\d+)\s*.+?\s*task_id=(\d+)\s*,.*?fault kernel_name=" \
                 r"[-\d_]{0,}(\S+?),\s*func_name=(\S+),"
        ret = re.findall(regexp, data, re.M | re.S)
        if len(ret) == 0:
            utils.print_error_log(
                "There is no node name and kernel name for dev id(%s) in plog."
                % dev_id)
            raise utils.AicErrException(
                Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)

        if len(ret) > 1:
            max_i = self._get_the_latest_aicerr_form_ret(ret, err_time)
            result = ret[max_i][1:]
        result = ret[0][1:]

        kernel_name_list = result[3].split('_')
        if "" in kernel_name_list:
            kernel_name_list.remove("")
        kernel_name_list = kernel_name_list[:-1]
        kernel_name = '_'.join(kernel_name_list)

        node_name = self._get_node_name_by_kernel_name(kernel_name)
        result_list = list(result)
        result_list[2] =  node_name
        result_list[3] = kernel_name
        return result_list
Exemple #7
0
 def _get_node_and_kernel_name_execute_command(self: any) -> any:
     grep_cmd = ['grep', 'PrintErrorInfo:.*?aicore kernel execute failed',
                 '-inrE', self.collect_applog_path]
     status, data = utils.execute_command(grep_cmd)
     if status != 0:
         utils.print_error_log("Failed to execute command: %s." % " ".join(grep_cmd))
         raise utils.AicErrException(
             Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
     return data
Exemple #8
0
 def _get_air_error_execute_command(self):
     grep_cmd = ['grep', 'PrintCoreErrorInfo:.*?there is an aicore error',
                 '-inrE', self.collect_applog_path]
     status, data = utils.execute_command(grep_cmd)
     if status != 0:
         utils.print_error_log("Failed to execute command: %s.Maybe rts break when report Core log to host." %
                               " ".join(grep_cmd))
         raise utils.AicErrException(Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
     return data
Exemple #9
0
 def _get_all_error_log(self: any) -> None:
     error_log_file = os.path.join(self.output_path, "error.log")
     utils.print_info_log('Start to analyze error slog.')
     cmd = ['grep', r'\[ERROR\]', '-nr', self.collection.collect_slog_path]
     status, data = utils.execute_command(cmd)
     if status != 0:
         utils.print_error_log("Failed to execute command: %s. %s" %
                               (" ".join(cmd), " ".join(data)))
         raise utils.AicErrException(
             Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
     utils.write_file(error_log_file, data)
     utils.print_info_log('The error slog is saved in %s.' % error_log_file)
Exemple #10
0
    def _get_op_param(self, kernel_name) -> list:
        get_param_cmd = [
            'grep',
            f'BuildSingleOp Prebuilding op: kernelName\[{kernel_name}\]',
            '-hr', '-A', '3', self.collection.collect_applog_path
        ]
        _, get_param_data = utils.execute_command(get_param_cmd)
        purified_data = re.sub(r"\[INFO\].*?\[fusion_op.cc:\d+?\].*?\s", "",
                               get_param_data)
        purified_data = re.sub(r"[\n]", "", purified_data)
        get_param_regexp = r"op inputs:\s*\((.*?)\),\s*outputs:\s*\((.*?)\),\s*attrs:\s*\((.*?)\)\."
        get_param_ret = re.findall(get_param_regexp, purified_data, re.M)
        if len(get_param_ret) == 0:
            utils.print_error_log(
                f"Fail to get op params of kernel [{kernel_name}] in host log ."
            )
            raise utils.AicErrException(
                Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
        input_str, output_str, attr_str = get_param_ret[0]

        result_list = []
        input_list = ast.literal_eval("[" + input_str + "]")
        for (index, input_item) in enumerate(input_list):
            input_item["param_type"] = "input"
            input_item["run_shape"] = input_item.get("shape")
            data_file = os.path.join(
                self.collection.collect_dump_path,
                ".".join([kernel_name, "input",
                          str(index), "npy"]))

            input_item["value"] = np.load(data_file)
            x_range = []
            for i in input_item.get("shape"):
                x_range.append((i, i))
            input_item["range"] = x_range
        result_list.extend(input_list)

        output_list = ast.literal_eval("[" + output_str + "]")
        for output_item in output_list:
            output_item["param_type"] = "output"
            output_item["run_shape"] = output_item.get("shape")
            y_range = []
            for i in output_item.get("shape"):
                y_range.append((i, i))
            output_item["range"] = y_range
        result_list.extend(output_list)

        attr_list_ori = ast.literal_eval("[" + attr_str + "]")
        for attr_item in attr_list_ori:
            if isinstance(attr_item, dict):
                result_list.append(attr_item.get("value"))
        return result_list
Exemple #11
0
 def _get_module_str(self, kernel_name) -> str:
     get_module_cmd = [
         'grep', rf'kernel\[{kernel_name}\].*module\[', '-hr',
         self.collection.collect_applog_path
     ]
     _, get_module_data = utils.execute_command(get_module_cmd)
     get_module_regexp = rf"kernel\[{kernel_name}\].*?module\[(.*?)\]"
     get_module_ret = re.findall(get_module_regexp, get_module_data, re.M)
     if len(get_module_ret) == 0:
         utils.print_error_log(
             f"Fail to get op module of kernel [{kernel_name}] in host log ."
         )
         raise utils.AicErrException(
             Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
     return get_module_ret[0]
Exemple #12
0
 def _get_imas_log(self: any) -> None:
     imas_log_file = os.path.join(self.output_path, "imas.log")
     cmd = ['grep', 'IMAS', '-nr', self.collection.collect_applog_path]
     utils.print_info_log('Start to analyze IMAS log.')
     status, data = utils.execute_command(cmd)
     if status == 1:
         utils.print_warn_log("There is no IMAS log in %s" %
                              self.output_path)
         return
     if status != 0:
         utils.print_error_log("Failed to execute command: %s. %s" %
                               (" ".join(cmd), " ".join(data)))
         raise utils.AicErrException(
             Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
     utils.write_file(imas_log_file, data)
     utils.print_info_log('The IMAS log is saved in %s.' % imas_log_file)
Exemple #13
0
    def _get_cce_tbe_code_number(self: any, decompile_file: str,
                                 loc_json_file: str, err_pc: str,
                                 info: any) -> bool:
        # txt code to cce number
        if os.path.exists(decompile_file) is False:
            utils.print_error_log("The decompile file does not exist.")
            return False

        if err_pc != "":
            cce_code_num = self._read_decompile_file(decompile_file, err_pc,
                                                     info)
            # cce to tbe code number
            if os.path.exists(loc_json_file) is False:
                utils.print_warn_log("file %s not exist" % loc_json_file)
                return False
            self._read_loc_json_file(loc_json_file, cce_code_num, info)
        return True
Exemple #14
0
    def _decompile(self: any, kernel_info: list, dir_path: str,
                   info: any) -> bool:
        kernel_name = kernel_info[0]
        kernel_meta_path = kernel_info[1]
        diff_str, err_pc = self._get_info_for_decompile(info)

        # decompile .o file
        cce_file = os.path.join(kernel_meta_path, kernel_name + ".cce")
        if os.path.exists(cce_file) is False:
            utils.print_warn_log(".cce file %s not exist" % cce_file)
        else:
            utils.copy_file(cce_file,
                            os.path.join(dir_path, kernel_name + ".cce"))

        # decompile .o file
        o_file = os.path.join(kernel_meta_path, kernel_name + ".o")
        if os.path.exists(o_file) is False:
            utils.print_warn_log(".o file %s not exist" % o_file)
            return False

        utils.copy_file(o_file, os.path.join(dir_path, kernel_name + ".o"))

        utils.copy_file(o_file, os.path.join(dir_path, kernel_name + ".json"))

        decompile_file_name = kernel_name + ".o.txt"
        decompile_file = os.path.join(dir_path, decompile_file_name)

        status = self._get_decompile_status(o_file, decompile_file)
        if status != 0:
            utils.print_error_log(
                "Failed to decompile %s, you can fix problem according to the "
                "message above, or copy %s and %s to another host and execute : "
                "%s -d -mcpu=%s %s > %s" %
                (o_file, Constant.OBJ_DUMP_FILE, o_file,
                 Constant.OBJ_DUMP_FILE, "dav-m100", kernel_name + ".o",
                 decompile_file_name))
            return False

        loc_json_file = os.path.join(kernel_meta_path,
                                     kernel_name + "_loc.json")
        self._get_cce_tbe_code_number(decompile_file, loc_json_file, err_pc,
                                      info)
        self._get_occur_before_mark(decompile_file, diff_str, info)

        return True
Exemple #15
0
 def _get_op_by_graph(graph_file: str, info: any) -> None:
     if graph_file == '':
         return
     try:
         with open(graph_file, 'r') as graph:
             text = graph.read()
             regexp = r'(op\s+\{\s+name:\s+"%s".+?%s.+?\})\s+' \
                      r'op\s+\{' % (info.node_name, info.kernel_name)
             ret = re.findall(regexp, text, re.M | re.S)
             if len(ret) == 0:
                 utils.print_warn_log(
                     'Failed to get op for node(%s) kernel(%s).' %
                     (info.node_name, info.kernel_name))
                 return
             info.operator = ret[0]
     except IOError as io_error:
         utils.print_error_log('Failed to open file %s. %s' %
                               (graph_file, io_error))
         raise utils.AicErrException(Constant.MS_AICERR_OPEN_FILE_ERROR)
     finally:
         pass
Exemple #16
0
 def _get_the_latest_aicerr_form_ret(ret: list, err_time: any) -> int:
     max_i = -1
     max_time_obj = None
     for i, (time_str, _, _) in enumerate(ret):
         time_obj = utils.strplogtime(time_str)
         # 都找最迟的会找到同一个,加个条件时间要早于AICERROR时间,
         # 前提host、device时间同步。否则去掉and前的条件。
         if err_time >= time_obj and (
                 max_time_obj is None or time_obj > max_time_obj):
             max_time_obj = time_obj
             max_i = i
     if max_i == -1:
         for i, (time_str, _, _) in enumerate(ret):
             time_obj = utils.strplogtime(time_str)
             if max_time_obj is None or time_obj > max_time_obj:
                 max_time_obj = time_obj
                 max_i = i
     if max_i == -1:
         utils.print_error_log("Failed to get node and kernel name.")
         raise utils.AicErrException(Constant.MS_AICERR_FIND_DATA_ERROR)
     return max_i
Exemple #17
0
    def _check_addr(self, avaliable_addrs, used_addrs):
        input_params = used_addrs.get("input_addr")
        output_params = used_addrs.get("output_addr")
        workspace = used_addrs.get("workspace")
        for input_param in input_params:
            start_addr = int(input_param.get("addr"))
            shape_size = self._cal_shape_size(input_param.get("shape"))
            size_of_dtype = Constant.SIZE_OF_DTYPE.get(
                input_param.get("dtype"))
            end_addr = int(start_addr) + int(shape_size) * int(size_of_dtype)
            ret = self._check_addr_in_range(start_addr, avaliable_addrs)
            utils.print_info_log(
                f"shape_size is {shape_size}, size_of_dtype is {size_of_dtype}"
            )
            input_param["size"] = int(shape_size) * int(size_of_dtype)
            if not ret:
                utils.print_error_log(
                    "input_addr not avaliable, input_start_addr:%#x" %
                    start_addr)
                input_param["invalid"] = True
            ret = self._check_addr_in_range(end_addr, avaliable_addrs)
            if not ret:
                utils.print_error_log(
                    "input_addr not avaliable, input_end_addr:%#x" % end_addr)
                input_param["invalid"] = True

        for output_param in output_params:
            start_addr = int(output_param.get("addr"))
            shape_size = self._cal_shape_size(output_param.get("shape"))
            size_of_dtype = Constant.SIZE_OF_DTYPE.get(
                output_param.get("dtype"))
            end_addr = int(output_param.get(
                "addr")) + int(shape_size) * int(size_of_dtype)
            ret = self._check_addr_in_range(start_addr, avaliable_addrs)
            utils.print_info_log(
                f"shape_size is {shape_size}, size_of_dtype is {size_of_dtype}"
            )
            output_param["size"] = int(shape_size) * int(size_of_dtype)
            if not ret:
                utils.print_error_log(
                    "output_addr not avaliable, output_start_addr:%#x" %
                    start_addr)
                output_param["invalid"] = True
            ret = self._check_addr_in_range(end_addr, avaliable_addrs)
            if not ret:
                utils.print_error_log(
                    "output_addr not avaliable, output_end_addr:%#x" %
                    end_addr)
                output_param["invalid"] = True
Exemple #18
0
def collect_remote_file(report_path: str, key: str, collect_path: str) -> str:
    """
    collect local file:
    :param report_path: the local report path
    :param key: the key in slog_conf_path
    :param collect_path: the collect path
    :return: the local path
    """
    collect_target_path = os.path.join(collect_path,
                                       os.path.basename(report_path))
    utils.check_path_valid(collect_target_path, isdir=True, output=True)
    if key == Constant.DIR_SLOG:
        slog_report_path = os.path.join(report_path, "log", "device", "firmware")
        if os.path.exists(slog_report_path) and \
                os.path.isdir(slog_report_path):
            copy_file_to_dest(slog_report_path, key, collect_target_path,
                              report_path)
        else:
            utils.print_error_log(
                'There is no %s in %s.' % (key, report_path))
    elif key == 'MNTN_PATH':
        hisi_report_path = os.path.join(report_path, "log", "device", "system")
        if os.path.exists(hisi_report_path) and \
                os.path.isdir(hisi_report_path):
            copy_file_to_dest(hisi_report_path, Constant.DIR_BBOX,
                              collect_target_path, report_path)
        else:
            utils.print_warn_log(
                'There is no hisi_logs in %s.' % report_path)
    elif key == Constant.DIR_PLOG:
        plog_path = os.path.join(report_path, "log", "host", "cann")
        if os.path.exists(plog_path) and \
                os.path.isdir(plog_path):
            copy_file_to_dest(plog_path, Constant.DIR_PLOG,
                              collect_target_path, report_path)
        else:
            utils.print_warn_log(
                'There is no plog in %s.' % report_path)
    return collect_target_path
Exemple #19
0
 def get_op_info(self: any) -> tuple:
     grep_cmd = ['grep', '<exception_print>TIME.*4060006', '-nr', '-A',
                 '120', self.collect_slog_path]
     status, data = utils.execute_command(grep_cmd)
     if status != 0:
         utils.print_error_log("Failed to execute command: %s." % " ".join(grep_cmd))
         raise utils.AicErrException(
             Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
     ret = re.findall(Constant.EXCEPTION_PATTERN, data, re.M | re.S)
     if len(ret) == 0:
         utils.print_info_log("No AIC_ERROR found.")
         raise utils.AicErrException(
             Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
     for device_aic_err in ret:
         if len(device_aic_err) != Constant.AIC_ERROR_TUPLE_LEN:
             utils.print_info_log("The AIC_ERROR is not complete.")
             raise utils.AicErrException(
                 Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
         log_time = device_aic_err[0]
         dev_id = device_aic_err[1]
         stream_id = device_aic_err[2]
         task_id = device_aic_err[3]
         err_time = utils.strplogtime(log_time)
         node_name, kernel_name = self._get_node_and_kernel_name(
             dev_id, task_id, stream_id, err_time)
         if node_name == '' and kernel_name == '':
             continue
         self.ai_core_error_list.append(device_aic_err)
         self.node_name_list.append(node_name)
         self.kernel_name_list.append(kernel_name)
     if len(self.ai_core_error_list) == 0:
         utils.print_error_log(
             "The AIC_ERROR of device does not match the host.")
         raise utils.AicErrException(
             Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR)
     return self.ai_core_error_list, self.node_name_list, self.kernel_name_list
Exemple #20
0
 def _check_dump_data_vaild(dump_data: any, dump_file: str, header_length: int, file_size: int) -> None:
     input_data_size = 0
     for item in dump_data.input:
         input_data_size += item.size
     output_data_size = 0
     for item in dump_data.output:
         output_data_size += item.size
     buffer_data_size = 0
     for item in dump_data.buffer:
         buffer_data_size += item.size
     # check 8 + content size + sum(input.data) + sum(output.data)
     # + sum(buffer.data) equal to file size
     if header_length + Constant.UINT64_SIZE + input_data_size \
             + output_data_size + buffer_data_size != file_size:
         utils.print_error_log(
             'The file size(%d) of %s is not equal to %d(header '
             'length) + %d(the size of header content) + %d(the sum'
             ' of input data) + %d(the sum of output data) + %d(the'
             ' sum of buffer data). Please check the dump file.'
             % (file_size, dump_file, Constant.UINT64_SIZE,
                header_length, input_data_size, output_data_size,
                buffer_data_size))
         raise utils.AicErrException(
             Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR)
Exemple #21
0
 def _parse_dump_file(self: any, dump_file: str) -> any:
     """
     Parse the dump file path by big dump data format
     :param: dump_file the dump file
     :return: DumpData
     :exception when read or parse file error
     """
     utils.check_path_valid(dump_file)
     try:
         # get file size
         file_size = os.path.getsize(dump_file)
         # check file size > 8
         if file_size <= Constant.UINT64_SIZE:
             utils.print_error_log('The size of %s is at least greater then %d, but the file'
                                   ' size is %d. Please check the dump file.'
                                   % (dump_file, Constant.UINT64_SIZE, file_size))
             raise utils.AicErrException(
                 Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR)
         with open(dump_file, 'rb') as dump_data_file:
             # read header length
             header_length = dump_data_file.read(Constant.UINT64_SIZE)
             header_length = struct.unpack(Constant.UINT64_FMT, header_length)[0]
             # check header_length <= file_size - 8
             if header_length > file_size - Constant.UINT64_SIZE:
                 utils.print_error_log(
                     'The header content size(%d) of %s must be less then '
                     'or equal to %d(file size) - %d(header length).'
                     ' Please check the dump file.'
                     % (header_length, dump_file, file_size,
                        Constant.UINT64_SIZE))
                 raise utils.AicErrException(
                     Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR)
             # read header content
             return self._get_dump_data(dump_data_file, header_length, file_size)
     except IOError as io_error:
         utils.print_error_log('Failed to read the dump file %s. %s'
                               % (dump_file, str(io_error)))
         raise utils.AicErrException(Constant.MS_AICERR_OPEN_FILE_ERROR)
     finally:
         pass
Exemple #22
0
    def parse(self: any) -> None:
        """
        parse by collection info
        """
        utils.print_info_log('******************Analysis******************')
        aicore_error_data_list = self._aicore_error_data()
        utils.print_info_log('Start to analyze each ai core error.')
        summary_info_list = []

        # decompile
        if "aarch64" in platform.machine():
            obj_dump_file = "cce-objdump_aarch64"
        else:
            obj_dump_file = "cce-objdump"

        obj_dump_file = os.path.join(os.getcwd(), "tools", obj_dump_file)
        if os.path.exists(obj_dump_file):
            os.system("chmod 755 " + obj_dump_file)
            os.environ["PATH"] = os.path.join(
                os.getcwd(), "tools") + ":" + os.environ["PATH"]
        else:
            cce_dump = shutil.which("cce-objdump")
            if not cce_dump:
                # guess where is cce-objdump
                parent_path = "aarch64-linux" if "aarch64" in platform.machine(
                ) else "x86_64-linux"
                cce_dump_guess = os.path.join("usr/local/Ascend/latest",
                                              parent_path,
                                              "ccec_compiler/bin/cce-objdump")
                if os.path.exists(cce_dump_guess):
                    cce_dump = cce_dump_guess

            if not cce_dump:
                utils.print_error_log(
                    'Cannot find  cce-objdump! please add cce-objdump path in env PATH.'
                )
                raise utils.AicErrException(
                    Constant.MS_AICERR_EXECUTE_COMMAND_ERROR)
        for i, current_pc in enumerate(self.collection.ai_core_error_list):
            # parser aic error by slog
            info = AicErrorInfo()
            info.err_time, info.dev_id, info.stream_id, info.task_id, \
            info.core_id, info.aic_error, info.start_pc, info.extra_info, \
            info.current_pc = current_pc

            utils.print_info_log(
                "******************No.%d %s******************" %
                (i, info.err_time))
            info.err_time_obj = utils.strplogtime(info.err_time)
            err_i_folder_name = "aicerror_%d_%s" % (
                i, time.strftime("%Y%m%d%H%M%S",
                                 info.err_time_obj.timetuple()))
            err_i_folder = os.path.join(self.output_path, err_i_folder_name)
            utils.check_path_valid(err_i_folder, isdir=True, output=True)
            info.node_name = self.collection.node_name_list[i]
            info.kernel_name = self.collection.kernel_name_list[i]
            # get hisi log
            self._get_hisi_log(info, err_i_folder)
            # get op info in build proto file
            self._get_op_by_graph(aicore_error_data_list[Constant.GRAPH_FILE],
                                  info)
            kernel_meta_path = os.path.join(
                self.collection.collect_compile_path, 'kernel_meta')
            if os.path.exists(kernel_meta_path):
                # 反编译  出错指令
                result = self._decompile([info.kernel_name, kernel_meta_path],
                                         err_i_folder, info)
                if result is False:
                    utils.print_warn_log(
                        "decompile kernel_meta file %s failed." % os.path.join(
                            kernel_meta_path, info.kernel_name + ".o"))
            else:
                utils.print_warn_log("kernel_meta path %s not exist" %
                                     kernel_meta_path)
            try:
                # input output address
                info.aval_addrs = self._get_available_addrs(info.err_time)
                info.necessary_addr = self._get_necessary_addrs(
                    info.kernel_name)
                self._check_addr(info.aval_addrs, info.necessary_addr)
                # self.print_summary(avl_addr, necessary_addr)
            except BaseException as e:
                import logging
                logging.exception(e)
                print("Check addr error failed")

            info.input_output_addrs = self._get_input_output_addrs(
                info, err_i_folder,
                aicore_error_data_list[Constant.ALLOC_ADDR],
                aicore_error_data_list[Constant.ACTUAL_ADDR])

            # 地址越界信息收集
            info.addr_overflow = aicore_error_data_list[Constant.ADDR_OVERFLOW]
            # 算子代码地址,args地址
            info.op_addr, info.args_addr, info.multi_args_addr = \
                self._get_op_and_args_addr(info.start_pc)

            # parse dump
            if self.collection.collect_dump_path:
                parser = DumpDataParser(self.collection.collect_dump_path,
                                        info.node_name, info.kernel_name)
                info.dump_info = parser.parse()

            # write info file
            self._write_errorinfo_file(err_i_folder, info, i)

            summary_info_list.append(
                "%s   %s   device_id=%s   core_id=%s   task_id=%s   node=%s   "
                "kernel=%s" %
                (err_i_folder_name, info.aic_error, info.dev_id, info.core_id,
                 info.task_id, info.node_name, info.kernel_name))
        utils.print_info_log('Finish to analyze each ai core error.')
        # write summary info
        self._write_summary_file(summary_info_list)
Exemple #23
0
 def _get_dtype_by_data_type(self: any, data_type: any) -> any:
     if data_type not in self.DATA_TYPE_TO_DTYPE_MAP:
         utils.print_error_log("The output data type(%s) does not support." % data_type)
         raise utils.AicErrException(
             Constant.MS_AICERR_INVALID_DUMP_DATA_ERROR)
     return self.DATA_TYPE_TO_DTYPE_MAP.get(data_type).get(Constant.DTYPE)
Exemple #24
0
def main() -> int:
    """
    main function
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("-p",
                        "--report_path",
                        dest="report_path",
                        default="",
                        help="<Optional> the tar dir from npucollector",
                        required=False)
    parser.add_argument("-f",
                        "--tar_file",
                        dest="tar_file",
                        default="",
                        help="<Optional> the tar.gz path from npucollector",
                        required=False)
    parser.add_argument("-out",
                        "--output",
                        dest="output_path",
                        default="",
                        help="<Optional> the output path")

    if len(sys.argv) <= 1:
        parser.print_usage()
        return Constant.MS_AICERR_INVALID_PARAM_ERROR
    args = parser.parse_args(sys.argv[1:])
    if (not args.report_path) and (not args.tar_file):
        utils.print_error_log("report_path and tar_file must have one ")
        return Constant.MS_AICERR_INVALID_PARAM_ERROR

    try:
        collect_time = time.localtime()
        cur_time_str = time.strftime("%Y%m%d%H%M%S", collect_time)
        utils.check_path_valid(os.path.realpath(args.output_path),
                               isdir=True,
                               output=True)
        output_path = os.path.join(os.path.realpath(args.output_path),
                                   "info_" + cur_time_str)
        utils.check_path_valid(output_path, isdir=True, output=True)
        if args.tar_file:
            print("Start to unzip tar.gz, ")
            extract_path = "extract_" + cur_time_str
            extract_tar(args.tar_file, extract_path)
            args.report_path = get_select_dir(extract_path)

        # collect info
        collection = RemoteCollection(args.report_path, output_path)
        collection.collect()

        # clear local script.sh
        local_script = os.path.join(output_path, 'collection', Constant.SCRIPT)
        utils.rm_path(local_script, output_path, isdir=True)

        # parse ai core error
        parser = AicoreErrorParser(collection, output_path, collect_time)
        parser.parse()

        single_op_case = SingleOpCase(collection, output_path, collect_time)
        single_op_case.run()

    except utils.AicErrException as error:
        return error.error_info
    finally:
        pass
    return Constant.MS_AICERR_NONE_ERROR