Esempio n. 1
0
    def __init__(self, **kwargs):
        # get device_id and device_target
        self._get_devid_and_devtarget()
        output_path = kwargs.pop("output_path", "./data")
        self._output_path = validate_and_normalize_path(output_path)
        self._output_path = os.path.join(self._output_path, "profiler")
        if not os.path.exists(self._output_path):
            os.makedirs(self._output_path, exist_ok=True)
            os.chmod(self._output_path,
                     stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
        else:
            logger.warning(
                "The target dir already exists. "
                "There may be some old profiling data, and they will be rewrote in the end."
            )

        os.environ['PROFILING_MODE'] = 'true'
        os.environ['MINDDATA_PROFILING_DIR'] = self._output_path

        if self._device_target and self._device_target == "GPU":
            from mindspore._c_expression import GPUProfiler
            self._gpu_profiler = GPUProfiler.get_instance()
            self._gpu_profiler.init(self._output_path)
            self._gpu_profiler.step_profiling_enable(True)
            if context.get_auto_parallel_context('device_num') > 1:
                self._dev_id = get_rank()
            os.environ['DEVICE_ID'] = str(self._dev_id)

            if kwargs:
                logger.warning("Params not be supported yet on GPU.")
        elif self._device_target and self._device_target == "Ascend":
            optypes_not_deal = kwargs.pop("optypes_not_deal", "Variable")
            if not isinstance(optypes_not_deal, str):
                raise TypeError("The parameter optypes_not_deal must be str.")
            job_id = kwargs.pop("ascend_job_id", "")
            if kwargs:
                logger.warning("There are invalid params which don't work.")

            os.environ['DEVICE_ID'] = self._dev_id
            os.environ['AICPU_PROFILING_MODE'] = 'true'

            # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
            context.set_context(enable_profiling=True,
                                profiling_options="training_trace:task_trace")

            self._container_path = os.path.join(
                self._base_profiling_container_path, self._dev_id)
            data_path = os.path.join(self._container_path, "data")
            data_path = validate_and_normalize_path(data_path)
            if not os.path.exists(data_path):
                os.makedirs(data_path, exist_ok=True)

            self._filt_optype_names = optypes_not_deal.split(
                ",") if optypes_not_deal else []
            self._profiling_job_id = job_id
            # add job id env through user input later
            self._job_id_env = 0
            self._start_time = int(time.time() * 10000000)
            logger.info("Profiling: profiling start time: %d",
                        self._start_time)
Esempio n. 2
0
    def _analyse_step_trace(self,
                            source_path=None,
                            framework_parser=None,
                            is_training_mode_flag=True):
        """
        Analyse step trace data and save the result.

        Args:
            source_path (str): The directory that contains the step trace original data.
            framework_parser (FrameworkParser): The framework parse instance.
            is_training_mode_flag (bool): Whether in training mode or not.
        """
        logger.info("Begin to parse step trace.")
        # construct output path
        step_trace_intermediate_file_path = os.path.join(
            self._output_path,
            f'step_trace_raw_{self._dev_id}_detail_time.csv')
        point_info_file_path = os.path.join(self._output_path,
                                            'step_trace_point_info.json')
        step_trace_intermediate_file_path = validate_and_normalize_path(
            step_trace_intermediate_file_path)
        point_info_file_path = validate_and_normalize_path(
            point_info_file_path)

        if self._device_target and self._device_target == 'GPU':
            input_file_path = os.path.join(
                self._output_path, f'step_trace_profiling_{self._dev_id}.txt')
            parser = GpuStepTraceParser(
                input_dir=input_file_path,
                output_file_path=step_trace_intermediate_file_path,
                is_training_mode=is_training_mode_flag)
            parser.parse_and_save()
            point_info = parser.record_point_info(input_file_path,
                                                  point_info_file_path)
        else:
            # whether keep the first step
            skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME)
            point_info = framework_parser.point_info
            # recognize inference or training mode
            is_traning_mode_flag = framework_parser.check_op_name("Gradients")
            # parser the step trace files and save the result to disk
            source_path = validate_and_normalize_path(source_path)
            parser = AscendStepTraceParser(
                input_dir=source_path,
                output_file_path=step_trace_intermediate_file_path,
                job_id=self._job_id_env,
                skip_first_step=skip_first_step_flag,
                is_training_mode=is_traning_mode_flag)
            parser.update_tag_op_type_map(point_info)
            parser.parse_and_save()
            point_info = parser.record_point_info(point_info,
                                                  point_info_file_path)
        # print parser result
        parser.show()
        logger.info("Finish saving the intermediate result: %s",
                    step_trace_intermediate_file_path)
        logger.info("The point info is: %s", point_info)

        return point_info
Esempio n. 3
0
    def _get_output_path(self, kwargs):
        """Get output path of profiling data."""
        current_time = int(time.time())

        # to avoid getting different timestamp from different process in multi-card training,
        # set the timestamp as exist timestamp if it's difference is less than 6 seconds.
        def _select_timestamp(dir_name, re_pattern, input_time):
            """select the timestamp from current_time and exist timestamp."""
            timestamp_diff_threshold = 6
            exist_timestamp_list = []
            select_time = input_time
            if not os.path.exists(dir_name):
                os.makedirs(dir_name, exist_ok=True)
            for file_name in os.listdir(dir_name):
                match_res = re_pattern.match(file_name)
                if match_res:
                    exist_timestamp_list.append(int(match_res.group(1)))
            if exist_timestamp_list:
                time_diff_list = [
                    input_time - timestamp
                    for timestamp in exist_timestamp_list
                ]
                min_time_diff = min(time_diff_list)
                if min_time_diff <= timestamp_diff_threshold:
                    select_time = exist_timestamp_list[time_diff_list.index(
                        min_time_diff)]

            return select_time

        if "output_path" not in kwargs:
            selected_timestamp = _select_timestamp(os.getcwd(),
                                                   re.compile(r'data-(\d+)'),
                                                   current_time)
            output_path = f"data-{selected_timestamp}"
            self._output_path = validate_and_normalize_path(output_path)
        else:
            output_path = kwargs.pop("output_path")
            self._output_path = validate_and_normalize_path(output_path)
            selected_timestamp = _select_timestamp(
                self._output_path, re.compile(r'profiler-(\d+)'), current_time)

        self._output_path = os.path.join(self._output_path,
                                         f"profiler-{selected_timestamp}")
        if not os.path.exists(self._output_path):
            os.makedirs(self._output_path, exist_ok=True)
            os.chmod(self._output_path,
                     stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
        else:
            logger.warning(
                "The target dir already exists. "
                "There may be some old profiling data, and they will be rewrote in the end."
            )
Esempio n. 4
0
    def get_min_cycle_counter(self):
        """
        Get minimum cycle counter.

        Returns:
            float, the minimum value of the cycle counter.
        """
        file_path = os.path.join(
            self._profiling_dir,
            self._min_cycle_counter_file_path.format(self._device_id))

        file_path = validate_and_normalize_path(file_path)

        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    min_cycle_counter = f_obj.read()
                    min_cycle_counter = float(min_cycle_counter) \
                        if not min_cycle_counter == 'inf' else 0
            except (IOError, OSError) as err:
                logger.error(
                    'Error occurred when read minimum cycle counter: %s', err)
                raise ProfilerIOException
        else:
            min_cycle_counter = 0
            logger.info("No min cycle counter recorded.")

        return min_cycle_counter
Esempio n. 5
0
    def _get_and_validate_path(self, file_name):
        """Generate op or activity file path from file name, and validate this path."""
        file_path = os.path.join(self._profiling_dir,
                                 file_name.format(self._device_id))
        file_path = validate_and_normalize_path(file_path)

        return file_path
Esempio n. 6
0
    def _get_pipeline_path(self, source_dir):
        """
        Get the minddata pipeline file path.

        Args:
            source_dir (str): The minddata pipeline source dir.

        Returns:
            str, the minddata pipeline file path.
        """
        pipeline_path = os.path.join(
            source_dir, self._raw_pipeline_file_name.format(self._device_id))

        try:
            pipeline_path = validate_and_normalize_path(pipeline_path)
        except RuntimeError:
            logger.warning('Minddata pipeline file is invalid.')
            raise ProfilerPathErrorException(
                'Minddata pipeline file is invalid.')
        if not os.path.isfile(pipeline_path):
            logger.warning('The minddata pipeline file <%s> not found.',
                           pipeline_path)
            raise ProfilerFileNotFoundException(pipeline_path)

        return pipeline_path
Esempio n. 7
0
    def write_timeline_to_json_by_limitation(self, size_limit):
        """Write timeline to json by limitation."""
        display_filename = self._display_filename.format(self._device_id)
        display_file_path = os.path.join(
            self._profiling_dir,
            display_filename
        )
        display_file_path = validate_and_normalize_path(display_file_path)

        length = len(self._timeline_meta)
        try:
            with open(display_file_path, 'w') as json_file:
                json_file.write('[')
                for index, item in enumerate(self._timeline_meta):
                    json.dump(item, json_file)
                    file_size = os.path.getsize(display_file_path)
                    if file_size > size_limit:
                        break
                    if index == length - 1:
                        break
                    json_file.write(',')
                json_file.write(']')
                os.chmod(display_file_path, stat.S_IREAD | stat.S_IWRITE)
        except (IOError, OSError) as err:
            logger.error('Error occurred when write timeline display file: %s', err)
            raise ProfilerIOException
Esempio n. 8
0
    def _parse_aicpu_time(self):
        """Parse the parsed AICPU operator time file."""
        aicpu_file = os.path.join(
            self._profiling_dir,
            self._file_name_aicpu_time.format(self._device_id))
        aicpu_file = validate_and_normalize_path(aicpu_file)
        if not os.path.isfile(aicpu_file):
            return

        save_file_name = 'aicpu_intermediate_' + self._device_id + '.csv'
        save_file_path = os.path.join(self._profiling_dir, save_file_name)
        with open(aicpu_file, 'r') as src_file:
            row = src_file.readline()
            if not row.startswith('serial_number'):
                return
            with open(save_file_path, 'w') as save_file:
                csv_writer = csv.writer(save_file)
                csv_writer.writerow(self._header_aicpu)

                while True:
                    row = src_file.readline()
                    if not row:
                        break
                    infos = row.split()
                    if infos[0] == 'AI':
                        continue
                    csv_writer.writerow(infos)
Esempio n. 9
0
    def _get_step_end_tag_id(self, source_files):
        """
        Get step end tag id.This id is 255 before 2020.12.16,and 65535 now.
        File is an old version if there is no 65535 tag id, or it is a new version.
        """

        step_num = 0
        source_file = validate_and_normalize_path(source_files[0])
        try:
            with open(source_file, 'rb') as handler:
                content = handler.read()
                for pos in range(0, len(content), 20):
                    next_event = self._get_trace_struct(content[pos:pos + self._event_size])
                    # 1 means bp_start.
                    if next_event.tag_id == 1:
                        step_num += 1
                    # Step end tag id is 65535 in the new version.
                    if next_event.tag_id == 65535:
                        self._step_end_tag_id = next_event.tag_id
                    # We only search the first step to find if there is 65535 tag id.
                    if step_num == 2:
                        break
        except (IOError, OSError) as err:
            log.warning(f'Failed to read {source_file} while get end tag id', err)
            raise ProfilerIOException
Esempio n. 10
0
    def _write_timeline_data_into_file(self, timeline_data):
        """
        Write the timeline information into the file, including
            operator name, stream id, start time and duration.

        Args:
            timeline_data (list): The metadata to be written into the file.
                [
                    ['op_name_1', 'stream_id_1', 'start_time_1', 'durarion_1'],
                    ['op_name_2', 'stream_id_2', 'start_time_2', 'durarion_2'],
                    [...]
                ]
        """
        # sorted by start times
        timeline_data.sort(key=lambda x: float(x[2]))
        filename = 'output_timeline_data_{}.txt'.format(self._device_id)
        file_path = os.path.join(self._output_path, filename)
        file_path = validate_and_normalize_path(file_path)

        # write to file
        try:
            with open(file_path, 'w') as f_obj:
                f_obj.write(TIMELINE_FILE_COLUMN_TITLE + '\n')
                for timeline in timeline_data:
                    timeline = [str(item) for item in timeline]
                    f_obj.write(','.join(timeline) + '\n')
            os.chmod(file_path, stat.S_IREAD | stat.S_IWRITE)
        except (IOError, OSError) as err:
            logger.error('Error occurred when writing intermediate timeline file: %s', err)
            raise ProfilerIOException
Esempio n. 11
0
 def _parse_point_files(self):
     """Parse the framework point files."""
     for path in self._framework_path['point']:
         path = validate_and_normalize_path(path)
         with open(path, 'r') as file:
             for point_info in file:
                 infos = point_info.strip('\n').split(' ')
                 self._point_info[int(infos[0])] = infos[1]
Esempio n. 12
0
    def __init__(self,
                 subgraph='all',
                 is_detail=True,
                 is_show_op_path=False,
                 output_path='./data',
                 optypes_to_deal='',
                 optypes_not_deal='Variable',
                 job_id=""):
        # get device_id and device_target
        self._get_devid_and_devtarget()
        self._output_path = validate_and_normalize_path(output_path)
        self._output_path = os.path.join(self._output_path, "profiler")
        if not os.path.exists(self._output_path):
            os.makedirs(self._output_path, exist_ok=True)
        else:
            logger.warning(
                "The target dir already exists. "
                "There may be some old profiling data, and they will be rewrote in the end."
            )

        if self._device_target and self._device_target == "GPU":
            from mindspore._c_expression import GPUProfiler
            self._gpu_profiler = GPUProfiler.get_instance()
            self._gpu_profiler.init(self._output_path)
            self._gpu_profiler.step_profiling_enable(True)
        elif self._device_target and (self._device_target == "Ascend"
                                      or self._device_target != "Davinci"):
            self._container_path = os.path.join(
                self._base_profiling_container_path, self._dev_id)
            data_path = os.path.join(self._container_path, "data")
            if not os.path.exists(data_path):
                os.makedirs(data_path, exist_ok=True)

            os.environ['PROFILING_MODE'] = 'true'
            os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace'
            os.environ['MINDDATA_PROFILING_DIR'] = self._output_path
            os.environ['DEVICE_ID'] = self._dev_id
            os.environ['AICPU_PROFILING_MODE'] = 'true'
            os.environ['PROFILING_DIR'] = str(self._container_path)

            # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
            context.set_context(enable_profiling=True,
                                profiling_options="training_trace:task_trace")

            self._subgraph = check_subgraph(subgraph)
            self._valid_optype_name = optypes_to_deal.split(
                ",") if optypes_to_deal else []
            self._filt_optype_names = optypes_not_deal.split(
                ",") if optypes_not_deal else []
            self._detail = check_bool(is_detail, 'is_detail')
            self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path')
            self._profiling_job_id = job_id
            # add job id env through user input later
            self._job_id_env = 0
            self._start_time = int(time.time() * 10000000)
            logger.info("Profiling: profiling start time: %d",
                        self._start_time)
Esempio n. 13
0
 def __init__(self, hwts_output_file, output_filename, op_task_info,
              output_path, device_id):
     hwts_output_file = validate_and_normalize_path(hwts_output_file)
     self._hwts_output_file = hwts_output_file
     self._output_filename = output_filename
     self._op_task_info = op_task_info
     self._output_path = output_path
     self._device_id = device_id
     self._min_cycle_counter = float("inf")
Esempio n. 14
0
    def _get_and_validate_path(self, file_name):
        """Generate op or activity file path from file name, and validate this path."""
        file_path = os.path.join(self._profiling_dir,
                                 file_name.format(self._device_id))
        file_path = validate_and_normalize_path(file_path)
        if not os.path.exists(file_path):
            logger.error(f"Failed to find parsed timeline file {file_path}.")
            raise ProfilerFileNotFoundException('parsed timeline file')

        return file_path
Esempio n. 15
0
    def parse_minddata_aicpu_data(minddata_aicpu_source_path):
        """
        Parse minddata get_next info which contains queue size and execute time.

        Args:
            minddata_aicpu_source_path (str): the source file path.

        Returns:
            list[Union[str, float]], the converted data.
        """
        result = list()
        try:
            minddata_aicpu_source_path = validate_and_normalize_path(
                minddata_aicpu_source_path)
            with open(minddata_aicpu_source_path) as source_data_file:
                source_data = source_data_file.read()
                step_data = source_data.split("\x00")
                for one_step in step_data:
                    if one_step:
                        node_info = one_step.split(", ")
                        node_name, node_start, node_end, queue_size = "", 0, 0, 0
                        if node_info:
                            node_name = node_info[0].replace("Node:", "")

                        if len(node_info) > 3 and "queue" in node_info[1]:
                            queue_size = node_info[1].replace(
                                "queue size:", "")
                            queue_size = int(queue_size) if queue_size.isdigit(
                            ) else queue_size
                            node_start = node_info[2].replace("Run start:", "")
                            node_start = int(node_start) if node_start.isdigit(
                            ) else node_start
                            node_end = node_info[3].replace("Run end:", "")
                            node_end = int(
                                node_end) if node_end.isdigit() else node_end
                        elif len(node_info) > 3 and "Run" in node_info[1]:
                            queue_size = node_info[3].replace(
                                "queue size:", "")
                            queue_size = int(queue_size) if queue_size.isdigit(
                            ) else queue_size
                            node_start = node_info[1].replace("Run start:", "")
                            node_start = int(node_start) if node_start.isdigit(
                            ) else node_start
                            node_end = node_info[2].replace("Run end:", "")
                            node_end = int(
                                node_end) if node_end.isdigit() else node_end

                        one_step_list = [
                            node_name, node_start, node_end, queue_size
                        ]
                        result.append(one_step_list)
        except OSError:
            logger.error("Open get_next profiling file error.")

        return result
Esempio n. 16
0
 def _load_point_info(self):
     """Load point info."""
     file_path = os.path.join(self._profiling_dir, 'step_trace_point_info.json')
     file_path = validate_and_normalize_path(file_path)
     if os.path.isfile(file_path):
         with open(file_path, 'r', encoding='utf-8') as file:
             try:
                 self._point_info = json.load(file)
             except (json.JSONDecodeError, TypeError) as err:
                 logger.warning(err)
                 raise ProfilerRawFileException('Fail to parse point info file.')
Esempio n. 17
0
    def _write_memory_files(self, filename, content):
        """Write the summary and top breakdowns of memory usage."""
        file_path = os.path.join(self._profiling_dir, filename)
        file_path = validate_and_normalize_path(file_path)

        try:
            with open(file_path, 'w') as json_file:
                json.dump(content, json_file)
                os.chmod(file_path, stat.S_IREAD | stat.S_IWRITE)
        except (IOError, OSError) as err:
            logger.error('Fail to write memory file.\n%s', err)
            raise ProfilerIOException
Esempio n. 18
0
    def _get_file_path(self):
        """Get the proto file path."""
        file_path = os.path.join(self._profiling_dir,
                                 self._proto_file_path.format(self._device_id))
        file_path = validate_and_normalize_path(file_path)

        if not os.path.exists(file_path):
            msg = 'The memory file does not exist!'
            logger.error(msg)
            raise ProfilerFileNotFoundException(msg=msg)

        return file_path
Esempio n. 19
0
    def _aicore_detail_data_load(self):
        """Load data according to the parsed AICORE operator file."""
        op_detail_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_aicore_detail_info.format(self._device_id))
        framework_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_framework.format(self._device_id))
        op_detail_file_path = validate_and_normalize_path(op_detail_file_path)
        framework_file_path = validate_and_normalize_path(framework_file_path)
        if not os.path.isfile(op_detail_file_path):
            logger.warning('The file <%s> does not exist.',
                           op_detail_file_path)
            return
        if not os.path.isfile(framework_file_path):
            logger.warning('The file <%s> does not exist.',
                           framework_file_path)
            return

        framework_infos = dict()
        with open(framework_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            _ = next(csv_reader)
            for info in csv_reader:
                framework_infos[info[3]] = [
                    info[3], info[4], info[5], info[6],
                    json.loads(info[7]) if info[7] else None
                ]

        with open(op_detail_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            _ = next(csv_reader)
            for info in csv_reader:
                framework_info = framework_infos.get(info[0])
                self._aicore_detail_data.append([
                    framework_info[1], framework_info[2],
                    float(info[1]), framework_info[3], framework_info[0],
                    framework_info[4]
                ])
        del framework_infos
Esempio n. 20
0
    def _parse(self, source_file):
        """Parse source step trace files."""
        log.info("Start to parse step trace file.")
        fp_start, bp_end, iter_end, iter_start = 0, 1, 2, 3
        reduce_start = 4
        start_time, end_time = 0, 1
        STEP_TRACE_POINT_COUNT = 3

        source_file = validate_and_normalize_path(source_file)
        try:
            with open(source_file, 'r') as f:
                lines = f.readlines()
                if len(lines) < STEP_TRACE_POINT_COUNT:
                    raise ProfilerRawFileException(
                        f"Failed to parse {source_file} file. The FP_POINT/BP_POINT/ITER_END_POINT "
                        f"do not recognized correctly. Try to set the environment variable'PROFILING_FP_START' "
                        f"and 'PROFILING_BP_END' to solve this problem. For example, "
                        f"'export PROFILING_FP_START=Default/xxx/Conv2d-op1' ")
                step_trace_info_all = [line.strip().split()[1:] for line in lines]
                num_of_step = len(step_trace_info_all[0])
                for step_trace_point in step_trace_info_all:
                    if len(step_trace_point) != num_of_step:
                        raise ProfilerRawFileException(
                            f"Failed to parse {source_file} file. Due to the profiled "
                            f"step_num of FP/BP/ITER_END Point are not equal")
                iter_start_info = [step_trace_info_all[fp_start][0]] + \
                    step_trace_info_all[iter_end][:num_of_step]
                step_trace_info_all.insert(iter_start, iter_start_info)
        except (IOError, OSError) as err:
            log.warning(f'Failed to read {source_file}', err)
            raise ProfilerIOException

        for step_num in range(num_of_step):
            step_trace = {
                'start': int(step_trace_info_all[iter_start][step_num].split(',')[start_time]),
                'fp': int(step_trace_info_all[fp_start][step_num].split(',')[start_time]),
                'bp': int(step_trace_info_all[bp_end][step_num].split(',')[end_time]),
                'end': int(step_trace_info_all[iter_end][step_num].split(',')[end_time]),
                'reduce': {}
            }
            num_of_step_point = len(step_trace_info_all)
            if num_of_step_point > reduce_start:
                reduce_info = {}
                reduce_time_info = []
                for reduce_idx in range(reduce_start, num_of_step_point):
                    cur_reduce_time = step_trace_info_all[reduce_idx][step_num]
                    reduce_time_info += cur_reduce_time.split(',')
                reduce_info['ops'] = reduce_time_info
                step_trace['reduce'] = reduce_info
            self._record_trace_event(step_trace)
        self._record_average_info()
        log.info("Finish to parse step trace file.")
Esempio n. 21
0
    def _get_file_path(self):
        """Get the proto file path."""
        file_path = os.path.join(self._profiling_dir,
                                 self._proto_file_path.format(self._device_id))
        file_path = validate_and_normalize_path(file_path)

        if not os.path.exists(file_path):
            logger.warning(
                'The memory file does not exist! Please ignore the warning '
                'if you are running heterogeneous training.')
            raise ProfilerFileNotFoundException(msg=file_path)

        return file_path
Esempio n. 22
0
 def _aicore_trace_data_load(self):
     """Load data according to the parsed AICORE operator types file."""
     file_path = query_latest_trace_time_file(self._profiling_dir, int(self._device_id))
     if not file_path:
         logger.error("Failed to find parsed trace time file.")
         raise ProfilerFileNotFoundException('parsed step trace time file')
     file_path = validate_and_normalize_path(file_path)
     with open(file_path, 'r') as handle:
         csv_reader = csv.reader(handle)
         self.__column__ = next(csv_reader)
         self._aicore_trace_data = list(csv_reader)
     self._size = len(self._aicore_trace_data) - 1
     self._load_point_info()
Esempio n. 23
0
    def _parse(self, source_file):
        """Parse source step trace files."""
        log.info("Start to parse step trace file.")
        fp_start, bp_end, iter_end, iter_start = 0, 1, 2, 3
        reduce_start = 4
        start_time, end_time = 0, 1

        source_file = validate_and_normalize_path(source_file)
        try:
            with open(source_file, 'r') as f:
                lines = f.readlines()
                step_trace_info_all = [
                    line.strip().split()[1:] for line in lines
                ]
                num_of_step = len(step_trace_info_all[0])
                iter_start_info = [step_trace_info_all[fp_start][0]] + \
                    step_trace_info_all[iter_end][:num_of_step]
                step_trace_info_all.insert(iter_start, iter_start_info)
        except (IOError, OSError) as err:
            log.warning(f'Failed to read {source_file}', err)
            raise ProfilerIOException

        for step_num in range(num_of_step):
            step_trace = {
                'start':
                int(step_trace_info_all[iter_start][step_num].split(',')
                    [start_time]),
                'fp':
                int(step_trace_info_all[fp_start][step_num].split(',')
                    [start_time]),
                'bp':
                int(step_trace_info_all[bp_end][step_num].split(',')
                    [end_time]),
                'end':
                int(step_trace_info_all[iter_end][step_num].split(',')
                    [end_time]),
                'reduce': {}
            }
            num_of_step_point = len(step_trace_info_all)
            if num_of_step_point > reduce_start:
                reduce_info = {}
                reduce_time_info = []
                for reduce_idx in range(reduce_start, num_of_step_point):
                    cur_reduce_time = step_trace_info_all[reduce_idx][step_num]
                    reduce_time_info += cur_reduce_time.split(',')
                reduce_info['ops'] = reduce_time_info
                step_trace['reduce'] = reduce_info
            self._record_trace_event(step_trace)
        self._record_average_info()
        log.info("Finish to parse step trace file.")
Esempio n. 24
0
    def _analyse_step_trace(self, source_path, framework_parser):
        """
        Analyse step trace data and save the result.

        Args:
            source_path (str): The directory that contains the step trace original data.
            framework_parser (FrameworkParser): The framework parse instance.
        """
        logger.info("Begin to parse step trace.")
        # construct output path
        step_trace_intermediate_file_path = os.path.join(
            self._output_path,
            f'step_trace_raw_{self._dev_id}_detail_time.csv')
        point_info_file_path = os.path.join(self._output_path,
                                            'step_trace_point_info.json')
        step_trace_intermediate_file_path = validate_and_normalize_path(
            step_trace_intermediate_file_path)
        point_info_file_path = validate_and_normalize_path(
            point_info_file_path)
        # whether keep the first step
        skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME)
        point_info = framework_parser.point_info
        # parser the step trace files and save the result to disk
        source_path = validate_and_normalize_path(source_path)
        parser = StepTraceParser(
            input_dir=source_path,
            output_file_path=step_trace_intermediate_file_path,
            job_id=self._job_id_env,
            skip_first_step=skip_first_step_flag)
        parser.update_tag_op_type_map(point_info)
        parser.parse_and_save()
        point_info = parser.record_point_info(point_info, point_info_file_path)
        # print parser result
        parser.show()
        logger.info("Finish saving the intermediate result: %s",
                    step_trace_intermediate_file_path)
        logger.info("The point info is: %s", point_info)
Esempio n. 25
0
 def _parse(self, source_files):
     """Parse source step trace files."""
     log.info("Start to parse step trace file.")
     event_info = {}
     for source_file in source_files:
         source_file = validate_and_normalize_path(source_file)
         with open(source_file, 'rb') as handler:
             content = handler.read()
             for step_trace in self._get_next_step_trace(content, event_info):
                 if self._skip_first_step:
                     self._skip_first_step = False
                     continue
                 self._record_trace_event(step_trace)
     self._record_average_info()
     log.info("Finish to parse step trace file.")
Esempio n. 26
0
    def _aicore_data_load(self):
        """Load data according to the parsed AICORE operator types file."""
        op_type_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_aicore_type_time.format(self._device_id)
        )
        op_type_file_path = validate_and_normalize_path(op_type_file_path)
        if not os.path.isfile(op_type_file_path):
            logger.warning('The file <%s> does not exist.', op_type_file_path)
            return

        with open(op_type_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            _ = next(csv_reader)
            for info in csv_reader:
                self._aicore_data.append([info[0], float(info[1]), int(info[2]), float(info[3])])
Esempio n. 27
0
    def write_timeline_summary(self):
        """Write timeline summary to json."""
        timeline_summary_file_path = os.path.join(
            self._profiling_dir,
            self._timeline_summary_filename.format(self._device_id)
        )

        timeline_summary_file_path = validate_and_normalize_path(timeline_summary_file_path)

        try:
            with open(timeline_summary_file_path, 'w') as json_file:
                json.dump(self._timeline_summary, json_file)
            os.chmod(timeline_summary_file_path, stat.S_IREAD | stat.S_IWRITE)
        except (IOError, OSError) as err:
            logger.error('Error occurred when write timeline summary file: %s', err)
            raise ProfilerIOException
Esempio n. 28
0
    def _parse_task_files(self):
        """Parse the framework task files."""
        for path in self._framework_path['task']:
            path = validate_and_normalize_path(path)
            with open(path, 'r') as file:
                for task_info in file:
                    infos = task_info.strip('\n').split(' ')
                    infos = infos[1:] if len(infos) == 5 else infos
                    # key is op name, values is task id, stream id, block_dim
                    self._task_cache[infos[0]] = [infos[2], infos[3], infos[1]]

                    # if the task id is less than the task id threshold, the
                    # stream id and task id correspond to an operator
                    task_id = infos[2]
                    if int(task_id) < self._task_id_threshold:
                        task_id = '_'.join([infos[3], task_id])
                    self._task_id_full_op_name_dict[task_id] = infos[0]
Esempio n. 29
0
    def __init__(self,
                 subgraph='all',
                 is_detail=True,
                 is_show_op_path=False,
                 output_path='./data',
                 optypes_to_deal='',
                 optypes_not_deal='Variable',
                 job_id=""):
        # get device_id and device_target
        self._get_devid_and_devtarget()
        self._container_path = os.path.join(
            self._base_profiling_container_path, self._dev_id)
        data_path = os.path.join(self._container_path, "data")
        if not os.path.exists(data_path):
            os.makedirs(data_path, exist_ok=True)
        self._output_path = validate_and_normalize_path(output_path)
        self._output_path = os.path.join(self._output_path, "profiler")
        if not os.path.exists(self._output_path):
            os.makedirs(self._output_path, exist_ok=True)

        os.environ['PROFILING_MODE'] = 'true'
        os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace'
        os.environ['MINDDATA_PROFILING_DIR'] = self._output_path
        os.environ['DEVICE_ID'] = self._dev_id
        os.environ['AICPU_PROFILING_MODE'] = 'true'
        os.environ['PROFILING_DIR'] = str(self._container_path)

        # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
        context.set_context(enable_profiling=True,
                            profiling_options="training_trace:task_trace")

        self._subgraph = check_subgraph(subgraph)
        self._valid_optype_name = optypes_to_deal.split(
            ",") if optypes_to_deal else []
        self._filt_optype_names = optypes_not_deal.split(
            ",") if optypes_not_deal else []
        self._detail = check_bool(is_detail, 'is_detail')
        self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path')
        self._profiling_job_id = job_id
        # add job id env through user input later
        self._job_id_env = 0
        self._start_time = int(time.time() * 10000000)
        logger.info("Profiling: profiling start time: %d", self._start_time)
Esempio n. 30
0
    def _get_save_path(self, output_path):
        """
        Get the save path.

        Args:
            output_path (str): The output dir.

        Returns:
            str, the save path.
        """
        try:
            output_dir = validate_and_normalize_path(output_path)
        except ValidationError:
            logger.warning('Output path is invalid.')
            raise ProfilerPathErrorException('Output path is invalid.')
        if not os.path.isdir(output_dir):
            logger.warning('The output dir <%s> not found.', output_dir)
            raise ProfilerDirNotFoundException(output_dir)
        return os.path.join(
            output_dir,
            self._parsed_pipeline_file_name.format(self._device_id))