def get_timeline_summary(self):
        """
        Get timeline summary information for UI display.

        Returns:
            json, the content of timeline summary information.
        """
        summary_filename = self._timeline_summary_filename.format(self._device_id)
        file_path = os.path.join(self._profiling_dir, summary_filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline summary path.'
        )

        timeline_summary = {}
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    timeline_summary = json.load(f_obj)
            except (IOError, OSError, json.JSONDecodeError) as err:
                logger.error('Error occurred when read timeline summary file: %s', err)
                raise ProfilerIOException
        else:
            logger.info('No timeline summary file. Please check the output path.')

        return timeline_summary
Exemple #2
0
    def analyze(self, options=None):
        """
        Get the proposal from proposer.

        Args:
            options (dict): options for proposer analysis.
                - step_trace: include optional parameters for step trace,The dictionary key is iter_interval
                  used to get the analyser options for iteration interval time.

        Returns:
            dict, the proposal from proposer instance,the dictionary key is a language internationalization
            label, and the value is used to format the value in the language internationalization string.

        Examples:
            >>> proposer_type = 'step_trace'
            >>> proposer = ProposerFactory.instance().get_proposer(proposer_type, self.profiling_dir, self.device_id)
            >>> result = proposer.analyze(options)
        """
        logger.info("The StepTraceProposer is running")

        options = get_options(options)
        logger.debug("The StepTraceProposer 'options' is %s", str(options))
        step_trace_condition = options.get("step_trace", {})
        # Get the proposals of iteration interval.
        self._iter_interval_analyze(step_trace_condition)
        return self.__proposal_dict
Exemple #3
0
def get_timeline_detail():
    """
    Get timeline detail.

    Returns:
        Response, the detail information of timeline.

    Examples:
        >>> GET http://xxxx/v1/mindinsight/profile/timeline
    """
    summary_dir = request.args.get("dir")
    profiler_dir = validate_and_normalize_profiler_path(
        summary_dir, settings.SUMMARY_BASE_DIR)
    if not os.path.exists(profiler_dir):
        raise ProfilerDirNotFoundException(msg=summary_dir)
    device_id = request.args.get("device_id", default='0')
    _ = to_int(device_id, 'device_id')
    device_type = request.args.get("device_type", default='ascend')
    if device_type not in ['gpu', 'ascend']:
        logger.info(
            "Invalid device_type, device_type should be gpu or ascend.")
        raise ParamValueError("Invalid device_type.")

    analyser = AnalyserFactory.instance().get_analyser('timeline',
                                                       profiler_dir, device_id)
    timeline = analyser.get_display_timeline(device_type)

    return jsonify(timeline)
Exemple #4
0
    def get_display_timeline(self, device_type):
        """
        Get timeline data for UI display.

        Returns:
            json, the content of timeline data.
        """
        if device_type == "ascend":
            display_filename = self._ascend_display_filename.format(
                self._device_id)
        elif device_type == "gpu":
            display_filename = self._gpu_display_filename.format(
                self._device_id)
        else:
            logger.info(
                'device type should be ascend or gpu. Please check the device type.'
            )
            raise ParamValueError("Invalid device_type.")
        file_path = os.path.join(self._profiling_dir, display_filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline json path.')

        timeline = []
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    timeline = json.load(f_obj)
            except (IOError, OSError, json.JSONDecodeError) as err:
                logger.error(
                    'Error occurred when read timeline display file: %s', err)
                raise ProfilerIOException
        else:
            logger.info('No timeline file. Please check the output path.')

        return timeline
    def get_display_timeline(self):
        """
        Get timeline data for UI display.

        Returns:
            json, the content of timeline data.
        """
        # Search timeline json file under profiling dir.
        display_filename = self._display_filename.format(self._device_id)
        # Check if there is a timeline json file for display
        file_path = os.path.join(self._profiling_dir, display_filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline json path.'
        )

        timeline = []
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    timeline = json.load(f_obj)
            except (IOError, OSError) as err:
                logger.error('Error occurred when read timeline display file: %s', err)
                raise ProfilerIOException
        else:
            logger.info('No timeline file. Please check the output path.')

        return timeline
Exemple #6
0
    def _analyse_timeline(self):
        """
        Analyse and parse timeline info.
        """
        timeline_analyser = AnalyserFactory.instance().get_analyser(
            'timeline', self._output_path, self._dev_id
        )
        min_cycle_counter = timeline_analyser.get_min_cycle_counter()

        # Get framework info
        aicoredetail_analyser = AnalyserFactory.instance().get_analyser(
            'aicore_detail', self._output_path, self._dev_id
        )
        framework_info = aicoredetail_analyser.query()

        # Get all reduce info
        step_trace_analyser = AnalyserFactory.instance().get_analyser(
            'step_trace', self._output_path, self._dev_id
        )
        all_reduce_info = step_trace_analyser.query_for_all_reduce(min_cycle_counter)

        # Get timeline info
        logger.info('Start writing timeline info...')
        logger.info('Warm Prompt: It could take a few minutes if you are training '
                    'with a complex network or more than 10 steps.')
        # Add AllReduce and framework info into timeline
        timeline_analyser.init_timeline(all_reduce_info, framework_info)
        timeline_analyser.write_timeline()
        timeline_analyser.write_timeline_summary()
    def init_timeline(self, all_reduce_info, framework_info):
        """
        Init timeline metadata, adding all collected info.

        Args:
            all_reduce_info (list[list]): The metadata of AllReduce operator.
            framework_info (dict): The framework metadata.
        """
        logger.info('Initiating timeline...')
        timeline_list = self._load_timeline_data()
        self._timeline_summary['op_exe_times'] = len(timeline_list)

        # Add AllReduce info to timeline temp list and sort by start time.
        if all_reduce_info:
            logger.debug('AllReduce info found. Start adding info into timeline...')
            timeline_list.extend(all_reduce_info)
            timeline_list.sort(key=lambda x: float(x[2]))

        # Init a dict for counting the num of streams.
        stream_count_dict = {}
        for timeline in timeline_list:
            self._parse_timeline_data(timeline)
            # Updating the collection of streams.
            if len(timeline) == 4:
                self._update_num_of_streams(timeline, stream_count_dict)

        # Get framework metadata.
        framework_obj_list = framework_info.get('object')
        # The length of list is the number of operators.
        self._timeline_summary['num_of_ops'] = len(framework_obj_list)
        self._add_framework_info(framework_obj_list)
        logger.info('Finished adding info into timeline...')

        # Update timeline summary info
        self._timeline_summary['num_of_streams'] = len(stream_count_dict.keys())
    def get_min_cycle_counter(self):
        """
        Get minimum cycle counter.

        Returns:
            float, the minimum value of the cycle counter.
        """
        file_path = os.path.join(
            self._profiling_dir,
            self._min_cycle_counter_file_path.format(self._device_id)
        )

        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid min cycle counter file path.'
        )

        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    min_cycle_counter = f_obj.read()
                    min_cycle_counter = float(min_cycle_counter) \
                        if not min_cycle_counter == 'inf' else 0
            except (IOError, OSError) as err:
                logger.error('Error occurred when read minimum cycle counter: %s', err)
                raise ProfilerIOException
        else:
            min_cycle_counter = 0
            logger.info("No min cycle counter recorded.")

        return min_cycle_counter
Exemple #9
0
    def _get_reduce_time_in_order(self, row_info_dict):
        """
        Get reduce time in order.

        Args:
            row_info_dict (dict): Step trace information.

        Returns:
            dict, sorted reduce information. The reduce info is format like:
            {stream_id: List[Tuple(start_point, end_point, duration, field_name)]}
        """
        reduce_info = {}
        reduce_fields = [
            field_name for field_name in self.__column__
            if field_name.startswith('stream_')
            and not field_name.endswith('point')
        ]
        for reduce_field in reduce_fields:
            reduce_start = row_info_dict.get(reduce_field + '_start_point', 0)
            reduce_end = row_info_dict.get(reduce_field + '_end_point', 0)
            reduce_duration = row_info_dict.get(reduce_field, 0)
            if not (reduce_start and reduce_end and reduce_duration):
                log.info("Reduce event missing value.")
                continue
            cur_stream_id = reduce_field.split('_', 2)[1]
            cur_stream = reduce_info.get(cur_stream_id)
            if not cur_stream:
                cur_stream = []
                reduce_info[cur_stream_id] = cur_stream
            cur_stream.append(
                (reduce_start, reduce_end, reduce_duration, reduce_field))
        for _, reduce_events in reduce_info.items():
            reduce_events.sort(key=lambda elem: elem[1])
        return reduce_info
Exemple #10
0
    def __init__(self, subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data',
                 optypes_to_deal='', optypes_not_deal='Variable', job_id=""):
        # get device_id and device_target
        device_target = ""
        try:
            import mindspore.context as context
            dev_id = str(context.get_context("device_id"))
            device_target = context.get_context("device_target")
        except ImportError:
            logger.error("Profiling: fail to import context from mindspore.")
        except ValueError as err:
            logger.error("Profiling: fail to get context, %s", err.message)

        if not dev_id:
            dev_id = os.getenv('DEVICE_ID')
        if not dev_id:
            dev_id = "0"
            logger.error("Fail to get DEVICE_ID, use 0 instead.")

        if device_target and device_target != "Davinci" \
            and device_target != "Ascend":
            msg = ("Profiling: unsupport backend: %s" \
                  % device_target)
            raise RuntimeError(msg)

        self._dev_id = dev_id
        self._container_path = os.path.join(self._base_profiling_container_path, dev_id)
        data_path = os.path.join(self._container_path, "data")
        if not os.path.exists(data_path):
            os.makedirs(data_path)
        self._output_path = validate_and_normalize_path(output_path,
                                                        'Profiler output path (' + output_path + ')')
        self._output_path = os.path.join(self._output_path, "profiler")
        if not os.path.exists(self._output_path):
            os.makedirs(self._output_path)

        os.environ['PROFILING_MODE'] = 'true'
        os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace'
        # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
        try:
            import mindspore.context as context
            context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace")
        except ImportError:
            logger.error("Profiling: fail to import context from mindspore.")
        except ValueError as err:
            logger.error("Profiling: fail to set context, %s", err.message)

        os.environ['AICPU_PROFILING_MODE'] = 'true'
        os.environ['PROFILING_DIR'] = str(self._container_path)
        self._subgraph = check_subgraph(subgraph)
        self._valid_optype_name = optypes_to_deal.split(",") if optypes_to_deal else []
        self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else []
        self._detail = check_bool(is_detail, 'is_detail')
        self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path')
        self._profiling_job_id = job_id
        self._start_time = int(time.time() * 10000000)
        logger.info("Profiling: profiling start time: %d", self._start_time)
    def get_proposal(self, options=None):
        """
        Get compose proposals.

        Args:
            options (dict): options for composed proposal.
                - compose_proposal_result: execution results of the already running proposers.
                - step_trace: include optional parameters for step trace,The dictionary key is iter_interval
                  used to get the analyser options for iteration interval time.

        Returns:
            dict, the proposals from multiple different proposers.

        Examples:
            >>> type_list = ['common', 'step_trace']
            >>> condition = {"filter_condition": {'mode': "proc", "proc_name": "iteration_interval"}}
            >>> options = {'step_trace': {"iter_interval": condition}}
            >>> cp = ComposeProposal(self.profiling_dir, self.device_id, type_list)
            >>> result_proposal = cp.get_proposal(options=options)
        """
        logger.info("The ComposeProposal is running")
        options = get_options(options)
        logger.debug("The 'options' is %s", str(options))
        # The flag whether to write category label.
        type_label_flag = options.get("type_label_flag", True)
        compose_proposal_result = OrderedDict()
        logger.debug("The 'compose_proposer_type_list' is %s",
                     str(self.compose_proposer_type_list))
        for proposer_type in self.compose_proposer_type_list:
            proposer = ProposerFactory.instance().get_proposer(
                proposer_type, self.profiling_path, self.device_id)

            if proposer is None:
                continue

            # Write the result of proposals to option for other proposer to get.
            options["compose_proposal_result"] = compose_proposal_result

            result = proposer.analyze(options)
            # Insert category label.
            if result and type_label_flag:
                proposer_type_label = proposer_type + "-type_label"
                # Get the name of the category label, the default is the same as the proposer type.
                type_label_name = options.get(proposer_type_label,
                                              proposer_type)
                # Add postfix to category label name
                type_proposal_label = type_label_name + self.type_label_postfix
                compose_proposal_result[type_proposal_label] = None
                # Merge results to the proposals dictionary.
                compose_proposal_result.update(result)
            elif result and not type_label_flag:
                # Merge results to the proposals dictionary.
                compose_proposal_result.update(result)

        logger.debug("The 'compose_proposal_result' is %s",
                     str(compose_proposal_result))
        return compose_proposal_result
Exemple #12
0
 def _save(self):
     log.info("Start to save step trace file.")
     if not self._header:
         return
     with open(self._output_path, 'w') as file_handle:
         csv_writer = csv.writer(file_handle)
         csv_writer.writerow(self._header)
         for row_data in self._result:
             csv_writer.writerow(row_data)
     os.chmod(self._output_path, stat.S_IREAD)
Exemple #13
0
 def parse_and_save(self):
     """Parse step trace files and save the result."""
     try:
         source_files = self._get_step_trace_files()
         self._parse(source_files)
         self._save()
     except IOError as err:
         log.exception(err)
         raise ProfilerIOException()
     else:
         log.info("Finish to save intermediate result for step trace file.")
    def execute(self):
        """Execute the parser, get result data, and write it to the output file."""

        if not os.path.exists(self._source_file_name):
            logger.info("Did not find the aicpu profiling source file")
            return

        with open(self._source_file_name, 'rb') as ai_cpu_data:
            ai_cpu_str = str(ai_cpu_data.read().replace(b'\n\x00', b' ___ ')
                             .replace(b'\x00', b' ___ '))[2:-1]
            ai_cpu_lines = ai_cpu_str.split(" ___ ")

        node_list = list()
        ai_cpu_total_time_summary = 0
        # node serial number
        serial_number = 1
        for i in range(len(ai_cpu_lines)-1):
            node_line = ai_cpu_lines[i]
            thread_line = ai_cpu_lines[i+1]
            if "Node" in node_line and "Thread" in thread_line:
                # get the node data from node_line
                node_name = node_line.split(',')[0].split(':')[-1]
                run_v2_start = node_line.split(',')[1].split(':')[-1]
                compute_start = node_line.split(',')[2].split(':')[-1]
                mercy_start = node_line.split(',')[3].split(':')[-1]
                mercy_end = node_line.split(',')[4].split(':')[-1]
                run_v2_end = node_line.split(',')[5].split(':')[-1]
                # get total_time and dispatch_time from thread line
                total_time = thread_line.split(',')[-1].split('=')[-1].split()[0]
                dispatch_time = thread_line.split(',')[-2].split('=')[-1].split()[0]

                node_data = [serial_number, node_name, total_time, dispatch_time, run_v2_start, compute_start,
                             mercy_start, mercy_end, run_v2_end]

                node_list.append(node_data)
                # calculate the total time
                ai_cpu_total_time_summary += int(total_time)
                # increase node serial number
                serial_number += 1
            elif "Node" in node_line and "Thread" not in thread_line:
                node_name = node_line.split(',')[0].split(':')[-1]
                logger.warning("The node:%s cannot find thread data", node_name)

        node_list.append(["AI CPU Total Time(us):", ai_cpu_total_time_summary])

        if node_list:
            fwrite_format(self._output_filename, data_source=_dst_file_title, is_print=True,
                          is_start=True)
            fwrite_format(self._output_filename,
                          data_source=tabulate(node_list, _dst_file_column_title,
                                               tablefmt='simple'),
                          is_start=True, is_print=True)
Exemple #15
0
    def analyze(self, options=None):
        """
        Get the proposal from proposer.

        Args:
            options: options for proposer analysis

        Returns:
            dict, the proposal from proposer instance.
        """
        logger.info('The CommonProposer is running')
        options = get_options(options)
        return self.__proposal_dict
Exemple #16
0
 def _parse(self, source_files):
     """Parse source step trace files."""
     log.info("Start to parse step trace file.")
     event_info = {}
     for source_file in source_files:
         with open(source_file, 'rb') as handler:
             content = handler.read()
             for step_trace in self._get_next_step_trace(
                     content, event_info):
                 if self._skip_first_step:
                     self._skip_first_step = False
                     continue
                 self._record_trace_event(step_trace)
     self._record_average_info()
     log.info("Finish to parse step trace file.")
Exemple #17
0
    def _get_profiling_job_id(self):
        """Get profiling job id, which was generated by ada service.

        Returns:
            str: profiling jon id.
        """

        if self._profiling_job_id:
            return self._profiling_job_id

        job_id = ""
        cmd = "ls -t " + PROFILING_LOG_BASE_PATH + "|grep JOB|awk '{print $1}'"
        r = os.popen(cmd)
        profiling_job_dirs = r.readlines()
        r.close()
        for item in profiling_job_dirs:
            path = os.path.join(PROFILING_LOG_BASE_PATH, item.strip())
            log_file = get_file_names(path, "host_start.log")
            if not log_file:
                logger.error("Profiling: job path %s, host_start.log not exist.", path)
                continue

            log_file = os.path.join(path, log_file[0])
            item_dict = self._parse_host_start_log(log_file)

            if not item_dict:
                logger.error("Profiling: job path %s, fail to get job start info.", path)
                continue
            if self._start_time > int(item_dict["start_time"]):
                logger.info("Profiling: job path %s, start_time %s, training start_time %d.",
                            path, item_dict["start_time"], self._start_time)
                break

            if self._dev_id != item_dict["device_id"]:
                logger.info("Profiling: job path %s, dev id %s, training device id %s.",
                            path, item_dict["device_id"], self._dev_id)
                continue

            job_id = item.strip()
            break

        if not job_id:
            msg = ("Fail to get profiling job, please check whether job dir was generated")
            raise RuntimeError(msg)

        return job_id
Exemple #18
0
 def _record_average_info(self):
     """Calculate average info."""
     result_size = len(self._result)
     # calculate average data for each column in result data
     average_data = [0] * len(self._header)
     if result_size >= 2:
         for row_info in self._result[1:]:
             average_data = [
                 Decimal(i) + Decimal(j)
                 for i, j in zip(row_info, average_data)
             ]
         average_data = [
             round((item / (result_size - 1))) for item in average_data
         ]
         # change step num info in average_data to None
         step_num_index = self._header.index('step_num')
         average_data[step_num_index] = '-'
     self._result.append(average_data)
     log.info("Finish add average info for step trace.")
    def query(self, condition=None):
        """
        Query data according to the condition.

        Args:
            condition (dict): The search condition, only contains `filter_condition` parameter.
                Default: None.

        Returns:
            dict, the result after filtered, sorted and grouped.
        """
        if condition is None:
            condition = {}
        filter_condition = condition.get('filter_condition', {})
        log.info("Receive query request. %s", filter_condition)
        self._validate_filter_condition(filter_condition)
        self._result = {'size': self._size}
        self._filter(filter_condition)

        return self._result
Exemple #20
0
    def get_display_timeline(self, device_type, scope_name_num):
        """
        Get timeline data for UI display.

        Returns:
            json, the content of timeline data.
        """
        if device_type == "ascend":
            display_filename = self._ascend_display_filename.format(
                self._device_id)
        elif device_type == "gpu":
            display_filename = self._gpu_display_filename.format(
                self._device_id)
        else:
            logger.info(
                'device type should be ascend or gpu. Please check the device type.'
            )
            raise ParamValueError("Invalid device_type.")
        file_path = os.path.join(self._profiling_dir, display_filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline json path.')

        timeline = []
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    timeline = json.load(f_obj)
                    for idx, time_item in enumerate(timeline):
                        if time_item["tid"] == 100001 and \
                                time_item["ph"] != "M" and \
                                int(time_item["scope_level"]) >= int(scope_name_num):
                            timeline[idx] = None
                    timeline = list(filter(lambda x: x, timeline))
            except (IOError, OSError, json.JSONDecodeError) as err:
                logger.error(
                    'Error occurred when read timeline display file: %s', err)
                raise ProfilerIOException()
        else:
            logger.info('No timeline file. Please check the output path.')

        return timeline
    def _sort_reduce_by_time(self, row_info_dict):
        """
        Sort reduce info by time.

        Args:
            row_info_dict (dict): Step trace information.

        Returns:
            list, including the all reduce info sorted by start time only.
            [
                [reduce_field, stream_id, reduce_start, reduce_duration],
                [...],
                [...]
            ]
        """
        factor = 1e5  # convert time unit from 10ns to 1ms
        reduce_pid = 10000
        reduce_info = []
        reduce_fields = [
            field_name for field_name in self.__column__
            if field_name.startswith('stream_')
            and not field_name.endswith('point')
        ]
        for reduce_field in reduce_fields:
            reduce_start = row_info_dict.get(reduce_field + '_start_point')
            reduce_start = reduce_start / factor \
                if reduce_start else 0
            reduce_duration = row_info_dict.get(reduce_field)
            reduce_duration = reduce_duration / factor if reduce_duration else 0
            if not (reduce_start and reduce_duration):
                log.info("Reduce event missing value.")
                continue
            cur_stream_id = reduce_field.split('_', 2)[1]
            reduce_meta = [
                reduce_field,
                int(cur_stream_id), reduce_start, reduce_duration, reduce_pid
            ]
            reduce_info.append(reduce_meta)

        return reduce_info
Exemple #22
0
    def _analyse_step_trace(self, source_path, framework_parser):
        """
        Analyse step trace data and save the result.

        Args:
            source_path (str): The directory that contains the step trace original data.
            framework_parser (FrameworkParser): The framework parse instance.
        """
        logger.info("Begin to parse step trace.")
        # construct output path
        step_trace_intermediate_file_path = os.path.join(
            self._output_path,
            f'step_trace_raw_{self._dev_id}_detail_time.csv'
        )
        point_info_file_path = os.path.join(
            self._output_path,
            'step_trace_point_info.json'
        )
        # whether keep the first step
        skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME)
        point_info = framework_parser.point_info
        # parser the step trace files and save the result to disk
        parser = StepTraceParser(input_dir=source_path,
                                 output_file_path=step_trace_intermediate_file_path,
                                 job_id=self._job_id_env,
                                 skip_first_step=skip_first_step_flag)
        parser.parse_and_save()
        point_info = parser.record_point_info(point_info, point_info_file_path)
        # print parser result
        parser.show()
        logger.info("Finish saving the intermediate result: %s", step_trace_intermediate_file_path)
        logger.info("The point info is: %s", point_info)
Exemple #23
0
    def __init__(self, subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data',
                 optypes_to_deal='', optypes_not_deal='Variable', job_id=""):
        # get device_id and device_target
        self._get_devid_and_devtarget()
        self._container_path = os.path.join(self._base_profiling_container_path, self._dev_id)
        data_path = os.path.join(self._container_path, "data")
        if not os.path.exists(data_path):
            os.makedirs(data_path, exist_ok=True)
        self._output_path = validate_and_normalize_path(output_path,
                                                        'Profiler output path (' + output_path + ')')
        self._output_path = os.path.join(self._output_path, "profiler")
        if not os.path.exists(self._output_path):
            os.makedirs(self._output_path, exist_ok=True)

        os.environ['PROFILING_MODE'] = 'true'
        os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace'
        os.environ['MINDDATA_PROFILING_DIR'] = self._output_path
        os.environ['DEVICE_ID'] = self._dev_id
        # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
        try:
            import mindspore.context as context
            context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace")
        except ImportError:
            logger.error("Profiling: fail to import context from mindspore.")
        except ValueError:
            logger.error("Profiling: fail to set context enable_profiling")

        os.environ['AICPU_PROFILING_MODE'] = 'true'
        os.environ['PROFILING_DIR'] = str(self._container_path)
        self._subgraph = check_subgraph(subgraph)
        self._valid_optype_name = optypes_to_deal.split(",") if optypes_to_deal else []
        self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else []
        self._detail = check_bool(is_detail, 'is_detail')
        self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path')
        self._profiling_job_id = job_id
        # add job id env through user input later
        self._job_id_env = 0
        self._start_time = int(time.time() * 10000000)
        logger.info("Profiling: profiling start time: %d", self._start_time)
Exemple #24
0
    def add_framework_info(self, framework_info):
        """
        Add framework info into timeline metadata.

        Args:
            framework_info (dict): The framework metadata.
        """
        logger.info('Adding framework info...')
        framework_obj_list = framework_info.get('object')
        self._timeline_summary['num_of_ops'] = len(framework_obj_list)
        for framework_obj in framework_obj_list:
            op_name = framework_obj[0]
            op_type = framework_obj[1]
            op_full_name = framework_obj[4]
            op_info = framework_obj[5]
            for timeline_obj in self._timeline_meta:
                if op_full_name == timeline_obj.get('name'):
                    timeline_obj['name'] = op_name
                    timeline_obj['args'] = {
                        'type': op_type,
                        'fullname': op_full_name
                    }
                    timeline_obj['args'].update(op_info)
Exemple #25
0
    def analyse(self):
        """
        Collect and analyse performance data, called after training or during training.

        Examples:
            >>> from mindinsight.profiler import Profiler
            >>> context.set_context(mode=context.GRAPH_MODE, device_target=“Ascend”,
            >>>                     device_id=int(os.environ["DEVICE_ID"]))
            >>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data')
            >>> model = Model(train_network)
            >>> dataset = get_dataset()
            >>> model.train(2, dataset)
            >>> profiler.analyse()
        """

        try:
            from mindspore.communication.management import release
            release()
        except ImportError:
            logger.error("Profiling: fail to import release from mindspore.")

        logger.info("begin profiler analyse")

        job_id = self._get_profiling_job_id()
        if not job_id:
            msg = ("Fail to get profiling job, please check whether job dir was generated under path %s" \
                   % PROFILING_LOG_BASE_PATH)
            raise RuntimeError(msg)

        logger.info("Profiling: job id is %s ", job_id)

        source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id)
        # parse hwts.log.data.45.dev file, and get task profiling data
        hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt"
        hwts_output_filename = os.path.join(self._output_path, hwts_output_filename)
        hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename)
        result = hwtslog_parser.execute()
        if not result:
            logger.error("Profiling: fail to parse hwts log file.")
            return

        # parse Framework file, and get the relation of op and tasks
        framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path)
        framework_parser.parse()
        op_task_dict = framework_parser.to_task_id_full_op_name_dict()
        if not op_task_dict:
            logger.error("Profiling: fail to parse framework files.")
            return

        # get op compute time from hwts data and framework data, write output_op_compute_time.txt
        opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt"
        opcompute_output_filename = os.path.join(self._output_path, opcompute_output_filename)
        optime_parser = OPComputeTimeParser(hwts_output_filename, opcompute_output_filename, op_task_dict)
        optime_parser.execute()

        # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt
        output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt"
        output_data_preprocess_aicpu = os.path.join(self._output_path, output_data_preprocess_aicpu)
        try:
            aicpu_data_parser = DataPreProcessParser(source_path, output_data_preprocess_aicpu)
            aicpu_data_parser.execute()
        except FileNotFoundError as err:
            logger.exception(err)

        # analyse op compute time info
        try:
            self._analyser_op_info()
        except MindInsightException as err:
            logger.error(err.message)
Exemple #26
0
    def analyse(self):
        """
        Collect and analyse performance data, called after training or during training.

        Examples:
            >>> from mindinsight.profiler import Profiler
            >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
            >>>                     device_id=int(os.environ["DEVICE_ID"]))
            >>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data')
            >>> model = Model(train_network)
            >>> dataset = get_dataset()
            >>> model.train(2, dataset)
            >>> profiler.analyse()
        """

        try:
            from mindspore.communication.management import release
            release()
        except ImportError:
            logger.error("Profiling: fail to import release from mindspore.")

        job_id = self._get_profiling_job_id()
        logger.info("Profiling: job id is %s ", job_id)

        source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id)
        # parse hwts.log.data.45.dev file, and get task profiling data
        hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt"
        hwts_output_filename = os.path.join(self._output_path, hwts_output_filename)
        hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename)
        result = hwtslog_parser.execute()
        if not result:
            logger.error("Profiling: fail to parse hwts log file.")
            return

        # parse Framework file, and get the relation of op and tasks
        framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path)
        framework_parser.parse()
        op_task_dict = framework_parser.to_task_id_full_op_name_dict()
        if not op_task_dict:
            logger.error("Profiling: fail to parse framework files.")
            return

        # get op compute time from hwts data and framework data, write output_op_compute_time.txt
        opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt"
        opcompute_output_filename = os.path.join(self._output_path, opcompute_output_filename)
        optime_parser = OPComputeTimeParser(
            hwts_output_filename, opcompute_output_filename,
            op_task_dict, self._output_path, self._dev_id
        )
        optime_parser.execute()

        # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt
        output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt"
        output_data_preprocess_aicpu = os.path.join(self._output_path, output_data_preprocess_aicpu)
        aicpu_data_parser = DataPreProcessParser(source_path, output_data_preprocess_aicpu)
        aicpu_data_parser.execute()

        # Parsing minddata AICPU profiling
        MinddataParser.execute(source_path, self._output_path, self._dev_id)

        # parse minddata pipeline operator and queue
        try:
            pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path)
            pipeline_parser.parse()
        except MindInsightException as err:
            logger.warning(err.message)

        # analyse op compute time info
        try:
            self._analyser_op_info()
        except MindInsightException as err:
            logger.warning(err.message)

        # analyse step trace info
        try:
            self._analyse_step_trace(source_path, framework_parser)
        except MindInsightException as err:
            logger.warning(err.message)

        # analyse timeline info
        try:
            self._analyse_timeline()
        except (ProfilerIOException, ProfilerFileNotFoundException, ValidationError) as err:
            logger.warning('Fail to write timeline data: %s', err)
    def query(self, condition=None):
        """
        Query data according to the condition.

        Args:
            condition (dict): The search condition, only contains `filter_condition` parameter.
                Default: None.

        Returns:
            dict, the result after filtered, sorted and grouped.
        """
        if condition is None:
            condition = {}
        filter_condition = condition.get('filter_condition', {})
        log.info("Receive query request. %s", filter_condition)
        self._filter(filter_condition)
        result = dict()
        result["sampling_interval"] = self._data.get("sampling_interval")
        result["step_info"] = self._steps_info
        result["step_total_num"] = self._step_total_num
        result["cpu_processor_num"] = self._data.get("cpu_processor_num")
        # device average CPU utilization
        result["device_info"] = dict()
        for key in self._data.get("device_info").keys():
            arr = self._data.get("device_info")[key]
            avg_value = round(sum(arr) / len(arr)) if arr else 0
            result["device_info"][key] = {
                "metrics": arr,
                "avg_value": avg_value
            }

        # process average CPU utilization
        result["process_info"] = dict()
        for key in self._data.get("process_info").keys():
            arr = self._data.get("process_info")[key]
            avg_value = round(sum(arr) / len(arr)) if arr else 0
            result["process_info"][key] = {
                "metrics": arr,
                "avg_value": avg_value
            }

        # op average CPU utilization
        minddata_pipeline_op_info = self._get_minddata_pipeline_info()
        result["op_info"] = dict()
        result["op_info"]["op_list"] = []
        result["op_info"]["total_op_avg_value"] = dict()
        result["op_info"]["total_op_avg_value"]["user_utilization"] = 0
        result["op_info"]["total_op_avg_value"]["sys_utilization"] = 0

        for item in self._data.get("op_info"):
            # Filtering out non minddata pipeline operator
            if str(item.get("op_id")) == "-1":
                continue
            op_info_dict = dict()
            op_info_dict["metrics"] = dict()
            for key in item.get("metrics").keys():
                arr = item.get("metrics")[key]
                avg_value = round(sum(arr) / len(arr)) if arr else 0
                op_info_dict["metrics"][key] = {
                    "metrics": arr,
                    "avg_value": avg_value
                }
                result["op_info"]["total_op_avg_value"][key] += avg_value
            op_info_dict["op_id"] = item.get("op_id")
            op_info = [
                i for i in minddata_pipeline_op_info
                if i.get("op_id") == item.get("op_id")
            ]
            # op_info is like [{"num_workers":int,"op_id":int}]
            op_info_dict["num_workers"] = op_info[0].get("num_workers")
            result["op_info"]["op_list"].append(op_info_dict)
        return result
Exemple #28
0
 def write_timeline(self):
     """Load data according to the parsed profiling files."""
     # Write timeline to file.
     logger.info('Writing timeline file...')
     self.write_timeline_to_json_by_limitation()
     logger.info('Finished file writing!')
Exemple #29
0
    def execute(self):
        """
        Execute the parser, get result data, and write it to the output file.

        Returns:
            bool, whether succeed to analyse hwts log.
        """

        content_format = ['QIIIIIIIIIIII', 'QIIQIIIIIIII', 'IIIIQIIIIIIII']
        log_type = [
            'Start of task', 'End of task', 'Start of block', 'End of block',
            'Block PMU'
        ]

        result_data = ""

        with open(self._source_flie_name, 'rb') as hwts_data:
            while True:
                line = hwts_data.read(64)
                if line:
                    if not line.strip():
                        continue
                else:
                    break
                byte_first_four = struct.unpack('BBHHH', line[0:8])
                byte_first = bin(byte_first_four[0]).replace('0b', '').zfill(8)
                ms_type = byte_first[-3:]
                is_warn_res0_ov = byte_first[4]
                cnt = int(byte_first[0:4], 2)
                core_id = byte_first_four[1]
                blk_id, task_id = byte_first_four[3], byte_first_four[4]
                if ms_type in ['000', '001', '010']:  # log type 0,1,2
                    result = struct.unpack(content_format[0], line[8:])
                    syscnt = result[0]
                    stream_id = result[1]
                elif ms_type == '011':  # log type 3
                    result = struct.unpack(content_format[1], line[8:])
                    syscnt = result[0]
                    stream_id = result[1]
                elif ms_type == '100':  # log type 4
                    result = struct.unpack(content_format[2], line[8:])
                    stream_id = result[2]
                    if is_warn_res0_ov == '0':
                        syscnt = result[4]
                    else:
                        syscnt = None
                else:
                    logger.info("Profiling: invalid hwts log record type %s",
                                ms_type)
                    continue

                if int(task_id) < 25000:
                    task_id = str(stream_id) + "_" + str(task_id)
                result_data += ("%-14s %-4s %-8s %-9s %-8s %-15s %s\n" %
                                (log_type[int(ms_type, 2)], cnt, core_id,
                                 blk_id, task_id, syscnt, stream_id))

        fwrite_format(self._output_filename,
                      data_source=self._dst_file_title,
                      is_start=True)
        fwrite_format(self._output_filename,
                      data_source=self._dst_file_column_title)
        fwrite_format(self._output_filename, data_source=result_data)

        return True