Ejemplo n.º 1
0
    def get_flops(self):
        """Get flops for each device."""
        flops_info_list = []
        max_flops = 0

        for host_map_ip, device_id, rank_id in self._host_device_rank_relation:
            host_dir = os.path.join(self._cluster_dir, host_map_ip, 'profiler')
            validate_and_normalize_path(
                host_dir,
                raise_key='Invalid host directory {}.'.format(host_map_ip))
            file_path = self._get_flops_file_for_each_device(
                host_dir, device_id)

            # Forward compatible. If flops file do not exist, return empty data.
            if not os.path.exists(file_path):
                flops_info_list = []
                break

            file_content = self._get_file_content(file_path)
            max_flops = max(max_flops, file_content.get('FLOPs'))

            flops_dict = {
                'host_ip': host_map_ip,
                'device_id': device_id,
                'rank_id': rank_id,
            }
            flops_dict.update(file_content)
            flops_info_list.append(flops_dict)

        # Normalize the flops by divide the max flops in all device.
        for flops_info in flops_info_list:
            flops_info['FLOPs_norm'] = flops_info['FLOPs'] / max_flops

        return flops_info_list
Ejemplo n.º 2
0
    def get_peak_memory(self):
        """Get peak memory for each device."""
        peak_mem_list = []

        for host_map_ip, device_id, rank_id in self._host_device_rank_relation:
            host_dir = os.path.join(self._cluster_dir, host_map_ip, 'profiler')
            validate_and_normalize_path(
                host_dir,
                raise_key='Invalid host directory {}.'.format(host_map_ip))
            file_path = self._get_memory_file_for_each_device(
                host_dir, device_id)
            file_content = self._get_file_content(file_path)
            capacity = file_content.get('capacity')
            peak_mem = file_content.get('peak_mem')

            mem_dict = {
                'host_ip': host_map_ip,
                'device_id': device_id,
                'rank_id': rank_id,
                'capacity': capacity,
                'peak_mem': peak_mem
            }
            peak_mem_list.append(mem_dict)

        return peak_mem_list
    def _load_point_info(self):
        """Load point info."""
        file_path = os.path.join(self._profiling_dir,
                                 'step_trace_point_info.json')
        file_path = validate_and_normalize_path(
            file_path, raise_key="Invalid step_trace_point_info file path.")

        # If step_trace_point_info_{self._device_id}.json file exist, load this file.
        file_path_new = os.path.join(
            self._profiling_dir,
            f'step_trace_point_info_{self._device_id}.json')
        file_path_new = validate_and_normalize_path(
            file_path_new,
            raise_key="Invalid step_trace_point_info file path.")
        if os.path.isfile(file_path_new):
            file_path = file_path_new

        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                try:
                    self._point_info = json.load(file)
                except (json.JSONDecodeError, TypeError) as err:
                    log.exception(err)
                    raise ProfilerRawFileException(
                        'Fail to parse point info file.')
Ejemplo n.º 4
0
    def _get_flops_file_for_each_device(self, path, device_id):
        """Get memory file for each device."""
        filename = self._summary_filename.format(device_id)
        file_path = os.path.join(path, filename)
        validate_and_normalize_path(file_path,
                                    raise_key='Invalid flops file path.')

        return file_path
Ejemplo n.º 5
0
    def _load(self):
        """Load data according to the parsed AICORE operator file."""
        op_detail_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_aicore_detail_time.format(self._device_id))
        framework_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_framework_info.format(self._device_id))
        flops_file_path = os.path.join(
            self._profiling_dir, self._file_name_flops.format(self._device_id))
        op_detail_file_path = validate_and_normalize_path(
            op_detail_file_path, raise_key='Invalid aicore_detail file path.')
        framework_file_path = validate_and_normalize_path(
            framework_file_path, raise_key='Invalid framework file path.')
        flops_file_path = validate_and_normalize_path(
            flops_file_path, raise_key='Invalid flops file path.')
        if not os.path.isfile(op_detail_file_path):
            logger.warning('The file <%s> does not exist.',
                           op_detail_file_path)
            return
        if not os.path.isfile(framework_file_path):
            logger.warning('The file <%s> does not exist.',
                           framework_file_path)
            return

        framework_infos = dict()
        with open(framework_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)
            for info in csv_reader:
                framework_infos[info[3]] = self._convert_framework_field_type(
                    info)

        flops_infos = dict()
        if os.path.isfile(flops_file_path):
            with open(flops_file_path, 'r') as f_obj:
                # skip the first line which is header info.
                next(f_obj)
                for line in f_obj:
                    flops_line = line.strip().split(',')
                    # flops_line[0] is full_op_name.
                    flops_infos[flops_line[0]] = flops_line[1:]
        else:
            logger.warning('The file <%s> does not exist.', flops_file_path)

        with open(op_detail_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)
            for info in csv_reader:
                detail_info = self._get_op_detail_info(info, framework_infos,
                                                       flops_infos)
                self._data.append(detail_info)

        del framework_infos
        del flops_infos
Ejemplo n.º 6
0
    def get_timeline_summary(self):
        """
        Get timeline summary information for UI display.

        Returns:
            json, the content of timeline summary information.
        """
        summary_filename = self._timeline_summary_filename.format(self._device_id)
        file_path = os.path.join(self._profiling_dir, summary_filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline summary path.'
        )

        timeline_summary = {}
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    timeline_summary = json.load(f_obj)
            except (IOError, OSError, json.JSONDecodeError) as err:
                logger.error('Error occurred when read timeline summary file: %s', err)
                raise ProfilerIOException
        else:
            logger.info('No timeline summary file. Please check the output path.')

        return timeline_summary
    def _get_minddata_queue_step_time_info(self):
        """Get the sampling time information at the steps of the host queue"""
        minddata_queue_step_time_info = []
        minddata_analyser = MinddataAnalyser(self._profiling_dir,
                                             self._device_id)
        file_path = minddata_analyser.get_device_queue_file_path()
        file_path = validate_and_normalize_path(
            file_path, raise_key="Invalid device_queue file path")
        if not os.path.exists(file_path):
            log.error('Did not find the device queue file: %s', file_path)
            raise ProfilerFileNotFoundException(
                msg='Did not find the device queue file.')

        with open(file_path) as data_file:
            for line in data_file.readlines():
                op_info = line.split()
                # op_info is a list like:['1','64','8','2','85406783']
                # The value of the first element in op_info is '0' or '1'.
                # '0' means that the time information is recorded.
                # '1' means that the queue information is recorded.
                # '1':queue info , '64':queue capacity, '8':step_num, '2':queue size, '85406783':sampling time.
                if op_info and op_info[0] == "1":
                    minddata_queue_step_time_info.append(
                        [op_info[2], op_info[4]])
        return minddata_queue_step_time_info
Ejemplo n.º 8
0
def get_cluster_link_info():
    """
    Get cluster link info.

    Returns:
        Response, the cluster link info.

    Raises:
        ParamValueError: If the search condition contains some errors.

    Examples:
        >>>POST http://xxx/v1/mindinsight/profile/search-cluster-link
    """
    train_id = get_train_id(request)
    cluster_profiler_dir = os.path.join(settings.SUMMARY_BASE_DIR, train_id)
    try:
        cluster_profiler_dir = validate_and_normalize_path(
            cluster_profiler_dir, 'cluster_profiler')
    except ValidationError:
        raise ParamValueError('Invalid cluster_profiler dir')

    condition = request.stream.read()
    try:
        condition = json.loads(condition) if condition else {}
    except (json.JSONDecodeError, ValueError):
        raise ParamValueError("Json data parse failed.")

    device_id = condition.get("device_id", "0")
    to_int(device_id, 'device_id')

    analyser = AnalyserFactory.instance().get_analyser('cluster_hccl',
                                                       cluster_profiler_dir,
                                                       device_id)
    link_info = analyser.get_cluster_link_info(condition)
    return jsonify(link_info)
Ejemplo n.º 9
0
    def _write_timeline_data_into_file(self, timeline_data):
        """
        Write the timeline information into the file, including
            operator name, stream id, start time and duration.

        Args:
            timeline_data (list): The metadata to be written into the file.
                [
                    ['op_name_1', 'stream_id_1', 'start_time_1', 'durarion_1'],
                    ['op_name_2', 'stream_id_2', 'start_time_2', 'durarion_2'],
                    [...]
                ]
        """
        # sorted by start times
        timeline_data.sort(key=lambda x: float(x[2]))
        filename = 'output_timeline_data_{}.txt'.format(self._device_id)
        file_path = os.path.join(self._output_path, filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid file path of timeline data.')

        # write to file
        try:
            with open(file_path, 'w') as f_obj:
                f_obj.write(TIMELINE_FILE_COLUMN_TITLE + '\n')
                for timeline in timeline_data:
                    timeline = [str(item) for item in timeline]
                    f_obj.write(','.join(timeline) + '\n')
        except (IOError, OSError) as err:
            logger.error(
                'Error occurred when writing intermediate timeline file: %s',
                err)
            raise ProfilerIOException
Ejemplo n.º 10
0
    def _get_file_path(self, device_type, file_type):
        """
        Get memory usage summary file.

        Args:
            device_type (str): Device type, e.g., GPU, Ascend.
            file_type (str): memory usage file type, e.g., summary, details.

        Returns:
            str, file path of memory usage file corresponding to its file_type.
        """
        filename = ""
        if device_type == "ascend":
            if file_type is FileType.SUMMARY.value:
                filename = self._summary_filename.format(self._device_id)
            elif file_type is FileType.DETAILS.value:
                filename = self._details_filename.format(self._device_id)
        else:
            logger.error('Memory Usage only supports Ascend for now. Please check the device type.')
            raise ParamValueError("Invalid device type.")

        file_path = os.path.join(self._profiling_dir, filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid memory usage file path.'
        )

        return file_path
Ejemplo n.º 11
0
def get_cluster_flops():
    """
    Get cluster FLOPs.

    Returns:
        str, the cluster FLOPs.

    Raises:
        ParamValueError: If the cluster profiler dir is invalid.

    Examples:
        >>>GET http://xxx/v1/mindinsight/profile/cluster-flops
    """
    train_id = get_train_id(request)
    if not train_id:
        raise ParamValueError('No train id.')
    cluster_profiler_dir = os.path.join(settings.SUMMARY_BASE_DIR, train_id)
    cluster_profiler_dir = validate_and_normalize_path(cluster_profiler_dir,
                                                       'cluster_profiler')
    check_train_job_and_profiler_dir(cluster_profiler_dir)

    analyser = AnalyserFactory.instance().get_analyser('cluster_flops',
                                                       cluster_profiler_dir)
    flops = analyser.get_flops()
    return jsonify(flops)
Ejemplo n.º 12
0
    def _get_pipeline_path(self, source_dir):
        """
        Get the minddata pipeline file path.

        Args:
            source_dir (str): The minddata pipeline source dir.

        Returns:
            str, the minddata pipeline file path.
        """
        pipeline_path = os.path.join(
            source_dir, self._raw_pipeline_file_name.format(self._device_id))

        try:
            pipeline_path = validate_and_normalize_path(
                pipeline_path, 'profiler')
        except ValidationError:
            logger.warning('Minddata pipeline file is invalid.')
            raise ProfilerPathErrorException(
                'Minddata pipeline file is invalid.')
        if not os.path.isfile(pipeline_path):
            logger.warning('The minddata pipeline file <%s> not found.',
                           pipeline_path)
            raise ProfilerFileNotFoundException(pipeline_path)

        return pipeline_path
Ejemplo n.º 13
0
    def get_display_timeline(self):
        """
        Get timeline data for UI display.

        Returns:
            json, the content of timeline data.
        """
        # Search timeline json file under profiling dir.
        timeline_filename = self._timeline_filename.format(self._device_id)
        display_filename = self._display_filename.format(self._device_id)
        file_list = [
            filename for filename in os.listdir(self._profiling_dir)
            if timeline_filename in filename or display_filename in filename
        ]

        # Check if there is a timeline json file for display
        file_path = os.path.join(self._profiling_dir, display_filename)
        if display_filename not in file_list:
            file_path = os.path.join(self._profiling_dir, timeline_filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline json path.')

        timeline = []
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    timeline = json.load(f_obj)
            except (IOError, OSError) as err:
                logger.error(
                    'Error occurred when read timeline display file: %s', err)
                raise ProfilerIOException
        else:
            logger.info('No timeline file. Please check the output path.')

        return timeline
Ejemplo n.º 14
0
    def get_display_timeline(self, device_type):
        """
        Get timeline data for UI display.

        Returns:
            json, the content of timeline data.
        """
        if device_type == "ascend":
            display_filename = self._ascend_display_filename.format(
                self._device_id)
        elif device_type == "gpu":
            display_filename = self._gpu_display_filename.format(
                self._device_id)
        else:
            logger.info(
                'device type should be ascend or gpu. Please check the device type.'
            )
            raise ParamValueError("Invalid device_type.")
        file_path = os.path.join(self._profiling_dir, display_filename)
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline json path.')

        timeline = []
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    timeline = json.load(f_obj)
            except (IOError, OSError, json.JSONDecodeError) as err:
                logger.error(
                    'Error occurred when read timeline display file: %s', err)
                raise ProfilerIOException
        else:
            logger.info('No timeline file. Please check the output path.')

        return timeline
Ejemplo n.º 15
0
    def load_timeline_data(self):
        """Load timeline data from file."""
        file_path = os.path.join(
            self._profiling_dir,
            self._output_timeline_data_file_path.format(self._device_id))
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline txt file path.')
        if not os.path.exists(file_path):
            logger.error("Failed to find parsed timeline file.")
            raise ProfilerFileNotFoundException('parsed timeline file')

        stream_count_dict = {}
        try:
            with open(file_path, 'r') as f_obj:
                for line in f_obj:
                    if not line.startswith('op_name'):
                        line_list = line.strip('\n').split(',')
                        self._parse_timeline_data(line_list)
                        self._update_num_of_streams(line_list,
                                                    stream_count_dict)
        except (IOError, OSError) as err:
            logger.error(
                'Error occurred when read timeline intermediate file: %s', err)
            raise ProfilerIOException

        # Update timeline summary info
        self._timeline_summary['num_of_streams'] = len(
            stream_count_dict.keys())
Ejemplo n.º 16
0
    def _get_communication_info(self, host_ip, device_id, step_num):
        """Get step trace info."""
        file_name = 'hccl_raw_{}.csv'.format(device_id)
        communication_file_path = \
            os.path.join(self._cluster_profiler_dir, 'cluster_profiler', host_ip, 'profiler', file_name)
        communication_file_path = validate_and_normalize_path(
            communication_file_path,
            raise_key="Invalid  communication file path.")
        if not os.path.exists(communication_file_path):
            log.error('Did not find the file: %s', communication_file_path)
            raise ProfilerFileNotFoundException(
                msg='Did not find the file:{}'.format(communication_file_path))
        communication_info = list()
        step_num = str(step_num)
        with open(communication_file_path, 'r') as src_file:
            csv_reader = csv.reader(src_file)
            # when the step_num value is 0, it means the average value.
            # The last line of the step_trace_raw_{}_detail_time.csv records the average value.
            # The first element of the last line is '-'.
            step_num = '-' if step_num == '0' else step_num
            for row in csv_reader:
                if row[0] == step_num:
                    communication_info = row
                    break
        # Convert string to floating point and dictionary
        if communication_info:
            communication_info[1] = float(communication_info[1])
            communication_info[2] = float(communication_info[2])
            communication_info[3] = json.loads(communication_info[3])
            communication_info[4] = json.loads(communication_info[4])

        return communication_info
Ejemplo n.º 17
0
    def _get_total_step_num(self):
        """Get the num of train step."""
        total_step_num = 0
        # Take the data of one of the machines to get the total number of steps.
        host_ip_dir = self._host_ips_dir[0]
        target_dir_path = os.path.join(self._cluster_profiler_dir,
                                       'cluster_profiler', host_ip_dir,
                                       'profiler')
        target_dir_path = validate_and_normalize_path(
            target_dir_path, raise_key="Invalid profiler dir path.")
        if not os.path.exists(target_dir_path):
            log.error('Did not find cluster_profiler dir : %s',
                      target_dir_path)
            raise ProfilerDirNotFoundException(
                msg='Did not find cluster_profiler dir:{}'.format(
                    target_dir_path))

        entries = os.scandir(target_dir_path)
        for entry in entries:
            if entry.is_symlink():
                continue
            if entry.is_file() and entry.name.startswith('hccl_raw'):
                file_path = os.path.join(target_dir_path, entry.name)
                with open(file_path, 'r') as src_file:
                    lines = src_file.readlines()
                # The first row is col_name, the last row is the average.
                if len(lines) > 2:
                    total_step_num = len(lines) - 2
                break
        return total_step_num
Ejemplo n.º 18
0
    def _get_total_step_num(self):
        """Get the num of train step."""
        total_step_num = 0
        # take the data of one of the machines to get the total number of steps.
        host_ip_dir = self._host_ips_dir[0]
        target_dir_path = os.path.join(self._cluster_profiler_dir,
                                       'cluster_profiler', host_ip_dir,
                                       'profiler')
        target_dir_path = validate_and_normalize_path(
            target_dir_path, raise_key="Invalid profiler dir path.")
        if not os.path.exists(target_dir_path):
            log.error('Did not find cluster_profiler dir : %s',
                      target_dir_path)
            raise ProfilerDirNotFoundException(
                msg='Did not find cluster_profiler dir:{}'.format(
                    target_dir_path))

        entries = os.scandir(target_dir_path)
        for entry in entries:
            if entry.is_symlink():
                continue
            if entry.is_file() and entry.name.startswith('step_trace_raw'):
                file_path = os.path.join(target_dir_path, entry.name)
                with open(file_path, 'r') as src_file:
                    lines = src_file.readlines()
                # The penultimate line represents the information of the last step
                # The step num index is 0
                if len(lines) > 1:
                    total_step_num = lines[-2].split(',')[0]
                break
        return total_step_num
Ejemplo n.º 19
0
 def _get_step_trace_info(self, host_ip, device_id, step_num):
     """Get step trace info."""
     file_name = 'step_trace_raw_{}_detail_time.csv'.format(device_id)
     step_trace_file_path = \
         os.path.join(self._cluster_profiler_dir, 'cluster_profiler', host_ip, 'profiler', file_name)
     step_trace_file_path = validate_and_normalize_path(
         step_trace_file_path, raise_key="Invalid step trace file path.")
     if not os.path.exists(step_trace_file_path):
         log.error('Did not find the file: %s', step_trace_file_path)
         raise ProfilerFileNotFoundException(
             msg='Did not find the file:{}'.format(step_trace_file_path))
     step_trace_info = list()
     step_num = str(step_num)
     with open(step_trace_file_path, 'r') as src_file:
         lines = src_file.readlines()
         # when the step_num value is 0, it means the average value.
         # The last line of the step_trace_raw_{}_detail_time.csv records the average value.
         if step_num == '0':
             step_trace_info = lines[-1].strip('\n').split(',')
         else:
             for line in lines:
                 line = line.strip('\n').split(',')
                 if line[0] == step_num:
                     step_trace_info = line
     # step_trace_info[6]: iteration_interval time
     # step_trace_info[7]: fp_and_bp time
     # step_trace_info[8]: tail time
     # divided by 1e5, the unit becomes a millisecond
     iteration_interval = float(step_trace_info[6]) / 1e5
     fp_and_bp = float(step_trace_info[7]) / 1e5
     tail = float(step_trace_info[8]) / 1e5
     step_trace_info = [iteration_interval, fp_and_bp, tail]
     return step_trace_info
Ejemplo n.º 20
0
    def get_min_cycle_counter_from_file(self):
        """
        Get minimum cycle counter.

        Returns:
            float, the minimum value of the cycle counter.
        """
        file_path = os.path.join(
            self._profiling_dir,
            self._min_cycle_counter_file_path.format(self._device_id))

        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid min cycle counter file path.')

        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    min_cycle_counter = f_obj.read()
                    min_cycle_counter = float(min_cycle_counter) \
                        if not min_cycle_counter == 'inf' else 0
            except (IOError, OSError) as err:
                logger.error(
                    'Error occurred when read minimum cycle counter: %s', err)
                raise ProfilerIOException
        else:
            min_cycle_counter = 0
            logger.info("No min cycle counter recorded.")

        return min_cycle_counter
Ejemplo n.º 21
0
    def write_timeline_to_json_by_limitation(self):
        """Write timeline to json by limitation."""
        display_filename = self._display_filename.format(self._device_id)
        display_file_path = os.path.join(
            self._profiling_dir,
            display_filename
        )
        display_file_path = validate_and_normalize_path(
            display_file_path, raise_key='Invalid timeline display json path.'
        )

        length = len(self._timeline_meta)
        try:
            with open(display_file_path, 'w') as json_file:
                json_file.write('[')
                for index, item in enumerate(self._timeline_meta):
                    json.dump(item, json_file)
                    file_size = os.path.getsize(display_file_path)
                    if file_size > SIZE_LIMIT:
                        break
                    if index == length - 1:
                        break
                    json_file.write(',')
                json_file.write(']')
        except (IOError, OSError) as err:
            logger.error('Error occurred when write timeline display file: %s', err)
            raise ProfilerIOException
Ejemplo n.º 22
0
    def get_timeline_summary(self):
        """
        Get timeline summary information for UI display.

        Returns:
            json, the content of timeline summary information.
        """
        file_path = None
        summary_file_name = 'timeline_summary_{}.json'.format(self._device_id)
        if summary_file_name in os.listdir(self._profiling_dir):
            file_path = os.path.join(self._profiling_dir, summary_file_name)

        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline summary path.')

        timeline_summary = {}
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    timeline_summary = json.load(f_obj)
            except (IOError, OSError) as err:
                logger.error(
                    'Error occurred when read timeline summary file: %s', err)
                raise ProfilerIOException

        return timeline_summary
Ejemplo n.º 23
0
    def _load_timeline_data(self):
        """Load timeline data from file."""
        file_path = os.path.join(
            self._profiling_dir,
            self._output_timeline_data_file_path.format(self._device_id)
        )
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline txt file path.'
        )
        if not os.path.exists(file_path):
            logger.error("Failed to find parsed timeline file.")
            raise ProfilerFileNotFoundException('parsed timeline file')

        timeline_list = []
        try:
            with open(file_path, 'r') as f_obj:
                for line in f_obj:
                    if not line.startswith('op_name'):
                        line_list = line.strip('\n').split(',')
                        timeline_list.append(line_list)
        except (IOError, OSError) as err:
            logger.error('Error occurred when read timeline intermediate file: %s', err)
            raise ProfilerIOException

        return timeline_list
Ejemplo n.º 24
0
def get_profile_device_list():
    """
    Get profile device list.

    Returns:
        list, the available device list.

    Raises:
        ParamValueError: If the search condition contains some errors.

    Examples:
        >>> POST http://xxxx/v1/mindinsight/profile/devices
    """
    profiler_dir = get_profiler_dir(request)
    train_id = get_train_id(request)
    if not profiler_dir or not train_id:
        raise ParamValueError("No profiler_dir or train_id.")

    profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id,
                                    profiler_dir)
    try:
        profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs,
                                                       "profiler")
    except ValidationError:
        raise ParamValueError("Invalid profiler dir")

    check_train_job_and_profiler_dir(profiler_dir_abs)

    device_list, _ = analyse_device_list_from_profiler_dir(profiler_dir_abs)
    return jsonify(device_list)
    def _get_minddata_pipeline_info(self):
        """Get the number of thread cores in minddata pipeline operator"""
        file_name = self._minddata_pipeline_display_filename.format(
            self._device_id)
        file_path = os.path.join(self._profiling_dir, file_name)
        file_path = validate_and_normalize_path(
            file_path, raise_key="Invalid minddata_pipeline_info file path.")
        if not os.path.exists(file_path):
            log.error('Did not find the minddata_pipeline file: %s', file_path)
            raise ProfilerFileNotFoundException(
                msg='Did not find the minddata_pipeline file:{}'.format(
                    file_path))

        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                minddata_pipeline_info = json.load(file)
            except json.JSONDecodeError as err:
                log.exception(err)
                raise ProfilerRawFileException(
                    "Fail to parse minddata pipeline file")

        minddata_pipeline_op_info = []
        for item in minddata_pipeline_info.get("op_info"):
            op_info_dict = dict()
            op_info_dict["op_id"] = item.get("op_id")
            op_info_dict["num_workers"] = item.get("num_workers")
            minddata_pipeline_op_info.append(op_info_dict)
        return minddata_pipeline_op_info
Ejemplo n.º 26
0
 def write_min_cycle_counter_to_file(self):
     """Write minimum cycle counter into a txt file."""
     min_cycle_counter = self._min_cycle_counter
     file_name = 'min_cycle_counter_' + self._device_id + '.txt'
     file_path = os.path.join(self._output_path, file_name)
     file_path = validate_and_normalize_path(
         file_path, raise_key='Invalid min cycle counter file path.')
     with open(file_path, 'w') as file:
         file.write(str(min_cycle_counter))
Ejemplo n.º 27
0
    def __init__(self, subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data',
                 optypes_to_deal='', optypes_not_deal='Variable', job_id=""):
        # get device_id and device_target
        device_target = ""
        try:
            import mindspore.context as context
            dev_id = str(context.get_context("device_id"))
            device_target = context.get_context("device_target")
        except ImportError:
            logger.error("Profiling: fail to import context from mindspore.")
        except ValueError as err:
            logger.error("Profiling: fail to get context, %s", err.message)

        if not dev_id:
            dev_id = os.getenv('DEVICE_ID')
        if not dev_id:
            dev_id = "0"
            logger.error("Fail to get DEVICE_ID, use 0 instead.")

        if device_target and device_target != "Davinci" \
            and device_target != "Ascend":
            msg = ("Profiling: unsupport backend: %s" \
                  % device_target)
            raise RuntimeError(msg)

        self._dev_id = dev_id
        self._container_path = os.path.join(self._base_profiling_container_path, dev_id)
        data_path = os.path.join(self._container_path, "data")
        if not os.path.exists(data_path):
            os.makedirs(data_path)
        self._output_path = validate_and_normalize_path(output_path,
                                                        'Profiler output path (' + output_path + ')')
        self._output_path = os.path.join(self._output_path, "profiler")
        if not os.path.exists(self._output_path):
            os.makedirs(self._output_path)

        os.environ['PROFILING_MODE'] = 'true'
        os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace'
        # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
        try:
            import mindspore.context as context
            context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace")
        except ImportError:
            logger.error("Profiling: fail to import context from mindspore.")
        except ValueError as err:
            logger.error("Profiling: fail to set context, %s", err.message)

        os.environ['AICPU_PROFILING_MODE'] = 'true'
        os.environ['PROFILING_DIR'] = str(self._container_path)
        self._subgraph = check_subgraph(subgraph)
        self._valid_optype_name = optypes_to_deal.split(",") if optypes_to_deal else []
        self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else []
        self._detail = check_bool(is_detail, 'is_detail')
        self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path')
        self._profiling_job_id = job_id
        self._start_time = int(time.time() * 10000000)
        logger.info("Profiling: profiling start time: %d", self._start_time)
Ejemplo n.º 28
0
 def __init__(self, hwts_output_file, output_filename, op_task_info,
              output_path, device_id):
     hwts_output_file = validate_and_normalize_path(
         hwts_output_file, raise_key='Invalid hwts output file path.')
     self._hwts_output_file = hwts_output_file
     self._output_filename = output_filename
     self._op_task_info = op_task_info
     self._output_path = output_path
     self._device_id = device_id
     self._min_cycle_counter = float("inf")
Ejemplo n.º 29
0
    def _load(self):
        """Load data according to the parsed AICORE operator file."""
        op_detail_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_aicore_detail_time.format(self._device_id)
        )
        framework_file_path = os.path.join(
            self._profiling_dir,
            self._file_name_framework_info.format(self._device_id)
        )
        op_detail_file_path = validate_and_normalize_path(
            op_detail_file_path, raise_key='Invalid aicore_detail file path.'
        )

        framework_file_path = validate_and_normalize_path(
            framework_file_path, raise_key='Invalid framework file path.'
        )
        if not os.path.isfile(op_detail_file_path):
            logger.warning('The file <%s> does not exist.', op_detail_file_path)
            return
        if not os.path.isfile(framework_file_path):
            logger.warning('The file <%s> does not exist.', framework_file_path)
            return

        framework_infos = dict()
        with open(framework_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            _ = next(csv_reader)
            for info in csv_reader:
                framework_infos[info[3]] = self._convert_framework_field_type(
                    info
                )

        with open(op_detail_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            _ = next(csv_reader)
            for info in csv_reader:
                detail_info = self._get_op_detail_info(info, framework_infos)
                self._data.append(detail_info)

        del framework_infos
Ejemplo n.º 30
0
def get_profile_summary_proposal():
    """
    Get summary profiling proposal.

    Returns:
        str, the summary profiling proposal.

    Raises:
        ParamValueError: If the parameters contain some errors.

    Examples:
        >>> GET http://xxxx/v1/mindinsight/profile/summary/propose
    """
    profiler_dir = get_profiler_dir(request)
    train_id = get_train_id(request)
    device_id = get_device_id(request)
    if not profiler_dir or not train_id:
        raise ParamValueError("No profiler_dir or train_id.")
    to_int(device_id, 'device_id')

    profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id,
                                    profiler_dir)
    try:
        profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs,
                                                       "profiler")
    except ValidationError:
        raise ParamValueError("Invalid profiler dir")

    check_train_job_and_profiler_dir(profiler_dir_abs)

    step_trace_condition = {
        "filter_condition": {
            "mode": "proc",
            "proc_name": "iteration_interval",
            "step_id": 0
        }
    }
    options = {'step_trace': {"iter_interval": step_trace_condition}}

    proposal_type_list = [
        'step_trace', 'minddata', 'minddata_pipeline', 'common'
    ]
    proposal_obj = ComposeProposal(profiler_dir_abs, device_id,
                                   proposal_type_list)
    proposal_info = proposal_obj.get_proposal(options)
    # Use json.dumps for orderly return
    return CustomResponse(json.dumps(proposal_info),
                          mimetype='application/json')