def _get_minddata_pipeline_info(self):
        """Get the number of thread cores in minddata pipeline operator"""
        file_name = self._minddata_pipeline_display_filename.format(
            self._device_id)
        file_path = os.path.join(self._profiling_dir, file_name)
        file_path = validate_and_normalize_path(
            file_path, raise_key="Invalid minddata_pipeline_info file path.")
        if not os.path.exists(file_path):
            log.error('Did not find the minddata_pipeline file: %s', file_path)
            raise ProfilerFileNotFoundException(
                msg='Did not find the minddata_pipeline file:{}'.format(
                    file_path))

        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                minddata_pipeline_info = json.load(file)
            except json.JSONDecodeError as err:
                log.exception(err)
                raise ProfilerRawFileException(
                    "Fail to parse minddata pipeline file")

        minddata_pipeline_op_info = []
        for item in minddata_pipeline_info.get("op_info"):
            op_info_dict = dict()
            op_info_dict["op_id"] = item.get("op_id")
            op_info_dict["num_workers"] = item.get("num_workers")
            minddata_pipeline_op_info.append(op_info_dict)
        return minddata_pipeline_op_info
Exemple #2
0
    def _get_op_task_id_map(self):
        """
        Read hwts data file, get the task time info.

        Returns:
           list: all hwts task time info.
        """

        op_map_result = []
        hwts_list = []

        if not os.path.exists(self._hwts_output_file):
            logger.error('The hwts output file does not exist.')
            raise ProfilerFileNotFoundException('hwts output file')

        with open(self._hwts_output_file, 'r') as data_file:
            lines = data_file.readlines()
            for line in lines:
                if line.startswith("Start of task") or line.startswith(
                        "End of task"):
                    line_split = line.split()
                    container = HWTSContainer(line_split)
                    hwts_list.append(container)

        # hwts op map by taskId
        for hwts in hwts_list:
            if hwts.task_id in self._op_task_info.keys():
                hwts.op_name = self._op_task_info[hwts.task_id]
                op_map_result.append(hwts)

        return op_map_result
    def _get_minddata_queue_step_time_info(self):
        """Get the sampling time information at the steps of the host queue"""
        minddata_queue_step_time_info = []
        minddata_analyser = MinddataAnalyser(self._profiling_dir,
                                             self._device_id)
        file_path = minddata_analyser.get_device_queue_file_path()
        file_path = validate_and_normalize_path(
            file_path, raise_key="Invalid device_queue file path")
        if not os.path.exists(file_path):
            log.error('Did not find the device queue file: %s', file_path)
            raise ProfilerFileNotFoundException(
                msg='Did not find the device queue file.')

        with open(file_path) as data_file:
            for line in data_file.readlines():
                op_info = line.split()
                # op_info is a list like:['1','64','8','2','85406783']
                # The value of the first element in op_info is '0' or '1'.
                # '0' means that the time information is recorded.
                # '1' means that the queue information is recorded.
                # '1':queue info , '64':queue capacity, '8':step_num, '2':queue size, '85406783':sampling time.
                if op_info and op_info[0] == "1":
                    minddata_queue_step_time_info.append(
                        [op_info[2], op_info[4]])
        return minddata_queue_step_time_info
Exemple #4
0
    def load_timeline_data(self):
        """Load timeline data from file."""
        file_path = os.path.join(
            self._profiling_dir,
            self._output_timeline_data_file_path.format(self._device_id))
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline txt file path.')
        if not os.path.exists(file_path):
            logger.error("Failed to find parsed timeline file.")
            raise ProfilerFileNotFoundException('parsed timeline file')

        stream_count_dict = {}
        try:
            with open(file_path, 'r') as f_obj:
                for line in f_obj:
                    if not line.startswith('op_name'):
                        line_list = line.strip('\n').split(',')
                        self._parse_timeline_data(line_list)
                        self._update_num_of_streams(line_list,
                                                    stream_count_dict)
        except (IOError, OSError) as err:
            logger.error(
                'Error occurred when read timeline intermediate file: %s', err)
            raise ProfilerIOException

        # Update timeline summary info
        self._timeline_summary['num_of_streams'] = len(
            stream_count_dict.keys())
    def _load_timeline_data(self):
        """Load timeline data from file."""
        file_path = os.path.join(
            self._profiling_dir,
            self._output_timeline_data_file_path.format(self._device_id)
        )
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline txt file path.'
        )
        if not os.path.exists(file_path):
            logger.error("Failed to find parsed timeline file.")
            raise ProfilerFileNotFoundException('parsed timeline file')

        timeline_list = []
        try:
            with open(file_path, 'r') as f_obj:
                for line in f_obj:
                    if not line.startswith('op_name'):
                        line_list = line.strip('\n').split(',')
                        timeline_list.append(line_list)
        except (IOError, OSError) as err:
            logger.error('Error occurred when read timeline intermediate file: %s', err)
            raise ProfilerIOException

        return timeline_list
Exemple #6
0
    def _get_pipeline_path(self, source_dir):
        """
        Get the minddata pipeline file path.

        Args:
            source_dir (str): The minddata pipeline source dir.

        Returns:
            str, the minddata pipeline file path.
        """
        pipeline_path = os.path.join(
            source_dir, self._raw_pipeline_file_name.format(self._device_id))

        try:
            pipeline_path = validate_and_normalize_path(
                pipeline_path, 'profiler')
        except ValidationError:
            logger.warning('Minddata pipeline file is invalid.')
            raise ProfilerPathErrorException(
                'Minddata pipeline file is invalid.')
        if not os.path.isfile(pipeline_path):
            logger.warning('The minddata pipeline file <%s> not found.',
                           pipeline_path)
            raise ProfilerFileNotFoundException(pipeline_path)

        return pipeline_path
Exemple #7
0
    def _get_host_device_rank_relation(self):
        """Get host_ip device_id rank_id relation."""
        rank_table_file_path = self._get_rank_table_file_path()
        if not os.path.exists(rank_table_file_path):
            log.error('Did not find rank table file under %s',
                      self._cluster_profiler_dir)
            raise ProfilerFileNotFoundException(
                msg='Did not find rank table file')
        with open(rank_table_file_path, 'r', encoding='utf-8') as file:
            try:
                relation_info = json.load(file)
            except json.JSONDecodeError as err:
                log.exception(err)
        host_device_rank_relation = list()
        servers_info = relation_info.get("server_list")
        for server_info in servers_info:
            server_id = server_info.get("server_id")
            devices_info = server_info.get("device")
            for device_info in devices_info:
                device_id = device_info.get("device_id")
                rank_id = device_info.get("rank_id")
                host_device_rank_relation.append(
                    [server_id, device_id, rank_id])

        host_ips_mapping_info = self._get_host_ips_mapping_info()
        for item in host_device_rank_relation:
            # host_ip_index:0,host_mapping_id_index:1
            target_info = [i for i in host_ips_mapping_info if item[0] == i[0]]
            # target_info is like:[[host_ip, host_mapping_ip]]
            item[0] = target_info[0][1]

        return host_device_rank_relation
Exemple #8
0
    def _get_communication_info(self, host_ip, device_id, step_num):
        """Get step trace info."""
        file_name = 'hccl_raw_{}.csv'.format(device_id)
        communication_file_path = \
            os.path.join(self._cluster_profiler_dir, 'cluster_profiler', host_ip, 'profiler', file_name)
        communication_file_path = validate_and_normalize_path(
            communication_file_path,
            raise_key="Invalid  communication file path.")
        if not os.path.exists(communication_file_path):
            log.error('Did not find the file: %s', communication_file_path)
            raise ProfilerFileNotFoundException(
                msg='Did not find the file:{}'.format(communication_file_path))
        communication_info = list()
        step_num = str(step_num)
        with open(communication_file_path, 'r') as src_file:
            csv_reader = csv.reader(src_file)
            # when the step_num value is 0, it means the average value.
            # The last line of the step_trace_raw_{}_detail_time.csv records the average value.
            # The first element of the last line is '-'.
            step_num = '-' if step_num == '0' else step_num
            for row in csv_reader:
                if row[0] == step_num:
                    communication_info = row
                    break
        # Convert string to floating point and dictionary
        if communication_info:
            communication_info[1] = float(communication_info[1])
            communication_info[2] = float(communication_info[2])
            communication_info[3] = json.loads(communication_info[3])
            communication_info[4] = json.loads(communication_info[4])

        return communication_info
Exemple #9
0
 def _get_step_trace_info(self, host_ip, device_id, step_num):
     """Get step trace info."""
     file_name = 'step_trace_raw_{}_detail_time.csv'.format(device_id)
     step_trace_file_path = \
         os.path.join(self._cluster_profiler_dir, 'cluster_profiler', host_ip, 'profiler', file_name)
     step_trace_file_path = validate_and_normalize_path(
         step_trace_file_path, raise_key="Invalid step trace file path.")
     if not os.path.exists(step_trace_file_path):
         log.error('Did not find the file: %s', step_trace_file_path)
         raise ProfilerFileNotFoundException(
             msg='Did not find the file:{}'.format(step_trace_file_path))
     step_trace_info = list()
     step_num = str(step_num)
     with open(step_trace_file_path, 'r') as src_file:
         lines = src_file.readlines()
         # when the step_num value is 0, it means the average value.
         # The last line of the step_trace_raw_{}_detail_time.csv records the average value.
         if step_num == '0':
             step_trace_info = lines[-1].strip('\n').split(',')
         else:
             for line in lines:
                 line = line.strip('\n').split(',')
                 if line[0] == step_num:
                     step_trace_info = line
     # step_trace_info[6]: iteration_interval time
     # step_trace_info[7]: fp_and_bp time
     # step_trace_info[8]: tail time
     # divided by 1e5, the unit becomes a millisecond
     iteration_interval = float(step_trace_info[6]) / 1e5
     fp_and_bp = float(step_trace_info[7]) / 1e5
     tail = float(step_trace_info[8]) / 1e5
     step_trace_info = [iteration_interval, fp_and_bp, tail]
     return step_trace_info
Exemple #10
0
 def _load(self):
     """Load data according to the parsed AICORE operator types file."""
     file_path = query_latest_trace_time_file(self._profiling_dir, self._device_id)
     if not file_path:
         log.error("Failed to find parsed trace time file.")
         raise ProfilerFileNotFoundException('parsed step trace time file')
     with open(file_path, 'r') as handle:
         csv_reader = csv.reader(handle)
         self.__column__ = next(csv_reader)
         self._data = list(csv_reader)
     self._size = len(self._data) - 1
     self._display_col_names = self._col_names[:]
     self._load_point_info()
Exemple #11
0
    def _search_file(self, profiling_id, device_id):
        """
        Search all framework files in raw profiling path.

        Args:
            profiling_id (str): The profiling ID.
            device_id (str): The device ID.

        Raises:
            ProfilerFileNotFoundException: If the framework files are not found.
        """
        self._search_file_from_job_path(device_id)
        self._search_file_from_data_path(profiling_id, device_id)

        if self._backend_type is None:
            raise ProfilerFileNotFoundException('Framework')
        self._framework_path['graph'].sort()
        self._framework_path['task'].sort()
    def _load(self):
        """Load cpu_utilization info."""
        file_name = self._cpu_utilization_display_filename.format(
            self._device_id)
        file_path = os.path.join(self._profiling_dir, file_name)
        file_path = validate_and_normalize_path(
            file_path, raise_key="Invalid cpu_utilization_info file path.")
        if not os.path.exists(file_path):
            log.error('Did not find the cpu utilization file: %s', file_path)
            raise ProfilerFileNotFoundException(
                msg='Did not find the cpu utilization file.')

        with open(file_path, 'r', encoding='utf-8') as src_file:
            try:
                self._data = json.load(src_file)
            except json.JSONDecodeError as err:
                log.exception(err)
                raise ProfilerRawFileException(
                    "Fail to parse cpu_utilization info file")
Exemple #13
0
    def _search_file(self, profiling_id, device_id):
        """
        Search all framework files in raw profiling path.

        Args:
            profiling_id (str): The profiling ID.
            device_id (str): The device ID.

        Raises:
            ProfilerFileNotFoundException: If the framework files are not found.
        """
        # first search in the JOB dir, and if not, search in the sub directory
        # in the JOB
        self._search_file_from_job_path(device_id, search_in_sub_path=False)
        if self._backend_type is None:
            self._search_file_from_job_path(device_id, search_in_sub_path=True)
        self._search_file_from_data_path(profiling_id, device_id)

        if self._backend_type is None:
            raise ProfilerFileNotFoundException('Framework')
        self._framework_path['graph'].sort()
        self._framework_path['task'].sort()
    def _get_minddata_queue_step_time_info(self):
        """Get the sampling time information at the steps of the host queue"""
        minddata_queue_step_time_info = []
        minddata_analyser = MinddataAnalyser(self._profiling_dir,
                                             self._device_id)
        file_path = minddata_analyser.get_device_queue_file_path()
        file_path = validate_and_normalize_path(
            file_path, raise_key="Invalid device_queue file path")
        if not os.path.exists(file_path):
            log.error('Did not find the device queue file: %s', file_path)
            raise ProfilerFileNotFoundException(
                msg='Did not find the device queue file:{}'.format(file_path))

        with open(file_path) as data_file:
            for line in data_file.readlines():
                op_info = line.split()
                # op_info[0]=="1":queue info, op_info[1]:Connector capacity,
                # op_info[2]:step_num, op_info[3]:Connector size, op_info[4]:sampling time
                if op_info and op_info[0] == "1":
                    minddata_queue_step_time_info.append(
                        [op_info[2], op_info[4]])
        return minddata_queue_step_time_info
Exemple #15
0
    def _get_file_content(self, device_type, file_type):
        """
        Get file content for different types of memory usage files.

        Args:
            device_type (str): Device type, e.g., GPU, Ascend.
            file_type (str): memory usage file type, e.g., summary, details.

        Returns:
            dict, file content corresponding to file_type.
        """
        file_path = self._get_file_path(device_type, file_type)
        if not os.path.exists(file_path):
            logger.error('Invalid file path. Please check the output path: %s', file_path)
            raise ProfilerFileNotFoundException(msg='Invalid memory file path.')

        try:
            with open(file_path, 'r') as f_obj:
                file_content = json.load(f_obj)
        except (IOError, OSError, json.JSONDecodeError) as err:
            logger.error('Error occurred when read memory file: %s', err)
            raise ProfilerIOException()

        return file_content