def _get_pipeline_path(self, source_dir): """ Get the minddata pipeline file path. Args: source_dir (str): The minddata pipeline source dir. Returns: str, the minddata pipeline file path. """ pipeline_path = os.path.join( source_dir, self._raw_pipeline_file_name.format(self._device_id)) try: pipeline_path = validate_and_normalize_path( pipeline_path, 'profiler') except ValidationError: logger.warning('Minddata pipeline file is invalid.') raise ProfilerPathErrorException( 'Minddata pipeline file is invalid.') if not os.path.isfile(pipeline_path): logger.warning('The minddata pipeline file <%s> not found.', pipeline_path) raise ProfilerFileNotFoundException(pipeline_path) return pipeline_path
def get_proposer(self, proposer_type, *args): """ Get the specified proposer according to the proposer type. Args: proposer_type (str): The proposer type. args (list): The parameters required for the specific proposer class. Returns: Proposer, the specified proposer instance. Examples: >>> proposer_type = 'step_trace' >>> proposer = ProposerFactory.instance().get_proposer(proposer_type, self.profiling_dir, self.device_id) """ logger.debug("The 'proposer_type' is %s,The 'args' is %s", proposer_type, str(args)) proposer_instance = None sub_name = proposer_type.split('_') proposer_class_name = ''.join([name.capitalize() for name in sub_name]) proposer_class_name += 'Proposer' if hasattr(proposer_module, proposer_class_name): proposer_instance = getattr(proposer_module, proposer_class_name)(*args) else: logger.warning("The proposer class %s does not exist.", proposer_class_name) return proposer_instance
def _load(self): """Load data according to the parsed AICORE operator file.""" op_detail_file_path = os.path.join( self._profiling_dir, self._file_name_aicore_detail_time.format(self._device_id)) framework_file_path = os.path.join( self._profiling_dir, self._file_name_framework_info.format(self._device_id)) if not os.path.isfile(op_detail_file_path): logger.warning('The file <%s> does not exist.', op_detail_file_path) return if not os.path.isfile(framework_file_path): logger.warning('The file <%s> does not exist.', framework_file_path) return framework_infos = dict() with open(framework_file_path, 'r') as file: csv_reader = csv.reader(file) _ = next(csv_reader) for info in csv_reader: framework_infos[info[3]] = self._convert_framework_field_type( info) with open(op_detail_file_path, 'r') as file: csv_reader = csv.reader(file) _ = next(csv_reader) for info in csv_reader: detail_info = self._get_op_detail_info(info, framework_infos) self._data.append(detail_info) del framework_infos
def _search_file(input_dir): """Search step trace file under specific input directory.""" # validate input_dir if not os.path.isdir(input_dir): raise ProfilerPathErrorException( '{} does not exist or is not a dir'.format(input_dir)) # get step trace files files = os.listdir(input_dir) step_trace_files = list( filter( lambda file: file.startswith('training_trace') and not file. endswith('.done'), files)) # validate result if len(step_trace_files) > 1: # the format of file name is like # `training_trace.46.dev.profiler_default_tag.$id.slice_$number` # use the $number as the sorted key try: step_trace_files.sort( key=lambda path: int(path.rsplit('_', 1)[-1])) except ValueError as err: log.warning("Unable to parse file names: %s. %s", step_trace_files, err) step_trace_files = [] file_paths = [ os.path.join(input_dir, file) for file in step_trace_files ] log.info("Find %d step trace files.", len(file_paths)) return file_paths
def _record_trace_event(self, step_trace): """Record trace event.""" self._step_num += 1 start_time = step_trace.get('start') end_time = step_trace.get('end') fp_time = step_trace.get('fp') bp_time = step_trace.get('bp') if not (start_time and end_time and fp_time and bp_time): log.warning("The step %d is missing basic time.", self._step_num) return if start_time == '-': start_time = fp_time row_data = { 'step_num': self._step_num, 'start_point': start_time, 'end_point': end_time, 'total': end_time - start_time, 'fp_point': fp_time, 'bp_point': bp_time, 'iteration_interval': fp_time - start_time, 'fp_and_bp': bp_time - fp_time, 'tail': end_time - bp_time } # update reduce info self._update_reduce_info(step_trace, row_data) # save the row data if not self._header: self._header = list(row_data.keys()) row_data_list = [ row_data.get(header_name, 0) for header_name in self._header ] self._result.append(row_data_list)
def get_flops_summary(self): """ Get flops summary information for UI display. Returns: json, the content of flops summary information. """ summary_filename = self._flops_summary_filename.format(self._device_id) file_path = os.path.join(self._profiling_dir, summary_filename) file_path = validate_and_normalize_path( file_path, raise_key='Invalid flops summary path.' ) flops_summary = {} if os.path.exists(file_path): try: with open(file_path, 'r') as f_obj: flops_summary = json.load(f_obj) except (IOError, OSError, json.JSONDecodeError) as err: logger.error('Error occurred when read flops summary file: %s', err) raise ProfilerIOException() else: logger.warning('No flops summary file. Please check the output path.') return flops_summary
def get_analyser_result(self, analyser_type, condition=None): logger.debug("The Proposer 'analyser_type' is %s, 'options' is %s", str(analyser_type), str(condition)) analyser_result = {} try: analyser = AnalyserFactory.instance().get_analyser(analyser_type, self.profiling_path, self.device_id) analyser_result = analyser.query(condition) logger.debug("The 'analyser_result' is %s, the 'condition' is %s.", str(analyser_result), str(condition)) except MindInsightException as e: logger.warning(e) return analyser_result
def execute(self): """Execute the parser, get result data, and write it to the output file.""" if not os.path.exists(self._source_file_name): logger.info("Did not find the aicpu profiling source file") return with open(self._source_file_name, 'rb') as ai_cpu_data: ai_cpu_str = str(ai_cpu_data.read().replace(b'\n\x00', b' ___ ') .replace(b'\x00', b' ___ '))[2:-1] ai_cpu_lines = ai_cpu_str.split(" ___ ") node_list = list() ai_cpu_total_time_summary = 0 # node serial number serial_number = 1 for i in range(len(ai_cpu_lines)-1): node_line = ai_cpu_lines[i] thread_line = ai_cpu_lines[i+1] if "Node" in node_line and "Thread" in thread_line: # get the node data from node_line node_name = node_line.split(',')[0].split(':')[-1] run_v2_start = node_line.split(',')[1].split(':')[-1] compute_start = node_line.split(',')[2].split(':')[-1] mercy_start = node_line.split(',')[3].split(':')[-1] mercy_end = node_line.split(',')[4].split(':')[-1] run_v2_end = node_line.split(',')[5].split(':')[-1] # get total_time and dispatch_time from thread line total_time = thread_line.split(',')[-1].split('=')[-1].split()[0] dispatch_time = thread_line.split(',')[-2].split('=')[-1].split()[0] node_data = [serial_number, node_name, total_time, dispatch_time, run_v2_start, compute_start, mercy_start, mercy_end, run_v2_end] node_list.append(node_data) # calculate the total time ai_cpu_total_time_summary += int(total_time) # increase node serial number serial_number += 1 elif "Node" in node_line and "Thread" not in thread_line: node_name = node_line.split(',')[0].split(':')[-1] logger.warning("The node:%s cannot find thread data", node_name) node_list.append(["AI CPU Total Time(us):", ai_cpu_total_time_summary]) if node_list: fwrite_format(self._output_filename, data_source=_dst_file_title, is_print=True, is_start=True) fwrite_format(self._output_filename, data_source=tabulate(node_list, _dst_file_column_title, tablefmt='simple'), is_start=True, is_print=True)
def _construct_time_point(self, name, start, duration): """Construct time point.""" point = {} if start >= 0 and duration >= 0: point = { self._attr_ui_name: name, self._attr_ui_start: round(start, 4), self._attr_ui_duration: round(duration, 4) } else: log.warning("Not invalid point info: " "name: %s, start: %s, duration: %s", name, start, duration) return point
def _load(self): """Load data according to the parsed AICORE operator types file.""" op_type_file_path = os.path.join( self._profiling_dir, self._file_name_aicore_type_time.format(self._device_id)) if not os.path.isfile(op_type_file_path): logger.warning('The file <%s> does not exist.', op_type_file_path) return with open(op_type_file_path, 'r') as file: csv_reader = csv.reader(file) _ = next(csv_reader) for info in csv_reader: self._data.append(self._convert_field_type(info))
def _load(self): """Load data according to the parsed minddata pipeline file.""" pipeline_file_path = os.path.join( self._profiling_dir, self._file_name_pipeline.format(self._device_id)) if not os.path.isfile(pipeline_file_path): logger.warning('The file <%s> does not exist.', pipeline_file_path) return with open(pipeline_file_path, 'r') as file: csv.field_size_limit(sys.maxsize) csv_reader = csv.reader(file) _ = next(csv_reader) for info in csv_reader: self._data.append(self._convert_field_type(info))
def _update_reduce_info(step_trace, row_data): """Extract reduce info.""" reduce_time = step_trace.get('reduce', {}) for stream_id, time_points in reduce_time.items(): time_point_num = len(time_points) if time_point_num % 2: log.warning("Stream %d has %d reduce time points.", stream_id, time_point_num) continue for index, point_id in enumerate(range(0, time_point_num, 2)): field_name = f'stream_{stream_id}_parallel_{index}' row_data[field_name + '_start_point'] = time_points[point_id] row_data[field_name + '_end_point'] = time_points[point_id + 1] row_data[field_name] = time_points[point_id + 1] - time_points[point_id]
def _load(self): """Load data according to the parsed AICPU operator file.""" aicpu_file_path = os.path.join( self._profiling_dir, self._file_name_aicpu_time.format(self._device_id)) aicpu_file_path = validate_and_normalize_path( aicpu_file_path, raise_key='Invalid aicpu file path.') if not os.path.isfile(aicpu_file_path): logger.warning('The file <%s> does not exist.', aicpu_file_path) return with open(aicpu_file_path, 'r') as file: csv_reader = csv.reader(file) next(csv_reader) for info in csv_reader: aicpu_info = self._convert_field_type(info) self._data.append(aicpu_info)
def _load(self): """Load data according to the parsed AICPU operator file.""" aicpu_file_path = os.path.join( self._profiling_dir, self._file_name_aicpu_time.format(self._device_id)) aicpu_file_path = validate_and_normalize_path( aicpu_file_path, raise_key='Invalid aicpu file path.') if not os.path.isfile(aicpu_file_path): logger.warning('The file <%s> does not exist.', aicpu_file_path) return type_detail_cache = dict() with open(aicpu_file_path, 'r') as file: csv_reader = csv.reader(file) next(csv_reader) for item in csv_reader: op_type = item[1] info = type_detail_cache.get(op_type) if info: info.append(item) else: type_detail_cache[op_type] = [item] type_temp_detail_cache = dict() total_avg_time = 0 result = [] for key, value in type_detail_cache.items(): exec_frequency = len(value) total_time_index = 2 exec_avg_time = sum([float(i[total_time_index]) for i in value]) / exec_frequency exec_avg_time = round(exec_avg_time, 6) total_avg_time += exec_avg_time type_temp_detail_cache[key] = [key, exec_avg_time, exec_frequency] for key, value in type_temp_detail_cache.items(): execution_time_index = 1 percent = round( (value[execution_time_index] / total_avg_time) * 100, 2) value.append(percent) result.append(value) self._data = result
def _get_save_path(self, output_path): """ Get the save path. Args: output_path (str): The output dir. Returns: str, the save path. """ try: output_dir = validate_and_normalize_path(output_path, 'profiler') except ValidationError: logger.warning('Output path is invalid.') raise ProfilerPathErrorException('Output path is invalid.') if not os.path.isdir(output_dir): logger.warning('The output dir <%s> not found.', output_dir) raise ProfilerDirNotFoundException(output_dir) return os.path.join( output_dir, self._parsed_pipeline_file_name.format(self._device_id))
def parse(self): """ Parse the minddata pipeline files. Raises: ProfilerRawFileException: If fails to parse the raw file of minddata pipeline or the file is empty. """ with open(self._pipeline_path, 'r') as file: try: pipeline_info = json.load(file) except (json.JSONDecodeError, TypeError) as err: logger.exception(err) raise ProfilerRawFileException( 'Fail to parse minddata pipeline file.') if not pipeline_info: logger.warning('The minddata pipeline file is empty.') raise ProfilerRawFileException( 'The minddata pipeline file is empty.') self._parse_and_save(pipeline_info)
def validate_and_set_job_id_env(job_id_env): """ Validate the job id and set it in environment. Args: job_id_env (str): The id that to be set in environment parameter `JOB_ID`. Returns: int, the valid job id env. """ if job_id_env is None: return job_id_env # get job_id_env in int type valid_id = to_int(job_id_env, 'job_id_env') # check the range of valid_id if valid_id and 255 < valid_id < sys.maxsize: os.environ['JOB_ID'] = job_id_env else: log.warning( "Invalid job_id_env %s. The value should be int and between 255 and %s. Use" "default job id env instead.", job_id_env, sys.maxsize) return valid_id
def record_point_info(self, point_info, output_path): """ Record point info into json. Args: point_info (dict): The point info about tag id and relative op name. output_path (str): The output path for saving point info. Returns: dict, parsed point info. """ points = { 'fp_start': point_info.get(self._fp_tag, ''), 'bp_end': point_info.get(self._bp_tag, '') } try: with open(output_path, 'w') as json_file: json.dump(points, json_file) os.chmod(output_path, stat.S_IREAD) except (IOError, OSError) as err: log.warning('Failed to save point info. %s', err) raise ProfilerIOException return points
def _iter_interval_analyze(self, step_trace_condition): """Get the proposals of iteration interval.""" iter_interval_dict = OrderedDict() default_iter_interval_lst = [0] iter_interval_condition = step_trace_condition.get("iter_interval", {}) analyser_result = self.get_analyser_result( self.__proposer_type, condition=iter_interval_condition) iter_interval_length_lst = analyser_result.get("info", {}).get( "iteration_interval", default_iter_interval_lst) logger.debug("The 'iter_interval_length_lst' is %s", str(iter_interval_length_lst)) # Check the iter_interval_length_lst. if not isinstance(iter_interval_length_lst, list) or not iter_interval_length_lst: logger.warning( "The 'iter_interval_length_lst' is %s, it is null or not a list", str(iter_interval_length_lst)) else: if iter_interval_length_lst[ 0] > self.__step_trace_iter_interval_threshold: iter_interval_dict[self.__iter_interval_label] = [ str(self.__step_trace_iter_interval_threshold) ] self.__proposal_dict.update(iter_interval_dict)
def _load(self): """Load data according to the parsed AICORE operator file.""" op_detail_file_path = os.path.join( self._profiling_dir, self._file_name_aicore_detail_time.format(self._device_id)) framework_file_path = os.path.join( self._profiling_dir, self._file_name_framework_info.format(self._device_id)) flops_file_path = os.path.join( self._profiling_dir, self._file_name_flops.format(self._device_id)) op_detail_file_path = validate_and_normalize_path( op_detail_file_path, raise_key='Invalid aicore_detail file path.') framework_file_path = validate_and_normalize_path( framework_file_path, raise_key='Invalid framework file path.') flops_file_path = validate_and_normalize_path( flops_file_path, raise_key='Invalid flops file path.') if not os.path.isfile(op_detail_file_path): logger.warning('The file <%s> does not exist.', op_detail_file_path) return if not os.path.isfile(framework_file_path): logger.warning('The file <%s> does not exist.', framework_file_path) return framework_infos = dict() with open(framework_file_path, 'r') as file: csv_reader = csv.reader(file) next(csv_reader) for info in csv_reader: framework_infos[info[3]] = self._convert_framework_field_type( info) flops_infos = dict() if os.path.isfile(flops_file_path): with open(flops_file_path, 'r') as f_obj: # skip the first line which is header info. next(f_obj) for line in f_obj: flops_line = line.strip().split(',') # flops_line[0] is full_op_name. flops_infos[flops_line[0]] = flops_line[1:] else: logger.warning('The file <%s> does not exist.', flops_file_path) with open(op_detail_file_path, 'r') as file: csv_reader = csv.reader(file) next(csv_reader) for info in csv_reader: detail_info = self._get_op_detail_info(info, framework_infos, flops_infos) self._data.append(detail_info) del framework_infos del flops_infos
def analyse(self): """ Collect and analyse performance data, called after training or during training. Examples: >>> from mindinsight.profiler import Profiler >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", >>> device_id=int(os.environ["DEVICE_ID"])) >>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data') >>> model = Model(train_network) >>> dataset = get_dataset() >>> model.train(2, dataset) >>> profiler.analyse() """ try: from mindspore.communication.management import release release() except ImportError: logger.error("Profiling: fail to import release from mindspore.") job_id = self._get_profiling_job_id() logger.info("Profiling: job id is %s ", job_id) source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id) # parse hwts.log.data.45.dev file, and get task profiling data hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt" hwts_output_filename = os.path.join(self._output_path, hwts_output_filename) hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename) result = hwtslog_parser.execute() if not result: logger.error("Profiling: fail to parse hwts log file.") return # parse Framework file, and get the relation of op and tasks framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path) framework_parser.parse() op_task_dict = framework_parser.to_task_id_full_op_name_dict() if not op_task_dict: logger.error("Profiling: fail to parse framework files.") return # get op compute time from hwts data and framework data, write output_op_compute_time.txt opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt" opcompute_output_filename = os.path.join(self._output_path, opcompute_output_filename) optime_parser = OPComputeTimeParser( hwts_output_filename, opcompute_output_filename, op_task_dict, self._output_path, self._dev_id ) optime_parser.execute() # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt" output_data_preprocess_aicpu = os.path.join(self._output_path, output_data_preprocess_aicpu) aicpu_data_parser = DataPreProcessParser(source_path, output_data_preprocess_aicpu) aicpu_data_parser.execute() # Parsing minddata AICPU profiling MinddataParser.execute(source_path, self._output_path, self._dev_id) # parse minddata pipeline operator and queue try: pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path) pipeline_parser.parse() except MindInsightException as err: logger.warning(err.message) # analyse op compute time info try: self._analyser_op_info() except MindInsightException as err: logger.warning(err.message) # analyse step trace info try: self._analyse_step_trace(source_path, framework_parser) except MindInsightException as err: logger.warning(err.message) # analyse timeline info try: self._analyse_timeline() except (ProfilerIOException, ProfilerFileNotFoundException, ValidationError) as err: logger.warning('Fail to write timeline data: %s', err)