def __init__(self, **kwargs): # get device_id and device_target self._get_devid_and_devtarget() output_path = kwargs.pop("output_path", "./data") self._output_path = validate_and_normalize_path(output_path) self._output_path = os.path.join(self._output_path, "profiler") if not os.path.exists(self._output_path): os.makedirs(self._output_path, exist_ok=True) os.chmod(self._output_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) else: logger.warning( "The target dir already exists. " "There may be some old profiling data, and they will be rewrote in the end." ) os.environ['PROFILING_MODE'] = 'true' os.environ['MINDDATA_PROFILING_DIR'] = self._output_path if self._device_target and self._device_target == "GPU": from mindspore._c_expression import GPUProfiler self._gpu_profiler = GPUProfiler.get_instance() self._gpu_profiler.init(self._output_path) self._gpu_profiler.step_profiling_enable(True) if context.get_auto_parallel_context('device_num') > 1: self._dev_id = get_rank() os.environ['DEVICE_ID'] = str(self._dev_id) if kwargs: logger.warning("Params not be supported yet on GPU.") elif self._device_target and self._device_target == "Ascend": optypes_not_deal = kwargs.pop("optypes_not_deal", "Variable") if not isinstance(optypes_not_deal, str): raise TypeError("The parameter optypes_not_deal must be str.") job_id = kwargs.pop("ascend_job_id", "") if kwargs: logger.warning("There are invalid params which don't work.") os.environ['DEVICE_ID'] = self._dev_id os.environ['AICPU_PROFILING_MODE'] = 'true' # use context interface to open profiling, for the new mindspore version(after 2020.5.21) context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace") self._container_path = os.path.join( self._base_profiling_container_path, self._dev_id) data_path = os.path.join(self._container_path, "data") data_path = validate_and_normalize_path(data_path) if not os.path.exists(data_path): os.makedirs(data_path, exist_ok=True) self._filt_optype_names = optypes_not_deal.split( ",") if optypes_not_deal else [] self._profiling_job_id = job_id # add job id env through user input later self._job_id_env = 0 self._start_time = int(time.time() * 10000000) logger.info("Profiling: profiling start time: %d", self._start_time)
def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True): """ Analyse step trace data and save the result. Args: source_path (str): The directory that contains the step trace original data. framework_parser (FrameworkParser): The framework parse instance. is_training_mode_flag (bool): Whether in training mode or not. """ logger.info("Begin to parse step trace.") # construct output path step_trace_intermediate_file_path = os.path.join( self._output_path, f'step_trace_raw_{self._dev_id}_detail_time.csv') point_info_file_path = os.path.join(self._output_path, 'step_trace_point_info.json') step_trace_intermediate_file_path = validate_and_normalize_path( step_trace_intermediate_file_path) point_info_file_path = validate_and_normalize_path( point_info_file_path) if self._device_target and self._device_target == 'GPU': input_file_path = os.path.join( self._output_path, f'step_trace_profiling_{self._dev_id}.txt') parser = GpuStepTraceParser( input_dir=input_file_path, output_file_path=step_trace_intermediate_file_path, is_training_mode=is_training_mode_flag) parser.parse_and_save() point_info = parser.record_point_info(input_file_path, point_info_file_path) else: # whether keep the first step skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME) point_info = framework_parser.point_info # recognize inference or training mode is_traning_mode_flag = framework_parser.check_op_name("Gradients") # parser the step trace files and save the result to disk source_path = validate_and_normalize_path(source_path) parser = AscendStepTraceParser( input_dir=source_path, output_file_path=step_trace_intermediate_file_path, job_id=self._job_id_env, skip_first_step=skip_first_step_flag, is_training_mode=is_traning_mode_flag) parser.update_tag_op_type_map(point_info) parser.parse_and_save() point_info = parser.record_point_info(point_info, point_info_file_path) # print parser result parser.show() logger.info("Finish saving the intermediate result: %s", step_trace_intermediate_file_path) logger.info("The point info is: %s", point_info) return point_info
def _get_output_path(self, kwargs): """Get output path of profiling data.""" current_time = int(time.time()) # to avoid getting different timestamp from different process in multi-card training, # set the timestamp as exist timestamp if it's difference is less than 6 seconds. def _select_timestamp(dir_name, re_pattern, input_time): """select the timestamp from current_time and exist timestamp.""" timestamp_diff_threshold = 6 exist_timestamp_list = [] select_time = input_time if not os.path.exists(dir_name): os.makedirs(dir_name, exist_ok=True) for file_name in os.listdir(dir_name): match_res = re_pattern.match(file_name) if match_res: exist_timestamp_list.append(int(match_res.group(1))) if exist_timestamp_list: time_diff_list = [ input_time - timestamp for timestamp in exist_timestamp_list ] min_time_diff = min(time_diff_list) if min_time_diff <= timestamp_diff_threshold: select_time = exist_timestamp_list[time_diff_list.index( min_time_diff)] return select_time if "output_path" not in kwargs: selected_timestamp = _select_timestamp(os.getcwd(), re.compile(r'data-(\d+)'), current_time) output_path = f"data-{selected_timestamp}" self._output_path = validate_and_normalize_path(output_path) else: output_path = kwargs.pop("output_path") self._output_path = validate_and_normalize_path(output_path) selected_timestamp = _select_timestamp( self._output_path, re.compile(r'profiler-(\d+)'), current_time) self._output_path = os.path.join(self._output_path, f"profiler-{selected_timestamp}") if not os.path.exists(self._output_path): os.makedirs(self._output_path, exist_ok=True) os.chmod(self._output_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) else: logger.warning( "The target dir already exists. " "There may be some old profiling data, and they will be rewrote in the end." )
def get_min_cycle_counter(self): """ Get minimum cycle counter. Returns: float, the minimum value of the cycle counter. """ file_path = os.path.join( self._profiling_dir, self._min_cycle_counter_file_path.format(self._device_id)) file_path = validate_and_normalize_path(file_path) if os.path.exists(file_path): try: with open(file_path, 'r') as f_obj: min_cycle_counter = f_obj.read() min_cycle_counter = float(min_cycle_counter) \ if not min_cycle_counter == 'inf' else 0 except (IOError, OSError) as err: logger.error( 'Error occurred when read minimum cycle counter: %s', err) raise ProfilerIOException else: min_cycle_counter = 0 logger.info("No min cycle counter recorded.") return min_cycle_counter
def _get_and_validate_path(self, file_name): """Generate op or activity file path from file name, and validate this path.""" file_path = os.path.join(self._profiling_dir, file_name.format(self._device_id)) file_path = validate_and_normalize_path(file_path) return file_path
def _get_pipeline_path(self, source_dir): """ Get the minddata pipeline file path. Args: source_dir (str): The minddata pipeline source dir. Returns: str, the minddata pipeline file path. """ pipeline_path = os.path.join( source_dir, self._raw_pipeline_file_name.format(self._device_id)) try: pipeline_path = validate_and_normalize_path(pipeline_path) except RuntimeError: logger.warning('Minddata pipeline file is invalid.') raise ProfilerPathErrorException( 'Minddata pipeline file is invalid.') if not os.path.isfile(pipeline_path): logger.warning('The minddata pipeline file <%s> not found.', pipeline_path) raise ProfilerFileNotFoundException(pipeline_path) return pipeline_path
def write_timeline_to_json_by_limitation(self, size_limit): """Write timeline to json by limitation.""" display_filename = self._display_filename.format(self._device_id) display_file_path = os.path.join( self._profiling_dir, display_filename ) display_file_path = validate_and_normalize_path(display_file_path) length = len(self._timeline_meta) try: with open(display_file_path, 'w') as json_file: json_file.write('[') for index, item in enumerate(self._timeline_meta): json.dump(item, json_file) file_size = os.path.getsize(display_file_path) if file_size > size_limit: break if index == length - 1: break json_file.write(',') json_file.write(']') os.chmod(display_file_path, stat.S_IREAD | stat.S_IWRITE) except (IOError, OSError) as err: logger.error('Error occurred when write timeline display file: %s', err) raise ProfilerIOException
def _parse_aicpu_time(self): """Parse the parsed AICPU operator time file.""" aicpu_file = os.path.join( self._profiling_dir, self._file_name_aicpu_time.format(self._device_id)) aicpu_file = validate_and_normalize_path(aicpu_file) if not os.path.isfile(aicpu_file): return save_file_name = 'aicpu_intermediate_' + self._device_id + '.csv' save_file_path = os.path.join(self._profiling_dir, save_file_name) with open(aicpu_file, 'r') as src_file: row = src_file.readline() if not row.startswith('serial_number'): return with open(save_file_path, 'w') as save_file: csv_writer = csv.writer(save_file) csv_writer.writerow(self._header_aicpu) while True: row = src_file.readline() if not row: break infos = row.split() if infos[0] == 'AI': continue csv_writer.writerow(infos)
def _get_step_end_tag_id(self, source_files): """ Get step end tag id.This id is 255 before 2020.12.16,and 65535 now. File is an old version if there is no 65535 tag id, or it is a new version. """ step_num = 0 source_file = validate_and_normalize_path(source_files[0]) try: with open(source_file, 'rb') as handler: content = handler.read() for pos in range(0, len(content), 20): next_event = self._get_trace_struct(content[pos:pos + self._event_size]) # 1 means bp_start. if next_event.tag_id == 1: step_num += 1 # Step end tag id is 65535 in the new version. if next_event.tag_id == 65535: self._step_end_tag_id = next_event.tag_id # We only search the first step to find if there is 65535 tag id. if step_num == 2: break except (IOError, OSError) as err: log.warning(f'Failed to read {source_file} while get end tag id', err) raise ProfilerIOException
def _write_timeline_data_into_file(self, timeline_data): """ Write the timeline information into the file, including operator name, stream id, start time and duration. Args: timeline_data (list): The metadata to be written into the file. [ ['op_name_1', 'stream_id_1', 'start_time_1', 'durarion_1'], ['op_name_2', 'stream_id_2', 'start_time_2', 'durarion_2'], [...] ] """ # sorted by start times timeline_data.sort(key=lambda x: float(x[2])) filename = 'output_timeline_data_{}.txt'.format(self._device_id) file_path = os.path.join(self._output_path, filename) file_path = validate_and_normalize_path(file_path) # write to file try: with open(file_path, 'w') as f_obj: f_obj.write(TIMELINE_FILE_COLUMN_TITLE + '\n') for timeline in timeline_data: timeline = [str(item) for item in timeline] f_obj.write(','.join(timeline) + '\n') os.chmod(file_path, stat.S_IREAD | stat.S_IWRITE) except (IOError, OSError) as err: logger.error('Error occurred when writing intermediate timeline file: %s', err) raise ProfilerIOException
def _parse_point_files(self): """Parse the framework point files.""" for path in self._framework_path['point']: path = validate_and_normalize_path(path) with open(path, 'r') as file: for point_info in file: infos = point_info.strip('\n').split(' ') self._point_info[int(infos[0])] = infos[1]
def __init__(self, subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data', optypes_to_deal='', optypes_not_deal='Variable', job_id=""): # get device_id and device_target self._get_devid_and_devtarget() self._output_path = validate_and_normalize_path(output_path) self._output_path = os.path.join(self._output_path, "profiler") if not os.path.exists(self._output_path): os.makedirs(self._output_path, exist_ok=True) else: logger.warning( "The target dir already exists. " "There may be some old profiling data, and they will be rewrote in the end." ) if self._device_target and self._device_target == "GPU": from mindspore._c_expression import GPUProfiler self._gpu_profiler = GPUProfiler.get_instance() self._gpu_profiler.init(self._output_path) self._gpu_profiler.step_profiling_enable(True) elif self._device_target and (self._device_target == "Ascend" or self._device_target != "Davinci"): self._container_path = os.path.join( self._base_profiling_container_path, self._dev_id) data_path = os.path.join(self._container_path, "data") if not os.path.exists(data_path): os.makedirs(data_path, exist_ok=True) os.environ['PROFILING_MODE'] = 'true' os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace' os.environ['MINDDATA_PROFILING_DIR'] = self._output_path os.environ['DEVICE_ID'] = self._dev_id os.environ['AICPU_PROFILING_MODE'] = 'true' os.environ['PROFILING_DIR'] = str(self._container_path) # use context interface to open profiling, for the new mindspore version(after 2020.5.21) context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace") self._subgraph = check_subgraph(subgraph) self._valid_optype_name = optypes_to_deal.split( ",") if optypes_to_deal else [] self._filt_optype_names = optypes_not_deal.split( ",") if optypes_not_deal else [] self._detail = check_bool(is_detail, 'is_detail') self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path') self._profiling_job_id = job_id # add job id env through user input later self._job_id_env = 0 self._start_time = int(time.time() * 10000000) logger.info("Profiling: profiling start time: %d", self._start_time)
def __init__(self, hwts_output_file, output_filename, op_task_info, output_path, device_id): hwts_output_file = validate_and_normalize_path(hwts_output_file) self._hwts_output_file = hwts_output_file self._output_filename = output_filename self._op_task_info = op_task_info self._output_path = output_path self._device_id = device_id self._min_cycle_counter = float("inf")
def _get_and_validate_path(self, file_name): """Generate op or activity file path from file name, and validate this path.""" file_path = os.path.join(self._profiling_dir, file_name.format(self._device_id)) file_path = validate_and_normalize_path(file_path) if not os.path.exists(file_path): logger.error(f"Failed to find parsed timeline file {file_path}.") raise ProfilerFileNotFoundException('parsed timeline file') return file_path
def parse_minddata_aicpu_data(minddata_aicpu_source_path): """ Parse minddata get_next info which contains queue size and execute time. Args: minddata_aicpu_source_path (str): the source file path. Returns: list[Union[str, float]], the converted data. """ result = list() try: minddata_aicpu_source_path = validate_and_normalize_path( minddata_aicpu_source_path) with open(minddata_aicpu_source_path) as source_data_file: source_data = source_data_file.read() step_data = source_data.split("\x00") for one_step in step_data: if one_step: node_info = one_step.split(", ") node_name, node_start, node_end, queue_size = "", 0, 0, 0 if node_info: node_name = node_info[0].replace("Node:", "") if len(node_info) > 3 and "queue" in node_info[1]: queue_size = node_info[1].replace( "queue size:", "") queue_size = int(queue_size) if queue_size.isdigit( ) else queue_size node_start = node_info[2].replace("Run start:", "") node_start = int(node_start) if node_start.isdigit( ) else node_start node_end = node_info[3].replace("Run end:", "") node_end = int( node_end) if node_end.isdigit() else node_end elif len(node_info) > 3 and "Run" in node_info[1]: queue_size = node_info[3].replace( "queue size:", "") queue_size = int(queue_size) if queue_size.isdigit( ) else queue_size node_start = node_info[1].replace("Run start:", "") node_start = int(node_start) if node_start.isdigit( ) else node_start node_end = node_info[2].replace("Run end:", "") node_end = int( node_end) if node_end.isdigit() else node_end one_step_list = [ node_name, node_start, node_end, queue_size ] result.append(one_step_list) except OSError: logger.error("Open get_next profiling file error.") return result
def _load_point_info(self): """Load point info.""" file_path = os.path.join(self._profiling_dir, 'step_trace_point_info.json') file_path = validate_and_normalize_path(file_path) if os.path.isfile(file_path): with open(file_path, 'r', encoding='utf-8') as file: try: self._point_info = json.load(file) except (json.JSONDecodeError, TypeError) as err: logger.warning(err) raise ProfilerRawFileException('Fail to parse point info file.')
def _write_memory_files(self, filename, content): """Write the summary and top breakdowns of memory usage.""" file_path = os.path.join(self._profiling_dir, filename) file_path = validate_and_normalize_path(file_path) try: with open(file_path, 'w') as json_file: json.dump(content, json_file) os.chmod(file_path, stat.S_IREAD | stat.S_IWRITE) except (IOError, OSError) as err: logger.error('Fail to write memory file.\n%s', err) raise ProfilerIOException
def _get_file_path(self): """Get the proto file path.""" file_path = os.path.join(self._profiling_dir, self._proto_file_path.format(self._device_id)) file_path = validate_and_normalize_path(file_path) if not os.path.exists(file_path): msg = 'The memory file does not exist!' logger.error(msg) raise ProfilerFileNotFoundException(msg=msg) return file_path
def _aicore_detail_data_load(self): """Load data according to the parsed AICORE operator file.""" op_detail_file_path = os.path.join( self._profiling_dir, self._file_name_aicore_detail_info.format(self._device_id)) framework_file_path = os.path.join( self._profiling_dir, self._file_name_framework.format(self._device_id)) op_detail_file_path = validate_and_normalize_path(op_detail_file_path) framework_file_path = validate_and_normalize_path(framework_file_path) if not os.path.isfile(op_detail_file_path): logger.warning('The file <%s> does not exist.', op_detail_file_path) return if not os.path.isfile(framework_file_path): logger.warning('The file <%s> does not exist.', framework_file_path) return framework_infos = dict() with open(framework_file_path, 'r') as file: csv_reader = csv.reader(file) _ = next(csv_reader) for info in csv_reader: framework_infos[info[3]] = [ info[3], info[4], info[5], info[6], json.loads(info[7]) if info[7] else None ] with open(op_detail_file_path, 'r') as file: csv_reader = csv.reader(file) _ = next(csv_reader) for info in csv_reader: framework_info = framework_infos.get(info[0]) self._aicore_detail_data.append([ framework_info[1], framework_info[2], float(info[1]), framework_info[3], framework_info[0], framework_info[4] ]) del framework_infos
def _parse(self, source_file): """Parse source step trace files.""" log.info("Start to parse step trace file.") fp_start, bp_end, iter_end, iter_start = 0, 1, 2, 3 reduce_start = 4 start_time, end_time = 0, 1 STEP_TRACE_POINT_COUNT = 3 source_file = validate_and_normalize_path(source_file) try: with open(source_file, 'r') as f: lines = f.readlines() if len(lines) < STEP_TRACE_POINT_COUNT: raise ProfilerRawFileException( f"Failed to parse {source_file} file. The FP_POINT/BP_POINT/ITER_END_POINT " f"do not recognized correctly. Try to set the environment variable'PROFILING_FP_START' " f"and 'PROFILING_BP_END' to solve this problem. For example, " f"'export PROFILING_FP_START=Default/xxx/Conv2d-op1' ") step_trace_info_all = [line.strip().split()[1:] for line in lines] num_of_step = len(step_trace_info_all[0]) for step_trace_point in step_trace_info_all: if len(step_trace_point) != num_of_step: raise ProfilerRawFileException( f"Failed to parse {source_file} file. Due to the profiled " f"step_num of FP/BP/ITER_END Point are not equal") iter_start_info = [step_trace_info_all[fp_start][0]] + \ step_trace_info_all[iter_end][:num_of_step] step_trace_info_all.insert(iter_start, iter_start_info) except (IOError, OSError) as err: log.warning(f'Failed to read {source_file}', err) raise ProfilerIOException for step_num in range(num_of_step): step_trace = { 'start': int(step_trace_info_all[iter_start][step_num].split(',')[start_time]), 'fp': int(step_trace_info_all[fp_start][step_num].split(',')[start_time]), 'bp': int(step_trace_info_all[bp_end][step_num].split(',')[end_time]), 'end': int(step_trace_info_all[iter_end][step_num].split(',')[end_time]), 'reduce': {} } num_of_step_point = len(step_trace_info_all) if num_of_step_point > reduce_start: reduce_info = {} reduce_time_info = [] for reduce_idx in range(reduce_start, num_of_step_point): cur_reduce_time = step_trace_info_all[reduce_idx][step_num] reduce_time_info += cur_reduce_time.split(',') reduce_info['ops'] = reduce_time_info step_trace['reduce'] = reduce_info self._record_trace_event(step_trace) self._record_average_info() log.info("Finish to parse step trace file.")
def _get_file_path(self): """Get the proto file path.""" file_path = os.path.join(self._profiling_dir, self._proto_file_path.format(self._device_id)) file_path = validate_and_normalize_path(file_path) if not os.path.exists(file_path): logger.warning( 'The memory file does not exist! Please ignore the warning ' 'if you are running heterogeneous training.') raise ProfilerFileNotFoundException(msg=file_path) return file_path
def _aicore_trace_data_load(self): """Load data according to the parsed AICORE operator types file.""" file_path = query_latest_trace_time_file(self._profiling_dir, int(self._device_id)) if not file_path: logger.error("Failed to find parsed trace time file.") raise ProfilerFileNotFoundException('parsed step trace time file') file_path = validate_and_normalize_path(file_path) with open(file_path, 'r') as handle: csv_reader = csv.reader(handle) self.__column__ = next(csv_reader) self._aicore_trace_data = list(csv_reader) self._size = len(self._aicore_trace_data) - 1 self._load_point_info()
def _parse(self, source_file): """Parse source step trace files.""" log.info("Start to parse step trace file.") fp_start, bp_end, iter_end, iter_start = 0, 1, 2, 3 reduce_start = 4 start_time, end_time = 0, 1 source_file = validate_and_normalize_path(source_file) try: with open(source_file, 'r') as f: lines = f.readlines() step_trace_info_all = [ line.strip().split()[1:] for line in lines ] num_of_step = len(step_trace_info_all[0]) iter_start_info = [step_trace_info_all[fp_start][0]] + \ step_trace_info_all[iter_end][:num_of_step] step_trace_info_all.insert(iter_start, iter_start_info) except (IOError, OSError) as err: log.warning(f'Failed to read {source_file}', err) raise ProfilerIOException for step_num in range(num_of_step): step_trace = { 'start': int(step_trace_info_all[iter_start][step_num].split(',') [start_time]), 'fp': int(step_trace_info_all[fp_start][step_num].split(',') [start_time]), 'bp': int(step_trace_info_all[bp_end][step_num].split(',') [end_time]), 'end': int(step_trace_info_all[iter_end][step_num].split(',') [end_time]), 'reduce': {} } num_of_step_point = len(step_trace_info_all) if num_of_step_point > reduce_start: reduce_info = {} reduce_time_info = [] for reduce_idx in range(reduce_start, num_of_step_point): cur_reduce_time = step_trace_info_all[reduce_idx][step_num] reduce_time_info += cur_reduce_time.split(',') reduce_info['ops'] = reduce_time_info step_trace['reduce'] = reduce_info self._record_trace_event(step_trace) self._record_average_info() log.info("Finish to parse step trace file.")
def _analyse_step_trace(self, source_path, framework_parser): """ Analyse step trace data and save the result. Args: source_path (str): The directory that contains the step trace original data. framework_parser (FrameworkParser): The framework parse instance. """ logger.info("Begin to parse step trace.") # construct output path step_trace_intermediate_file_path = os.path.join( self._output_path, f'step_trace_raw_{self._dev_id}_detail_time.csv') point_info_file_path = os.path.join(self._output_path, 'step_trace_point_info.json') step_trace_intermediate_file_path = validate_and_normalize_path( step_trace_intermediate_file_path) point_info_file_path = validate_and_normalize_path( point_info_file_path) # whether keep the first step skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME) point_info = framework_parser.point_info # parser the step trace files and save the result to disk source_path = validate_and_normalize_path(source_path) parser = StepTraceParser( input_dir=source_path, output_file_path=step_trace_intermediate_file_path, job_id=self._job_id_env, skip_first_step=skip_first_step_flag) parser.update_tag_op_type_map(point_info) parser.parse_and_save() point_info = parser.record_point_info(point_info, point_info_file_path) # print parser result parser.show() logger.info("Finish saving the intermediate result: %s", step_trace_intermediate_file_path) logger.info("The point info is: %s", point_info)
def _parse(self, source_files): """Parse source step trace files.""" log.info("Start to parse step trace file.") event_info = {} for source_file in source_files: source_file = validate_and_normalize_path(source_file) with open(source_file, 'rb') as handler: content = handler.read() for step_trace in self._get_next_step_trace(content, event_info): if self._skip_first_step: self._skip_first_step = False continue self._record_trace_event(step_trace) self._record_average_info() log.info("Finish to parse step trace file.")
def _aicore_data_load(self): """Load data according to the parsed AICORE operator types file.""" op_type_file_path = os.path.join( self._profiling_dir, self._file_name_aicore_type_time.format(self._device_id) ) op_type_file_path = validate_and_normalize_path(op_type_file_path) if not os.path.isfile(op_type_file_path): logger.warning('The file <%s> does not exist.', op_type_file_path) return with open(op_type_file_path, 'r') as file: csv_reader = csv.reader(file) _ = next(csv_reader) for info in csv_reader: self._aicore_data.append([info[0], float(info[1]), int(info[2]), float(info[3])])
def write_timeline_summary(self): """Write timeline summary to json.""" timeline_summary_file_path = os.path.join( self._profiling_dir, self._timeline_summary_filename.format(self._device_id) ) timeline_summary_file_path = validate_and_normalize_path(timeline_summary_file_path) try: with open(timeline_summary_file_path, 'w') as json_file: json.dump(self._timeline_summary, json_file) os.chmod(timeline_summary_file_path, stat.S_IREAD | stat.S_IWRITE) except (IOError, OSError) as err: logger.error('Error occurred when write timeline summary file: %s', err) raise ProfilerIOException
def _parse_task_files(self): """Parse the framework task files.""" for path in self._framework_path['task']: path = validate_and_normalize_path(path) with open(path, 'r') as file: for task_info in file: infos = task_info.strip('\n').split(' ') infos = infos[1:] if len(infos) == 5 else infos # key is op name, values is task id, stream id, block_dim self._task_cache[infos[0]] = [infos[2], infos[3], infos[1]] # if the task id is less than the task id threshold, the # stream id and task id correspond to an operator task_id = infos[2] if int(task_id) < self._task_id_threshold: task_id = '_'.join([infos[3], task_id]) self._task_id_full_op_name_dict[task_id] = infos[0]
def __init__(self, subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data', optypes_to_deal='', optypes_not_deal='Variable', job_id=""): # get device_id and device_target self._get_devid_and_devtarget() self._container_path = os.path.join( self._base_profiling_container_path, self._dev_id) data_path = os.path.join(self._container_path, "data") if not os.path.exists(data_path): os.makedirs(data_path, exist_ok=True) self._output_path = validate_and_normalize_path(output_path) self._output_path = os.path.join(self._output_path, "profiler") if not os.path.exists(self._output_path): os.makedirs(self._output_path, exist_ok=True) os.environ['PROFILING_MODE'] = 'true' os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace' os.environ['MINDDATA_PROFILING_DIR'] = self._output_path os.environ['DEVICE_ID'] = self._dev_id os.environ['AICPU_PROFILING_MODE'] = 'true' os.environ['PROFILING_DIR'] = str(self._container_path) # use context interface to open profiling, for the new mindspore version(after 2020.5.21) context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace") self._subgraph = check_subgraph(subgraph) self._valid_optype_name = optypes_to_deal.split( ",") if optypes_to_deal else [] self._filt_optype_names = optypes_not_deal.split( ",") if optypes_not_deal else [] self._detail = check_bool(is_detail, 'is_detail') self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path') self._profiling_job_id = job_id # add job id env through user input later self._job_id_env = 0 self._start_time = int(time.time() * 10000000) logger.info("Profiling: profiling start time: %d", self._start_time)
def _get_save_path(self, output_path): """ Get the save path. Args: output_path (str): The output dir. Returns: str, the save path. """ try: output_dir = validate_and_normalize_path(output_path) except ValidationError: logger.warning('Output path is invalid.') raise ProfilerPathErrorException('Output path is invalid.') if not os.path.isdir(output_dir): logger.warning('The output dir <%s> not found.', output_dir) raise ProfilerDirNotFoundException(output_dir) return os.path.join( output_dir, self._parsed_pipeline_file_name.format(self._device_id))