def _get_total_step_num(self): """Get the num of train step.""" total_step_num = 0 # take the data of one of the machines to get the total number of steps. host_ip_dir = self._host_ips_dir[0] target_dir_path = os.path.join(self._cluster_profiler_dir, 'cluster_profiler', host_ip_dir, 'profiler') target_dir_path = validate_and_normalize_path( target_dir_path, raise_key="Invalid profiler dir path.") if not os.path.exists(target_dir_path): log.error('Did not find cluster_profiler dir : %s', target_dir_path) raise ProfilerDirNotFoundException( msg='Did not find cluster_profiler dir:{}'.format( target_dir_path)) entries = os.scandir(target_dir_path) for entry in entries: if entry.is_symlink(): continue if entry.is_file() and entry.name.startswith('step_trace_raw'): file_path = os.path.join(target_dir_path, entry.name) with open(file_path, 'r') as src_file: lines = src_file.readlines() # The penultimate line represents the information of the last step # The step num index is 0 if len(lines) > 1: total_step_num = lines[-2].split(',')[0] break return total_step_num
def _get_step_trace_info(self, host_ip, device_id, step_num): """Get step trace info.""" file_name = 'step_trace_raw_{}_detail_time.csv'.format(device_id) step_trace_file_path = \ os.path.join(self._cluster_profiler_dir, 'cluster_profiler', host_ip, 'profiler', file_name) step_trace_file_path = validate_and_normalize_path( step_trace_file_path, raise_key="Invalid step trace file path.") if not os.path.exists(step_trace_file_path): log.error('Did not find the file: %s', step_trace_file_path) raise ProfilerFileNotFoundException( msg='Did not find the file:{}'.format(step_trace_file_path)) step_trace_info = list() step_num = str(step_num) with open(step_trace_file_path, 'r') as src_file: lines = src_file.readlines() # when the step_num value is 0, it means the average value. # The last line of the step_trace_raw_{}_detail_time.csv records the average value. if step_num == '0': step_trace_info = lines[-1].strip('\n').split(',') else: for line in lines: line = line.strip('\n').split(',') if line[0] == step_num: step_trace_info = line # step_trace_info[6]: iteration_interval time # step_trace_info[7]: fp_and_bp time # step_trace_info[8]: tail time # divided by 1e5, the unit becomes a millisecond iteration_interval = float(step_trace_info[6]) / 1e5 fp_and_bp = float(step_trace_info[7]) / 1e5 tail = float(step_trace_info[8]) / 1e5 step_trace_info = [iteration_interval, fp_and_bp, tail] return step_trace_info
def get_memory_usage_breakdowns(self, device_type, graph_id, node_id): """ Get memory usage breakdowns for each node. Args: device_type (str): Device type, e.g., GPU, Ascend. graph_id (int): Graph id. node_id (int): Node id. Returns: json, the content of memory usage breakdowns. """ memory_details = self._get_file_content(device_type, FileType.DETAILS.value) if graph_id not in memory_details: logger.error('Invalid graph id: %s', graph_id) raise ParamValueError('Invalid graph id.') graph = memory_details[graph_id] if not ('breakdowns' in graph and node_id < len(graph['breakdowns'])): logger.error('Invalid node id: %s', node_id) raise ParamValueError('Invalid node id.') memory_breakdowns = graph.get('breakdowns')[node_id] return {'breakdowns': memory_breakdowns}
def _write_timeline_data_into_file(self, timeline_data): """ Write the timeline information into the file, including operator name, stream id, start time and duration. Args: timeline_data (list): The metadata to be written into the file. [ ['op_name_1', 'stream_id_1', 'start_time_1', 'durarion_1'], ['op_name_2', 'stream_id_2', 'start_time_2', 'durarion_2'], [...] ] """ # sorted by start times timeline_data.sort(key=lambda x: float(x[2])) filename = 'output_timeline_data_{}.txt'.format(self._device_id) file_path = os.path.join(self._output_path, filename) file_path = validate_and_normalize_path( file_path, raise_key='Invalid file path of timeline data.') # write to file try: with open(file_path, 'w') as f_obj: f_obj.write(TIMELINE_FILE_COLUMN_TITLE + '\n') for timeline in timeline_data: timeline = [str(item) for item in timeline] f_obj.write(','.join(timeline) + '\n') except (IOError, OSError) as err: logger.error( 'Error occurred when writing intermediate timeline file: %s', err) raise ProfilerIOException
def _get_minddata_pipeline_info(self): """Get the number of thread cores in minddata pipeline operator""" file_name = self._minddata_pipeline_display_filename.format( self._device_id) file_path = os.path.join(self._profiling_dir, file_name) file_path = validate_and_normalize_path( file_path, raise_key="Invalid minddata_pipeline_info file path.") if not os.path.exists(file_path): log.error('Did not find the minddata_pipeline file: %s', file_path) raise ProfilerFileNotFoundException( msg='Did not find the minddata_pipeline file:{}'.format( file_path)) with open(file_path, 'r', encoding='utf-8') as file: try: minddata_pipeline_info = json.load(file) except json.JSONDecodeError as err: log.exception(err) raise ProfilerRawFileException( "Fail to parse minddata pipeline file") minddata_pipeline_op_info = [] for item in minddata_pipeline_info.get("op_info"): op_info_dict = dict() op_info_dict["op_id"] = item.get("op_id") op_info_dict["num_workers"] = item.get("num_workers") minddata_pipeline_op_info.append(op_info_dict) return minddata_pipeline_op_info
def write_timeline_to_json_by_limitation(self): """Write timeline to json by limitation.""" display_filename = self._display_filename.format(self._device_id) display_file_path = os.path.join( self._profiling_dir, display_filename ) display_file_path = validate_and_normalize_path( display_file_path, raise_key='Invalid timeline display json path.' ) length = len(self._timeline_meta) try: with open(display_file_path, 'w') as json_file: json_file.write('[') for index, item in enumerate(self._timeline_meta): json.dump(item, json_file) file_size = os.path.getsize(display_file_path) if file_size > SIZE_LIMIT: break if index == length - 1: break json_file.write(',') json_file.write(']') except (IOError, OSError) as err: logger.error('Error occurred when write timeline display file: %s', err) raise ProfilerIOException
def get_timeline_summary(self): """ Get timeline summary information for UI display. Returns: json, the content of timeline summary information. """ summary_filename = self._timeline_summary_filename.format(self._device_id) file_path = os.path.join(self._profiling_dir, summary_filename) file_path = validate_and_normalize_path( file_path, raise_key='Invalid timeline summary path.' ) timeline_summary = {} if os.path.exists(file_path): try: with open(file_path, 'r') as f_obj: timeline_summary = json.load(f_obj) except (IOError, OSError, json.JSONDecodeError) as err: logger.error('Error occurred when read timeline summary file: %s', err) raise ProfilerIOException else: logger.info('No timeline summary file. Please check the output path.') return timeline_summary
def _get_host_device_rank_relation(self): """Get host_ip device_id rank_id relation.""" rank_table_file_path = self._get_rank_table_file_path() if not os.path.exists(rank_table_file_path): log.error('Did not find rank table file under %s', self._cluster_profiler_dir) raise ProfilerFileNotFoundException( msg='Did not find rank table file') with open(rank_table_file_path, 'r', encoding='utf-8') as file: try: relation_info = json.load(file) except json.JSONDecodeError as err: log.exception(err) host_device_rank_relation = list() servers_info = relation_info.get("server_list") for server_info in servers_info: server_id = server_info.get("server_id") devices_info = server_info.get("device") for device_info in devices_info: device_id = device_info.get("device_id") rank_id = device_info.get("rank_id") host_device_rank_relation.append( [server_id, device_id, rank_id]) host_ips_mapping_info = self._get_host_ips_mapping_info() for item in host_device_rank_relation: # host_ip_index:0,host_mapping_id_index:1 target_info = [i for i in host_ips_mapping_info if item[0] == i[0]] # target_info is like:[[host_ip, host_mapping_ip]] item[0] = target_info[0][1] return host_device_rank_relation
def _get_communication_info(self, host_ip, device_id, step_num): """Get step trace info.""" file_name = 'hccl_raw_{}.csv'.format(device_id) communication_file_path = \ os.path.join(self._cluster_profiler_dir, 'cluster_profiler', host_ip, 'profiler', file_name) communication_file_path = validate_and_normalize_path( communication_file_path, raise_key="Invalid communication file path.") if not os.path.exists(communication_file_path): log.error('Did not find the file: %s', communication_file_path) raise ProfilerFileNotFoundException( msg='Did not find the file:{}'.format(communication_file_path)) communication_info = list() step_num = str(step_num) with open(communication_file_path, 'r') as src_file: csv_reader = csv.reader(src_file) # when the step_num value is 0, it means the average value. # The last line of the step_trace_raw_{}_detail_time.csv records the average value. # The first element of the last line is '-'. step_num = '-' if step_num == '0' else step_num for row in csv_reader: if row[0] == step_num: communication_info = row break # Convert string to floating point and dictionary if communication_info: communication_info[1] = float(communication_info[1]) communication_info[2] = float(communication_info[2]) communication_info[3] = json.loads(communication_info[3]) communication_info[4] = json.loads(communication_info[4]) return communication_info
def _load_timeline_data(self): """Load timeline data from file.""" file_path = os.path.join( self._profiling_dir, self._output_timeline_data_file_path.format(self._device_id) ) file_path = validate_and_normalize_path( file_path, raise_key='Invalid timeline txt file path.' ) if not os.path.exists(file_path): logger.error("Failed to find parsed timeline file.") raise ProfilerFileNotFoundException('parsed timeline file') timeline_list = [] try: with open(file_path, 'r') as f_obj: for line in f_obj: if not line.startswith('op_name'): line_list = line.strip('\n').split(',') timeline_list.append(line_list) except (IOError, OSError) as err: logger.error('Error occurred when read timeline intermediate file: %s', err) raise ProfilerIOException return timeline_list
def _get_op_task_id_map(self): """ Read hwts data file, get the task time info. Returns: list: all hwts task time info. """ op_map_result = [] hwts_list = [] if not os.path.exists(self._hwts_output_file): logger.error('The hwts output file does not exist.') raise ProfilerFileNotFoundException('hwts output file') with open(self._hwts_output_file, 'r') as data_file: lines = data_file.readlines() for line in lines: if line.startswith("Start of task") or line.startswith( "End of task"): line_split = line.split() container = HWTSContainer(line_split) hwts_list.append(container) # hwts op map by taskId for hwts in hwts_list: if hwts.task_id in self._op_task_info.keys(): hwts.op_name = self._op_task_info[hwts.task_id] op_map_result.append(hwts) return op_map_result
def get_memory_usage_breakdowns(): """ Get memory breakdowns of each node. Returns: Response, the memory breakdowns for each node. Examples: >>> GET http://xxxx/v1/mindinsight/profile/memory-breakdowns """ summary_dir = request.args.get("dir") profiler_dir_abs = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) check_train_job_and_profiler_dir(profiler_dir_abs) device_id = request.args.get("device_id", default='0') to_int(device_id, 'device_id') device_type = request.args.get("device_type", default='ascend') graph_id = request.args.get("graph_id", default='0') node_id = request.args.get("node_id", default='0') node_id = to_int(node_id, 'node_id') if device_type not in ['ascend']: logger.error( "Invalid device_type, Memory Usage only supports Ascend for now.") raise ParamValueError("Invalid device_type.") analyser = AnalyserFactory.instance().get_analyser('memory_usage', profiler_dir_abs, device_id) breakdowns = analyser.get_memory_usage_breakdowns(device_type, graph_id, node_id) return breakdowns
def validate_and_normalize_profiler_path(summary_dir, summary_base_dir): """ Validate and normalize profiler path. Args: summary_dir (str): The relative path of summary directory. summary_base_dir (str): The summary base directory. Returns: str, normalized path of profiler directory. """ if not summary_dir: raise ProfilerParamValueErrorException('The file dir does not exist.') try: unquote_path = unquote(summary_dir, errors='strict') except UnicodeDecodeError: raise ProfilerParamValueErrorException('Unquote error with strict mode') profiler_dir = os.path.join(summary_base_dir, unquote_path, 'profiler') try: profiler_dir = validate_and_normalize_path(profiler_dir, 'profiler') except ValidationError: log.error('profiler dir <%s> is invalid', profiler_dir) raise ProfilerParamValueErrorException('Profiler dir is invalid.') return profiler_dir
def _get_proc_details(self, proc_name, step_id=None, time_type='realtime'): """ Get step trace info for selected step and save the result. Args: proc_name (str): The selected field name. step_id (int): The selected step_id. If not given, it means all steps is required. If the value is 0, it means average info for all steps except the first is required. Default: None. time_type (str): The value type. `systime` keeps the original value. `realtime` transforms the value in millisecond. Default: `realtime`. """ if proc_name is None: log.error('`proc_name` is required for query.') raise ProfilerParamValueErrorException( '`proc_name` is required for query.') if step_id is None: rows_info = self._data[:-1] else: rows_info = [self._data[step_id - 1]] proc_info = [ get_field_value(row_info, proc_name, self.__column__, time_type) for row_info in rows_info ] self._result['info'] = {proc_name: proc_info}
def get_timeline_summary(self): """ Get timeline summary information for UI display. Returns: json, the content of timeline summary information. """ file_path = None summary_file_name = 'timeline_summary_{}.json'.format(self._device_id) if summary_file_name in os.listdir(self._profiling_dir): file_path = os.path.join(self._profiling_dir, summary_file_name) file_path = validate_and_normalize_path( file_path, raise_key='Invalid timeline summary path.') timeline_summary = {} if os.path.exists(file_path): try: with open(file_path, 'r') as f_obj: timeline_summary = json.load(f_obj) except (IOError, OSError) as err: logger.error( 'Error occurred when read timeline summary file: %s', err) raise ProfilerIOException return timeline_summary
def get_display_timeline(self, device_type): """ Get timeline data for UI display. Returns: json, the content of timeline data. """ if device_type == "ascend": display_filename = self._ascend_display_filename.format( self._device_id) elif device_type == "gpu": display_filename = self._gpu_display_filename.format( self._device_id) else: logger.info( 'device type should be ascend or gpu. Please check the device type.' ) raise ParamValueError("Invalid device_type.") file_path = os.path.join(self._profiling_dir, display_filename) file_path = validate_and_normalize_path( file_path, raise_key='Invalid timeline json path.') timeline = [] if os.path.exists(file_path): try: with open(file_path, 'r') as f_obj: timeline = json.load(f_obj) except (IOError, OSError, json.JSONDecodeError) as err: logger.error( 'Error occurred when read timeline display file: %s', err) raise ProfilerIOException else: logger.info('No timeline file. Please check the output path.') return timeline
def get_min_cycle_counter_from_file(self): """ Get minimum cycle counter. Returns: float, the minimum value of the cycle counter. """ file_path = os.path.join( self._profiling_dir, self._min_cycle_counter_file_path.format(self._device_id)) file_path = validate_and_normalize_path( file_path, raise_key='Invalid min cycle counter file path.') if os.path.exists(file_path): try: with open(file_path, 'r') as f_obj: min_cycle_counter = f_obj.read() min_cycle_counter = float(min_cycle_counter) \ if not min_cycle_counter == 'inf' else 0 except (IOError, OSError) as err: logger.error( 'Error occurred when read minimum cycle counter: %s', err) raise ProfilerIOException else: min_cycle_counter = 0 logger.info("No min cycle counter recorded.") return min_cycle_counter
def get_display_timeline(self): """ Get timeline data for UI display. Returns: json, the content of timeline data. """ # Search timeline json file under profiling dir. timeline_filename = self._timeline_filename.format(self._device_id) display_filename = self._display_filename.format(self._device_id) file_list = [ filename for filename in os.listdir(self._profiling_dir) if timeline_filename in filename or display_filename in filename ] # Check if there is a timeline json file for display file_path = os.path.join(self._profiling_dir, display_filename) if display_filename not in file_list: file_path = os.path.join(self._profiling_dir, timeline_filename) file_path = validate_and_normalize_path( file_path, raise_key='Invalid timeline json path.') timeline = [] if os.path.exists(file_path): try: with open(file_path, 'r') as f_obj: timeline = json.load(f_obj) except (IOError, OSError) as err: logger.error( 'Error occurred when read timeline display file: %s', err) raise ProfilerIOException else: logger.info('No timeline file. Please check the output path.') return timeline
def _get_total_step_num(self): """Get the num of train step.""" total_step_num = 0 # Take the data of one of the machines to get the total number of steps. host_ip_dir = self._host_ips_dir[0] target_dir_path = os.path.join(self._cluster_profiler_dir, 'cluster_profiler', host_ip_dir, 'profiler') target_dir_path = validate_and_normalize_path( target_dir_path, raise_key="Invalid profiler dir path.") if not os.path.exists(target_dir_path): log.error('Did not find cluster_profiler dir : %s', target_dir_path) raise ProfilerDirNotFoundException( msg='Did not find cluster_profiler dir:{}'.format( target_dir_path)) entries = os.scandir(target_dir_path) for entry in entries: if entry.is_symlink(): continue if entry.is_file() and entry.name.startswith('hccl_raw'): file_path = os.path.join(target_dir_path, entry.name) with open(file_path, 'r') as src_file: lines = src_file.readlines() # The first row is col_name, the last row is the average. if len(lines) > 2: total_step_num = len(lines) - 2 break return total_step_num
def load_timeline_data(self): """Load timeline data from file.""" file_path = os.path.join( self._profiling_dir, self._output_timeline_data_file_path.format(self._device_id)) file_path = validate_and_normalize_path( file_path, raise_key='Invalid timeline txt file path.') if not os.path.exists(file_path): logger.error("Failed to find parsed timeline file.") raise ProfilerFileNotFoundException('parsed timeline file') stream_count_dict = {} try: with open(file_path, 'r') as f_obj: for line in f_obj: if not line.startswith('op_name'): line_list = line.strip('\n').split(',') self._parse_timeline_data(line_list) self._update_num_of_streams(line_list, stream_count_dict) except (IOError, OSError) as err: logger.error( 'Error occurred when read timeline intermediate file: %s', err) raise ProfilerIOException # Update timeline summary info self._timeline_summary['num_of_streams'] = len( stream_count_dict.keys())
def _get_minddata_queue_step_time_info(self): """Get the sampling time information at the steps of the host queue""" minddata_queue_step_time_info = [] minddata_analyser = MinddataAnalyser(self._profiling_dir, self._device_id) file_path = minddata_analyser.get_device_queue_file_path() file_path = validate_and_normalize_path( file_path, raise_key="Invalid device_queue file path") if not os.path.exists(file_path): log.error('Did not find the device queue file: %s', file_path) raise ProfilerFileNotFoundException( msg='Did not find the device queue file.') with open(file_path) as data_file: for line in data_file.readlines(): op_info = line.split() # op_info is a list like:['1','64','8','2','85406783'] # The value of the first element in op_info is '0' or '1'. # '0' means that the time information is recorded. # '1' means that the queue information is recorded. # '1':queue info , '64':queue capacity, '8':step_num, '2':queue size, '85406783':sampling time. if op_info and op_info[0] == "1": minddata_queue_step_time_info.append( [op_info[2], op_info[4]]) return minddata_queue_step_time_info
def _get_file_path(self, device_type, file_type): """ Get memory usage summary file. Args: device_type (str): Device type, e.g., GPU, Ascend. file_type (str): memory usage file type, e.g., summary, details. Returns: str, file path of memory usage file corresponding to its file_type. """ filename = "" if device_type == "ascend": if file_type is FileType.SUMMARY.value: filename = self._summary_filename.format(self._device_id) elif file_type is FileType.DETAILS.value: filename = self._details_filename.format(self._device_id) else: logger.error('Memory Usage only supports Ascend for now. Please check the device type.') raise ParamValueError("Invalid device type.") file_path = os.path.join(self._profiling_dir, filename) file_path = validate_and_normalize_path( file_path, raise_key='Invalid memory usage file path.' ) return file_path
def validate_group_condition(search_condition): """ Verify the group_condition in search_condition is valid or not. Args: search_condition (dict): The search condition. Raises: ProfilerGroupConditionException: If the group_condition param in search_condition is invalid. """ group_condition = search_condition.get("group_condition") if not isinstance(group_condition, dict): raise ProfilerGroupConditionException("The group condition must be dict.") if "limit" in group_condition: limit = group_condition.get("limit", 10) if isinstance(limit, bool) \ or not isinstance(group_condition.get("limit"), int): log.error("The limit must be int.") raise ProfilerGroupConditionException("The limit must be int.") if limit < 1 or limit > 100: raise ProfilerGroupConditionException("The limit must in [1, 100].") if "offset" in group_condition: offset = group_condition.get("offset", 0) if isinstance(offset, bool) \ or not isinstance(group_condition.get("offset"), int): log.error("The offset must be int.") raise ProfilerGroupConditionException("The offset must be int.") if offset < 0: raise ProfilerGroupConditionException("The offset must ge 0.") if offset > 1000000: raise ProfilerGroupConditionException("The offset must le 1000000.")
def validate_minddata_pipeline_condition(condition): """ Verify the minddata pipeline search condition is valid or not. Args: condition (dict): The minddata pipeline search condition. Raises: ProfilerParamTypeErrorException: If the type of the search condition is invalid. ProfilerDeviceIdException: If the device_id param in the search condition is invalid. ProfilerGroupConditionException: If the group_condition param in the search condition is invalid. ProfilerSortConditionException: If the sort_condition param in the search condition is invalid. ProfilerFilterConditionException: If the filter_condition param in the search condition is invalid. """ if not isinstance(condition, dict): log.error("Invalid condition type, it should be dict.") raise ProfilerParamTypeErrorException( "Invalid condition type, it should be dict." ) if "device_id" in condition: device_id = condition.get("device_id") if not isinstance(device_id, str): raise ProfilerDeviceIdException( "Invalid device_id type, it should be str." ) if "group_condition" in condition: validate_group_condition(condition) if "sort_condition" in condition: validate_sort_condition(condition, MINDDATA_PIPELINE_COL) if "filter_condition" in condition: filter_condition = condition.get('filter_condition') if not isinstance(filter_condition, dict): raise ProfilerFilterConditionException( "The filter condition must be dict." ) for key, value in filter_condition.items(): if key == 'op_id': validate_op_filter_condition( value, value_type=int, value_type_msg='int' ) elif key == 'op_type': validate_op_filter_condition(value) elif key == 'is_display_op_detail': if not isinstance(key, bool): raise ProfilerFilterConditionException( "The condition must be bool." ) else: raise ProfilerFilterConditionException( "The key {} of filter_condition is not support.".format(key) )
def validate_condition(search_condition): """ Verify the param in search_condition is valid or not. Args: search_condition (dict): The search condition. Raises: ProfilerParamTypeErrorException: If the type of the param in search_condition is invalid. ProfilerDeviceIdException: If the device_id param in search_condition is invalid. ProfilerOpTypeException: If the op_type param in search_condition is invalid. ProfilerGroupConditionException: If the group_condition param in search_condition is invalid. ProfilerSortConditionException: If the sort_condition param in search_condition is invalid. ProfilerFilterConditionException: If the filter_condition param in search_condition is invalid. """ if not isinstance(search_condition, dict): log.error("Invalid search_condition type, it should be dict.") raise ProfilerParamTypeErrorException( "Invalid search_condition type, it should be dict.") if "device_id" in search_condition: device_id = search_condition.get("device_id") if not isinstance(device_id, str): raise ProfilerDeviceIdException( "Invalid device_id type, it should be str.") if "op_type" in search_condition: op_type = search_condition.get("op_type") if op_type == "aicpu_type": search_scope = AICPU_TYPE_COL elif op_type == "aicpu_detail": search_scope = AICPU_DETAIL_COL elif op_type == "aicore_type": search_scope = AICORE_TYPE_COL elif op_type == "aicore_detail": search_scope = AICORE_DETAIL_COL elif op_type == "gpu_op_type": search_scope = GPU_TYPE_COL elif op_type == "gpu_op_info": search_scope = GPU_DETAIL_COL elif op_type == "gpu_cuda_activity": search_scope = GPU_ACTIVITY_COL else: raise ProfilerOpTypeException( "The op_type must in ['aicpu_type','aicpu_detail', 'aicore_type', 'aicore_detail', " "'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']") else: raise ProfilerOpTypeException( "The op_type must in ['aicpu_type','aicpu_detail', 'aicore_type', 'aicore_detail', " "'gpu_op_type', 'gpu_op_info', 'gpu_cuda_activity']") if "group_condition" in search_condition: validate_group_condition(search_condition) if "sort_condition" in search_condition: validate_sort_condition(search_condition, search_scope) if "filter_condition" in search_condition: validate_filter_condition(search_condition)
def _validate_str_param(proc_name, accept_param, error_name=''): """Validate proc_name.""" if proc_name is None or isinstance(proc_name, str) and proc_name in accept_param: return log.error("Invalid param %s in request. Acceptable value is %s.", error_name, accept_param) raise ProfilerParamValueErrorException(f"Invalid {error_name}.")
def _validate_step_id(self, step_id): """Validate step_id.""" if step_id is None or isinstance(step_id, int) and 0 <= step_id <= self._size: return log.error("Invalid step_id in request. step_id should be in [0, %d].", self._size) raise StepNumNotSupportedException([0, self._size])
def __init__(self, subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data', optypes_to_deal='', optypes_not_deal='Variable', job_id=""): # get device_id and device_target device_target = "" try: import mindspore.context as context dev_id = str(context.get_context("device_id")) device_target = context.get_context("device_target") except ImportError: logger.error("Profiling: fail to import context from mindspore.") except ValueError as err: logger.error("Profiling: fail to get context, %s", err.message) if not dev_id: dev_id = os.getenv('DEVICE_ID') if not dev_id: dev_id = "0" logger.error("Fail to get DEVICE_ID, use 0 instead.") if device_target and device_target != "Davinci" \ and device_target != "Ascend": msg = ("Profiling: unsupport backend: %s" \ % device_target) raise RuntimeError(msg) self._dev_id = dev_id self._container_path = os.path.join(self._base_profiling_container_path, dev_id) data_path = os.path.join(self._container_path, "data") if not os.path.exists(data_path): os.makedirs(data_path) self._output_path = validate_and_normalize_path(output_path, 'Profiler output path (' + output_path + ')') self._output_path = os.path.join(self._output_path, "profiler") if not os.path.exists(self._output_path): os.makedirs(self._output_path) os.environ['PROFILING_MODE'] = 'true' os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace' # use context interface to open profiling, for the new mindspore version(after 2020.5.21) try: import mindspore.context as context context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace") except ImportError: logger.error("Profiling: fail to import context from mindspore.") except ValueError as err: logger.error("Profiling: fail to set context, %s", err.message) os.environ['AICPU_PROFILING_MODE'] = 'true' os.environ['PROFILING_DIR'] = str(self._container_path) self._subgraph = check_subgraph(subgraph) self._valid_optype_name = optypes_to_deal.split(",") if optypes_to_deal else [] self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else [] self._detail = check_bool(is_detail, 'is_detail') self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path') self._profiling_job_id = job_id self._start_time = int(time.time() * 10000000) logger.info("Profiling: profiling start time: %d", self._start_time)
def _get_file_content(file_path): """Get file content.""" try: with open(file_path, 'r') as f_obj: file_content = json.load(f_obj) except (IOError, OSError, json.JSONDecodeError) as err: log.error('Error occurred when read flops file: %s', err) raise ProfilerIOException() return file_content
def _load(self): """Load data according to the parsed AICORE operator types file.""" file_path = query_latest_trace_time_file(self._profiling_dir, self._device_id) if not file_path: log.error("Failed to find parsed trace time file.") raise ProfilerFileNotFoundException('parsed step trace time file') with open(file_path, 'r') as handle: csv_reader = csv.reader(handle) self.__column__ = next(csv_reader) self._data = list(csv_reader) self._size = len(self._data) - 1 self._display_col_names = self._col_names[:] self._load_point_info()