def get_profile_summary_proposal(): """ Get summary profiling proposal. Returns: str, the summary profiling proposal. Raises: ParamValueError: If the parameters contain some errors. Examples: >>> GET http://xxxx/v1/mindinsight/profile/summary/propose """ profiler_dir = get_profiler_dir(request) train_id = get_train_id(request) device_id = get_device_id(request) if not profiler_dir or not train_id: raise ParamValueError("No profiler_dir or train_id.") to_int(device_id, 'device_id') profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id, profiler_dir) try: profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs, "profiler") except ValidationError: raise ParamValueError("Invalid profiler dir") step_trace_condition = { "filter_condition": { "mode": "proc", "proc_name": "iteration_interval", "step_id": 0 } } options = {'step_trace': {"iter_interval": step_trace_condition}} proposal_type_list = [ 'step_trace', 'minddata', 'minddata_pipeline', 'common' ] proposal_obj = ComposeProposal(profiler_dir_abs, device_id, proposal_type_list) proposal_info = proposal_obj.get_proposal(options) # Use json.dumps for orderly return return CustomResponse(json.dumps(proposal_info), mimetype='application/json')
def get_profile_op_info(): """ Get operation profiling info. Returns: str, the operation profiling information. Raises: ParamValueError: If the search condition contains some errors. Examples: >>> POST http://xxxx/v1/mindinsight/profile/ops/search """ profiler_dir = get_profiler_dir(request) train_id = get_train_id(request) if not profiler_dir or not train_id: raise ParamValueError("No profiler_dir or train_id.") search_condition = request.stream.read() try: search_condition = json.loads( search_condition if search_condition else "{}") except (json.JSONDecodeError, ValueError): raise ParamValueError("Json data parse failed.") validate_condition(search_condition) device_id = search_condition.get("device_id", "0") to_int(device_id, 'device_id') profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id, profiler_dir) try: profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs, "profiler") except ValidationError: raise ParamValueError("Invalid profiler dir") op_type = search_condition.get("op_type") analyser = AnalyserFactory.instance().get_analyser(op_type, profiler_dir_abs, device_id) op_info = analyser.query(search_condition) return jsonify(op_info)
def get_minddata_pipeline_op_queue_info(): """ Get minddata pipeline operator info and queue info. Returns: str, the operation information and queue information. Raises: ParamValueError: If the search condition contains some errors. Examples: >>> POST http://xxxx/v1/mindinsight/profile/minddata-pipeline/op-queue """ profiler_dir = get_profiler_dir(request) train_id = get_train_id(request) if not profiler_dir or not train_id: raise ParamValueError("No profiler_dir or train_id.") profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id, profiler_dir) try: profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs, "profiler") except ValidationError: raise ParamValueError("Invalid profiler dir.") check_train_job_and_profiler_dir(profiler_dir_abs) condition = request.stream.read() try: condition = json.loads(condition) if condition else {} except Exception: raise ParamValueError("Json data parse failed.") validate_minddata_pipeline_condition(condition) device_id = condition.get("device_id", "0") to_int(device_id, 'device_id') analyser = AnalyserFactory.instance().get_analyser('minddata_pipeline', profiler_dir_abs, device_id) op_info = analyser.query(condition) return jsonify(op_info)
def get_training_trace_graph(): """ Get training trace info of one step. Returns: Response, the training trace info of one step. Examples: >>> GET http://xxxx/v1/mindinsight/profile/training-trace/graph """ summary_dir = request.args.get("dir") profiler_dir_abs = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) check_train_job_and_profiler_dir(profiler_dir_abs) graph_type = request.args.get("type", default='0') graph_type = to_int(graph_type, 'graph_type') device_id = request.args.get("device_id", default='0') to_int(device_id, 'device_id') graph_info = {} try: analyser = AnalyserFactory.instance().get_analyser( 'step_trace', profiler_dir_abs, device_id) except ProfilerFileNotFoundException: return jsonify(graph_info) graph_info = analyser.query( {'filter_condition': { 'mode': 'step', 'step_id': graph_type }}) graph_info['summary'] = analyser.summary graph_info['point_info'] = analyser.point_info(graph_type) graph_info['is_heterogeneous'] = False # In heterogeneous training scene, do not display step trace data. cpu_op_type_file_name = f"cpu_op_type_info_{device_id}.csv" if cpu_op_type_file_name in os.listdir(profiler_dir_abs): graph_info = {'is_heterogeneous': True} return jsonify(graph_info)
def get_minddata_cpu_utilization_info(): """ Get minddata cpu utilization info. Returns: str, the minddata cpu utilization info. Raises: ParamValueError: If the search condition contains some errors. Examples: >>>POST http://xxx/v1/mindinsight/profile/minddata-cpu-utilization-summary """ profiler_dir = get_profiler_dir(request) train_id = get_train_id(request) if not profiler_dir or not train_id: raise ParamValueError("No profiler_dir or train_id.") profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id, profiler_dir) try: profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs, "profiler") except ValidationError: raise ParamValueError("Invalid profiler dir.") check_train_job_and_profiler_dir(profiler_dir_abs) condition = request.stream.read() try: condition = json.loads(condition) if condition else {} except (json.JSONDecodeError, ValueError): raise ParamValueError("Json data parse failed.") device_id = condition.get("device_id", "0") to_int(device_id, 'device_id') analyser = AnalyserFactory.instance().get_analyser( 'minddata_cpu_utilization', profiler_dir_abs, device_id) cpu_utilization = analyser.query(condition) return jsonify(cpu_utilization)
def get_minddata_pipeline_queue_info(): """ Get the special minddata pipeline queue info. Returns: str, the queue information. Raises: ParamValueError: If the search condition contains some errors. Examples: >>> GET http://xxxx/v1/mindinsight/profile/minddata-pipeline/queue """ profiler_dir = get_profiler_dir(request) train_id = get_train_id(request) if not profiler_dir or not train_id: raise ParamValueError("No profiler_dir or train_id.") profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id, profiler_dir) try: profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs, "profiler") except ValidationError: raise ParamValueError("Invalid profiler dir.") check_train_job_and_profiler_dir(profiler_dir_abs) device_id = request.args.get('device_id', default='0') to_int(device_id, 'device_id') op_id = request.args.get('op_id', type=int) if op_id is None: raise ParamValueError( "Invalid operator id or operator id does not exist.") analyser = AnalyserFactory.instance().get_analyser('minddata_pipeline', profiler_dir_abs, device_id) op_queue_info = analyser.get_op_and_parent_op_info(op_id) return jsonify(op_queue_info)
def _get_train_tensors(self, train_id, tags, step, dims, detail): """ Get tensor data for given train_id, tags, step, dims and detail. Args: train_id (str): Specify list of train job ID. tags (list): Specify list of tag. step (int): Specify step of tensor, it's necessary when detail is set to 'data'. dims (str): Specify dims of tensor, it's necessary when detail is set to 'data'. detail (str): Specify which data to query, available values: 'stats', 'histogram' and 'data'. Returns: list[dict], a list of dictionaries containing the `train_id`, `tag`, `values`. Raises: TensorNotExistError, If tensor with specific train_id and tag is not exist in cache. ParamValueError, If the value of detail is not within available values: 'stats', 'histogram' and 'data'. """ tensors_response = [] for tag in tags: try: tensors = self._data_manager.list_tensors(train_id, tag) except ParamValueError as err: raise TensorNotExistError(err.message) if tensors and not isinstance(tensors[0].value, TensorContainer): raise TensorNotExistError( "there is no tensor data in this tag: {}".format(tag)) if detail is None or detail == 'stats': values = self._get_tensors_summary(detail, tensors) elif detail == 'data': Validation.check_param_empty(step=step, dims=dims) # Limit to query max two dimensions for tensor in table view. dims = TensorUtils.parse_shape(dims, limit=MAX_DIMENSIONS_FOR_TENSOR) step = to_int(step, "step") values = self._get_tensors_data(step, dims, tensors) elif detail == 'histogram': values = self._get_tensors_histogram(tensors) else: raise ParamValueError( 'Can not support this value: {} of detail.'.format(detail)) tensor = {"train_id": train_id, "tag": tag, "values": values} tensors_response.append(tensor) return tensors_response
def get_flops_summary(): """ Get flops summary info. Returns: Response, the flops summary info. Examples: >>> GET http://xxxx/v1/mindinsight/profile/flops-summary """ train_id = request.args.get("train_id") profiler_dir_abs = validate_and_normalize_profiler_path( train_id, settings.SUMMARY_BASE_DIR) check_train_job_and_profiler_dir(profiler_dir_abs) device_id = request.args.get("device_id", default='0') to_int(device_id, 'device_id') analyser = AnalyserFactory.instance().get_analyser('flops', profiler_dir_abs, device_id) summary = analyser.get_flops_summary() return summary
def check_offset(cls, offset, default_value=0): """ Check offset parameter, it must be greater or equal 0. Args: offset (Union[str, int]): Value can be string number or int. default_value (int): Default value for checked offset. Default: 0. Returns: int, offset. """ if offset is None: return default_value offset = to_int(offset, 'offset') if offset < 0: raise ParamValueError("'offset' should be greater than or equal to 0.") return offset
def _get_info_dict_from_row_data(self, row_info, time_type): """ Get step info in dict format. Args: row_info (list[str]): Step info, the value is corresponding to `__column__`. time_type (str): The value type. `systime` keeps the original value. `realtime` transforms the value in millisecond. Default: `realtime`. Returns: dict, step trace information. The key is in `__column__`. """ row_info_dict = {} for key, value in zip(self.__column__, row_info): if key == 'step_num': continue value = to_int(value, key) row_info_dict[key] = to_millisecond(value) if time_type == 'realtime' else value return row_info_dict
def get_field_value(row_info, field_name, header, time_type='realtime'): """ Extract basic info through row_info. Args: row_info (list): The list of data info in one row. field_name (str): The name in header. header (list[str]): The list of field names. time_type (str): The type of value, `realtime` or `systime`. Default: `realtime`. Returns: dict, step trace info in dict format. """ field_index = header.index(field_name) value = row_info[field_index] value = to_int(value, field_name) if time_type == 'realtime': value = to_millisecond(value) return value
def get_timeline_detail(): """ Get timeline detail. Returns: Response, the detail information of timeline. Examples: >>> GET http://xxxx/v1/mindinsight/profile/timeline """ summary_dir = request.args.get("dir") profiler_dir = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) device_id = request.args.get("device_id", default='0') _ = to_int(device_id, 'device_id') analyser = AnalyserFactory.instance().get_analyser('timeline', profiler_dir, device_id) timeline = analyser.get_display_timeline() return jsonify(timeline)
def get_timeline_summary(): """ Get timeline summary info. Returns: Response, the timeline summary info. Examples: >>> GET http://xxxx/v1/mindinsight/profile/timeline-summary """ summary_dir = request.args.get("dir") profiler_dir = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) device_id = request.args.get("device_id", default='0') _ = to_int(device_id, 'device_id') analyser = AnalyserFactory.instance().get_analyser('timeline', profiler_dir, device_id) summary = analyser.get_timeline_summary() return summary
def check_limit(cls, limit, min_value=1, max_value=1000, default_value=100): """ Check limit parameter, it should between min_value and max_value. Args: limit (Union[str, int]): Value can be string number or int. min_value (int): Limit should greater or equal this value. Default: 1. max_value (int): Limit should less or equal this value. Default: 1000. default_value (int): Default value for limit. Default: 100. Returns: int, limit. """ if limit is None: return default_value limit = to_int(limit, 'limit') if limit < min_value or limit > max_value: raise ParamValueError("'limit' should in [{}, {}].".format(min_value, max_value)) return limit
def validate_and_set_job_id_env(job_id_env): """ Validate the job id and set it in environment. Args: job_id_env (str): The id that to be set in environment parameter `JOB_ID`. Returns: int, the valid job id env. """ if job_id_env is None: return job_id_env # get job_id_env in int type valid_id = to_int(job_id_env, 'job_id_env') # check the range of valid_id if valid_id and 255 < valid_id < sys.maxsize: os.environ['JOB_ID'] = job_id_env else: log.warning( "Invalid job_id_env %s. The value should be int and between 255 and %s. Use" "default job id env instead.", job_id_env, sys.maxsize) return valid_id
def get_timeline_summary(): """ Get timeline summary info. Returns: Response, the timeline summary info. Examples: >>> GET http://xxxx/v1/mindinsight/profile/timeline-summary """ summary_dir = request.args.get("dir") profiler_dir = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) if not os.path.exists(profiler_dir): msg = 'The profiler dir is not found!' raise ProfilerDirNotFoundException(msg=msg) device_id = request.args.get("device_id", default='0') _ = to_int(device_id, 'device_id') analyser = AnalyserFactory.instance().get_analyser('timeline', profiler_dir, device_id) summary = analyser.get_timeline_summary() return summary