def get_memory_usage_graphics(): """ Get graphic representation of memory usage. Returns: Response, the graphic representation of memory usage. Examples: >>> GET http://xxxx/v1/mindinsight/profile/memory-graphics """ summary_dir = request.args.get("dir") profiler_dir_abs = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) check_train_job_and_profiler_dir(profiler_dir_abs) device_id = request.args.get("device_id", default='0') to_int(device_id, 'device_id') device_type = request.args.get("device_type", default='ascend') if device_type not in ['ascend']: logger.info( "Invalid device_type, Memory Usage only supports Ascend for now.") raise ParamValueError("Invalid device_type.") analyser = AnalyserFactory.instance().get_analyser('memory_usage', profiler_dir_abs, device_id) graphics = analyser.get_memory_usage_graphics(device_type) return graphics
def get_memory_usage_breakdowns(): """ Get memory breakdowns of each node. Returns: Response, the memory breakdowns for each node. Examples: >>> GET http://xxxx/v1/mindinsight/profile/memory-breakdowns """ summary_dir = request.args.get("dir") profiler_dir_abs = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) check_train_job_and_profiler_dir(profiler_dir_abs) device_id = request.args.get("device_id", default='0') to_int(device_id, 'device_id') device_type = request.args.get("device_type", default='ascend') graph_id = request.args.get("graph_id", default='0') node_id = request.args.get("node_id", default='0') node_id = to_int(node_id, 'node_id') if device_type not in ['ascend']: logger.error( "Invalid device_type, Memory Usage only supports Ascend for now.") raise ParamValueError("Invalid device_type.") analyser = AnalyserFactory.instance().get_analyser('memory_usage', profiler_dir_abs, device_id) breakdowns = analyser.get_memory_usage_breakdowns(device_type, graph_id, node_id) return breakdowns
def get_timeline_detail(): """ Get timeline detail. Returns: Response, the detail information of timeline. Examples: >>> GET http://xxxx/v1/mindinsight/profile/timeline """ summary_dir = request.args.get("dir") profiler_dir_abs = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) check_train_job_and_profiler_dir(profiler_dir_abs) device_id = request.args.get("device_id", default='0') to_int(device_id, 'device_id') device_type = request.args.get("device_type", default='ascend') scope_name_num = request.args.get("scope_name_num", default='0') if device_type not in ['gpu', 'ascend']: logger.info( "Invalid device_type, device_type should be gpu or ascend.") raise ParamValueError("Invalid device_type.") analyser = AnalyserFactory.instance().get_analyser('timeline', profiler_dir_abs, device_id) timeline = analyser.get_display_timeline(device_type, scope_name_num) return jsonify(timeline)
def get_profile_device_list(): """ Get profile device list. Returns: list, the available device list. Raises: ParamValueError: If the search condition contains some errors. Examples: >>> POST http://xxxx/v1/mindinsight/profile/devices """ profiler_dir = get_profiler_dir(request) train_id = get_train_id(request) if not profiler_dir or not train_id: raise ParamValueError("No profiler_dir or train_id.") profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id, profiler_dir) try: profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs, "profiler") except ValidationError: raise ParamValueError("Invalid profiler dir") check_train_job_and_profiler_dir(profiler_dir_abs) device_list, _ = analyse_device_list_from_profiler_dir(profiler_dir_abs) return jsonify(device_list)
def get_timeline_summary(): """ Get timeline summary info. Returns: Response, the timeline summary info. Examples: >>> GET http://xxxx/v1/mindinsight/profile/timeline-summary """ summary_dir = request.args.get("dir") profiler_dir_abs = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) check_train_job_and_profiler_dir(profiler_dir_abs) device_id = request.args.get("device_id", default='0') to_int(device_id, 'device_id') device_type = request.args.get("device_type", default='ascend') if device_type not in ['gpu', 'ascend']: logger.info( "Invalid device_type, device_type should be gpu or ascend.") raise ParamValueError("Invalid device_type.") analyser = AnalyserFactory.instance().get_analyser('timeline', profiler_dir_abs, device_id) summary = analyser.get_timeline_summary(device_type) return summary
def get_time_info(): """ Get minddata operation info. Returns: Response, the minddata operation info. Examples: >>> GET http://xxxx/v1/mindinsight/profile/minddata_op """ profiler_dir_abs = get_profiler_abs_dir(request) check_train_job_and_profiler_dir(profiler_dir_abs) device_id = unquote_args(request, "device_id") to_int(device_id, 'device_id') op_type = unquote_args(request, "type") time_info = { 'size': 0, 'info': [], "summary": { "time_summary": {} }, "advise": {} } minddata_analyser = AnalyserFactory.instance().get_analyser( 'minddata', profiler_dir_abs, device_id) if op_type == "get_next": _, time_info = minddata_analyser.analyse_get_next_info( info_type="time") elif op_type == "device_queue": _, time_info = minddata_analyser.analyse_device_queue_info( info_type="time") return jsonify(time_info)
def get_process_summary(): """ Get interval process summary. Returns: Response, the process summary. Examples: >>> GET http://xxxx/v1/mindinsight/profile/process_summary """ profiler_dir_abs = get_profiler_abs_dir(request) check_train_job_and_profiler_dir(profiler_dir_abs) device_id = unquote_args(request, "device_id") to_int(device_id, 'device_id') minddata_analyser = AnalyserFactory.instance().get_analyser( 'minddata', profiler_dir_abs, device_id) get_next_queue_info, _ = minddata_analyser.analyse_get_next_info( info_type="queue") device_queue_info, _ = minddata_analyser.analyse_device_queue_info( info_type="queue") result = MinddataAnalyser.analyse_queue_summary(get_next_queue_info, device_queue_info) return jsonify(result)
def get_queue_info(): """ Get each type queue info. Returns: Response, the queue info. Examples: >>> GET http://xxxx/v1/mindinsight/profile/queue_info """ profiler_dir_abs = get_profiler_abs_dir(request) check_train_job_and_profiler_dir(profiler_dir_abs) device_id = unquote_args(request, "device_id") to_int(device_id, 'device_id') queue_type = unquote_args(request, "type") queue_info = {} minddata_analyser = AnalyserFactory.instance().get_analyser( 'minddata', profiler_dir_abs, device_id) if queue_type == "get_next": queue_info, _ = minddata_analyser.analyse_get_next_info( info_type="queue") elif queue_type == "device_queue": queue_info, _ = minddata_analyser.analyse_device_queue_info( info_type="queue") return jsonify(queue_info)
def get_target_time_info(): """ Get all the time information of the specified column. Returns: Response, all the time information of the specified column. Examples: >>> GET http://xxxx/v1/mindinsight/profile/training-trace/target-time-info """ summary_dir = request.args.get("dir") profiler_dir_abs = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) check_train_job_and_profiler_dir(profiler_dir_abs) proc_name = request.args.get("type") validate_ui_proc(proc_name) device_id = request.args.get("device_id", default='0') to_int(device_id, 'device_id') analyser = AnalyserFactory.instance().get_analyser('step_trace', profiler_dir_abs, device_id) target_time_info = analyser.query( {'filter_condition': { 'mode': 'proc', 'proc_name': proc_name }}) target_time_info['summary'] = analyser.summary return jsonify(target_time_info)
def get_cluster_flops(): """ Get cluster FLOPs. Returns: str, the cluster FLOPs. Raises: ParamValueError: If the cluster profiler dir is invalid. Examples: >>>GET http://xxx/v1/mindinsight/profile/cluster-flops """ train_id = get_train_id(request) if not train_id: raise ParamValueError('No train id.') cluster_profiler_dir = os.path.join(settings.SUMMARY_BASE_DIR, train_id) cluster_profiler_dir = validate_and_normalize_path(cluster_profiler_dir, 'cluster_profiler') check_train_job_and_profiler_dir(cluster_profiler_dir) analyser = AnalyserFactory.instance().get_analyser('cluster_flops', cluster_profiler_dir) flops = analyser.get_flops() return jsonify(flops)
def get_training_trace_graph(): """ Get training trace info of one step. Returns: Response, the training trace info of one step. Examples: >>> GET http://xxxx/v1/mindinsight/profile/training-trace/graph """ summary_dir = request.args.get("dir") profiler_dir_abs = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) check_train_job_and_profiler_dir(profiler_dir_abs) graph_type = request.args.get("type", default='0') graph_type = to_int(graph_type, 'graph_type') device_id = request.args.get("device_id", default='0') _ = to_int(device_id, 'device_id') graph_info = {} try: analyser = AnalyserFactory.instance().get_analyser( 'step_trace', profiler_dir_abs, device_id) except ProfilerFileNotFoundException: return jsonify(graph_info) graph_info = analyser.query( {'filter_condition': { 'mode': 'step', 'step_id': graph_type }}) graph_info['summary'] = analyser.summary graph_info['point_info'] = analyser.point_info return jsonify(graph_info)
def get_profile_summary_proposal(): """ Get summary profiling proposal. Returns: str, the summary profiling proposal. Raises: ParamValueError: If the parameters contain some errors. Examples: >>> GET http://xxxx/v1/mindinsight/profile/summary/propose """ profiler_dir = get_profiler_dir(request) train_id = get_train_id(request) device_id = get_device_id(request) if not profiler_dir or not train_id: raise ParamValueError("No profiler_dir or train_id.") to_int(device_id, 'device_id') profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id, profiler_dir) try: profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs, "profiler") except ValidationError: raise ParamValueError("Invalid profiler dir") check_train_job_and_profiler_dir(profiler_dir_abs) step_trace_condition = { "filter_condition": { "mode": "proc", "proc_name": "iteration_interval", "step_id": 0 } } options = {'step_trace': {"iter_interval": step_trace_condition}} proposal_type_list = [ 'step_trace', 'minddata', 'minddata_pipeline', 'common' ] proposal_obj = ComposeProposal(profiler_dir_abs, device_id, proposal_type_list) proposal_info = proposal_obj.get_proposal(options) # Use json.dumps for orderly return return CustomResponse(json.dumps(proposal_info), mimetype='application/json')
def get_profile_op_info(): """ Get operation profiling info. Returns: str, the operation profiling information. Raises: ParamValueError: If the search condition contains some errors. Examples: >>> POST http://xxxx/v1/mindinsight/profile/ops/search """ profiler_dir = get_profiler_dir(request) train_id = get_train_id(request) if not profiler_dir or not train_id: raise ParamValueError("No profiler_dir or train_id.") search_condition = request.stream.read() try: search_condition = json.loads( search_condition if search_condition else "{}") except (json.JSONDecodeError, ValueError): raise ParamValueError("Json data parse failed.") validate_condition(search_condition) device_id = search_condition.get("device_id", "0") to_int(device_id, 'device_id') profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id, profiler_dir) try: profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs, "profiler") except ValidationError: raise ParamValueError("Invalid profiler dir") check_train_job_and_profiler_dir(profiler_dir_abs) op_type = search_condition.get("op_type") analyser = AnalyserFactory.instance().get_analyser(op_type, profiler_dir_abs, device_id) op_info = analyser.query(search_condition) return jsonify(op_info)
def get_minddata_pipeline_op_queue_info(): """ Get minddata pipeline operator info and queue info. Returns: str, the operation information and queue information. Raises: ParamValueError: If the search condition contains some errors. Examples: >>> POST http://xxxx/v1/mindinsight/profile/minddata-pipeline/op-queue """ profiler_dir = get_profiler_dir(request) train_id = get_train_id(request) if not profiler_dir or not train_id: raise ParamValueError("No profiler_dir or train_id.") profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id, profiler_dir) try: profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs, "profiler") except ValidationError: raise ParamValueError("Invalid profiler dir.") check_train_job_and_profiler_dir(profiler_dir_abs) condition = request.stream.read() try: condition = json.loads(condition) if condition else {} except Exception: raise ParamValueError("Json data parse failed.") validate_minddata_pipeline_condition(condition) device_id = condition.get("device_id", "0") to_int(device_id, 'device_id') analyser = AnalyserFactory.instance().get_analyser('minddata_pipeline', profiler_dir_abs, device_id) op_info = analyser.query(condition) return jsonify(op_info)
def get_training_trace_graph(): """ Get training trace info of one step. Returns: Response, the training trace info of one step. Examples: >>> GET http://xxxx/v1/mindinsight/profile/training-trace/graph """ summary_dir = request.args.get("dir") profiler_dir_abs = validate_and_normalize_profiler_path( summary_dir, settings.SUMMARY_BASE_DIR) check_train_job_and_profiler_dir(profiler_dir_abs) graph_type = request.args.get("type", default='0') graph_type = to_int(graph_type, 'graph_type') device_id = request.args.get("device_id", default='0') to_int(device_id, 'device_id') graph_info = {} try: analyser = AnalyserFactory.instance().get_analyser( 'step_trace', profiler_dir_abs, device_id) except ProfilerFileNotFoundException: return jsonify(graph_info) graph_info = analyser.query( {'filter_condition': { 'mode': 'step', 'step_id': graph_type }}) graph_info['summary'] = analyser.summary graph_info['point_info'] = analyser.point_info(graph_type) graph_info['is_heterogeneous'] = False # In heterogeneous training scene, do not display step trace data. cpu_op_type_file_name = f"cpu_op_type_info_{device_id}.csv" if cpu_op_type_file_name in os.listdir(profiler_dir_abs): graph_info = {'is_heterogeneous': True} return jsonify(graph_info)
def get_minddata_cpu_utilization_info(): """ Get minddata cpu utilization info. Returns: str, the minddata cpu utilization info. Raises: ParamValueError: If the search condition contains some errors. Examples: >>>POST http://xxx/v1/mindinsight/profile/minddata-cpu-utilization-summary """ profiler_dir = get_profiler_dir(request) train_id = get_train_id(request) if not profiler_dir or not train_id: raise ParamValueError("No profiler_dir or train_id.") profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id, profiler_dir) try: profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs, "profiler") except ValidationError: raise ParamValueError("Invalid profiler dir.") check_train_job_and_profiler_dir(profiler_dir_abs) condition = request.stream.read() try: condition = json.loads(condition) if condition else {} except (json.JSONDecodeError, ValueError): raise ParamValueError("Json data parse failed.") device_id = condition.get("device_id", "0") to_int(device_id, 'device_id') analyser = AnalyserFactory.instance().get_analyser( 'minddata_cpu_utilization', profiler_dir_abs, device_id) cpu_utilization = analyser.query(condition) return jsonify(cpu_utilization)
def get_minddata_pipeline_queue_info(): """ Get the special minddata pipeline queue info. Returns: str, the queue information. Raises: ParamValueError: If the search condition contains some errors. Examples: >>> GET http://xxxx/v1/mindinsight/profile/minddata-pipeline/queue """ profiler_dir = get_profiler_dir(request) train_id = get_train_id(request) if not profiler_dir or not train_id: raise ParamValueError("No profiler_dir or train_id.") profiler_dir_abs = os.path.join(settings.SUMMARY_BASE_DIR, train_id, profiler_dir) try: profiler_dir_abs = validate_and_normalize_path(profiler_dir_abs, "profiler") except ValidationError: raise ParamValueError("Invalid profiler dir.") check_train_job_and_profiler_dir(profiler_dir_abs) device_id = request.args.get('device_id', default='0') to_int(device_id, 'device_id') op_id = request.args.get('op_id', type=int) if op_id is None: raise ParamValueError( "Invalid operator id or operator id does not exist.") analyser = AnalyserFactory.instance().get_analyser('minddata_pipeline', profiler_dir_abs, device_id) op_queue_info = analyser.get_op_and_parent_op_info(op_id) return jsonify(op_queue_info)
def get_flops_summary(): """ Get flops summary info. Returns: Response, the flops summary info. Examples: >>> GET http://xxxx/v1/mindinsight/profile/flops-summary """ train_id = request.args.get("train_id") profiler_dir_abs = validate_and_normalize_profiler_path( train_id, settings.SUMMARY_BASE_DIR) check_train_job_and_profiler_dir(profiler_dir_abs) device_id = request.args.get("device_id", default='0') to_int(device_id, 'device_id') analyser = AnalyserFactory.instance().get_analyser('flops', profiler_dir_abs, device_id) summary = analyser.get_flops_summary() return summary