def parse_logs(args): """ parse log files and insert to db :param args: :return: """ image_id = get_image_id() file_list = load_folder_files(os.path.join(args.log_path, "index")) html_results = OrderedDict() for k in DICT_INDEX.values(): html_results[k] = {} html_results[k]["header"] = TABLE_HEADER html_results[k]["data"] = [] for job_file in file_list: result = 0 with open(job_file, 'r+') as file_obj: file_lines = file_obj.readlines() try: job_info = json.loads(file_lines[-1]) except Exception as exc: print("file {} parse error".format(job_file)) continue # check model if exist in db get_or_insert_model(job_info["model_name"], job_info["mission_name"], job_info["direction_id"]) # save job if str(job_info["gpu_num"] ) == "8" and job_info["run_mode"] == "mp": run_machine_type = DICT_RUN_MACHINE_TYPE['8mp'] else: run_machine_type = DICT_RUN_MACHINE_TYPE[str( job_info["gpu_num"])] job_id = insert_job(image_id, run_machine_type, job_info, args).job_id # parse job results cpu_utilization_result = 0 gpu_utilization_result = 0 unit = '' mem_result = 0 try: if job_info["index"] == 1: result = job_info['FINAL_RESULT'] unit = job_info['UNIT'] for line in file_lines: if 'AVG_CPU_USE' in line: cpu_utilization_result = line.strip().split('=')[1] if 'AVG_GPU_USE' in line: gpu_utilization_result = line.strip().split('=')[1] if "MAX_GPU_MEMORY_USE" in line: value = line.strip().split("=")[1].strip() mem_result = int(value) if str.isdigit( value) else 0 elif job_info["index"] == 3: result = json.dumps(job_info['FINAL_RESULT']) else: for line in file_lines: if "MAX_BATCH_SIZE" in line: value = line.strip().split("=")[1].strip() result = int(value) if str.isdigit(value) else 0 break # save job results pjr = insert_results(job_id, job_info["model_name"], job_info["index"], result, unit, 1) log_file = job_info["log_file"].split("/")[-1] log_base = args.paddle_version + "/" + args.implement_type train_log_path = LOG_SERVER + os.path.join( log_base, "train_log", log_file) log_save_dict = {"train_log_path": train_log_path} if job_info["index"] == 1: insert_results(job_id, job_info["model_name"], 7, cpu_utilization_result, '%') insert_results(job_id, job_info["model_name"], 8, gpu_utilization_result, '%') pjr2 = insert_results(job_id, job_info["model_name"], 2, mem_result, 'MiB', 1) bm.JobResultsLog.objects.create( result_id=pjr2.result_id, log_path=json.dumps(log_save_dict)).save() if int(job_info["gpu_num"]) == 1: profiler_log = job_info["log_with_profiler"].split( "/")[-1] profiler_path = job_info["profiler_path"].split( "/")[-1] profiler_log_path = LOG_SERVER + os.path.join( log_base, "profiler_log", profiler_log) profiler_path = LOG_SERVER + os.path.join( log_base, "profiler_log", profiler_path) log_save_dict["profiler_log_path"] = profiler_log_path log_save_dict["profiler_path"] = profiler_path bm.JobResultsLog.objects.create( result_id=pjr.result_id, log_path=json.dumps(log_save_dict)).save() except Exception as pfe: print pfe else: print( "models: {}, run_machine_type: {}, index: {}, result: {}". format(job_info["model_name"], run_machine_type, job_info["index"], result)) if job_info["index"] == 1: # speed check_results( job_info["model_name"], job_info["index"], run_machine_type, result, html_results, -1 if args.device_type.lower() == 'cpu' else 1) check_results(job_info["model_name"], 2, run_machine_type, mem_result, html_results, -1) # mem elif job_info["index"] == 3: # profiler check_results(job_info["model_name"], job_info["index"], run_machine_type, json.loads(result), html_results, -1, "Framework_Total") check_results(job_info["model_name"], job_info["index"], run_machine_type, json.loads(result), html_results, -1, "GpuMemcpy_Total") elif job_info["index"] == 6: # max BS check_results(job_info["model_name"], job_info["index"], run_machine_type, result, html_results, 1) else: print("--------------> please set a correct index(1|3|6)!") # generate email file title = "frame_benchmark" env = dict(paddle_branch=args.image_branch, paddle_commit_id=args.image_commit_id, benchmark_commit_id=args.code_commit_id, device_type=args.device_type, implement_type=args.implement_type, docker_images=os.getenv('RUN_IMAGE_NAME')) if args.device_type.upper() in ("P40", "V100"): env["cuda_version"] = args.cuda_version env["cudnn_version"] = args.cudnn_version email_t = template.EmailTemplate(title, env, html_results, args.log_path) email_t.construct_email_content()
def parse_logs(args): """ parse log files and insert to db :param args: :return: """ image_id = get_image_id() file_list = load_folder_files(os.path.join(args.log_path, "index")) html_results = OrderedDict() icafe_results = [] for k in DICT_INDEX.values(): html_results[k] = {} if k == 'Profiler_info': html_results[k]["header"] = TABLE_PROFILE_HEADER else: html_results[k]["header"] = TABLE_HEADER html_results[k]["data"] = [] for job_file in file_list: result = 0 with open(job_file, 'r+') as file_obj: file_lines = file_obj.readlines() try: job_info = json.loads(file_lines[-1]) except Exception as exc: print("file {} parse error".format(job_file)) continue # check model if exist in db get_or_insert_model(job_info["model_name"], job_info["mission_name"], job_info["direction_id"]) # save job if str(job_info["gpu_num"] ) == "8" and job_info["run_mode"] == "mp": run_machine_type = DICT_RUN_MACHINE_TYPE['8mp'] else: run_machine_type = DICT_RUN_MACHINE_TYPE[str( job_info["gpu_num"])] job_id = insert_job(image_id, run_machine_type, job_info, args).job_id # parse job results cpu_utilization_result = 0 gpu_utilization_result = 0 unit = '' outlier = 0 outlier_mem = 0 mem_result = 0 benchmark = 0 benchmark_mem = 0 if job_info["index"] == 1: result = job_info['FINAL_RESULT'] unit = job_info['UNIT'] fail_flag = job_info['JOB_FAIL_FLAG'] for line in file_lines: if 'AVG_CPU_USE' in line: cpu_utilization_result = line.strip().split('=')[1] if 'AVG_GPU_USE' in line: gpu_utilization_result = line.strip().split('=')[1] if "MAX_GPU_MEMORY_USE" in line: value = line.strip().split("=")[1].strip() mem_result = int(value) if str.isdigit(value) else 0 elif job_info["index"] == 3: result = json.dumps(job_info['FINAL_RESULT']) else: for line in file_lines: if "MAX_BATCH_SIZE" in line: value = line.strip().split("=")[1].strip() result = int(value) if str.isdigit(value) else 0 break print("models: {}, run_machine_type: {}, index: {}, result: {}". format(job_info["model_name"], run_machine_type, job_info["index"], result)) # check_results and send alarm email if job_info["index"] == 1: # speed print_machine_type = machine_type_to_print(run_machine_type) #record fail jobs print('fail_flag:{}'.format(fail_flag)) if float(result) == 0 or fail_flag == 1: FAIL_LIST.append( [job_info["model_name"], print_machine_type]) outlier = 1 outlier_mem = 1 icafe_results.append([ job_info["model_name"], print_machine_type, 'fail', [] ]) benchmark = check_results( job_info["model_name"], job_info["index"], run_machine_type, result, html_results, -1 if args.device_type.lower() == 'cpu' else 1, unit=unit, outlier=outlier, icafe_results=icafe_results) benchmark_mem = check_results( job_info["model_name"], 2, run_machine_type, mem_result, html_results, -1, outlier=outlier_mem, icafe_results=icafe_results) # mem elif job_info["index"] == 3: # profiler check_results(job_info["model_name"], job_info["index"], run_machine_type, json.loads(result), html_results, -1, "Framework_Total", is_profile=True) check_results(job_info["model_name"], job_info["index"], run_machine_type, json.loads(result), html_results, -1, "GpuMemcpy_Total", is_profile=True) elif job_info["index"] == 6: # max BS check_results(job_info["model_name"], job_info["index"], run_machine_type, result, html_results, 1) else: print("--------------> please set a correct index(1|3|6)!") try: # save job results pjr = insert_results(job_id, job_info["model_name"], job_info["index"], result, unit, 1, benchmark=benchmark, outlier=outlier) log_file = job_info["log_file"].split("/")[-1] log_base = args.paddle_version + "/" + args.implement_type train_log_path = LOG_SERVER + os.path.join( log_base, "train_log", log_file) log_save_dict = {"train_log_path": train_log_path} if job_info["index"] == 1: insert_results(job_id, job_info["model_name"], 7, cpu_utilization_result, '%') insert_results(job_id, job_info["model_name"], 8, gpu_utilization_result, '%') pjr2 = insert_results(job_id, job_info["model_name"], 2, mem_result, 'MiB', 1, benchmark=benchmark_mem, outlier=outlier_mem) bm.JobResultsLog.objects.create( result_id=pjr2.result_id, log_path=json.dumps(log_save_dict)).save() if int(job_info["gpu_num"]) == 1: profiler_log = job_info["log_with_profiler"].split( "/")[-1] profiler_path = job_info["profiler_path"].split( "/")[-1] profiler_log_path = LOG_SERVER + os.path.join( log_base, "profiler_log", profiler_log) profiler_path = LOG_SERVER + os.path.join( log_base, "profiler_log", profiler_path) log_save_dict["profiler_log_path"] = profiler_log_path log_save_dict["profiler_path"] = profiler_path bm.JobResultsLog.objects.create( result_id=pjr.result_id, log_path=json.dumps(log_save_dict)).save() except Exception as pfe: print pfe # generate email file title = "frame_benchmark" env = dict( paddle_branch=args.image_branch, paddle_commit_id=args.image_commit_id, benchmark_commit_id=args.code_commit_id, device_type=args.device_type, implement_type=args.implement_type, docker_images=os.getenv('RUN_IMAGE_NAME'), paddle_version=args.paddle_version, HostName=os.getenv('HostName'), ) if args.device_type.upper() in ("P40", "V100", "A100", "V100-32G", "V100-16G"): env["cuda_version"] = args.cuda_version env["cudnn_version"] = args.cudnn_version email_t = template.EmailTemplate(title, env, html_results, args.log_path, FAIL_LIST) email_t.construct_email_content() print('icafe_results:{}'.format(icafe_results)) # build icafe card item = to_icafe.get_alarm_content(icafe_results, env, TABLE_HEADER) to_icafe.write_icafe(item)