def parse_logs(args): """ parse log files and insert to db :param args: :return: """ image_id = get_image_id() file_list = load_folder_files(os.path.join(args.log_path, "index")) dict_run_machine_type = { '1': 'ONE_GPU', '4': 'FOUR_GPU', '8': 'MULTI_GPU', '8mp': 'MULTI_GPU_MULTI_PROCESS' } report_index_dict = {'speed': 1, 'mem': 2, 'maxbs': 6} html_results = [] for job_file in file_list: cluster_job_id = uuid.uuid1() result = "" with open(job_file, 'r+') as file_obj: file_lines = file_obj.readlines() try: job_info = json.loads(file_lines[-1]) except Exception as exc: print("file {} parse error".format(job_file)) continue # save_job if str(job_info["gpu_num"] ) == "8" and job_info["run_mode"] == "mp": run_machine_type = dict_run_machine_type['8mp'] else: run_machine_type = dict_run_machine_type[str( job_info["gpu_num"])] report_index = report_index_dict[job_info["index"]] pj = bm.Job() pj.job_name = "pb_{}_{}".format(args.paddle_version, job_info["model_name"]) pj.cluster_job_id = cluster_job_id pj.cluster_type_id = "LocalJob" pj.model_name = job_info["model_name"] pj.report_index = report_index pj.code_branch = "master" pj.code_commit_id = args.code_commit_id pj.job_type = args.job_type pj.run_machine_type = run_machine_type pj.frame_id = 0 pj.image_id = image_id pj.cuda_version = args.cuda_version pj.cudnn_version = args.cudnn_version pj.device_type = args.device_type pj.model_implement_type = args.implement_type pj.log_extracted = "yes" pj.save() job_id = pj.job_id log_server = socket.gethostname() # todo config the log_server port log_server = "http://" + log_server + ":8777/" log_file = job_info["log_file"].split("/")[-1] profiler_log = job_info["log_with_profiler"].split("/")[-1] profiler_path = job_info["profiler_path"].split("/")[-1] train_log_path = log_server + os.path.join( os.path.basename(args.log_path), "train_log", log_file) profiler_log_path = log_server + os.path.join( os.path.basename(args.log_path), "profiler_log", profiler_log) profiler_path = log_server + os.path.join( os.path.basename(args.log_path), "profiler_log", profiler_path) cpu_utilization_result = 0 gpu_utilization_result = 0 try: if report_index == 2: for line in file_lines: if "MAX_GPU_MEMORY_USE" in line: value = line.strip().split("=")[1].strip() result = int(value) if str.isdigit(value) else 0 break elif report_index == 1: for line in file_lines: if "FINAL_RESULT" in line: result = line.strip().split("=")[1] if 'AVG_CPU_USE' in line: cpu_utilization_result = line.strip().split('=')[1] if 'AVG_GPU_USE' in line: gpu_utilization_result = line.strip().split('=')[1] else: for line in file_lines: if "MAX_BATCH_SIZE" in line: value = line.strip().split("=")[1].strip() result = int(value) if str.isdigit(value) else 0 break # save_result pjr = bm.JobResults() pjr.job_id = job_id pjr.model_name = job_info["model_name"] pjr.report_index_id = report_index pjr.report_result = result pjr.train_log_path = 1 pjr.save() # save log path pjrl = bm.JobResultsLog() pjrl.result_id = pjr.result_id cmd = "curl -I -m 10 -o /dev/null -s -w %{http_code} " + profiler_log_path if commands.getoutput(cmd) != '200': pjrl.log_path = json.dumps( {"train_log_path": train_log_path}) else: pjrl.log_path = json.dumps({ "train_log_path": train_log_path, "profiler_log_path": profiler_log_path, "profiler_path": profiler_path }) pjrl.save() # save cpu & gpu result if report_index == 1: pjr_cpu = bm.JobResults() pjr_cpu.job_id = job_id pjr_cpu.model_name = job_info["model_name"] pjr_cpu.report_index_id = 7 pjr_cpu.report_result = cpu_utilization_result pjr_cpu.save() pjr_gpu = bm.JobResults() pjr_gpu.job_id = job_id pjr_gpu.model_name = job_info["model_name"] pjr_gpu.report_index_id = 8 pjr_gpu.report_result = gpu_utilization_result pjr_gpu.save() except Exception as pfe: print pfe else: print( "models: {}, run_machine_type: {}, index: {}, result: {}". format(job_info["model_name"], run_machine_type, report_index, result)) # 如果当前值是空或者inf(speed 会出现) if not result or result == '-inf': result = 0 value = check_results(job_info["model_name"], report_index, run_machine_type, result) if value: current_html_result = [ job_info["model_name"], run_machine_type, job_info["index"], value[0], result, value[1] ] html_results.append(current_html_result) if html_results: template.construct_email_content(html_results, args.log_path, args)
def parse_logs(args): """ parse log files and insert to db :param args: :return: """ image_id = get_image_id() file_list = load_folder_files(os.path.join(args.log_path, "index")) dict_run_machine_type = { '1gpus' : 'ONE_GPU', '4gpus' : 'FOUR_GPU', '8gpus' : 'MULTI_GPU', '8gpus8p' : 'MULTI_GPU_MULTI_PROCESS' } cv_models = ['DeepLab_V3+', 'CycleGAN', 'mask_rcnn', 'SE-ResNeXt50', 'yolov3'] # nlp_models = ['bert', 'paddingrnn_large', 'paddingrnn_small', 'transformer'] # rl_models = ['ddpg_deep_explore'] multi_process_models = ['mask_rcnn', 'yolov3', 'transformer_base', 'transformer_big', 'bert', 'SE-ResNeXt50'] html_results = [] for file in file_list: # file_name like CycleGAN_mem_1gpus or ddpg_deep_explore_speed_1gpus cluster_job_id = uuid.uuid1() file_name = file.split('/')[-1] model_name = '_'.join(file_name.split('_')[:-2]) key_word = "FPS:" if model_name in cv_models else 'Avg:' job_name = 'pb_' + model_name task_index = file_name.split('_')[-2] if task_index == 'speed': report_index = 1 elif task_index == 'mem': report_index = 2 else: report_index = 6 run_machine_type = dict_run_machine_type[file_name.split('_')[-1]] run_mode = "mp" if file_name.split('_')[-1] == "8gpus8p" else "sp" pj = bm.Job() pj.job_name = job_name pj.cluster_job_id = cluster_job_id pj.cluster_type_id = 0 pj.model_name = model_name pj.report_index = report_index pj.code_branch = "master" pj.code_commit_id = args.code_commit_id pj.job_type = args.job_type pj.run_machine_type = run_machine_type pj.frame_id = 0 pj.image_id = image_id pj.cuda_version = args.cuda_version pj.cudnn_version = args.cudnn_version pj.gpu_type = args.gpu_type pj.model_implement_type = args.implement_type pj.log_extracted = "yes" pj.save() #log_server = log_server_cuda9 if args.cuda_version == '9.0' else log_server_cuda10 log_server = socket.gethostname() #todo config the log_server port log_server = "http://" + log_server + ":8777/" train_log_name = "{}_{}_{}_{}".format(model_name, "train", task_index, file_name.split('_')[-1][0]) if model_name in multi_process_models: train_log_name += "_{}".format(run_mode) train_log_path = os.path.join(os.path.basename(args.log_path), "train_log", train_log_name) train_log_path = log_server + train_log_path job_id = get_job_id(cluster_job_id) result = "" with open(file, 'r+') as file_obj: file_lines = file_obj.readlines() try: if report_index == 2: value = file_lines[-1].split()[-1] result = int(value) if str.isdigit(value) else 0 elif report_index == 1: lines = file_lines[-10:-1] for line in lines: if key_word in line: result = line.split(':')[1].split(' ')[1] else: value = file_lines[-1].split()[-1] result = int(value) if str.isdigit(value) else 0 pjr = bm.JobResults() pjr.job_id = job_id pjr.model_name = model_name pjr.report_index_id = report_index pjr.report_result = result pjr.train_log_path = train_log_path pjr.save() except Exception as pfe: print pfe else: print("models: {}, run_machine_type: {}, index: {}, result: {}".format( model_name, run_machine_type, task_index, result)) # 如果当前值是空或者inf(speed 会出现) if not result or result == '-inf': result = 0 value = check_results(model_name, report_index, run_machine_type, result) if value: current_html_result = [model_name, run_machine_type, task_index, value[0], result, value[1]] html_results.append(current_html_result) if html_results: template.construct_email_content(html_results, args.log_path, args)
def parse_logs(args): """ parse log files and insert to db :param args: :return: """ image_id = get_image_id() file_list = load_folder_files(os.path.join(args.log_path, "index")) html_results = [] for job_file in file_list: result = 0 with open(job_file, 'r+') as file_obj: file_lines = file_obj.readlines() try: job_info = json.loads(file_lines[-1]) except Exception as exc: print("file {} parse error".format(job_file)) continue # check model if exist in db get_or_insert_model(job_info["model_name"], job_info["mission_name"], job_info["direction_id"]) # save job if str(job_info["gpu_num"]) == "8" and job_info["run_mode"] == "mp": run_machine_type = DICT_RUN_MACHINE_TYPE['8mp'] else: run_machine_type = DICT_RUN_MACHINE_TYPE[str(job_info["gpu_num"])] job_id = insert_job(image_id, run_machine_type, job_info, args).job_id # parse job results cpu_utilization_result = 0 gpu_utilization_result = 0 unit = '' mem_result = 0 try: if job_info["index"] == 1: result = job_info['FINAL_RESULT'] unit = job_info['UNIT'] for line in file_lines: if 'AVG_CPU_USE' in line: cpu_utilization_result = line.strip().split('=')[1] if 'AVG_GPU_USE' in line: gpu_utilization_result = line.strip().split('=')[1] # TODO: 动态图吞吐和显存占用是一个任务,后续静态图也改成一个任务,删除这个判断和删除 elif job_info["index"] == 2: if "MAX_GPU_MEMORY_USE" in line and args.implement_type != 'static_graph': value = line.strip().split("=")[1].strip() mem_result = int(value) if str.isdigit(value) else 0 elif job_info["index"] == 2: for line in file_lines: if "MAX_GPU_MEMORY_USE" in line: value = line.strip().split("=")[1].strip() result = int(value) if str.isdigit(value) else 0 unit = 'MiB' break elif job_info["index"] == 3: result = json.dumps(job_info['FINAL_RESULT']) else: for line in file_lines: if "MAX_BATCH_SIZE" in line: value = line.strip().split("=")[1].strip() result = int(value) if str.isdigit(value) else 0 break # save job results pjr = insert_results(job_id, job_info["model_name"], job_info["index"], result, unit, 1) log_file = job_info["log_file"].split("/")[-1] log_base = args.paddle_version + "/" + args.implement_type train_log_path = LOG_SERVER + os.path.join(log_base, "train_log", log_file) log_save_dict = {"train_log_path": train_log_path} if job_info["index"] == 1: insert_results(job_id, job_info["model_name"], 7, cpu_utilization_result, '%') insert_results(job_id, job_info["model_name"], 8, gpu_utilization_result, '%') if int(job_info["gpu_num"]) == 1: profiler_log = job_info["log_with_profiler"].split("/")[-1] profiler_path = job_info["profiler_path"].split("/")[-1] profiler_log_path = LOG_SERVER + os.path.join(log_base, "profiler_log", profiler_log) profiler_path = LOG_SERVER + os.path.join(log_base, "profiler_log", profiler_path) log_save_dict["profiler_log_path"] = profiler_log_path log_save_dict["profiler_path"] = profiler_path pjrl = bm.JobResultsLog() pjrl.result_id = pjr.result_id pjrl.log_path = json.dumps(log_save_dict) pjrl.save() # TODO: 动态图吞吐和显存占用是一个任务,后续静态图也改成一个任务,即可删除这个判断 if args.implement_type != 'static_graph': pjr = insert_results(job_id, job_info["model_name"], 2, mem_result, 'MiB', 0) except Exception as pfe: print pfe else: print("models: {}, run_machine_type: {}, index: {}, result: {}".format( job_info["model_name"], run_machine_type, job_info["index"], result)) if job_info["index"] != 3: check_results(job_info["model_name"], job_info["index"], run_machine_type, result, html_results) # TODO: 动态图吞吐和显存占用是一个任务,后续静态图也改成一个任务,即可删除这个判断 if args.implement_type != 'static_graph': check_results(job_info["model_name"], 2, run_machine_type, mem_result, html_results) else: check_results(job_info["model_name"], job_info["index"], run_machine_type, result, html_results, "Framework_Total") check_results(job_info["model_name"], job_info["index"], run_machine_type, result, html_results, "GpuMemcpy_Total") if html_results: template.construct_email_content(html_results, args.log_path, args)
def parse_logs(args): """ parse log files and insert to db :param args: :return: """ image_id = get_image_id() file_list = load_folder_files(os.path.join(args.log_path, "index")) html_results = [] for job_file in file_list: result = 0 with open(job_file, 'r+') as file_obj: file_lines = file_obj.readlines() try: job_info = json.loads(file_lines[-1]) except Exception as exc: print("file {} parse error".format(job_file)) continue # save job if str(job_info["gpu_num"] ) == "8" and job_info["run_mode"] == "mp": run_machine_type = DICT_RUN_MACHINE_TYPE['8mp'] else: run_machine_type = DICT_RUN_MACHINE_TYPE[str( job_info["gpu_num"])] job_id = insert_job(image_id, run_machine_type, job_info, args).job_id # parse job results cpu_utilization_result = 0 gpu_utilization_result = 0 try: if job_info["index"] == 1: result = job_info['FINAL_RESULT'] for line in file_lines: if 'AVG_CPU_USE' in line: cpu_utilization_result = line.strip().split('=')[1] if 'AVG_GPU_USE' in line: gpu_utilization_result = line.strip().split('=')[1] elif job_info["index"] == 2: for line in file_lines: if "MAX_GPU_MEMORY_USE" in line: value = line.strip().split("=")[1].strip() result = int(value) if str.isdigit(value) else 0 break elif job_info["index"] == 3: result = json.dumps(job_info['FINAL_RESULT']) else: for line in file_lines: if "MAX_BATCH_SIZE" in line: value = line.strip().split("=")[1].strip() result = int(value) if str.isdigit(value) else 0 break # save job results pjr = insert_results(job_id, job_info["model_name"], job_info["index"], result, 1) log_file = job_info["log_file"].split("/")[-1] train_log_path = LOG_SERVER + os.path.join( os.path.basename(args.log_path), "train_log", log_file) log_save_dict = {"train_log_path": train_log_path} if job_info["index"] == 1: insert_results(job_id, job_info["model_name"], 7, cpu_utilization_result) insert_results(job_id, job_info["model_name"], 8, gpu_utilization_result) if int(job_info["gpu_num"]) == 1: profiler_log = job_info["log_with_profiler"].split( "/")[-1] profiler_path = job_info["profiler_path"].split( "/")[-1] profiler_log_path = LOG_SERVER + os.path.join( os.path.basename(args.log_path), "profiler_log", profiler_log) profiler_path = LOG_SERVER + os.path.join( os.path.basename(args.log_path), "profiler_log", profiler_path) log_save_dict["profiler_log_path"] = profiler_log_path log_save_dict["profiler_path"] = profiler_path pjrl = bm.JobResultsLog() pjrl.result_id = pjr.result_id pjrl.log_path = json.dumps(log_save_dict) pjrl.save() # cmd = "curl -I -m 10 -o /dev/null -s -w %{http_code} " + profiler_log_path #if commands.getoutput(cmd) != '200': except Exception as pfe: print pfe else: print( "models: {}, run_machine_type: {}, index: {}, result: {}". format(job_info["model_name"], run_machine_type, job_info["index"], result)) if job_info["index"] != 3: check_results(job_info, run_machine_type, result, html_results) elif job_info["index"] == 3: check_results(job_info, run_machine_type, result, html_results, "Framework_Total") check_results(job_info, run_machine_type, result, html_results, "GpuMemcpy_Total") if html_results: template.construct_email_content(html_results, args.log_path, args)