def insert_results(job_id, model_name, report_index_id, result, log_path=0): """insert job results to db""" pjr = bm.JobResults() pjr.job_id = job_id pjr.model_name = model_name pjr.report_index_id = report_index_id pjr.report_result = result pjr.train_log_path = log_path pjr.save() return pjr
def insert_results(job_id, model_name, report_index_id, result, unit, log_path=0, benchmark=0, outlier=0): """insert job results to db""" pjr = bm.JobResults() pjr.job_id = job_id pjr.model_name = model_name pjr.report_index_id = report_index_id pjr.report_result = result pjr.unit = unit pjr.outlier = outlier pjr.benchmark = benchmark pjr.train_log_path = log_path pjr.save() return pjr
def insert_result(job_info, job_result, log_dict): """ insert job result to db :param job_info: :param job_result: {"performance": {"acc1": (step_last, result_avg) "acc5": (step_last, result_avg)}} or {"speed":result_avg} :param log_dict: :return: """ job_id = job_info.job_id model_name = job_info.model_name indexs = sorted([int(x) for x in (str(job_info.report_index).split(','))], reverse=True) for index in indexs: mr = bm.JobResults() mr.job_id = job_id mr.model_name = model_name mr.report_index_id = index if index == 0: values = {} for key in job_result["performance"].keys(): values[key] = job_result["performance"][key][1] mr.report_result = values mr.result_log = log_dict elif index == 1: mr.report_result = job_result["speed"] elif index == 2: mr.report_result = job_result["gpu_train_mem_max"] elif index == 3: mr.report_result = job_result["train_time"] elif index == 4: mr.report_result = job_result["infer_speed"] elif index == 5: mr.report_result = job_result["gpu_infer_mem_max"] else: logging.error("error!") mr.save()
def parse_logs(args): """ parse log files and insert to db :param args: :return: """ image_id = get_image_id() file_list = load_folder_files(os.path.join(args.log_path, "index")) dict_run_machine_type = { '1': 'ONE_GPU', '4': 'FOUR_GPU', '8': 'MULTI_GPU', '8mp': 'MULTI_GPU_MULTI_PROCESS' } report_index_dict = {'speed': 1, 'mem': 2, 'maxbs': 6} html_results = [] for job_file in file_list: cluster_job_id = uuid.uuid1() result = "" with open(job_file, 'r+') as file_obj: file_lines = file_obj.readlines() try: job_info = json.loads(file_lines[-1]) except Exception as exc: print("file {} parse error".format(job_file)) continue # save_job if str(job_info["gpu_num"] ) == "8" and job_info["run_mode"] == "mp": run_machine_type = dict_run_machine_type['8mp'] else: run_machine_type = dict_run_machine_type[str( job_info["gpu_num"])] report_index = report_index_dict[job_info["index"]] pj = bm.Job() pj.job_name = "pb_{}_{}".format(args.paddle_version, job_info["model_name"]) pj.cluster_job_id = cluster_job_id pj.cluster_type_id = "LocalJob" pj.model_name = job_info["model_name"] pj.report_index = report_index pj.code_branch = "master" pj.code_commit_id = args.code_commit_id pj.job_type = args.job_type pj.run_machine_type = run_machine_type pj.frame_id = 0 pj.image_id = image_id pj.cuda_version = args.cuda_version pj.cudnn_version = args.cudnn_version pj.device_type = args.device_type pj.model_implement_type = args.implement_type pj.log_extracted = "yes" pj.save() job_id = pj.job_id log_server = socket.gethostname() # todo config the log_server port log_server = "http://" + log_server + ":8777/" log_file = job_info["log_file"].split("/")[-1] profiler_log = job_info["log_with_profiler"].split("/")[-1] profiler_path = job_info["profiler_path"].split("/")[-1] train_log_path = log_server + os.path.join( os.path.basename(args.log_path), "train_log", log_file) profiler_log_path = log_server + os.path.join( os.path.basename(args.log_path), "profiler_log", profiler_log) profiler_path = log_server + os.path.join( os.path.basename(args.log_path), "profiler_log", profiler_path) cpu_utilization_result = 0 gpu_utilization_result = 0 try: if report_index == 2: for line in file_lines: if "MAX_GPU_MEMORY_USE" in line: value = line.strip().split("=")[1].strip() result = int(value) if str.isdigit(value) else 0 break elif report_index == 1: for line in file_lines: if "FINAL_RESULT" in line: result = line.strip().split("=")[1] if 'AVG_CPU_USE' in line: cpu_utilization_result = line.strip().split('=')[1] if 'AVG_GPU_USE' in line: gpu_utilization_result = line.strip().split('=')[1] else: for line in file_lines: if "MAX_BATCH_SIZE" in line: value = line.strip().split("=")[1].strip() result = int(value) if str.isdigit(value) else 0 break # save_result pjr = bm.JobResults() pjr.job_id = job_id pjr.model_name = job_info["model_name"] pjr.report_index_id = report_index pjr.report_result = result pjr.train_log_path = 1 pjr.save() # save log path pjrl = bm.JobResultsLog() pjrl.result_id = pjr.result_id cmd = "curl -I -m 10 -o /dev/null -s -w %{http_code} " + profiler_log_path if commands.getoutput(cmd) != '200': pjrl.log_path = json.dumps( {"train_log_path": train_log_path}) else: pjrl.log_path = json.dumps({ "train_log_path": train_log_path, "profiler_log_path": profiler_log_path, "profiler_path": profiler_path }) pjrl.save() # save cpu & gpu result if report_index == 1: pjr_cpu = bm.JobResults() pjr_cpu.job_id = job_id pjr_cpu.model_name = job_info["model_name"] pjr_cpu.report_index_id = 7 pjr_cpu.report_result = cpu_utilization_result pjr_cpu.save() pjr_gpu = bm.JobResults() pjr_gpu.job_id = job_id pjr_gpu.model_name = job_info["model_name"] pjr_gpu.report_index_id = 8 pjr_gpu.report_result = gpu_utilization_result pjr_gpu.save() except Exception as pfe: print pfe else: print( "models: {}, run_machine_type: {}, index: {}, result: {}". format(job_info["model_name"], run_machine_type, report_index, result)) # 如果当前值是空或者inf(speed 会出现) if not result or result == '-inf': result = 0 value = check_results(job_info["model_name"], report_index, run_machine_type, result) if value: current_html_result = [ job_info["model_name"], run_machine_type, job_info["index"], value[0], result, value[1] ] html_results.append(current_html_result) if html_results: template.construct_email_content(html_results, args.log_path, args)
def parse_logs(args): """ parse log files and insert to db :param args: :return: """ image_id = get_image_id() file_list = load_folder_files(os.path.join(args.log_path, "index")) dict_run_machine_type = { '1gpus' : 'ONE_GPU', '4gpus' : 'FOUR_GPU', '8gpus' : 'MULTI_GPU', '8gpus8p' : 'MULTI_GPU_MULTI_PROCESS' } cv_models = ['DeepLab_V3+', 'CycleGAN', 'mask_rcnn', 'SE-ResNeXt50', 'yolov3'] # nlp_models = ['bert', 'paddingrnn_large', 'paddingrnn_small', 'transformer'] # rl_models = ['ddpg_deep_explore'] multi_process_models = ['mask_rcnn', 'yolov3', 'transformer_base', 'transformer_big', 'bert', 'SE-ResNeXt50'] html_results = [] for file in file_list: # file_name like CycleGAN_mem_1gpus or ddpg_deep_explore_speed_1gpus cluster_job_id = uuid.uuid1() file_name = file.split('/')[-1] model_name = '_'.join(file_name.split('_')[:-2]) key_word = "FPS:" if model_name in cv_models else 'Avg:' job_name = 'pb_' + model_name task_index = file_name.split('_')[-2] if task_index == 'speed': report_index = 1 elif task_index == 'mem': report_index = 2 else: report_index = 6 run_machine_type = dict_run_machine_type[file_name.split('_')[-1]] run_mode = "mp" if file_name.split('_')[-1] == "8gpus8p" else "sp" pj = bm.Job() pj.job_name = job_name pj.cluster_job_id = cluster_job_id pj.cluster_type_id = 0 pj.model_name = model_name pj.report_index = report_index pj.code_branch = "master" pj.code_commit_id = args.code_commit_id pj.job_type = args.job_type pj.run_machine_type = run_machine_type pj.frame_id = 0 pj.image_id = image_id pj.cuda_version = args.cuda_version pj.cudnn_version = args.cudnn_version pj.gpu_type = args.gpu_type pj.model_implement_type = args.implement_type pj.log_extracted = "yes" pj.save() #log_server = log_server_cuda9 if args.cuda_version == '9.0' else log_server_cuda10 log_server = socket.gethostname() #todo config the log_server port log_server = "http://" + log_server + ":8777/" train_log_name = "{}_{}_{}_{}".format(model_name, "train", task_index, file_name.split('_')[-1][0]) if model_name in multi_process_models: train_log_name += "_{}".format(run_mode) train_log_path = os.path.join(os.path.basename(args.log_path), "train_log", train_log_name) train_log_path = log_server + train_log_path job_id = get_job_id(cluster_job_id) result = "" with open(file, 'r+') as file_obj: file_lines = file_obj.readlines() try: if report_index == 2: value = file_lines[-1].split()[-1] result = int(value) if str.isdigit(value) else 0 elif report_index == 1: lines = file_lines[-10:-1] for line in lines: if key_word in line: result = line.split(':')[1].split(' ')[1] else: value = file_lines[-1].split()[-1] result = int(value) if str.isdigit(value) else 0 pjr = bm.JobResults() pjr.job_id = job_id pjr.model_name = model_name pjr.report_index_id = report_index pjr.report_result = result pjr.train_log_path = train_log_path pjr.save() except Exception as pfe: print pfe else: print("models: {}, run_machine_type: {}, index: {}, result: {}".format( model_name, run_machine_type, task_index, result)) # 如果当前值是空或者inf(speed 会出现) if not result or result == '-inf': result = 0 value = check_results(model_name, report_index, run_machine_type, result) if value: current_html_result = [model_name, run_machine_type, task_index, value[0], result, value[1]] html_results.append(current_html_result) if html_results: template.construct_email_content(html_results, args.log_path, args)
def parse_logs(args): image_id = get_image_id() file_list = load_folder_files(os.path.join(args.log_path, "index")) dict_run_machine_type = { '1gpus': 'ONE_GPU', '4gpus': 'FOUR_GPU', '8gpus': 'MULTI_GPU', '8gpus8p': 'MULTI_GPU_MULTI_PROCESS' } cv_models = [ 'DeepLab_V3+', 'CycleGAN', 'mask_rcnn', 'SE-ResNeXt50', 'yolov3' ] nlp_models = [ 'bert', 'paddingrnn_large', 'paddingrnn_small', 'transformer' ] rl_models = ['ddpg_deep_explore'] for file in file_list: # file_name like CycleGAN_mem_1gpus or ddpg_deep_explore_speed_1gpus cluster_job_id = uuid.uuid1() file_name = file.split('/')[-1] model_name = '_'.join(file_name.split('_')[:-2]) key_word = "FPS:" if model_name in cv_models else 'Avg:' job_name = 'pb_' + model_name task_index = file_name.split('_')[-2] if task_index == 'speed': report_index = 1 elif task_index == 'mem': report_index = 2 else: report_index = 6 run_machine_type = dict_run_machine_type[file_name.split('_')[-1]] pj = bm.Job() pj.job_name = job_name pj.cluster_job_id = cluster_job_id pj.cluster_type_id = 0 pj.model_name = model_name pj.report_index = report_index pj.code_branch = "master" pj.code_commit_id = args.code_commit_id pj.job_type = 2 pj.run_machine_type = run_machine_type pj.frame_id = 0 pj.image_id = image_id pj.cuda_version = args.cuda_version pj.cudnn_version = args.cudnn_version pj.log_extracted = "yes" pj.save() train_log_name = "{}_{}_{}_{}".format(model_name, "train", task_index, file_name.split('_')[-1][0]) train_log_path = os.path.join(os.path.basename(args.log_path), "train_log", train_log_name) job_id = get_job_id(cluster_job_id) result = "" with open(file, 'r+') as file_obj: file_lines = file_obj.readlines() try: if report_index == 2: value = file_lines[-1].split()[-1] result = int(value) if str.isdigit(value) else 0 elif report_index == 1: lines = file_lines[-10:-1] for line in lines: if key_word in line: result = line.split(':')[1].split(' ')[1] else: value = file_lines[-1].split()[-1] result = int(value) if str.isdigit(value) else 0 pjr = bm.JobResults() pjr.job_id = job_id pjr.model_name = model_name pjr.report_index_id = report_index pjr.report_result = result pjr.train_log_path = train_log_path pjr.save() except Exception as pfe: print pfe