def insert_job(image_id, run_machine_type, job_info, args): """ insert job to db""" cluster_job_id = uuid.uuid1() pj = bm.Job() pj.job_name = "pb_{}_{}".format(args.paddle_version, job_info["model_name"]) pj.cluster_job_id = cluster_job_id pj.cluster_type_id = "LocalJob" pj.model_name = job_info["model_name"] pj.report_index = job_info["index"] pj.code_branch = "master" pj.code_commit_id = args.code_commit_id pj.job_type = args.job_type pj.run_machine_type = run_machine_type pj.frame_id = 0 pj.image_id = image_id pj.cuda_version = args.cuda_version pj.cudnn_version = args.cudnn_version pj.device_type = args.device_type pj.model_implement_type = args.implement_type pj.log_extracted = "yes" pj.save() return pj
def insert_job(job_instance): """ insert the job info to the table named job by django """ if job_instance.conf["image_id"] == "latest": image_id = get_frame_latest_version( job_instance.conf["frame_id"]).image_id else: image_id = job_instance.conf["image_id"] pcj = bm.Job() pcj.job_name = job_instance.conf["body"]["jobName"] pcj.cluster_job_id = job_instance.conf["cluster_job_id"] pcj.cluster_type_id = job_instance.conf["cluster_type_id"] pcj.cluster_conf = job_instance.conf["body"]["clusterConf"] pcj.model_name = job_instance.conf["model"] pcj.report_index = job_instance.conf["report_index"] pcj.repo_address = job_instance.conf["repo_address"] pcj.code_branch = job_instance.conf["code_branch"] pcj.job_type = job_instance.conf["job_type"] pcj.run_rpc_type = job_instance.conf["run_rpc_type"] pcj.run_machine_type = job_instance.conf["run_machine_tpye"] pcj.batch_size = job_instance.conf["batch_size"] pcj.frame_id = job_instance.conf["frame_id"] pcj.image_id = image_id pcj.cuda_version = job_instance.conf["cuda_version"] pcj.cudnn_version = job_instance.conf["cudnn_version"] pcj.run_cmd = job_instance.conf["run_cmd"] pcj.eval_cmd = job_instance.conf["eval_cmd"] pcj.infer_cmd = job_instance.conf["infer_cmd"] pcj.submit_period = job_instance.conf["ploy"].submit_period pcj.check_period = job_instance.conf["ploy"].check_period pcj.statistics_unit = job_instance.conf["ploy"].statistics_unit pcj.status = "submit" pcj.save()
def parse_logs(args): """ parse log files and insert to db :param args: :return: """ image_id = get_image_id() file_list = load_folder_files(os.path.join(args.log_path, "index")) dict_run_machine_type = { '1': 'ONE_GPU', '4': 'FOUR_GPU', '8': 'MULTI_GPU', '8mp': 'MULTI_GPU_MULTI_PROCESS' } report_index_dict = {'speed': 1, 'mem': 2, 'maxbs': 6} html_results = [] for job_file in file_list: cluster_job_id = uuid.uuid1() result = "" with open(job_file, 'r+') as file_obj: file_lines = file_obj.readlines() try: job_info = json.loads(file_lines[-1]) except Exception as exc: print("file {} parse error".format(job_file)) continue # save_job if str(job_info["gpu_num"] ) == "8" and job_info["run_mode"] == "mp": run_machine_type = dict_run_machine_type['8mp'] else: run_machine_type = dict_run_machine_type[str( job_info["gpu_num"])] report_index = report_index_dict[job_info["index"]] pj = bm.Job() pj.job_name = "pb_{}_{}".format(args.paddle_version, job_info["model_name"]) pj.cluster_job_id = cluster_job_id pj.cluster_type_id = "LocalJob" pj.model_name = job_info["model_name"] pj.report_index = report_index pj.code_branch = "master" pj.code_commit_id = args.code_commit_id pj.job_type = args.job_type pj.run_machine_type = run_machine_type pj.frame_id = 0 pj.image_id = image_id pj.cuda_version = args.cuda_version pj.cudnn_version = args.cudnn_version pj.device_type = args.device_type pj.model_implement_type = args.implement_type pj.log_extracted = "yes" pj.save() job_id = pj.job_id log_server = socket.gethostname() # todo config the log_server port log_server = "http://" + log_server + ":8777/" log_file = job_info["log_file"].split("/")[-1] profiler_log = job_info["log_with_profiler"].split("/")[-1] profiler_path = job_info["profiler_path"].split("/")[-1] train_log_path = log_server + os.path.join( os.path.basename(args.log_path), "train_log", log_file) profiler_log_path = log_server + os.path.join( os.path.basename(args.log_path), "profiler_log", profiler_log) profiler_path = log_server + os.path.join( os.path.basename(args.log_path), "profiler_log", profiler_path) cpu_utilization_result = 0 gpu_utilization_result = 0 try: if report_index == 2: for line in file_lines: if "MAX_GPU_MEMORY_USE" in line: value = line.strip().split("=")[1].strip() result = int(value) if str.isdigit(value) else 0 break elif report_index == 1: for line in file_lines: if "FINAL_RESULT" in line: result = line.strip().split("=")[1] if 'AVG_CPU_USE' in line: cpu_utilization_result = line.strip().split('=')[1] if 'AVG_GPU_USE' in line: gpu_utilization_result = line.strip().split('=')[1] else: for line in file_lines: if "MAX_BATCH_SIZE" in line: value = line.strip().split("=")[1].strip() result = int(value) if str.isdigit(value) else 0 break # save_result pjr = bm.JobResults() pjr.job_id = job_id pjr.model_name = job_info["model_name"] pjr.report_index_id = report_index pjr.report_result = result pjr.train_log_path = 1 pjr.save() # save log path pjrl = bm.JobResultsLog() pjrl.result_id = pjr.result_id cmd = "curl -I -m 10 -o /dev/null -s -w %{http_code} " + profiler_log_path if commands.getoutput(cmd) != '200': pjrl.log_path = json.dumps( {"train_log_path": train_log_path}) else: pjrl.log_path = json.dumps({ "train_log_path": train_log_path, "profiler_log_path": profiler_log_path, "profiler_path": profiler_path }) pjrl.save() # save cpu & gpu result if report_index == 1: pjr_cpu = bm.JobResults() pjr_cpu.job_id = job_id pjr_cpu.model_name = job_info["model_name"] pjr_cpu.report_index_id = 7 pjr_cpu.report_result = cpu_utilization_result pjr_cpu.save() pjr_gpu = bm.JobResults() pjr_gpu.job_id = job_id pjr_gpu.model_name = job_info["model_name"] pjr_gpu.report_index_id = 8 pjr_gpu.report_result = gpu_utilization_result pjr_gpu.save() except Exception as pfe: print pfe else: print( "models: {}, run_machine_type: {}, index: {}, result: {}". format(job_info["model_name"], run_machine_type, report_index, result)) # 如果当前值是空或者inf(speed 会出现) if not result or result == '-inf': result = 0 value = check_results(job_info["model_name"], report_index, run_machine_type, result) if value: current_html_result = [ job_info["model_name"], run_machine_type, job_info["index"], value[0], result, value[1] ] html_results.append(current_html_result) if html_results: template.construct_email_content(html_results, args.log_path, args)
def parse_logs(args): """ parse log files and insert to db :param args: :return: """ image_id = get_image_id() file_list = load_folder_files(os.path.join(args.log_path, "index")) dict_run_machine_type = { '1gpus' : 'ONE_GPU', '4gpus' : 'FOUR_GPU', '8gpus' : 'MULTI_GPU', '8gpus8p' : 'MULTI_GPU_MULTI_PROCESS' } cv_models = ['DeepLab_V3+', 'CycleGAN', 'mask_rcnn', 'SE-ResNeXt50', 'yolov3'] # nlp_models = ['bert', 'paddingrnn_large', 'paddingrnn_small', 'transformer'] # rl_models = ['ddpg_deep_explore'] multi_process_models = ['mask_rcnn', 'yolov3', 'transformer_base', 'transformer_big', 'bert', 'SE-ResNeXt50'] html_results = [] for file in file_list: # file_name like CycleGAN_mem_1gpus or ddpg_deep_explore_speed_1gpus cluster_job_id = uuid.uuid1() file_name = file.split('/')[-1] model_name = '_'.join(file_name.split('_')[:-2]) key_word = "FPS:" if model_name in cv_models else 'Avg:' job_name = 'pb_' + model_name task_index = file_name.split('_')[-2] if task_index == 'speed': report_index = 1 elif task_index == 'mem': report_index = 2 else: report_index = 6 run_machine_type = dict_run_machine_type[file_name.split('_')[-1]] run_mode = "mp" if file_name.split('_')[-1] == "8gpus8p" else "sp" pj = bm.Job() pj.job_name = job_name pj.cluster_job_id = cluster_job_id pj.cluster_type_id = 0 pj.model_name = model_name pj.report_index = report_index pj.code_branch = "master" pj.code_commit_id = args.code_commit_id pj.job_type = args.job_type pj.run_machine_type = run_machine_type pj.frame_id = 0 pj.image_id = image_id pj.cuda_version = args.cuda_version pj.cudnn_version = args.cudnn_version pj.gpu_type = args.gpu_type pj.model_implement_type = args.implement_type pj.log_extracted = "yes" pj.save() #log_server = log_server_cuda9 if args.cuda_version == '9.0' else log_server_cuda10 log_server = socket.gethostname() #todo config the log_server port log_server = "http://" + log_server + ":8777/" train_log_name = "{}_{}_{}_{}".format(model_name, "train", task_index, file_name.split('_')[-1][0]) if model_name in multi_process_models: train_log_name += "_{}".format(run_mode) train_log_path = os.path.join(os.path.basename(args.log_path), "train_log", train_log_name) train_log_path = log_server + train_log_path job_id = get_job_id(cluster_job_id) result = "" with open(file, 'r+') as file_obj: file_lines = file_obj.readlines() try: if report_index == 2: value = file_lines[-1].split()[-1] result = int(value) if str.isdigit(value) else 0 elif report_index == 1: lines = file_lines[-10:-1] for line in lines: if key_word in line: result = line.split(':')[1].split(' ')[1] else: value = file_lines[-1].split()[-1] result = int(value) if str.isdigit(value) else 0 pjr = bm.JobResults() pjr.job_id = job_id pjr.model_name = model_name pjr.report_index_id = report_index pjr.report_result = result pjr.train_log_path = train_log_path pjr.save() except Exception as pfe: print pfe else: print("models: {}, run_machine_type: {}, index: {}, result: {}".format( model_name, run_machine_type, task_index, result)) # 如果当前值是空或者inf(speed 会出现) if not result or result == '-inf': result = 0 value = check_results(model_name, report_index, run_machine_type, result) if value: current_html_result = [model_name, run_machine_type, task_index, value[0], result, value[1]] html_results.append(current_html_result) if html_results: template.construct_email_content(html_results, args.log_path, args)
def parse_logs(args): image_id = get_image_id() file_list = load_folder_files(os.path.join(args.log_path, "index")) dict_run_machine_type = { '1gpus': 'ONE_GPU', '4gpus': 'FOUR_GPU', '8gpus': 'MULTI_GPU', '8gpus8p': 'MULTI_GPU_MULTI_PROCESS' } cv_models = [ 'DeepLab_V3+', 'CycleGAN', 'mask_rcnn', 'SE-ResNeXt50', 'yolov3' ] nlp_models = [ 'bert', 'paddingrnn_large', 'paddingrnn_small', 'transformer' ] rl_models = ['ddpg_deep_explore'] for file in file_list: # file_name like CycleGAN_mem_1gpus or ddpg_deep_explore_speed_1gpus cluster_job_id = uuid.uuid1() file_name = file.split('/')[-1] model_name = '_'.join(file_name.split('_')[:-2]) key_word = "FPS:" if model_name in cv_models else 'Avg:' job_name = 'pb_' + model_name task_index = file_name.split('_')[-2] if task_index == 'speed': report_index = 1 elif task_index == 'mem': report_index = 2 else: report_index = 6 run_machine_type = dict_run_machine_type[file_name.split('_')[-1]] pj = bm.Job() pj.job_name = job_name pj.cluster_job_id = cluster_job_id pj.cluster_type_id = 0 pj.model_name = model_name pj.report_index = report_index pj.code_branch = "master" pj.code_commit_id = args.code_commit_id pj.job_type = 2 pj.run_machine_type = run_machine_type pj.frame_id = 0 pj.image_id = image_id pj.cuda_version = args.cuda_version pj.cudnn_version = args.cudnn_version pj.log_extracted = "yes" pj.save() train_log_name = "{}_{}_{}_{}".format(model_name, "train", task_index, file_name.split('_')[-1][0]) train_log_path = os.path.join(os.path.basename(args.log_path), "train_log", train_log_name) job_id = get_job_id(cluster_job_id) result = "" with open(file, 'r+') as file_obj: file_lines = file_obj.readlines() try: if report_index == 2: value = file_lines[-1].split()[-1] result = int(value) if str.isdigit(value) else 0 elif report_index == 1: lines = file_lines[-10:-1] for line in lines: if key_word in line: result = line.split(':')[1].split(' ')[1] else: value = file_lines[-1].split()[-1] result = int(value) if str.isdigit(value) else 0 pjr = bm.JobResults() pjr.job_id = job_id pjr.model_name = model_name pjr.report_index_id = report_index pjr.report_result = result pjr.train_log_path = train_log_path pjr.save() except Exception as pfe: print pfe