Example #1
0
def parse_logs(args):
    """
    parse log files and insert to db
    :param args:
    :return:
    """
    image_id = get_image_id()
    file_list = load_folder_files(os.path.join(args.log_path, "index"))
    html_results = OrderedDict()
    for k in DICT_INDEX.values():
        html_results[k] = {}
        html_results[k]["header"] = TABLE_HEADER
        html_results[k]["data"] = []
    for job_file in file_list:
        result = 0
        with open(job_file, 'r+') as file_obj:
            file_lines = file_obj.readlines()
            try:
                job_info = json.loads(file_lines[-1])
            except Exception as exc:
                print("file {} parse error".format(job_file))
                continue

            # check model if exist in db
            get_or_insert_model(job_info["model_name"],
                                job_info["mission_name"],
                                job_info["direction_id"])

            # save job
            if str(job_info["gpu_num"]
                   ) == "8" and job_info["run_mode"] == "mp":
                run_machine_type = DICT_RUN_MACHINE_TYPE['8mp']
            else:
                run_machine_type = DICT_RUN_MACHINE_TYPE[str(
                    job_info["gpu_num"])]
            job_id = insert_job(image_id, run_machine_type, job_info,
                                args).job_id

            # parse job results
            cpu_utilization_result = 0
            gpu_utilization_result = 0
            unit = ''
            mem_result = 0
            try:
                if job_info["index"] == 1:
                    result = job_info['FINAL_RESULT']
                    unit = job_info['UNIT']
                    for line in file_lines:
                        if 'AVG_CPU_USE' in line:
                            cpu_utilization_result = line.strip().split('=')[1]
                        if 'AVG_GPU_USE' in line:
                            gpu_utilization_result = line.strip().split('=')[1]
                        if "MAX_GPU_MEMORY_USE" in line:
                            value = line.strip().split("=")[1].strip()
                            mem_result = int(value) if str.isdigit(
                                value) else 0

                elif job_info["index"] == 3:
                    result = json.dumps(job_info['FINAL_RESULT'])
                else:
                    for line in file_lines:
                        if "MAX_BATCH_SIZE" in line:
                            value = line.strip().split("=")[1].strip()
                            result = int(value) if str.isdigit(value) else 0
                            break

                # save job results
                pjr = insert_results(job_id, job_info["model_name"],
                                     job_info["index"], result, unit, 1)
                log_file = job_info["log_file"].split("/")[-1]
                log_base = args.paddle_version + "/" + args.implement_type
                train_log_path = LOG_SERVER + os.path.join(
                    log_base, "train_log", log_file)
                log_save_dict = {"train_log_path": train_log_path}
                if job_info["index"] == 1:
                    insert_results(job_id, job_info["model_name"], 7,
                                   cpu_utilization_result, '%')
                    insert_results(job_id, job_info["model_name"], 8,
                                   gpu_utilization_result, '%')
                    pjr2 = insert_results(job_id, job_info["model_name"], 2,
                                          mem_result, 'MiB', 1)
                    bm.JobResultsLog.objects.create(
                        result_id=pjr2.result_id,
                        log_path=json.dumps(log_save_dict)).save()
                    if int(job_info["gpu_num"]) == 1:
                        profiler_log = job_info["log_with_profiler"].split(
                            "/")[-1]
                        profiler_path = job_info["profiler_path"].split(
                            "/")[-1]
                        profiler_log_path = LOG_SERVER + os.path.join(
                            log_base, "profiler_log", profiler_log)
                        profiler_path = LOG_SERVER + os.path.join(
                            log_base, "profiler_log", profiler_path)
                        log_save_dict["profiler_log_path"] = profiler_log_path
                        log_save_dict["profiler_path"] = profiler_path

                bm.JobResultsLog.objects.create(
                    result_id=pjr.result_id,
                    log_path=json.dumps(log_save_dict)).save()

            except Exception as pfe:
                print pfe
            else:
                print(
                    "models: {}, run_machine_type: {}, index: {}, result: {}".
                    format(job_info["model_name"], run_machine_type,
                           job_info["index"], result))

                if job_info["index"] == 1:  # speed
                    check_results(
                        job_info["model_name"], job_info["index"],
                        run_machine_type, result, html_results,
                        -1 if args.device_type.lower() == 'cpu' else 1)
                    check_results(job_info["model_name"], 2, run_machine_type,
                                  mem_result, html_results, -1)  # mem
                elif job_info["index"] == 3:  # profiler
                    check_results(job_info["model_name"],
                                  job_info["index"], run_machine_type,
                                  json.loads(result), html_results, -1,
                                  "Framework_Total")
                    check_results(job_info["model_name"],
                                  job_info["index"], run_machine_type,
                                  json.loads(result), html_results, -1,
                                  "GpuMemcpy_Total")
                elif job_info["index"] == 6:  # max BS
                    check_results(job_info["model_name"], job_info["index"],
                                  run_machine_type, result, html_results, 1)
                else:
                    print("--------------> please set a correct index(1|3|6)!")

    # generate email file
    title = "frame_benchmark"
    env = dict(paddle_branch=args.image_branch,
               paddle_commit_id=args.image_commit_id,
               benchmark_commit_id=args.code_commit_id,
               device_type=args.device_type,
               implement_type=args.implement_type,
               docker_images=os.getenv('RUN_IMAGE_NAME'))
    if args.device_type.upper() in ("P40", "V100"):
        env["cuda_version"] = args.cuda_version
        env["cudnn_version"] = args.cudnn_version
    email_t = template.EmailTemplate(title, env, html_results, args.log_path)
    email_t.construct_email_content()
Example #2
0
def parse_logs(args):
    """
    parse log files and insert to db
    :param args:
    :return:
    """
    image_id = get_image_id()
    file_list = load_folder_files(os.path.join(args.log_path, "index"))
    html_results = OrderedDict()
    icafe_results = []
    for k in DICT_INDEX.values():
        html_results[k] = {}
        if k == 'Profiler_info':
            html_results[k]["header"] = TABLE_PROFILE_HEADER
        else:
            html_results[k]["header"] = TABLE_HEADER
        html_results[k]["data"] = []
    for job_file in file_list:
        result = 0
        with open(job_file, 'r+') as file_obj:
            file_lines = file_obj.readlines()
            try:
                job_info = json.loads(file_lines[-1])
            except Exception as exc:
                print("file {} parse error".format(job_file))
                continue

            # check model if exist in db
            get_or_insert_model(job_info["model_name"],
                                job_info["mission_name"],
                                job_info["direction_id"])

            # save job
            if str(job_info["gpu_num"]
                   ) == "8" and job_info["run_mode"] == "mp":
                run_machine_type = DICT_RUN_MACHINE_TYPE['8mp']
            else:
                run_machine_type = DICT_RUN_MACHINE_TYPE[str(
                    job_info["gpu_num"])]
            job_id = insert_job(image_id, run_machine_type, job_info,
                                args).job_id

            # parse job results
            cpu_utilization_result = 0
            gpu_utilization_result = 0
            unit = ''
            outlier = 0
            outlier_mem = 0
            mem_result = 0
            benchmark = 0
            benchmark_mem = 0
            if job_info["index"] == 1:
                result = job_info['FINAL_RESULT']
                unit = job_info['UNIT']
                fail_flag = job_info['JOB_FAIL_FLAG']
                for line in file_lines:
                    if 'AVG_CPU_USE' in line:
                        cpu_utilization_result = line.strip().split('=')[1]
                    if 'AVG_GPU_USE' in line:
                        gpu_utilization_result = line.strip().split('=')[1]
                    if "MAX_GPU_MEMORY_USE" in line:
                        value = line.strip().split("=")[1].strip()
                        mem_result = int(value) if str.isdigit(value) else 0

            elif job_info["index"] == 3:
                result = json.dumps(job_info['FINAL_RESULT'])
            else:
                for line in file_lines:
                    if "MAX_BATCH_SIZE" in line:
                        value = line.strip().split("=")[1].strip()
                        result = int(value) if str.isdigit(value) else 0
                        break

            print("models: {}, run_machine_type: {}, index: {}, result: {}".
                  format(job_info["model_name"], run_machine_type,
                         job_info["index"], result))
            # check_results and send alarm email
            if job_info["index"] == 1:  # speed
                print_machine_type = machine_type_to_print(run_machine_type)
                #record fail jobs
                print('fail_flag:{}'.format(fail_flag))
                if float(result) == 0 or fail_flag == 1:
                    FAIL_LIST.append(
                        [job_info["model_name"], print_machine_type])
                    outlier = 1
                    outlier_mem = 1
                    icafe_results.append([
                        job_info["model_name"], print_machine_type, 'fail', []
                    ])
                benchmark = check_results(
                    job_info["model_name"],
                    job_info["index"],
                    run_machine_type,
                    result,
                    html_results,
                    -1 if args.device_type.lower() == 'cpu' else 1,
                    unit=unit,
                    outlier=outlier,
                    icafe_results=icafe_results)
                benchmark_mem = check_results(
                    job_info["model_name"],
                    2,
                    run_machine_type,
                    mem_result,
                    html_results,
                    -1,
                    outlier=outlier_mem,
                    icafe_results=icafe_results)  # mem
            elif job_info["index"] == 3:  # profiler
                check_results(job_info["model_name"],
                              job_info["index"],
                              run_machine_type,
                              json.loads(result),
                              html_results,
                              -1,
                              "Framework_Total",
                              is_profile=True)
                check_results(job_info["model_name"],
                              job_info["index"],
                              run_machine_type,
                              json.loads(result),
                              html_results,
                              -1,
                              "GpuMemcpy_Total",
                              is_profile=True)
            elif job_info["index"] == 6:  # max BS
                check_results(job_info["model_name"], job_info["index"],
                              run_machine_type, result, html_results, 1)
            else:
                print("--------------> please set a correct index(1|3|6)!")

            try:
                # save job results
                pjr = insert_results(job_id,
                                     job_info["model_name"],
                                     job_info["index"],
                                     result,
                                     unit,
                                     1,
                                     benchmark=benchmark,
                                     outlier=outlier)
                log_file = job_info["log_file"].split("/")[-1]
                log_base = args.paddle_version + "/" + args.implement_type
                train_log_path = LOG_SERVER + os.path.join(
                    log_base, "train_log", log_file)
                log_save_dict = {"train_log_path": train_log_path}
                if job_info["index"] == 1:
                    insert_results(job_id, job_info["model_name"], 7,
                                   cpu_utilization_result, '%')
                    insert_results(job_id, job_info["model_name"], 8,
                                   gpu_utilization_result, '%')
                    pjr2 = insert_results(job_id,
                                          job_info["model_name"],
                                          2,
                                          mem_result,
                                          'MiB',
                                          1,
                                          benchmark=benchmark_mem,
                                          outlier=outlier_mem)
                    bm.JobResultsLog.objects.create(
                        result_id=pjr2.result_id,
                        log_path=json.dumps(log_save_dict)).save()
                    if int(job_info["gpu_num"]) == 1:
                        profiler_log = job_info["log_with_profiler"].split(
                            "/")[-1]
                        profiler_path = job_info["profiler_path"].split(
                            "/")[-1]
                        profiler_log_path = LOG_SERVER + os.path.join(
                            log_base, "profiler_log", profiler_log)
                        profiler_path = LOG_SERVER + os.path.join(
                            log_base, "profiler_log", profiler_path)
                        log_save_dict["profiler_log_path"] = profiler_log_path
                        log_save_dict["profiler_path"] = profiler_path

                bm.JobResultsLog.objects.create(
                    result_id=pjr.result_id,
                    log_path=json.dumps(log_save_dict)).save()
            except Exception as pfe:
                print pfe

    # generate email file
    title = "frame_benchmark"
    env = dict(
        paddle_branch=args.image_branch,
        paddle_commit_id=args.image_commit_id,
        benchmark_commit_id=args.code_commit_id,
        device_type=args.device_type,
        implement_type=args.implement_type,
        docker_images=os.getenv('RUN_IMAGE_NAME'),
        paddle_version=args.paddle_version,
        HostName=os.getenv('HostName'),
    )
    if args.device_type.upper() in ("P40", "V100", "A100", "V100-32G",
                                    "V100-16G"):
        env["cuda_version"] = args.cuda_version
        env["cudnn_version"] = args.cudnn_version
    email_t = template.EmailTemplate(title, env, html_results, args.log_path,
                                     FAIL_LIST)
    email_t.construct_email_content()
    print('icafe_results:{}'.format(icafe_results))
    # build icafe card
    item = to_icafe.get_alarm_content(icafe_results, env, TABLE_HEADER)
    to_icafe.write_icafe(item)