Beispiel #1
0
def insert_job(image_id, run_machine_type, job_info, args):
    """ insert job to db"""
    cluster_job_id = uuid.uuid1()
    pj = bm.Job()
    pj.job_name = "pb_{}_{}".format(args.paddle_version, job_info["model_name"])
    pj.cluster_job_id = cluster_job_id
    pj.cluster_type_id = "LocalJob"
    pj.model_name = job_info["model_name"]
    pj.report_index = job_info["index"]
    pj.code_branch = "master"
    pj.code_commit_id = args.code_commit_id
    pj.job_type = args.job_type
    pj.run_machine_type = run_machine_type
    pj.frame_id = 0
    pj.image_id = image_id
    pj.cuda_version = args.cuda_version
    pj.cudnn_version = args.cudnn_version
    pj.device_type = args.device_type
    pj.model_implement_type = args.implement_type
    pj.log_extracted = "yes"
    pj.save()
    return pj
Beispiel #2
0
def insert_job(job_instance):
    """ insert the job info to the table named job by django
    """
    if job_instance.conf["image_id"] == "latest":
        image_id = get_frame_latest_version(
            job_instance.conf["frame_id"]).image_id
    else:
        image_id = job_instance.conf["image_id"]
    pcj = bm.Job()
    pcj.job_name = job_instance.conf["body"]["jobName"]
    pcj.cluster_job_id = job_instance.conf["cluster_job_id"]
    pcj.cluster_type_id = job_instance.conf["cluster_type_id"]
    pcj.cluster_conf = job_instance.conf["body"]["clusterConf"]

    pcj.model_name = job_instance.conf["model"]
    pcj.report_index = job_instance.conf["report_index"]

    pcj.repo_address = job_instance.conf["repo_address"]
    pcj.code_branch = job_instance.conf["code_branch"]
    pcj.job_type = job_instance.conf["job_type"]

    pcj.run_rpc_type = job_instance.conf["run_rpc_type"]
    pcj.run_machine_type = job_instance.conf["run_machine_tpye"]
    pcj.batch_size = job_instance.conf["batch_size"]
    pcj.frame_id = job_instance.conf["frame_id"]
    pcj.image_id = image_id
    pcj.cuda_version = job_instance.conf["cuda_version"]
    pcj.cudnn_version = job_instance.conf["cudnn_version"]
    pcj.run_cmd = job_instance.conf["run_cmd"]
    pcj.eval_cmd = job_instance.conf["eval_cmd"]
    pcj.infer_cmd = job_instance.conf["infer_cmd"]

    pcj.submit_period = job_instance.conf["ploy"].submit_period
    pcj.check_period = job_instance.conf["ploy"].check_period
    pcj.statistics_unit = job_instance.conf["ploy"].statistics_unit

    pcj.status = "submit"
    pcj.save()
Beispiel #3
0
def parse_logs(args):
    """
    parse log files and insert to db
    :param args:
    :return:
    """
    image_id = get_image_id()
    file_list = load_folder_files(os.path.join(args.log_path, "index"))
    dict_run_machine_type = {
        '1': 'ONE_GPU',
        '4': 'FOUR_GPU',
        '8': 'MULTI_GPU',
        '8mp': 'MULTI_GPU_MULTI_PROCESS'
    }
    report_index_dict = {'speed': 1, 'mem': 2, 'maxbs': 6}
    html_results = []
    for job_file in file_list:
        cluster_job_id = uuid.uuid1()
        result = ""
        with open(job_file, 'r+') as file_obj:
            file_lines = file_obj.readlines()
            try:
                job_info = json.loads(file_lines[-1])
            except Exception as exc:
                print("file {} parse error".format(job_file))
                continue
            # save_job
            if str(job_info["gpu_num"]
                   ) == "8" and job_info["run_mode"] == "mp":
                run_machine_type = dict_run_machine_type['8mp']
            else:
                run_machine_type = dict_run_machine_type[str(
                    job_info["gpu_num"])]
            report_index = report_index_dict[job_info["index"]]
            pj = bm.Job()
            pj.job_name = "pb_{}_{}".format(args.paddle_version,
                                            job_info["model_name"])
            pj.cluster_job_id = cluster_job_id
            pj.cluster_type_id = "LocalJob"
            pj.model_name = job_info["model_name"]
            pj.report_index = report_index
            pj.code_branch = "master"
            pj.code_commit_id = args.code_commit_id
            pj.job_type = args.job_type
            pj.run_machine_type = run_machine_type
            pj.frame_id = 0
            pj.image_id = image_id
            pj.cuda_version = args.cuda_version
            pj.cudnn_version = args.cudnn_version
            pj.device_type = args.device_type
            pj.model_implement_type = args.implement_type
            pj.log_extracted = "yes"
            pj.save()
            job_id = pj.job_id

            log_server = socket.gethostname()
            # todo config the log_server port
            log_server = "http://" + log_server + ":8777/"
            log_file = job_info["log_file"].split("/")[-1]
            profiler_log = job_info["log_with_profiler"].split("/")[-1]
            profiler_path = job_info["profiler_path"].split("/")[-1]
            train_log_path = log_server + os.path.join(
                os.path.basename(args.log_path), "train_log", log_file)
            profiler_log_path = log_server + os.path.join(
                os.path.basename(args.log_path), "profiler_log", profiler_log)
            profiler_path = log_server + os.path.join(
                os.path.basename(args.log_path), "profiler_log", profiler_path)

            cpu_utilization_result = 0
            gpu_utilization_result = 0
            try:
                if report_index == 2:
                    for line in file_lines:
                        if "MAX_GPU_MEMORY_USE" in line:
                            value = line.strip().split("=")[1].strip()
                            result = int(value) if str.isdigit(value) else 0
                            break
                elif report_index == 1:
                    for line in file_lines:
                        if "FINAL_RESULT" in line:
                            result = line.strip().split("=")[1]
                        if 'AVG_CPU_USE' in line:
                            cpu_utilization_result = line.strip().split('=')[1]
                        if 'AVG_GPU_USE' in line:
                            gpu_utilization_result = line.strip().split('=')[1]
                else:
                    for line in file_lines:
                        if "MAX_BATCH_SIZE" in line:
                            value = line.strip().split("=")[1].strip()
                            result = int(value) if str.isdigit(value) else 0
                            break

                # save_result
                pjr = bm.JobResults()
                pjr.job_id = job_id
                pjr.model_name = job_info["model_name"]
                pjr.report_index_id = report_index
                pjr.report_result = result
                pjr.train_log_path = 1
                pjr.save()

                # save log path
                pjrl = bm.JobResultsLog()
                pjrl.result_id = pjr.result_id
                cmd = "curl -I -m 10 -o /dev/null -s -w %{http_code} " + profiler_log_path
                if commands.getoutput(cmd) != '200':
                    pjrl.log_path = json.dumps(
                        {"train_log_path": train_log_path})
                else:
                    pjrl.log_path = json.dumps({
                        "train_log_path": train_log_path,
                        "profiler_log_path": profiler_log_path,
                        "profiler_path": profiler_path
                    })
                pjrl.save()
                # save cpu & gpu result
                if report_index == 1:
                    pjr_cpu = bm.JobResults()
                    pjr_cpu.job_id = job_id
                    pjr_cpu.model_name = job_info["model_name"]
                    pjr_cpu.report_index_id = 7
                    pjr_cpu.report_result = cpu_utilization_result
                    pjr_cpu.save()

                    pjr_gpu = bm.JobResults()
                    pjr_gpu.job_id = job_id
                    pjr_gpu.model_name = job_info["model_name"]
                    pjr_gpu.report_index_id = 8
                    pjr_gpu.report_result = gpu_utilization_result
                    pjr_gpu.save()

            except Exception as pfe:
                print pfe
            else:
                print(
                    "models: {}, run_machine_type: {}, index: {}, result: {}".
                    format(job_info["model_name"], run_machine_type,
                           report_index, result))

                # 如果当前值是空或者inf(speed 会出现)
                if not result or result == '-inf':
                    result = 0

                value = check_results(job_info["model_name"], report_index,
                                      run_machine_type, result)

                if value:
                    current_html_result = [
                        job_info["model_name"], run_machine_type,
                        job_info["index"], value[0], result, value[1]
                    ]
                    html_results.append(current_html_result)

    if html_results:
        template.construct_email_content(html_results, args.log_path, args)
Beispiel #4
0
def parse_logs(args):
    """
    parse log files and insert to db
    :param args:
    :return:
    """
    image_id = get_image_id()
    file_list = load_folder_files(os.path.join(args.log_path, "index"))
    dict_run_machine_type = {
        '1gpus' : 'ONE_GPU',
        '4gpus' : 'FOUR_GPU',
        '8gpus' : 'MULTI_GPU',
        '8gpus8p' : 'MULTI_GPU_MULTI_PROCESS'
    }
    cv_models = ['DeepLab_V3+', 'CycleGAN', 'mask_rcnn', 'SE-ResNeXt50', 'yolov3']
    # nlp_models = ['bert', 'paddingrnn_large', 'paddingrnn_small', 'transformer']
    # rl_models = ['ddpg_deep_explore']
    multi_process_models = ['mask_rcnn', 'yolov3', 'transformer_base', 'transformer_big', 'bert', 'SE-ResNeXt50']
    html_results = []
    for file in file_list:
        # file_name like CycleGAN_mem_1gpus or ddpg_deep_explore_speed_1gpus
        cluster_job_id = uuid.uuid1()
        file_name = file.split('/')[-1]
        model_name = '_'.join(file_name.split('_')[:-2])
        key_word = "FPS:" if model_name in cv_models else 'Avg:'
        job_name = 'pb_' + model_name
        task_index = file_name.split('_')[-2]
        if task_index == 'speed':
            report_index = 1
        elif task_index == 'mem':
            report_index = 2
        else:
            report_index = 6

        run_machine_type = dict_run_machine_type[file_name.split('_')[-1]]
        run_mode = "mp" if file_name.split('_')[-1] == "8gpus8p" else "sp"
        pj = bm.Job()
        pj.job_name = job_name
        pj.cluster_job_id = cluster_job_id
        pj.cluster_type_id = 0
        pj.model_name = model_name
        pj.report_index = report_index
        pj.code_branch = "master"
        pj.code_commit_id = args.code_commit_id
        pj.job_type = args.job_type
        pj.run_machine_type = run_machine_type
        pj.frame_id = 0
        pj.image_id = image_id
        pj.cuda_version = args.cuda_version
        pj.cudnn_version = args.cudnn_version
        pj.gpu_type = args.gpu_type
        pj.model_implement_type = args.implement_type
        pj.log_extracted = "yes"
        pj.save()
        #log_server = log_server_cuda9 if args.cuda_version == '9.0' else log_server_cuda10
        log_server = socket.gethostname()
        #todo config the log_server port
        log_server = "http://" + log_server + ":8777/"
        train_log_name = "{}_{}_{}_{}".format(model_name, "train",
                                               task_index,
                                               file_name.split('_')[-1][0])
        if model_name in multi_process_models:
            train_log_name += "_{}".format(run_mode)
        train_log_path = os.path.join(os.path.basename(args.log_path),
                                      "train_log", train_log_name)
        train_log_path = log_server + train_log_path

        job_id = get_job_id(cluster_job_id)

        result = ""
        with open(file, 'r+') as file_obj:
            file_lines = file_obj.readlines()
            try:
                if report_index == 2:
                    value = file_lines[-1].split()[-1]
                    result = int(value) if str.isdigit(value) else 0
                elif report_index == 1:
                    lines = file_lines[-10:-1]
                    for line in lines:
                        if key_word in line:
                            result = line.split(':')[1].split(' ')[1]
                else:
                    value = file_lines[-1].split()[-1]
                    result = int(value) if str.isdigit(value) else 0

                pjr = bm.JobResults()
                pjr.job_id = job_id
                pjr.model_name = model_name
                pjr.report_index_id = report_index
                pjr.report_result = result
                pjr.train_log_path = train_log_path
                pjr.save()
            except Exception as pfe:
                print pfe
            else:
                print("models: {}, run_machine_type: {}, index: {}, result: {}".format(
                    model_name, run_machine_type, task_index, result))

                # 如果当前值是空或者inf(speed 会出现)
                if not result or result == '-inf':
                    result = 0

                value = check_results(model_name, report_index, run_machine_type, result)

                if value:
                    current_html_result = [model_name, run_machine_type,
                                           task_index, value[0], result, value[1]]
                    html_results.append(current_html_result)

    if html_results:
        template.construct_email_content(html_results, args.log_path, args)
Beispiel #5
0
def parse_logs(args):
    image_id = get_image_id()
    file_list = load_folder_files(os.path.join(args.log_path, "index"))
    dict_run_machine_type = {
        '1gpus': 'ONE_GPU',
        '4gpus': 'FOUR_GPU',
        '8gpus': 'MULTI_GPU',
        '8gpus8p': 'MULTI_GPU_MULTI_PROCESS'
    }
    cv_models = [
        'DeepLab_V3+', 'CycleGAN', 'mask_rcnn', 'SE-ResNeXt50', 'yolov3'
    ]
    nlp_models = [
        'bert', 'paddingrnn_large', 'paddingrnn_small', 'transformer'
    ]
    rl_models = ['ddpg_deep_explore']

    for file in file_list:
        # file_name like CycleGAN_mem_1gpus or ddpg_deep_explore_speed_1gpus
        cluster_job_id = uuid.uuid1()
        file_name = file.split('/')[-1]
        model_name = '_'.join(file_name.split('_')[:-2])
        key_word = "FPS:" if model_name in cv_models else 'Avg:'
        job_name = 'pb_' + model_name
        task_index = file_name.split('_')[-2]
        if task_index == 'speed':
            report_index = 1
        elif task_index == 'mem':
            report_index = 2
        else:
            report_index = 6

        run_machine_type = dict_run_machine_type[file_name.split('_')[-1]]
        pj = bm.Job()
        pj.job_name = job_name
        pj.cluster_job_id = cluster_job_id
        pj.cluster_type_id = 0
        pj.model_name = model_name
        pj.report_index = report_index
        pj.code_branch = "master"
        pj.code_commit_id = args.code_commit_id
        pj.job_type = 2
        pj.run_machine_type = run_machine_type
        pj.frame_id = 0
        pj.image_id = image_id
        pj.cuda_version = args.cuda_version
        pj.cudnn_version = args.cudnn_version
        pj.log_extracted = "yes"
        pj.save()

        train_log_name = "{}_{}_{}_{}".format(model_name, "train", task_index,
                                              file_name.split('_')[-1][0])
        train_log_path = os.path.join(os.path.basename(args.log_path),
                                      "train_log", train_log_name)

        job_id = get_job_id(cluster_job_id)

        result = ""
        with open(file, 'r+') as file_obj:
            file_lines = file_obj.readlines()
            try:
                if report_index == 2:
                    value = file_lines[-1].split()[-1]
                    result = int(value) if str.isdigit(value) else 0
                elif report_index == 1:
                    lines = file_lines[-10:-1]
                    for line in lines:
                        if key_word in line:
                            result = line.split(':')[1].split(' ')[1]
                else:
                    value = file_lines[-1].split()[-1]
                    result = int(value) if str.isdigit(value) else 0

                pjr = bm.JobResults()
                pjr.job_id = job_id
                pjr.model_name = model_name
                pjr.report_index_id = report_index
                pjr.report_result = result
                pjr.train_log_path = train_log_path
                pjr.save()
            except Exception as pfe:
                print pfe