Example #1
0
 def start_task(job_id, component_name, task_id, role, party_id,
                task_config):
     schedule_logger.info('job {} {} {} {} task subprocess is ready'.format(
         job_id, component_name, role, party_id, task_config))
     task_process_start_status = False
     try:
         task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                 role, party_id, component_name)
         os.makedirs(task_dir, exist_ok=True)
         task_config_path = os.path.join(task_dir, 'task_config.json')
         with open(task_config_path, 'w') as fw:
             json.dump(task_config, fw)
         process_cmd = [
             'python3', sys.modules[TaskExecutor.__module__].__file__, '-j',
             job_id, '-n', component_name, '-t', task_id, '-r', role, '-p',
             party_id, '-c', task_config_path
         ]
         task_log_dir = os.path.join(
             job_utils.get_job_log_directory(job_id=job_id), role, party_id,
             component_name)
         schedule_logger.info(
             'job {} {} {} {} task subprocess start'.format(
                 job_id, component_name, role, party_id, task_config))
         p = job_utils.run_subprocess(config_dir=task_dir,
                                      process_cmd=process_cmd,
                                      log_dir=task_log_dir)
         if p:
             task_process_start_status = True
     except Exception as e:
         schedule_logger.exception(e)
     finally:
         schedule_logger.info(
             'job {} component {} on {} {} start task subprocess {}'.format(
                 job_id, component_name, role, party_id,
                 'success' if task_process_start_status else 'failed'))
Example #2
0
 def run(job_id, component_name, task_id, task_version, role, party_id,
         task_parameters_path, task_info, **kwargs):
     process_cmd = [
         sys.executable,
         sys.modules[TaskExecutor.__module__].__file__,
         '-j',
         job_id,
         '-n',
         component_name,
         '-t',
         task_id,
         '-v',
         task_version,
         '-r',
         role,
         '-p',
         party_id,
         '-c',
         task_parameters_path,
         '--run_ip',
         RuntimeConfig.JOB_SERVER_HOST,
         '--job_server',
         '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST,
                        RuntimeConfig.HTTP_PORT),
     ]
     task_log_dir = os.path.join(
         job_utils.get_job_log_directory(job_id=job_id), role, party_id,
         component_name)
     task_job_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                 role, party_id, component_name)
     schedule_logger(job_id).info(
         'job {} task {} {} on {} {} executor subprocess is ready'.format(
             job_id, task_id, task_version, role, party_id))
     task_dir = os.path.dirname(task_parameters_path)
     p = job_utils.run_subprocess(job_id=job_id,
                                  config_dir=task_dir,
                                  process_cmd=process_cmd,
                                  log_dir=task_log_dir,
                                  job_dir=task_job_dir)
     task_info["run_pid"] = p.pid
     return p
Example #3
0
def download_upload(data_func):
    request_config = request.json
    _job_id = generate_job_id()
    stat_logger.info('generated job_id {}, body {}'.format(_job_id, request_config))
    _job_dir = get_job_directory(_job_id)
    os.makedirs(_job_dir, exist_ok=True)
    module = data_func
    required_arguments = ['work_mode', 'namespace', 'table_name']
    if module == 'upload':
        required_arguments.extend(['file', 'head', 'partition'])
    elif module == 'download':
        required_arguments.extend(['output_path'])
    else:
        raise Exception('can not support this operating: {}'.format(module))
    detect_utils.check_config(request_config, required_arguments=required_arguments)
    if module == "upload":
        if not os.path.isabs(request_config['file']):
            request_config["file"] = os.path.join(file_utils.get_project_base_directory(), request_config["file"])
    try:
        conf_file_path = new_runtime_conf(job_dir=_job_dir, method=data_func, module=module,
                                          role=request_config.get('local', {}).get("role"),
                                          party_id=request_config.get('local', {}).get("party_id", ''))
        file_utils.dump_json_conf(request_config, conf_file_path)
        progs = ["python3",
                 os.path.join(file_utils.get_project_base_directory(), JOB_MODULE_CONF[module]["module_path"]),
                 "-j", _job_id,
                 "-c", conf_file_path
                 ]
        try:
            p = run_subprocess(config_dir=_job_dir, process_cmd=progs)
        except Exception as e:
            stat_logger.exception(e)
            p = None
        return get_json_result(retcode=(0 if p else 101), job_id=_job_id,
                               data={'table_name': request_config['table_name'],
                                     'namespace': request_config['namespace'], 'pid': p.pid if p else ''})
    except Exception as e:
        stat_logger.exception(e)
        return get_json_result(retcode=-104, retmsg="failed", job_id=_job_id)
Example #4
0
    def run_task(job_id, component_name, task_id, role, party_id, task_config):
        schedule_logger(job_id).info(
            'job {} {} {} {} task subprocess is ready'.format(job_id, component_name, role, party_id, task_config))
        task_process_start_status = False
        try:
            task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name)
            os.makedirs(task_dir, exist_ok=True)
            task_config_path = os.path.join(task_dir, 'task_config.json')
            with open(task_config_path, 'w') as fw:
                json.dump(task_config, fw)

            try:
                backend = task_config['job_parameters']['backend']
            except KeyError:
                backend = 0
                schedule_logger(job_id).warning("failed to get backend, set as 0")

            backend = Backend(backend)

            if backend.is_eggroll():
                process_cmd = [
                    'python3', sys.modules[TaskExecutor.__module__].__file__,
                    '-j', job_id,
                    '-n', component_name,
                    '-t', task_id,
                    '-r', role,
                    '-p', party_id,
                    '-c', task_config_path,
                    '--processors_per_node', str(task_config['job_parameters'].get("processors_per_node", 0)),
                    '--job_server', '{}:{}'.format(get_lan_ip(), HTTP_PORT),
                ]
            elif backend.is_spark():
                if "SPARK_HOME" not in os.environ:
                    raise EnvironmentError("SPARK_HOME not found")
                spark_home = os.environ["SPARK_HOME"]

                # additional configs
                spark_submit_config = task_config['job_parameters'].get("spark_submit_config", dict())

                deploy_mode = spark_submit_config.get("deploy-mode", "client")
                if deploy_mode not in ["client"]:
                    raise ValueError(f"deploy mode {deploy_mode} not supported")

                spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit")
                process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}']
                for k, v in spark_submit_config.items():
                    if k != "conf":
                        process_cmd.append(f'--{k}={v}')
                if "conf" in spark_submit_config:
                    for ck, cv in spark_submit_config["conf"].items():
                        process_cmd.append(f'--conf')
                        process_cmd.append(f'{ck}={cv}')
                process_cmd.extend([
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j', job_id,
                    '-n', component_name,
                    '-t', task_id,
                    '-r', role,
                    '-p', party_id,
                    '-c', task_config_path,
                    '--job_server',
                    '{}:{}'.format(get_lan_ip(), HTTP_PORT),
                ])
            else:
                raise ValueError(f"${backend} supported")

            task_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name)
            schedule_logger(job_id).info(
                'job {} {} {} {} task subprocess start'.format(job_id, component_name, role, party_id, task_config))
            p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir)
            if p:
                task_process_start_status = True
        except Exception as e:
            schedule_logger(job_id).exception(e)
        finally:
            schedule_logger(job_id).info(
                'job {} component {} on {} {} start task subprocess {}'.format(job_id, component_name, role, party_id,
                                                                               'success' if task_process_start_status else 'failed'))
Example #5
0
def download_upload(data_func):
    request_config = request.json
    _job_id = generate_job_id()
    stat_logger.info('generated job_id {}, body {}'.format(
        _job_id, request_config))
    _job_dir = get_job_directory(_job_id)
    os.makedirs(_job_dir, exist_ok=True)
    module = data_func
    required_arguments = ['work_mode', 'namespace', 'table_name']
    if module == 'upload':
        required_arguments.extend(['file', 'head', 'partition'])
    elif module == 'download':
        required_arguments.extend(['output_path'])
    else:
        raise Exception('can not support this operating: {}'.format(module))
    detect_utils.check_config(request_config,
                              required_arguments=required_arguments)
    job_work_mode = request_config['work_mode']
    # todo: The current code here is redundant with job_app/submit_job, the next version of this function will be implemented by job_app/submit_job
    if job_work_mode != RuntimeConfig.WORK_MODE:
        if RuntimeConfig.WORK_MODE == WorkMode.CLUSTER and job_work_mode == WorkMode.STANDALONE:
            # use cluster standalone job server to execute standalone job
            return request_execute_server(
                request=request,
                execute_host='{}:{}'.format(
                    request.remote_addr, CLUSTER_STANDALONE_JOB_SERVER_PORT))
        else:
            raise Exception(
                'server run on standalone can not support cluster mode job')

    if module == "upload":
        if not os.path.isabs(request_config['file']):
            request_config["file"] = os.path.join(
                file_utils.get_project_base_directory(),
                request_config["file"])
    try:
        conf_file_path = new_runtime_conf(
            job_dir=_job_dir,
            method=data_func,
            module=module,
            role=request_config.get('local', {}).get("role"),
            party_id=request_config.get('local', {}).get("party_id", ''))
        file_utils.dump_json_conf(request_config, conf_file_path)
        progs = [
            "python3",
            os.path.join(file_utils.get_project_base_directory(),
                         JOB_MODULE_CONF[module]["module_path"]), "-j",
            _job_id, "-c", conf_file_path
        ]
        try:
            p = run_subprocess(config_dir=_job_dir, process_cmd=progs)
        except Exception as e:
            stat_logger.exception(e)
            p = None
        return get_json_result(retcode=(0 if p else 101),
                               job_id=_job_id,
                               data={
                                   'table_name': request_config['table_name'],
                                   'namespace': request_config['namespace'],
                                   'pid': p.pid if p else ''
                               })
    except Exception as e:
        stat_logger.exception(e)
        return get_json_result(retcode=-104, retmsg="failed", job_id=_job_id)
Example #6
0
    def start_task(cls, job_id, component_name, task_id, task_version, role,
                   party_id):
        """
        Start task, update status and party status
        :param job_id:
        :param component_name:
        :param task_id:
        :param task_version:
        :param role:
        :param party_id:
        :return:
        """
        schedule_logger(job_id).info(
            'try to start job {} task {} {} on {} {} executor subprocess'.
            format(job_id, task_id, task_version, role, party_id))
        task_executor_process_start_status = False
        task_info = {
            "job_id": job_id,
            "task_id": task_id,
            "task_version": task_version,
            "role": role,
            "party_id": party_id,
        }
        try:
            task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                    role, party_id, component_name, task_id,
                                    task_version)
            os.makedirs(task_dir, exist_ok=True)
            task_parameters_path = os.path.join(task_dir,
                                                'task_parameters.json')
            run_parameters_dict = job_utils.get_job_parameters(
                job_id, role, party_id)
            with open(task_parameters_path, 'w') as fw:
                fw.write(json_dumps(run_parameters_dict))

            run_parameters = RunParameters(**run_parameters_dict)

            schedule_logger(job_id=job_id).info(
                f"use computing engine {run_parameters.computing_engine}")

            if run_parameters.computing_engine in {
                    ComputingEngine.EGGROLL, ComputingEngine.STANDALONE
            }:
                process_cmd = [
                    sys.executable,
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-v',
                    task_version,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_parameters_path,
                    '--run_ip',
                    RuntimeConfig.JOB_SERVER_HOST,
                    '--job_server',
                    '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST,
                                   RuntimeConfig.HTTP_PORT),
                ]
            elif run_parameters.computing_engine == ComputingEngine.SPARK:
                if "SPARK_HOME" not in os.environ:
                    raise EnvironmentError("SPARK_HOME not found")
                spark_home = os.environ["SPARK_HOME"]

                # additional configs
                spark_submit_config = run_parameters.spark_run

                deploy_mode = spark_submit_config.get("deploy-mode", "client")
                if deploy_mode not in ["client"]:
                    raise ValueError(
                        f"deploy mode {deploy_mode} not supported")

                spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit")
                process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}']
                for k, v in spark_submit_config.items():
                    if k != "conf":
                        process_cmd.append(f'--{k}={v}')
                if "conf" in spark_submit_config:
                    for ck, cv in spark_submit_config["conf"].items():
                        process_cmd.append(f'--conf')
                        process_cmd.append(f'{ck}={cv}')
                process_cmd.extend([
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-v',
                    task_version,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_parameters_path,
                    '--run_ip',
                    RuntimeConfig.JOB_SERVER_HOST,
                    '--job_server',
                    '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST,
                                   RuntimeConfig.HTTP_PORT),
                ])
            else:
                raise ValueError(
                    f"${run_parameters.computing_engine} is not supported")

            task_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role, party_id,
                component_name)
            schedule_logger(job_id).info(
                'job {} task {} {} on {} {} executor subprocess is ready'.
                format(job_id, task_id, task_version, role, party_id))
            p = job_utils.run_subprocess(job_id=job_id,
                                         config_dir=task_dir,
                                         process_cmd=process_cmd,
                                         log_dir=task_log_dir)
            if p:
                task_info["party_status"] = TaskStatus.RUNNING
                #task_info["run_pid"] = p.pid
                task_info["start_time"] = current_timestamp()
                task_executor_process_start_status = True
            else:
                task_info["party_status"] = TaskStatus.FAILED
        except Exception as e:
            schedule_logger(job_id).exception(e)
            task_info["party_status"] = TaskStatus.FAILED
        finally:
            try:
                cls.update_task(task_info=task_info)
                cls.update_task_status(task_info=task_info)
            except Exception as e:
                schedule_logger(job_id).exception(e)
            schedule_logger(job_id).info(
                'job {} task {} {} on {} {} executor subprocess start {}'.
                format(
                    job_id, task_id, task_version, role, party_id, "success"
                    if task_executor_process_start_status else "failed"))
Example #7
0
    def start_task(job_id, component_name, task_id, role, party_id,
                   task_config):
        schedule_logger(job_id).info(
            'job {} {} {} {} task subprocess is ready'.format(
                job_id, component_name, role, party_id, task_config))
        task_process_start_status = False
        try:
            task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                    role, party_id, component_name)
            os.makedirs(task_dir, exist_ok=True)
            task_config_path = os.path.join(task_dir, 'task_config.json')
            with open(task_config_path, 'w') as fw:
                json.dump(task_config, fw)

            try:
                backend = task_config['job_parameters']['backend']
            except KeyError:
                backend = 0
                schedule_logger(job_id).warning(
                    "failed to get backend, set as 0")

            backend = Backend(backend)

            if backend.is_eggroll() or backend.is_eggroll2():
                process_cmd = [
                    'python3',
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_config_path,
                    '--job_server',
                    '{}:{}'.format(task_config['job_server']['ip'],
                                   task_config['job_server']['http_port']),
                ]
            elif backend.is_spark():
                if "SPARK_HOME" not in os.environ:
                    raise EnvironmentError("SPARK_HOME not found")
                spark_submit_config = task_config['job_parameters'].get(
                    "spark_submit_config", dict())
                deploy_mode = spark_submit_config.get("deploy-mode", "client")
                queue = spark_submit_config.get("queue", "default")
                driver_memory = spark_submit_config.get("driver-memory", "1g")
                num_executors = spark_submit_config.get("num-executors", 2)
                executor_memory = spark_submit_config.get(
                    "executor-memory", "1g")
                executor_cores = spark_submit_config.get("executor-cores", 1)

                if deploy_mode not in ["client"]:
                    raise ValueError(
                        f"deploy mode {deploy_mode} not supported")
                spark_home = os.environ["SPARK_HOME"]
                spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit")
                process_cmd = [
                    spark_submit_cmd,
                    f'--name={task_id}#{role}',
                    f'--deploy-mode={deploy_mode}',
                    f'--queue={queue}',
                    f'--driver-memory={driver_memory}',
                    f'--num-executors={num_executors}',
                    f'--executor-memory={executor_memory}',
                    f'--executor-cores={executor_cores}',
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_config_path,
                    '--job_server',
                    '{}:{}'.format(task_config['job_server']['ip'],
                                   task_config['job_server']['http_port']),
                ]
            else:
                raise ValueError(f"${backend} supported")

            task_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role, party_id,
                component_name)
            schedule_logger(job_id).info(
                'job {} {} {} {} task subprocess start'.format(
                    job_id, component_name, role, party_id, task_config))
            p = job_utils.run_subprocess(config_dir=task_dir,
                                         process_cmd=process_cmd,
                                         log_dir=task_log_dir)
            if p:
                task_process_start_status = True
        except Exception as e:
            schedule_logger(job_id).exception(e)
        finally:
            schedule_logger(job_id).info(
                'job {} component {} on {} {} start task subprocess {}'.format(
                    job_id, component_name, role, party_id,
                    'success' if task_process_start_status else 'failed'))