Exemple #1
0
def get_component_input_table(dsl_parser, job, component_name):
    component = dsl_parser.get_component_info(component_name=component_name)
    if 'reader' in component_name:
        component_parameters = component.get_role_parameters()
        return component_parameters[job.f_role][0]['ReaderParam']
    task_input_dsl = component.get_input()
    job_args_on_party = TaskExecutor.get_job_args_on_party(
        dsl_parser=dsl_parser,
        job_runtime_conf=job.f_runtime_conf,
        role=job.f_role,
        party_id=job.f_party_id)
    config = job_utils.get_job_parameters(job.f_job_id, job.f_role,
                                          job.f_party_id)
    task_parameters = RunParameters(**config)
    job_parameters = task_parameters
    component_input_table = TaskExecutor.get_task_run_args(
        job_id=job.f_job_id,
        role=job.f_role,
        party_id=job.f_party_id,
        task_id=None,
        task_version=None,
        job_args=job_args_on_party,
        job_parameters=job_parameters,
        task_parameters=task_parameters,
        input_dsl=task_input_dsl,
        get_input_table=True)
    return component_input_table
Exemple #2
0
def get_component_input_table(dsl_parser, job, component_name):
    component = dsl_parser.get_component_info(component_name=component_name)
    module_name = get_component_module(component_name, job.f_dsl)
    if 'reader' in module_name.lower():
        return job.f_runtime_conf.get("component_parameters", {}).get(
            "role", {}).get(job.f_role, {}).get(
                str(job.f_roles.get(job.f_role).index(int(
                    job.f_party_id)))).get(component_name)
    task_input_dsl = component.get_input()
    job_args_on_party = TaskExecutor.get_job_args_on_party(
        dsl_parser=dsl_parser,
        job_runtime_conf=job.f_runtime_conf,
        role=job.f_role,
        party_id=job.f_party_id)
    config = job_utils.get_job_parameters(job.f_job_id, job.f_role,
                                          job.f_party_id)
    task_parameters = RunParameters(**config)
    job_parameters = task_parameters
    component_input_table = TaskExecutor.get_task_run_args(
        job_id=job.f_job_id,
        role=job.f_role,
        party_id=job.f_party_id,
        task_id=None,
        task_version=None,
        job_args=job_args_on_party,
        job_parameters=job_parameters,
        task_parameters=task_parameters,
        input_dsl=task_input_dsl,
        get_input_table=True)
    return component_input_table
    def calculate_job_resource(cls,
                               job_parameters: RunParameters = None,
                               job_id=None,
                               role=None,
                               party_id=None):
        if not job_parameters:
            job_parameters = job_utils.get_job_parameters(job_id=job_id,
                                                          role=role,
                                                          party_id=party_id)
            job_parameters = RunParameters(**job_parameters)

        cores = 0
        memory = 0

        if not (job_parameters.computing_engine
                in IGNORE_RESOURCE_COMPUTING_ENGINE or role
                in IGNORE_RESOURCE_ROLES and job_parameters.computing_engine
                in SUPPORT_IGNORE_RESOURCE_ENGINES):
            cores = (
                int(job_parameters.adaptation_parameters["task_cores_per_node"]
                    or 0) *
                int(job_parameters.adaptation_parameters["task_nodes"] or 0) *
                int(job_parameters.task_parallelism or 0))
            memory = (int(
                job_parameters.adaptation_parameters["task_memory_per_node"]
                or 0) * int(job_parameters.adaptation_parameters["task_nodes"]
                            or 0) * int(job_parameters.task_parallelism or 0))

        return job_parameters.computing_engine, cores, memory
Exemple #4
0
 def calculate_task_resource(cls,
                             task_parameters: RunParameters = None,
                             task_info: dict = None):
     if not task_parameters:
         job_parameters = job_utils.get_job_parameters(
             job_id=task_info["job_id"],
             role=task_info["role"],
             party_id=task_info["party_id"])
         task_parameters = RunParameters(**job_parameters)
     cores_per_task = task_parameters.adaptation_parameters["task_cores_per_node"] * \
                      task_parameters.adaptation_parameters["task_nodes"]
     memory_per_task = task_parameters.adaptation_parameters["task_memory_per_node"] * \
                       task_parameters.adaptation_parameters["task_nodes"]
     return cores_per_task, memory_per_task
Exemple #5
0
 def calculate_job_resource(cls,
                            job_parameters: RunParameters = None,
                            job_id=None,
                            role=None,
                            party_id=None):
     if not job_parameters:
         job_parameters = job_utils.get_job_parameters(job_id=job_id,
                                                       role=role,
                                                       party_id=party_id)
         job_parameters = RunParameters(**job_parameters)
     cores = job_parameters.adaptation_parameters[
         "task_cores_per_node"] * job_parameters.adaptation_parameters[
             "task_nodes"] * job_parameters.task_parallelism
     memory = job_parameters.adaptation_parameters[
         "task_memory_per_node"] * job_parameters.adaptation_parameters[
             "task_nodes"] * job_parameters.task_parallelism
     return job_parameters.computing_engine, cores, memory
Exemple #6
0
 def calculate_task_resource(cls, task_parameters: RunParameters = None, task_info: dict = None):
     if not task_parameters:
         job_parameters = job_utils.get_job_parameters(job_id=task_info["job_id"],
                                                       role=task_info["role"],
                                                       party_id=task_info["party_id"])
         task_parameters = RunParameters(**job_parameters)
     if task_parameters.backend == Backend.LINKIS_SPARK_RABBITMQ:
         cores_per_task = 0
         memory_per_task = 0
     elif task_info["role"] in IGNORE_RESOURCE_ROLES and task_parameters.computing_engine in SUPPORT_IGNORE_RESOURCE_ENGINES:
         cores_per_task = 0
         memory_per_task = 0
     else:
         cores_per_task = task_parameters.adaptation_parameters["task_cores_per_node"] * \
                          task_parameters.adaptation_parameters["task_nodes"]
         memory_per_task = task_parameters.adaptation_parameters["task_memory_per_node"] * \
                           task_parameters.adaptation_parameters["task_nodes"]
     return cores_per_task, memory_per_task
Exemple #7
0
 def calculate_job_resource(cls, job_parameters: RunParameters = None, job_id=None, role=None, party_id=None):
     if not job_parameters:
         job_parameters = job_utils.get_job_parameters(job_id=job_id,
                                                       role=role,
                                                       party_id=party_id)
         job_parameters = RunParameters(**job_parameters)
     if job_parameters.backend == Backend.LINKIS_SPARK_RABBITMQ:
         cores = 0
         memory = 0
     elif role in IGNORE_RESOURCE_ROLES and job_parameters.computing_engine in SUPPORT_IGNORE_RESOURCE_ENGINES:
         cores = 0
         memory = 0
     else:
         cores = job_parameters.adaptation_parameters["task_cores_per_node"] * job_parameters.adaptation_parameters[
             "task_nodes"] * job_parameters.task_parallelism
         memory = job_parameters.adaptation_parameters["task_memory_per_node"] * job_parameters.adaptation_parameters[
             "task_nodes"] * job_parameters.task_parallelism
     return job_parameters.computing_engine, cores, memory
Exemple #8
0
    def start_task(cls, job_id, component_name, task_id, task_version, role,
                   party_id, **kwargs):
        """
        Start task, update status and party status
        :param job_id:
        :param component_name:
        :param task_id:
        :param task_version:
        :param role:
        :param party_id:
        :return:
        """
        job_dsl = job_utils.get_job_dsl(job_id, role, party_id)
        PrivilegeAuth.authentication_component(
            job_dsl,
            src_party_id=kwargs.get('src_party_id'),
            src_role=kwargs.get('src_role'),
            party_id=party_id,
            component_name=component_name)

        schedule_logger(job_id).info(
            f"try to start task {task_id} {task_version} on {role} {party_id} executor subprocess"
        )
        task_executor_process_start_status = False
        task_info = {
            "job_id": job_id,
            "task_id": task_id,
            "task_version": task_version,
            "role": role,
            "party_id": party_id,
        }
        is_failed = False
        try:
            task = JobSaver.query_task(task_id=task_id,
                                       task_version=task_version,
                                       role=role,
                                       party_id=party_id)[0]
            run_parameters_dict = job_utils.get_job_parameters(
                job_id, role, party_id)
            run_parameters_dict["src_user"] = kwargs.get("src_user")
            run_parameters = RunParameters(**run_parameters_dict)

            config_dir = job_utils.get_task_directory(job_id, role, party_id,
                                                      component_name, task_id,
                                                      task_version)
            os.makedirs(config_dir, exist_ok=True)

            run_parameters_path = os.path.join(config_dir,
                                               'task_parameters.json')
            with open(run_parameters_path, 'w') as fw:
                fw.write(json_dumps(run_parameters_dict))

            schedule_logger(job_id).info(
                f"use computing engine {run_parameters.computing_engine}")
            task_info["engine_conf"] = {
                "computing_engine": run_parameters.computing_engine
            }
            backend_engine = build_engine(run_parameters.computing_engine)
            run_info = backend_engine.run(
                task=task,
                run_parameters=run_parameters,
                run_parameters_path=run_parameters_path,
                config_dir=config_dir,
                log_dir=job_utils.get_job_log_directory(
                    job_id, role, party_id, component_name),
                cwd_dir=job_utils.get_job_directory(job_id, role, party_id,
                                                    component_name),
                user_name=kwargs.get("user_id"))
            task_info.update(run_info)
            task_info["start_time"] = current_timestamp()
            task_executor_process_start_status = True
        except Exception as e:
            schedule_logger(job_id).exception(e)
            is_failed = True
        finally:
            try:
                cls.update_task(task_info=task_info)
                task_info["party_status"] = TaskStatus.RUNNING
                cls.update_task_status(task_info=task_info)
                if is_failed:
                    task_info["party_status"] = TaskStatus.FAILED
                    cls.update_task_status(task_info=task_info)
            except Exception as e:
                schedule_logger(job_id).exception(e)
            schedule_logger(job_id).info(
                "task {} {} on {} {} executor subprocess start {}".format(
                    task_id, task_version, role, party_id, "success"
                    if task_executor_process_start_status else "failed"))
Exemple #9
0
    def start_task_worker(cls,
                          worker_name,
                          task: Task,
                          task_parameters: RunParameters = None,
                          executable: list = None,
                          extra_env: dict = None,
                          **kwargs):
        worker_id, config_dir, log_dir = cls.get_process_dirs(
            worker_name=worker_name,
            job_id=task.f_job_id,
            role=task.f_role,
            party_id=task.f_party_id,
            task=task)

        session_id = job_utils.generate_session_id(task.f_task_id,
                                                   task.f_task_version,
                                                   task.f_role,
                                                   task.f_party_id)
        federation_session_id = job_utils.generate_task_version_id(
            task.f_task_id, task.f_task_version)

        info_kwargs = {}
        specific_cmd = []
        if worker_name is WorkerName.TASK_EXECUTOR:
            from fate_flow.worker.task_executor import TaskExecutor
            module_file_path = sys.modules[TaskExecutor.__module__].__file__
        else:
            raise Exception(f"not support {worker_name} worker")

        if task_parameters is None:
            task_parameters = RunParameters(**job_utils.get_job_parameters(
                task.f_job_id, task.f_role, task.f_party_id))

        config = task_parameters.to_dict()
        config["src_user"] = kwargs.get("src_user")
        config_path, result_path = cls.get_config(config_dir=config_dir,
                                                  config=config,
                                                  log_dir=log_dir)

        if executable:
            process_cmd = executable
        else:
            process_cmd = [sys.executable or "python3"]

        common_cmd = [
            module_file_path,
            "--job_id",
            task.f_job_id,
            "--component_name",
            task.f_component_name,
            "--task_id",
            task.f_task_id,
            "--task_version",
            task.f_task_version,
            "--role",
            task.f_role,
            "--party_id",
            task.f_party_id,
            "--config",
            config_path,
            '--result',
            result_path,
            "--log_dir",
            log_dir,
            "--parent_log_dir",
            os.path.dirname(log_dir),
            "--worker_id",
            worker_id,
            "--run_ip",
            RuntimeConfig.JOB_SERVER_HOST,
            "--job_server",
            f"{RuntimeConfig.JOB_SERVER_HOST}:{RuntimeConfig.HTTP_PORT}",
            "--session_id",
            session_id,
            "--federation_session_id",
            federation_session_id,
        ]
        process_cmd.extend(common_cmd)
        process_cmd.extend(specific_cmd)
        env = cls.get_env(task.f_job_id, task.f_provider_info)
        if extra_env:
            env.update(extra_env)
        schedule_logger(task.f_job_id).info(
            f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} {worker_name} worker subprocess is ready"
        )
        p = process_utils.run_subprocess(job_id=task.f_job_id,
                                         config_dir=config_dir,
                                         process_cmd=process_cmd,
                                         added_env=env,
                                         log_dir=log_dir,
                                         cwd_dir=config_dir,
                                         process_name=worker_name.value,
                                         process_id=worker_id)
        cls.save_worker_info(task=task,
                             worker_name=worker_name,
                             worker_id=worker_id,
                             run_ip=RuntimeConfig.JOB_SERVER_HOST,
                             run_pid=p.pid,
                             config=config,
                             cmd=process_cmd,
                             **info_kwargs)
        return {"run_pid": p.pid, "worker_id": worker_id, "cmd": process_cmd}
Exemple #10
0
    def start_task(cls, job_id, component_name, task_id, task_version, role,
                   party_id):
        """
        Start task, update status and party status
        :param job_id:
        :param component_name:
        :param task_id:
        :param task_version:
        :param role:
        :param party_id:
        :return:
        """
        schedule_logger(job_id).info(
            'try to start job {} task {} {} on {} {} executor subprocess'.
            format(job_id, task_id, task_version, role, party_id))
        task_executor_process_start_status = False
        task_info = {
            "job_id": job_id,
            "task_id": task_id,
            "task_version": task_version,
            "role": role,
            "party_id": party_id,
        }
        try:
            task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                    role, party_id, component_name, task_id,
                                    task_version)
            os.makedirs(task_dir, exist_ok=True)
            task_parameters_path = os.path.join(task_dir,
                                                'task_parameters.json')
            run_parameters_dict = job_utils.get_job_parameters(
                job_id, role, party_id)
            with open(task_parameters_path, 'w') as fw:
                fw.write(json_dumps(run_parameters_dict))

            run_parameters = RunParameters(**run_parameters_dict)

            schedule_logger(job_id=job_id).info(
                f"use computing engine {run_parameters.computing_engine}")

            if run_parameters.computing_engine in {
                    ComputingEngine.EGGROLL, ComputingEngine.STANDALONE
            }:
                process_cmd = [
                    sys.executable,
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-v',
                    task_version,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_parameters_path,
                    '--run_ip',
                    RuntimeConfig.JOB_SERVER_HOST,
                    '--job_server',
                    '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST,
                                   RuntimeConfig.HTTP_PORT),
                ]
            elif run_parameters.computing_engine == ComputingEngine.SPARK:
                if "SPARK_HOME" not in os.environ:
                    raise EnvironmentError("SPARK_HOME not found")
                spark_home = os.environ["SPARK_HOME"]

                # additional configs
                spark_submit_config = run_parameters.spark_run

                deploy_mode = spark_submit_config.get("deploy-mode", "client")
                if deploy_mode not in ["client"]:
                    raise ValueError(
                        f"deploy mode {deploy_mode} not supported")

                spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit")
                process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}']
                for k, v in spark_submit_config.items():
                    if k != "conf":
                        process_cmd.append(f'--{k}={v}')
                if "conf" in spark_submit_config:
                    for ck, cv in spark_submit_config["conf"].items():
                        process_cmd.append(f'--conf')
                        process_cmd.append(f'{ck}={cv}')
                process_cmd.extend([
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-v',
                    task_version,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_parameters_path,
                    '--run_ip',
                    RuntimeConfig.JOB_SERVER_HOST,
                    '--job_server',
                    '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST,
                                   RuntimeConfig.HTTP_PORT),
                ])
            else:
                raise ValueError(
                    f"${run_parameters.computing_engine} is not supported")

            task_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role, party_id,
                component_name)
            schedule_logger(job_id).info(
                'job {} task {} {} on {} {} executor subprocess is ready'.
                format(job_id, task_id, task_version, role, party_id))
            p = job_utils.run_subprocess(job_id=job_id,
                                         config_dir=task_dir,
                                         process_cmd=process_cmd,
                                         log_dir=task_log_dir)
            if p:
                task_info["party_status"] = TaskStatus.RUNNING
                #task_info["run_pid"] = p.pid
                task_info["start_time"] = current_timestamp()
                task_executor_process_start_status = True
            else:
                task_info["party_status"] = TaskStatus.FAILED
        except Exception as e:
            schedule_logger(job_id).exception(e)
            task_info["party_status"] = TaskStatus.FAILED
        finally:
            try:
                cls.update_task(task_info=task_info)
                cls.update_task_status(task_info=task_info)
            except Exception as e:
                schedule_logger(job_id).exception(e)
            schedule_logger(job_id).info(
                'job {} task {} {} on {} {} executor subprocess start {}'.
                format(
                    job_id, task_id, task_version, role, party_id, "success"
                    if task_executor_process_start_status else "failed"))