Esempio n. 1
0
 def gen_updated_parameters(cls, job_id, initiator_role, initiator_party_id,
                            input_job_parameters,
                            input_component_parameters):
     # todo: check can not update job parameters
     job_configuration = job_utils.get_job_configuration(
         job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     updated_job_parameters = job_configuration.runtime_conf[
         "job_parameters"]
     updated_component_parameters = job_configuration.runtime_conf[
         "component_parameters"]
     if input_job_parameters:
         if input_job_parameters.get("common"):
             common_job_parameters = RunParameters(
                 **input_job_parameters["common"])
             cls.create_common_job_parameters(
                 job_id=job_id,
                 initiator_role=initiator_role,
                 common_job_parameters=common_job_parameters)
             for attr in {"model_id", "model_version"}:
                 setattr(common_job_parameters, attr,
                         updated_job_parameters["common"].get(attr))
             updated_job_parameters[
                 "common"] = common_job_parameters.to_dict()
         # not support role
     updated_components = set()
     if input_component_parameters:
         cls.merge_update(input_component_parameters,
                          updated_component_parameters)
     return updated_job_parameters, updated_component_parameters, list(
         updated_components)
Esempio n. 2
0
    def update_parameter(cls, job_id, role, party_id,
                         updated_parameters: dict):
        job_configuration = job_utils.get_job_configuration(job_id=job_id,
                                                            role=role,
                                                            party_id=party_id)
        job_parameters = updated_parameters.get("job_parameters")
        component_parameters = updated_parameters.get("component_parameters")
        if job_parameters:
            job_configuration.runtime_conf["job_parameters"] = job_parameters
            job_parameters = RunParameters(**job_parameters["common"])
            cls.create_job_parameters_on_party(role=role,
                                               party_id=party_id,
                                               job_parameters=job_parameters)
            job_configuration.runtime_conf_on_party[
                "job_parameters"] = job_parameters.to_dict()
        if component_parameters:
            job_configuration.runtime_conf[
                "component_parameters"] = component_parameters
            job_configuration.runtime_conf_on_party[
                "component_parameters"] = component_parameters

        job_info = {}
        job_info["job_id"] = job_id
        job_info["role"] = role
        job_info["party_id"] = party_id
        job_info["runtime_conf"] = job_configuration.runtime_conf
        job_info[
            "runtime_conf_on_party"] = job_configuration.runtime_conf_on_party
        JobSaver.update_job(job_info)
Esempio n. 3
0
 def update_common_parameters(self, common_parameters: RunParameters):
     if int(self.job_runtime_conf.get("dsl_version", 1)) == 2:
         self.job_runtime_conf["job_parameters"][
             "common"] = common_parameters.to_dict()
     else:
         self.job_runtime_conf[
             "job_parameters"] = common_parameters.to_dict()
     return self.job_runtime_conf
Esempio n. 4
0
 def get_job_parameters_dict(self, job_parameters: RunParameters = None):
     if job_parameters:
         if int(self.job_runtime_conf.get('dsl_version', 1)) == 2:
             self.job_runtime_conf['job_parameters'][
                 'common'] = job_parameters.to_dict()
         else:
             self.job_runtime_conf[
                 'job_parameters'] = job_parameters.to_dict()
     return self.job_runtime_conf['job_parameters']
Esempio n. 5
0
 def clean_task(cls, job_id, task_id, task_version, role, party_id,
                content_type: TaskCleanResourceType):
     status = set()
     if content_type == TaskCleanResourceType.METRICS:
         tracker = Tracker(job_id=job_id,
                           role=role,
                           party_id=party_id,
                           task_id=task_id,
                           task_version=task_version)
         status.add(tracker.clean_metrics())
     elif content_type == TaskCleanResourceType.TABLE:
         jobs = JobSaver.query_job(job_id=job_id,
                                   role=role,
                                   party_id=party_id)
         if jobs:
             job = jobs[0]
             job_parameters = RunParameters(
                 **job.f_runtime_conf_on_party["job_parameters"])
             tracker = Tracker(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               task_id=task_id,
                               task_version=task_version,
                               job_parameters=job_parameters)
             status.add(tracker.clean_task(job.f_runtime_conf_on_party))
     if len(status) == 1 and True in status:
         return True
     else:
         return False
Esempio n. 6
0
def start_session_stop(task):
    job_parameters = RunParameters(**get_job_parameters(
        job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id))
    session_manager_id = generate_session_id(task.f_task_id,
                                             task.f_task_version, task.f_role,
                                             task.f_party_id)
    if task.f_status != TaskStatus.WAITING:
        schedule_logger(task.f_job_id).info(
            f'start run subprocess to stop task sessions {session_manager_id}')
    else:
        schedule_logger(task.f_job_id).info(
            f'task is waiting, pass stop sessions {session_manager_id}')
        return
    task_dir = os.path.join(get_job_directory(job_id=task.f_job_id),
                            task.f_role, task.f_party_id,
                            task.f_component_name, 'session_stop')
    os.makedirs(task_dir, exist_ok=True)
    process_cmd = [
        sys.executable or 'python3',
        sys.modules[session_utils.SessionStop.__module__].__file__,
        '--session', session_manager_id, '--computing',
        job_parameters.computing_engine, '--federation',
        job_parameters.federation_engine, '--storage',
        job_parameters.storage_engine, '-c',
        'stop' if task.f_status == JobStatus.SUCCESS else 'kill'
    ]
    p = process_utils.run_subprocess(job_id=task.f_job_id,
                                     config_dir=task_dir,
                                     process_cmd=process_cmd)
    p.wait()
    p.poll()
Esempio n. 7
0
def get_component_input_table(dsl_parser, job, component_name):
    component = dsl_parser.get_component_info(component_name=component_name)
    module_name = get_component_module(component_name, job.f_dsl)
    if 'reader' in module_name.lower():
        return job.f_runtime_conf.get("component_parameters", {}).get(
            "role", {}).get(job.f_role, {}).get(
                str(job.f_roles.get(job.f_role).index(int(
                    job.f_party_id)))).get(component_name)
    task_input_dsl = component.get_input()
    job_args_on_party = TaskExecutor.get_job_args_on_party(
        dsl_parser=dsl_parser,
        job_runtime_conf=job.f_runtime_conf,
        role=job.f_role,
        party_id=job.f_party_id)
    config = job_utils.get_job_parameters(job.f_job_id, job.f_role,
                                          job.f_party_id)
    task_parameters = RunParameters(**config)
    job_parameters = task_parameters
    component_input_table = TaskExecutor.get_task_run_args(
        job_id=job.f_job_id,
        role=job.f_role,
        party_id=job.f_party_id,
        task_id=None,
        task_version=None,
        job_args=job_args_on_party,
        job_parameters=job_parameters,
        task_parameters=task_parameters,
        input_dsl=task_input_dsl,
        get_input_table=True)
    return component_input_table
Esempio n. 8
0
    def calculate_job_resource(cls,
                               job_parameters: RunParameters = None,
                               job_id=None,
                               role=None,
                               party_id=None):
        if not job_parameters:
            job_parameters = job_utils.get_job_parameters(job_id=job_id,
                                                          role=role,
                                                          party_id=party_id)
            job_parameters = RunParameters(**job_parameters)

        cores = 0
        memory = 0

        if not (job_parameters.computing_engine
                in IGNORE_RESOURCE_COMPUTING_ENGINE or role
                in IGNORE_RESOURCE_ROLES and job_parameters.computing_engine
                in SUPPORT_IGNORE_RESOURCE_ENGINES):
            cores = (
                int(job_parameters.adaptation_parameters["task_cores_per_node"]
                    or 0) *
                int(job_parameters.adaptation_parameters["task_nodes"] or 0) *
                int(job_parameters.task_parallelism or 0))
            memory = (int(
                job_parameters.adaptation_parameters["task_memory_per_node"]
                or 0) * int(job_parameters.adaptation_parameters["task_nodes"]
                            or 0) * int(job_parameters.task_parallelism or 0))

        return job_parameters.computing_engine, cores, memory
Esempio n. 9
0
 def adapt_job_parameters(cls,
                          role,
                          job_parameters: RunParameters,
                          create_initiator_baseline=False):
     ResourceManager.adapt_engine_parameters(
         role=role,
         job_parameters=job_parameters,
         create_initiator_baseline=create_initiator_baseline)
     if create_initiator_baseline:
         if job_parameters.task_parallelism is None:
             job_parameters.task_parallelism = JobDefaultConfig.task_parallelism
         if job_parameters.federated_status_collect_type is None:
             job_parameters.federated_status_collect_type = JobDefaultConfig.federated_status_collect_type
     if create_initiator_baseline and not job_parameters.computing_partitions:
         job_parameters.computing_partitions = job_parameters.adaptation_parameters[
             "task_cores_per_node"] * job_parameters.adaptation_parameters[
                 "task_nodes"]
Esempio n. 10
0
 def get_job_engines_address(cls, job_parameters: RunParameters):
     engines_info = {}
     engine_list = [(EngineType.COMPUTING, job_parameters.computing_engine),
                    (EngineType.FEDERATION,
                     job_parameters.federation_engine),
                    (EngineType.STORAGE, job_parameters.storage_engine)]
     for engine_type, engine_name in engine_list:
         engine_info = ResourceManager.get_engine_registration_info(
             engine_type=engine_type, engine_name=engine_name)
         job_parameters.engines_address[engine_type] = engine_info.f_engine_config if engine_info else {}
         engines_info[engine_type] = engine_info
     return engines_info
Esempio n. 11
0
 def get_common_parameters(self):
     if int(self.job_runtime_conf.get('dsl_version', 1)) == 2:
         job_parameters = RunParameters(**self.job_runtime_conf.get(
             "job_parameters", {}).get("common", {}))
         self.job_runtime_conf['job_parameters'][
             'common'] = job_parameters.to_dict()
     else:
         if "processors_per_node" in self.job_runtime_conf[
                 'job_parameters']:
             self.job_runtime_conf['job_parameters']["eggroll_run"] = \
                 {"eggroll.session.processors.per.node": self.job_runtime_conf['job_parameters']["processors_per_node"]}
         job_parameters = RunParameters(
             **self.job_runtime_conf['job_parameters'])
         self.job_runtime_conf['job_parameters'] = job_parameters.to_dict()
     return job_parameters
Esempio n. 12
0
 def calculate_task_resource(cls,
                             task_parameters: RunParameters = None,
                             task_info: dict = None):
     if not task_parameters:
         job_parameters = job_utils.get_job_parameters(
             job_id=task_info["job_id"],
             role=task_info["role"],
             party_id=task_info["party_id"])
         task_parameters = RunParameters(**job_parameters)
     if task_parameters.computing_engine in IGNORE_RESOURCE_COMPUTING_ENGINE:
         cores_per_task = 0
         memory_per_task = 0
     elif task_info[
             "role"] in IGNORE_RESOURCE_ROLES and task_parameters.computing_engine in SUPPORT_IGNORE_RESOURCE_ENGINES:
         cores_per_task = 0
         memory_per_task = 0
     else:
         cores_per_task = task_parameters.adaptation_parameters["task_cores_per_node"] * \
                          task_parameters.adaptation_parameters["task_nodes"]
         memory_per_task = task_parameters.adaptation_parameters["task_memory_per_node"] * \
                           task_parameters.adaptation_parameters["task_nodes"]
     return cores_per_task, memory_per_task
Esempio n. 13
0
 def create_new_version_task(cls, job, task, dsl_parser, auto):
     # stop old version task
     FederatedScheduler.stop_task(job=job,
                                  task=task,
                                  stop_status=TaskStatus.CANCELED)
     FederatedScheduler.clean_task(
         job=job, task=task, content_type=TaskCleanResourceType.METRICS)
     # create new version task
     task.f_task_version = task.f_task_version + 1
     if auto:
         task.f_auto_retries = task.f_auto_retries - 1
     task.f_run_pid = None
     task.f_run_ip = None
     # todo: FederatedScheduler.create_task and JobController.initialize_tasks will create task twice
     status_code, response = FederatedScheduler.create_task(job=job,
                                                            task=task)
     if status_code != FederatedSchedulingStatusCode.SUCCESS:
         raise Exception(f"create {task.f_task_id} new version failed")
     # create the task holder in db to record information of all participants in the initiator for scheduling
     for _role in response:
         for _party_id in response[_role]:
             if _role == job.f_initiator_role and _party_id == job.f_initiator_party_id:
                 continue
             JobController.initialize_tasks(
                 job_id=job.f_job_id,
                 role=_role,
                 party_id=_party_id,
                 run_on_this_party=False,
                 initiator_role=job.f_initiator_role,
                 initiator_party_id=job.f_initiator_party_id,
                 job_parameters=RunParameters(
                     **job.f_runtime_conf_on_party["job_parameters"]),
                 dsl_parser=dsl_parser,
                 components=[task.f_component_name],
                 task_version=task.f_task_version,
                 auto_retries=task.f_auto_retries)
     schedule_logger(job.f_job_id).info(
         f"create task {task.f_task_id} new version {task.f_task_version} successfully"
     )
Esempio n. 14
0
    def start_task_worker(cls,
                          worker_name,
                          task: Task,
                          task_parameters: RunParameters = None,
                          executable: list = None,
                          extra_env: dict = None,
                          **kwargs):
        worker_id, config_dir, log_dir = cls.get_process_dirs(
            worker_name=worker_name,
            job_id=task.f_job_id,
            role=task.f_role,
            party_id=task.f_party_id,
            task=task)

        session_id = job_utils.generate_session_id(task.f_task_id,
                                                   task.f_task_version,
                                                   task.f_role,
                                                   task.f_party_id)
        federation_session_id = job_utils.generate_task_version_id(
            task.f_task_id, task.f_task_version)

        info_kwargs = {}
        specific_cmd = []
        if worker_name is WorkerName.TASK_EXECUTOR:
            from fate_flow.worker.task_executor import TaskExecutor
            module_file_path = sys.modules[TaskExecutor.__module__].__file__
        else:
            raise Exception(f"not support {worker_name} worker")

        if task_parameters is None:
            task_parameters = RunParameters(**job_utils.get_job_parameters(
                task.f_job_id, task.f_role, task.f_party_id))

        config = task_parameters.to_dict()
        config["src_user"] = kwargs.get("src_user")
        config_path, result_path = cls.get_config(config_dir=config_dir,
                                                  config=config,
                                                  log_dir=log_dir)

        if executable:
            process_cmd = executable
        else:
            process_cmd = [sys.executable or "python3"]

        common_cmd = [
            module_file_path,
            "--job_id",
            task.f_job_id,
            "--component_name",
            task.f_component_name,
            "--task_id",
            task.f_task_id,
            "--task_version",
            task.f_task_version,
            "--role",
            task.f_role,
            "--party_id",
            task.f_party_id,
            "--config",
            config_path,
            '--result',
            result_path,
            "--log_dir",
            log_dir,
            "--parent_log_dir",
            os.path.dirname(log_dir),
            "--worker_id",
            worker_id,
            "--run_ip",
            RuntimeConfig.JOB_SERVER_HOST,
            "--job_server",
            f"{RuntimeConfig.JOB_SERVER_HOST}:{RuntimeConfig.HTTP_PORT}",
            "--session_id",
            session_id,
            "--federation_session_id",
            federation_session_id,
        ]
        process_cmd.extend(common_cmd)
        process_cmd.extend(specific_cmd)
        env = cls.get_env(task.f_job_id, task.f_provider_info)
        if extra_env:
            env.update(extra_env)
        schedule_logger(task.f_job_id).info(
            f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} {worker_name} worker subprocess is ready"
        )
        p = process_utils.run_subprocess(job_id=task.f_job_id,
                                         config_dir=config_dir,
                                         process_cmd=process_cmd,
                                         added_env=env,
                                         log_dir=log_dir,
                                         cwd_dir=config_dir,
                                         process_name=worker_name.value,
                                         process_id=worker_id)
        cls.save_worker_info(task=task,
                             worker_name=worker_name,
                             worker_id=worker_id,
                             run_ip=RuntimeConfig.JOB_SERVER_HOST,
                             run_pid=p.pid,
                             config=config,
                             cmd=process_cmd,
                             **info_kwargs)
        return {"run_pid": p.pid, "worker_id": worker_id, "cmd": process_cmd}
Esempio n. 15
0
    def create_job(cls, job_id, role, party_id, job_info):
        # parse job configuration
        dsl = job_info['dsl']
        runtime_conf = job_info['runtime_conf']
        train_runtime_conf = job_info['train_runtime_conf']
        if USE_AUTHENTICATION:
            authentication_check(src_role=job_info.get('src_role', None),
                                 src_party_id=job_info.get(
                                     'src_party_id', None),
                                 dsl=dsl,
                                 runtime_conf=runtime_conf,
                                 role=role,
                                 party_id=party_id)

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=dsl,
            runtime_conf=runtime_conf,
            train_runtime_conf=train_runtime_conf)
        job_parameters = dsl_parser.get_job_parameters(runtime_conf)
        schedule_logger(job_id).info(
            'job parameters:{}'.format(job_parameters))
        dest_user = job_parameters.get(role, {}).get(party_id,
                                                     {}).get('user', '')
        user = {}
        src_party_id = int(
            job_info['src_party_id']) if job_info.get('src_party_id') else 0
        src_role = job_info.get('src_role', '')
        src_user = job_parameters.get(src_role, {}).get(src_party_id, {}).get(
            'user', '') if src_role else ''
        for _role, party_id_item in job_parameters.items():
            user[_role] = {}
            for _party_id, _parameters in party_id_item.items():
                user[_role][_party_id] = _parameters.get("user", "")
        schedule_logger(job_id).info('job user:{}'.format(user))
        if USE_DATA_AUTHENTICATION:
            job_args = dsl_parser.get_args_input()
            schedule_logger(job_id).info('job args:{}'.format(job_args))
            dataset_dict = cls.get_dataset(False, role, party_id,
                                           runtime_conf.get("role"), job_args)
            dataset_list = []
            if dataset_dict.get(role, {}).get(party_id):
                for k, v in dataset_dict[role][party_id].items():
                    dataset_list.append({
                        "namespace": v.split('.')[0],
                        "table_name": v.split('.')[1]
                    })
            data_authentication_check(
                src_role=job_info.get('src_role'),
                src_party_id=job_info.get('src_party_id'),
                src_user=src_user,
                dest_user=dest_user,
                dataset_list=dataset_list)
        job_parameters = RunParameters(
            **job_parameters.get(role, {}).get(party_id, {}))

        # save new job into db
        if role == job_info["initiator_role"] and party_id == job_info[
                "initiator_party_id"]:
            is_initiator = True
        else:
            is_initiator = False
        job_info["status"] = JobStatus.READY
        job_info["user_id"] = dest_user
        job_info["src_user"] = src_user
        job_info["user"] = user
        # this party configuration
        job_info["role"] = role
        job_info["party_id"] = party_id
        job_info["is_initiator"] = is_initiator
        job_info["progress"] = 0
        cls.create_job_parameters_on_party(role=role,
                                           party_id=party_id,
                                           job_parameters=job_parameters)
        # update job parameters on party
        job_info["runtime_conf_on_party"][
            "job_parameters"] = job_parameters.to_dict()
        JobSaver.create_job(job_info=job_info)
        schedule_logger(job_id).info("start initialize tasks")
        initialized_result, provider_group = cls.initialize_tasks(
            job_id=job_id,
            role=role,
            party_id=party_id,
            run_on_this_party=True,
            initiator_role=job_info["initiator_role"],
            initiator_party_id=job_info["initiator_party_id"],
            job_parameters=job_parameters,
            dsl_parser=dsl_parser)
        schedule_logger(job_id).info("initialize tasks success")
        for provider_key, group_info in provider_group.items():
            for cpn in group_info["components"]:
                dsl["components"][cpn]["provider"] = provider_key

        roles = job_info['roles']
        cls.initialize_job_tracker(job_id=job_id,
                                   role=role,
                                   party_id=party_id,
                                   job_parameters=job_parameters,
                                   roles=roles,
                                   is_initiator=is_initiator,
                                   dsl_parser=dsl_parser)

        job_utils.save_job_conf(
            job_id=job_id,
            role=role,
            party_id=party_id,
            dsl=dsl,
            runtime_conf=runtime_conf,
            runtime_conf_on_party=job_info["runtime_conf_on_party"],
            train_runtime_conf=train_runtime_conf,
            pipeline_dsl=None)
        return {"components": initialized_result}
Esempio n. 16
0
    def start_task(cls, job_id, component_name, task_id, task_version, role,
                   party_id, **kwargs):
        """
        Start task, update status and party status
        :param job_id:
        :param component_name:
        :param task_id:
        :param task_version:
        :param role:
        :param party_id:
        :return:
        """
        job_dsl = job_utils.get_job_dsl(job_id, role, party_id)
        PrivilegeAuth.authentication_component(
            job_dsl,
            src_party_id=kwargs.get('src_party_id'),
            src_role=kwargs.get('src_role'),
            party_id=party_id,
            component_name=component_name)

        schedule_logger(job_id).info(
            f"try to start task {task_id} {task_version} on {role} {party_id} executor subprocess"
        )
        task_executor_process_start_status = False
        task_info = {
            "job_id": job_id,
            "task_id": task_id,
            "task_version": task_version,
            "role": role,
            "party_id": party_id,
        }
        is_failed = False
        try:
            task = JobSaver.query_task(task_id=task_id,
                                       task_version=task_version,
                                       role=role,
                                       party_id=party_id)[0]
            run_parameters_dict = job_utils.get_job_parameters(
                job_id, role, party_id)
            run_parameters_dict["src_user"] = kwargs.get("src_user")
            run_parameters = RunParameters(**run_parameters_dict)

            config_dir = job_utils.get_task_directory(job_id, role, party_id,
                                                      component_name, task_id,
                                                      task_version)
            os.makedirs(config_dir, exist_ok=True)

            run_parameters_path = os.path.join(config_dir,
                                               'task_parameters.json')
            with open(run_parameters_path, 'w') as fw:
                fw.write(json_dumps(run_parameters_dict))

            schedule_logger(job_id).info(
                f"use computing engine {run_parameters.computing_engine}")
            task_info["engine_conf"] = {
                "computing_engine": run_parameters.computing_engine
            }
            backend_engine = build_engine(run_parameters.computing_engine)
            run_info = backend_engine.run(
                task=task,
                run_parameters=run_parameters,
                run_parameters_path=run_parameters_path,
                config_dir=config_dir,
                log_dir=job_utils.get_job_log_directory(
                    job_id, role, party_id, component_name),
                cwd_dir=job_utils.get_job_directory(job_id, role, party_id,
                                                    component_name),
                user_name=kwargs.get("user_id"))
            task_info.update(run_info)
            task_info["start_time"] = current_timestamp()
            task_executor_process_start_status = True
        except Exception as e:
            schedule_logger(job_id).exception(e)
            is_failed = True
        finally:
            try:
                cls.update_task(task_info=task_info)
                task_info["party_status"] = TaskStatus.RUNNING
                cls.update_task_status(task_info=task_info)
                if is_failed:
                    task_info["party_status"] = TaskStatus.FAILED
                    cls.update_task_status(task_info=task_info)
            except Exception as e:
                schedule_logger(job_id).exception(e)
            schedule_logger(job_id).info(
                "task {} {} on {} {} executor subprocess start {}".format(
                    task_id, task_version, role, party_id, "success"
                    if task_executor_process_start_status else "failed"))
Esempio n. 17
0
    def adapt_engine_parameters(cls,
                                role,
                                job_parameters: RunParameters,
                                create_initiator_baseline=False):
        computing_engine_info = ResourceManager.get_engine_registration_info(
            engine_type=EngineType.COMPUTING,
            engine_name=job_parameters.computing_engine)
        if not job_parameters.adaptation_parameters or create_initiator_baseline:
            job_parameters.adaptation_parameters = {
                "task_nodes":
                0,
                "task_cores_per_node":
                0,
                "task_memory_per_node":
                0,
                # request_task_cores base on initiator and distribute to all parties, using job conf parameters or initiator fateflow server default settings
                "request_task_cores":
                int(job_parameters.task_cores)
                if job_parameters.task_cores else JobDefaultConfig.task_cores,
                "if_initiator_baseline":
                True
            }
        else:
            # use initiator baseline
            if role == "arbiter":
                job_parameters.adaptation_parameters["request_task_cores"] = 1
            elif "request_task_cores" not in job_parameters.adaptation_parameters:
                # compatibility 1.5.0
                job_parameters.adaptation_parameters[
                    "request_task_cores"] = job_parameters.adaptation_parameters[
                        "task_nodes"] * job_parameters.adaptation_parameters[
                            "task_cores_per_node"]

            job_parameters.adaptation_parameters[
                "if_initiator_baseline"] = False
        adaptation_parameters = job_parameters.adaptation_parameters

        if job_parameters.computing_engine in {
                ComputingEngine.STANDALONE, ComputingEngine.EGGROLL
        }:
            adaptation_parameters["task_nodes"] = computing_engine_info.f_nodes
            if int(
                    job_parameters.eggroll_run.get(
                        "eggroll.session.processors.per.node", 0)) > 0:
                adaptation_parameters["task_cores_per_node"] = int(
                    job_parameters.
                    eggroll_run["eggroll.session.processors.per.node"])
            else:
                adaptation_parameters["task_cores_per_node"] = max(
                    1,
                    int(adaptation_parameters["request_task_cores"] /
                        adaptation_parameters["task_nodes"]))
            if not create_initiator_baseline:
                # set the adaptation parameters to the actual engine operation parameters
                job_parameters.eggroll_run[
                    "eggroll.session.processors.per.node"] = adaptation_parameters[
                        "task_cores_per_node"]
        elif job_parameters.computing_engine == ComputingEngine.SPARK or job_parameters.computing_engine == ComputingEngine.LINKIS_SPARK:
            adaptation_parameters["task_nodes"] = int(
                job_parameters.spark_run.get("num-executors",
                                             computing_engine_info.f_nodes))
            if int(job_parameters.spark_run.get("executor-cores", 0)) > 0:
                adaptation_parameters["task_cores_per_node"] = int(
                    job_parameters.spark_run["executor-cores"])
            else:
                adaptation_parameters["task_cores_per_node"] = max(
                    1,
                    int(adaptation_parameters["request_task_cores"] /
                        adaptation_parameters["task_nodes"]))
            if not create_initiator_baseline:
                # set the adaptation parameters to the actual engine operation parameters
                job_parameters.spark_run[
                    "num-executors"] = adaptation_parameters["task_nodes"]
                job_parameters.spark_run[
                    "executor-cores"] = adaptation_parameters[
                        "task_cores_per_node"]
Esempio n. 18
0
 def set_federated_mode(cls, job_parameters: RunParameters):
     if not job_parameters.federated_mode:
         job_parameters.federated_mode = ENGINES["federated_mode"]
Esempio n. 19
0
    def save_pipelined_model(cls, job_id, role, party_id):
        schedule_logger(job_id).info(
            f"start to save pipeline model on {role} {party_id}")
        job_configuration = job_utils.get_job_configuration(job_id=job_id,
                                                            role=role,
                                                            party_id=party_id)
        runtime_conf_on_party = job_configuration.runtime_conf_on_party
        job_parameters = runtime_conf_on_party.get('job_parameters', {})
        if role in job_parameters.get("assistant_role", []):
            return
        model_id = job_parameters['model_id']
        model_version = job_parameters['model_version']
        job_type = job_parameters.get('job_type', '')
        roles = runtime_conf_on_party['role']
        initiator_role = runtime_conf_on_party['initiator']['role']
        initiator_party_id = runtime_conf_on_party['initiator']['party_id']
        if job_type == 'predict':
            return
        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job_configuration.dsl,
            runtime_conf=job_configuration.runtime_conf,
            train_runtime_conf=job_configuration.train_runtime_conf)

        components_parameters = {}
        tasks = JobSaver.query_task(job_id=job_id,
                                    role=role,
                                    party_id=party_id,
                                    only_latest=True)
        for task in tasks:
            components_parameters[
                task.f_component_name] = task.f_component_parameters
        predict_dsl = schedule_utils.fill_inference_dsl(
            dsl_parser,
            origin_inference_dsl=job_configuration.dsl,
            components_parameters=components_parameters)

        pipeline = pipeline_pb2.Pipeline()
        pipeline.inference_dsl = json_dumps(predict_dsl, byte=True)
        pipeline.train_dsl = json_dumps(job_configuration.dsl, byte=True)
        pipeline.train_runtime_conf = json_dumps(
            job_configuration.runtime_conf, byte=True)
        pipeline.fate_version = RuntimeConfig.get_env("FATE")
        pipeline.model_id = model_id
        pipeline.model_version = model_version

        pipeline.parent = True
        pipeline.loaded_times = 0
        pipeline.roles = json_dumps(roles, byte=True)
        pipeline.initiator_role = initiator_role
        pipeline.initiator_party_id = initiator_party_id
        pipeline.runtime_conf_on_party = json_dumps(runtime_conf_on_party,
                                                    byte=True)
        pipeline.parent_info = json_dumps({}, byte=True)

        tracker = Tracker(job_id=job_id,
                          role=role,
                          party_id=party_id,
                          model_id=model_id,
                          model_version=model_version,
                          job_parameters=RunParameters(**job_parameters))
        tracker.save_pipeline_model(pipeline_buffer_object=pipeline)
        if role != 'local':
            tracker.save_machine_learning_model_info()
        schedule_logger(job_id).info(
            f"save pipeline on {role} {party_id} successfully")
Esempio n. 20
0
    def _run_(self):
        # todo: All function calls where errors should be thrown
        args = self.args
        start_time = current_timestamp()
        try:
            LOGGER.info(
                f'run {args.component_name} {args.task_id} {args.task_version} on {args.role} {args.party_id} task'
            )
            self.report_info.update({
                "job_id": args.job_id,
                "component_name": args.component_name,
                "task_id": args.task_id,
                "task_version": args.task_version,
                "role": args.role,
                "party_id": args.party_id,
                "run_ip": args.run_ip,
                "run_pid": self.run_pid
            })
            operation_client = OperationClient()
            job_configuration = JobConfiguration(
                **operation_client.get_job_conf(
                    args.job_id, args.role, args.party_id, args.component_name,
                    args.task_id, args.task_version))
            task_parameters_conf = args.config
            dsl_parser = schedule_utils.get_job_dsl_parser(
                dsl=job_configuration.dsl,
                runtime_conf=job_configuration.runtime_conf,
                train_runtime_conf=job_configuration.train_runtime_conf,
                pipeline_dsl=None)

            job_parameters = dsl_parser.get_job_parameters(
                job_configuration.runtime_conf)
            user_name = job_parameters.get(args.role,
                                           {}).get(args.party_id,
                                                   {}).get("user", '')
            LOGGER.info(f"user name:{user_name}")
            src_user = task_parameters_conf.get("src_user")
            task_parameters = RunParameters(**task_parameters_conf)
            job_parameters = task_parameters
            if job_parameters.assistant_role:
                TaskExecutor.monkey_patch()

            job_args_on_party = TaskExecutor.get_job_args_on_party(
                dsl_parser, job_configuration.runtime_conf_on_party, args.role,
                args.party_id)
            component = dsl_parser.get_component_info(
                component_name=args.component_name)
            module_name = component.get_module()
            task_input_dsl = component.get_input()
            task_output_dsl = component.get_output()

            kwargs = {
                'job_id': args.job_id,
                'role': args.role,
                'party_id': args.party_id,
                'component_name': args.component_name,
                'task_id': args.task_id,
                'task_version': args.task_version,
                'model_id': job_parameters.model_id,
                'model_version': job_parameters.model_version,
                'component_module_name': module_name,
                'job_parameters': job_parameters,
            }
            tracker = Tracker(**kwargs)
            tracker_client = TrackerClient(**kwargs)
            checkpoint_manager = CheckpointManager(**kwargs)

            self.report_info["party_status"] = TaskStatus.RUNNING
            self.report_task_info_to_driver()

            previous_components_parameters = tracker_client.get_model_run_parameters(
            )
            LOGGER.info(
                f"previous_components_parameters:\n{json_dumps(previous_components_parameters, indent=4)}"
            )

            component_provider, component_parameters_on_party, user_specified_parameters = ProviderManager.get_component_run_info(
                dsl_parser=dsl_parser,
                component_name=args.component_name,
                role=args.role,
                party_id=args.party_id,
                previous_components_parameters=previous_components_parameters)
            RuntimeConfig.set_component_provider(component_provider)
            LOGGER.info(
                f"component parameters on party:\n{json_dumps(component_parameters_on_party, indent=4)}"
            )
            flow_feeded_parameters = {
                "output_data_name": task_output_dsl.get("data")
            }

            # init environment, process is shared globally
            RuntimeConfig.init_config(
                COMPUTING_ENGINE=job_parameters.computing_engine,
                FEDERATION_ENGINE=job_parameters.federation_engine,
                FEDERATED_MODE=job_parameters.federated_mode)

            if RuntimeConfig.COMPUTING_ENGINE == ComputingEngine.EGGROLL:
                session_options = task_parameters.eggroll_run.copy()
                session_options["python.path"] = os.getenv("PYTHONPATH")
                session_options["python.venv"] = os.getenv("VIRTUAL_ENV")
            else:
                session_options = {}

            sess = session.Session(session_id=args.session_id)
            sess.as_global()
            sess.init_computing(computing_session_id=args.session_id,
                                options=session_options)
            component_parameters_on_party[
                "job_parameters"] = job_parameters.to_dict()
            roles = job_configuration.runtime_conf["role"]
            if set(roles) == {"local"}:
                LOGGER.info(f"only local roles, pass init federation")
            else:
                sess.init_federation(
                    federation_session_id=args.federation_session_id,
                    runtime_conf=component_parameters_on_party,
                    service_conf=job_parameters.engines_address.get(
                        EngineType.FEDERATION, {}))
            LOGGER.info(
                f'run {args.component_name} {args.task_id} {args.task_version} on {args.role} {args.party_id} task'
            )
            LOGGER.info(
                f"component parameters on party:\n{json_dumps(component_parameters_on_party, indent=4)}"
            )
            LOGGER.info(f"task input dsl {task_input_dsl}")
            task_run_args, input_table_list = self.get_task_run_args(
                job_id=args.job_id,
                role=args.role,
                party_id=args.party_id,
                task_id=args.task_id,
                task_version=args.task_version,
                job_args=job_args_on_party,
                job_parameters=job_parameters,
                task_parameters=task_parameters,
                input_dsl=task_input_dsl,
            )
            if module_name in {
                    "Upload", "Download", "Reader", "Writer", "Checkpoint"
            }:
                task_run_args["job_parameters"] = job_parameters
            LOGGER.info(f"task input args {task_run_args}")

            need_run = component_parameters_on_party.get("ComponentParam",
                                                         {}).get(
                                                             "need_run", True)
            provider_interface = provider_utils.get_provider_interface(
                provider=component_provider)
            run_object = provider_interface.get(
                module_name,
                ComponentRegistry.get_provider_components(
                    provider_name=component_provider.name,
                    provider_version=component_provider.version)).get_run_obj(
                        self.args.role)
            flow_feeded_parameters.update({"table_info": input_table_list})
            cpn_input = ComponentInput(
                tracker=tracker_client,
                checkpoint_manager=checkpoint_manager,
                task_version_id=job_utils.generate_task_version_id(
                    args.task_id, args.task_version),
                parameters=component_parameters_on_party["ComponentParam"],
                datasets=task_run_args.get("data", None),
                caches=task_run_args.get("cache", None),
                models=dict(
                    model=task_run_args.get("model"),
                    isometric_model=task_run_args.get("isometric_model"),
                ),
                job_parameters=job_parameters,
                roles=dict(
                    role=component_parameters_on_party["role"],
                    local=component_parameters_on_party["local"],
                ),
                flow_feeded_parameters=flow_feeded_parameters,
            )
            profile_log_enabled = False
            try:
                if int(os.getenv("FATE_PROFILE_LOG_ENABLED", "0")) > 0:
                    profile_log_enabled = True
            except Exception as e:
                LOGGER.warning(e)
            if profile_log_enabled:
                # add profile logs
                LOGGER.info("profile logging is enabled")
                profile.profile_start()
                cpn_output = run_object.run(cpn_input)
                sess.wait_remote_all_done()
                profile.profile_ends()
            else:
                LOGGER.info("profile logging is disabled")
                cpn_output = run_object.run(cpn_input)
                sess.wait_remote_all_done()

            output_table_list = []
            LOGGER.info(f"task output data {cpn_output.data}")
            for index, data in enumerate(cpn_output.data):
                data_name = task_output_dsl.get(
                    'data')[index] if task_output_dsl.get(
                        'data') else '{}'.format(index)
                #todo: the token depends on the engine type, maybe in job parameters
                persistent_table_namespace, persistent_table_name = tracker.save_output_data(
                    computing_table=data,
                    output_storage_engine=job_parameters.storage_engine,
                    token={"username": user_name})
                if persistent_table_namespace and persistent_table_name:
                    tracker.log_output_data_info(
                        data_name=data_name,
                        table_namespace=persistent_table_namespace,
                        table_name=persistent_table_name)
                    output_table_list.append({
                        "namespace": persistent_table_namespace,
                        "name": persistent_table_name
                    })
            self.log_output_data_table_tracker(args.job_id, input_table_list,
                                               output_table_list)

            # There is only one model output at the current dsl version.
            tracker_client.save_component_output_model(
                model_buffers=cpn_output.model,
                model_alias=task_output_dsl['model'][0]
                if task_output_dsl.get('model') else 'default',
                user_specified_run_parameters=user_specified_parameters)
            if cpn_output.cache is not None:
                for i, cache in enumerate(cpn_output.cache):
                    if cache is None:
                        continue
                    name = task_output_dsl.get(
                        "cache")[i] if "cache" in task_output_dsl else str(i)
                    if isinstance(cache, DataCache):
                        tracker.tracking_output_cache(cache, cache_name=name)
                    elif isinstance(cache, tuple):
                        tracker.save_output_cache(
                            cache_data=cache[0],
                            cache_meta=cache[1],
                            cache_name=name,
                            output_storage_engine=job_parameters.
                            storage_engine,
                            output_storage_address=job_parameters.
                            engines_address.get(EngineType.STORAGE, {}),
                            token={"username": user_name})
                    else:
                        raise RuntimeError(
                            f"can not support type {type(cache)} module run object output cache"
                        )
            if need_run:
                self.report_info["party_status"] = TaskStatus.SUCCESS
            else:
                self.report_info["party_status"] = TaskStatus.PASS
        except PassError as e:
            self.report_info["party_status"] = TaskStatus.PASS
        except Exception as e:
            traceback.print_exc()
            self.report_info["party_status"] = TaskStatus.FAILED
            LOGGER.exception(e)
        finally:
            try:
                self.report_info["end_time"] = current_timestamp()
                self.report_info[
                    "elapsed"] = self.report_info["end_time"] - start_time
                self.report_task_info_to_driver()
            except Exception as e:
                self.report_info["party_status"] = TaskStatus.FAILED
                traceback.print_exc()
                LOGGER.exception(e)
        msg = f"finish {args.component_name} {args.task_id} {args.task_version} on {args.role} {args.party_id} with {self.report_info['party_status']}"
        LOGGER.info(msg)
        print(msg)
        return self.report_info