def _run(self):
        result = {}
        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=self.args.dsl,
            runtime_conf=self.args.runtime_conf,
            train_runtime_conf=self.args.train_runtime_conf,
            pipeline_dsl=self.args.pipeline_dsl)

        provider = ComponentProvider(**self.args.config["provider"])
        common_task_info = self.args.config["common_task_info"]
        log_msg = f"initialize the components: {self.args.config['components']}"
        LOGGER.info(
            start_log(log_msg,
                      role=self.args.role,
                      party_id=self.args.party_id))
        for component_name in self.args.config["components"]:
            result[component_name] = {}
            task_info = {}
            task_info.update(common_task_info)

            parameters, user_specified_parameters = ProviderManager.get_component_parameters(
                dsl_parser=dsl_parser,
                component_name=component_name,
                role=self.args.role,
                party_id=self.args.party_id,
                provider=provider)
            if parameters:
                task_info = {}
                task_info.update(common_task_info)
                task_info["component_name"] = component_name
                task_info["component_module"] = parameters["module"]
                task_info["provider_info"] = provider.to_dict()
                task_info["component_parameters"] = parameters
                TaskController.create_task(
                    role=self.args.role,
                    party_id=self.args.party_id,
                    run_on_this_party=common_task_info["run_on_this_party"],
                    task_info=task_info)
                result[component_name]["need_run"] = True
            else:
                # The party does not need to run, pass
                result[component_name]["need_run"] = False
        LOGGER.info(
            successful_log(log_msg,
                           role=self.args.role,
                           party_id=self.args.party_id))
        return result
Exemple #2
0
    def start_general_worker(cls,
                             worker_name: WorkerName,
                             job_id="",
                             role="",
                             party_id=0,
                             provider: ComponentProvider = None,
                             initialized_config: dict = None,
                             run_in_subprocess=True,
                             **kwargs):
        if RuntimeConfig.DEBUG:
            run_in_subprocess = True
        participate = locals()
        worker_id, config_dir, log_dir = cls.get_process_dirs(
            worker_name=worker_name,
            job_id=job_id,
            role=role,
            party_id=party_id)
        if worker_name in [
                WorkerName.PROVIDER_REGISTRAR, WorkerName.DEPENDENCE_UPLOAD
        ]:
            if not provider:
                raise ValueError("no provider argument")
            config = {"provider": provider.to_dict()}
            if worker_name == WorkerName.PROVIDER_REGISTRAR:
                from fate_flow.worker.provider_registrar import ProviderRegistrar
                module = ProviderRegistrar
                module_file_path = sys.modules[
                    ProviderRegistrar.__module__].__file__
                specific_cmd = []
            elif worker_name == WorkerName.DEPENDENCE_UPLOAD:
                from fate_flow.worker.dependence_upload import DependenceUpload
                module = DependenceUpload
                module_file_path = sys.modules[
                    DependenceUpload.__module__].__file__
                specific_cmd = [
                    '--dependence_type',
                    kwargs.get("dependence_type")
                ]
            provider_info = provider.to_dict()
        elif worker_name is WorkerName.TASK_INITIALIZER:
            if not initialized_config:
                raise ValueError("no initialized_config argument")
            config = initialized_config
            job_conf = job_utils.save_using_job_conf(job_id=job_id,
                                                     role=role,
                                                     party_id=party_id,
                                                     config_dir=config_dir)

            from fate_flow.worker.task_initializer import TaskInitializer
            module = TaskInitializer
            module_file_path = sys.modules[TaskInitializer.__module__].__file__
            specific_cmd = [
                '--dsl',
                job_conf["dsl_path"],
                '--runtime_conf',
                job_conf["runtime_conf_path"],
                '--train_runtime_conf',
                job_conf["train_runtime_conf_path"],
                '--pipeline_dsl',
                job_conf["pipeline_dsl_path"],
            ]
            provider_info = initialized_config["provider"]
        else:
            raise Exception(f"not support {worker_name} worker")
        config_path, result_path = cls.get_config(config_dir=config_dir,
                                                  config=config,
                                                  log_dir=log_dir)

        process_cmd = [
            sys.executable or "python3",
            module_file_path,
            "--config",
            config_path,
            '--result',
            result_path,
            "--log_dir",
            log_dir,
            "--parent_log_dir",
            os.path.dirname(log_dir),
            "--worker_id",
            worker_id,
            "--run_ip",
            RuntimeConfig.JOB_SERVER_HOST,
            "--job_server",
            f"{RuntimeConfig.JOB_SERVER_HOST}:{RuntimeConfig.HTTP_PORT}",
        ]

        if job_id:
            process_cmd.extend([
                "--job_id",
                job_id,
                "--role",
                role,
                "--party_id",
                party_id,
            ])

        process_cmd.extend(specific_cmd)
        if run_in_subprocess:
            p = process_utils.run_subprocess(job_id=job_id,
                                             config_dir=config_dir,
                                             process_cmd=process_cmd,
                                             added_env=cls.get_env(
                                                 job_id, provider_info),
                                             log_dir=log_dir,
                                             cwd_dir=config_dir,
                                             process_name=worker_name.value,
                                             process_id=worker_id)
            participate["pid"] = p.pid
            if job_id and role and party_id:
                logger = schedule_logger(job_id)
                msg = f"{worker_name} worker {worker_id} subprocess {p.pid}"
            else:
                logger = stat_logger
                msg = f"{worker_name} worker {worker_id} subprocess {p.pid}"
            logger.info(ready_log(msg=msg, role=role, party_id=party_id))

            # asynchronous
            if worker_name in [WorkerName.DEPENDENCE_UPLOAD]:
                if kwargs.get("callback") and kwargs.get("callback_param"):
                    callback_param = {}
                    participate.update(participate.get("kwargs", {}))
                    for k, v in participate.items():
                        if k in kwargs.get("callback_param"):
                            callback_param[k] = v
                    kwargs.get("callback")(**callback_param)
            else:
                try:
                    p.wait(timeout=120)
                    if p.returncode == 0:
                        logger.info(
                            successful_log(msg=msg,
                                           role=role,
                                           party_id=party_id))
                    else:
                        logger.info(
                            failed_log(msg=msg, role=role, party_id=party_id))
                    if p.returncode == 0:
                        return p.returncode, load_json_conf(result_path)
                    else:
                        std_path = process_utils.get_std_path(
                            log_dir=log_dir,
                            process_name=worker_name.value,
                            process_id=worker_id)
                        raise Exception(
                            f"run error, please check logs: {std_path}, {log_dir}/INFO.log"
                        )
                except subprocess.TimeoutExpired as e:
                    err = failed_log(msg=f"{msg} run timeout",
                                     role=role,
                                     party_id=party_id)
                    logger.exception(err)
                    raise Exception(err)
                finally:
                    try:
                        p.kill()
                        p.poll()
                    except Exception as e:
                        logger.exception(e)
        else:
            kwargs = cls.cmd_to_func_kwargs(process_cmd)
            code, message, result = module().run(**kwargs)
            if code == 0:
                return code, result
            else:
                raise Exception(message)