def get_job_parameters_dict(self, job_parameters: RunParameters = None): if job_parameters: if int(self.job_runtime_conf.get('dsl_version', 1)) == 2: self.job_runtime_conf['job_parameters']['common'] = job_parameters.to_dict() else: self.job_runtime_conf['job_parameters'] = job_parameters.to_dict() return self.job_runtime_conf['job_parameters']
def special_role_parameters(cls, role, job_parameters: RunParameters): if role == "arbiter": job_parameters.task_parallelism = 1 if job_parameters.adaptation_parameters["task_nodes"] > 0: job_parameters.adaptation_parameters["task_nodes"] = 1 if job_parameters.adaptation_parameters["task_cores_per_node"] > 0: job_parameters.adaptation_parameters["task_cores_per_node"] = 1
def update_common_parameters(self, common_parameters: RunParameters): if int(self.job_runtime_conf.get("dsl_version", 1)) == 2: if "common" not in self.job_runtime_conf["job_parameters"]: raise RuntimeError("the configuration format for v2 version must be job_parameters:common") self.job_runtime_conf["job_parameters"]["common"] = common_parameters.to_dict() else: self.job_runtime_conf["job_parameters"] = common_parameters.to_dict() return self.job_runtime_conf
def adapt_job_parameters(cls, role, job_parameters: RunParameters, create_initiator_baseline=False): ResourceManager.adapt_engine_parameters(role=role, job_parameters=job_parameters, create_initiator_baseline=create_initiator_baseline) if create_initiator_baseline: if job_parameters.task_parallelism is None: job_parameters.task_parallelism = DEFAULT_TASK_PARALLELISM if job_parameters.federated_status_collect_type is None: job_parameters.federated_status_collect_type = DEFAULT_FEDERATED_STATUS_COLLECT_TYPE if create_initiator_baseline and not job_parameters.computing_partitions: job_parameters.computing_partitions = job_parameters.adaptation_parameters["task_cores_per_node"] * job_parameters.adaptation_parameters["task_nodes"]
def create_job(cls, job_id, role, party_id, job_info): # parse job configuration dsl = job_info['dsl'] runtime_conf = job_info['runtime_conf'] train_runtime_conf = job_info['train_runtime_conf'] if USE_AUTHENTICATION: authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get( 'src_party_id', None), dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id) job_parameters = RunParameters(**runtime_conf['job_parameters']) job_initiator = runtime_conf['initiator'] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=dsl, runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf) # save new job into db if role == job_initiator['role'] and party_id == job_initiator[ 'party_id']: is_initiator = True else: is_initiator = False job_info["status"] = JobStatus.WAITING roles = job_info['roles'] # this party configuration job_info["role"] = role job_info["party_id"] = party_id job_info["is_initiator"] = is_initiator job_info["progress"] = 0 engines_info = cls.get_job_engines_address( job_parameters=job_parameters) cls.special_role_parameters(role=role, job_parameters=job_parameters) cls.check_parameters(job_parameters=job_parameters, engines_info=engines_info) runtime_conf["job_parameters"] = job_parameters.to_dict() JobSaver.create_job(job_info=job_info) job_utils.save_job_conf(job_id=job_id, job_dsl=dsl, job_runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) cls.initialize_tasks(job_id, role, party_id, True, job_initiator, job_parameters, dsl_parser) cls.initialize_job_tracker(job_id=job_id, role=role, party_id=party_id, job_info=job_info, is_initiator=is_initiator, dsl_parser=dsl_parser)
def get_component_input_table(dsl_parser, job, component_name): component = dsl_parser.get_component_info(component_name=component_name) if 'reader' in component_name: component_parameters = component.get_role_parameters() return component_parameters[job.f_role][0]['ReaderParam'] task_input_dsl = component.get_input() job_args_on_party = TaskExecutor.get_job_args_on_party( dsl_parser=dsl_parser, job_runtime_conf=job.f_runtime_conf, role=job.f_role, party_id=job.f_party_id) config = job_utils.get_job_parameters(job.f_job_id, job.f_role, job.f_party_id) task_parameters = RunParameters(**config) job_parameters = task_parameters component_input_table = TaskExecutor.get_task_run_args( job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id, task_id=None, task_version=None, job_args=job_args_on_party, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, get_input_table=True) return component_input_table
def start_session_stop(task): job_parameters = RunParameters(**get_job_parameters( job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id)) computing_session_id = generate_session_id(task.f_task_id, task.f_task_version, task.f_role, task.f_party_id) if task.f_status != TaskStatus.WAITING: schedule_logger(task.f_job_id).info( f'start run subprocess to stop task session {computing_session_id}' ) else: schedule_logger(task.f_job_id).info( f'task is waiting, pass stop session {computing_session_id}') return task_dir = os.path.join(get_job_directory(job_id=task.f_job_id), task.f_role, task.f_party_id, task.f_component_name, 'session_stop') os.makedirs(task_dir, exist_ok=True) process_cmd = [ 'python3', sys.modules[session_utils.SessionStop.__module__].__file__, '-j', computing_session_id, '--computing', job_parameters.computing_engine, '--federation', job_parameters.federation_engine, '--storage', job_parameters.storage_engine, '-c', 'stop' if task.f_status == JobStatus.SUCCESS else 'kill' ] p = run_subprocess(job_id=task.f_job_id, config_dir=task_dir, process_cmd=process_cmd, log_dir=None)
def clean_task(cls, job_id, task_id, task_version, role, party_id, content_type): status = set() if content_type == "metrics": tracker = Tracker(job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version) status.add(tracker.clean_metrics()) elif content_type == "table": jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id) if jobs: job = jobs[0] job_parameters = RunParameters( **job.f_runtime_conf_on_party["job_parameters"]) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version, job_parameters=job_parameters) status.add(tracker.clean_task(job.f_runtime_conf_on_party)) if len(status) == 1 and True in status: return True else: return False
def create_job(cls, job_id, role, party_id, job_info): # parse job configuration dsl = job_info['dsl'] runtime_conf = job_info['runtime_conf'] train_runtime_conf = job_info['train_runtime_conf'] if USE_AUTHENTICATION: authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get('src_party_id', None), dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id) dsl_parser = schedule_utils.get_job_dsl_parser(dsl=dsl, runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf) job_parameters = dsl_parser.get_job_parameters().get(role, {}).get(party_id, {}) schedule_logger(job_id).info('job parameters:{}'.format(job_parameters)) job_parameters = RunParameters(**job_parameters) # save new job into db if role == job_info["initiator_role"] and party_id == job_info["initiator_party_id"]: is_initiator = True else: is_initiator = False job_info["status"] = JobStatus.WAITING # this party configuration job_info["role"] = role job_info["party_id"] = party_id job_info["is_initiator"] = is_initiator job_info["progress"] = 0 cls.adapt_job_parameters(role=role, job_parameters=job_parameters) engines_info = cls.get_job_engines_address(job_parameters=job_parameters) cls.check_parameters(job_parameters=job_parameters, role=role, party_id=party_id, engines_info=engines_info) job_info["runtime_conf_on_party"]["job_parameters"] = job_parameters.to_dict() job_utils.save_job_conf(job_id=job_id, role=role, job_dsl=dsl, job_runtime_conf=runtime_conf, job_runtime_conf_on_party=job_info["runtime_conf_on_party"], train_runtime_conf=train_runtime_conf, pipeline_dsl=None) cls.initialize_tasks(job_id=job_id, role=role, party_id=party_id, run_on_this_party=True, initiator_role=job_info["initiator_role"], initiator_party_id=job_info["initiator_party_id"], job_parameters=job_parameters, dsl_parser=dsl_parser) job_parameters = job_info['runtime_conf_on_party']['job_parameters'] roles = job_info['roles'] cls.initialize_job_tracker(job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, roles=roles, is_initiator=is_initiator, dsl_parser=dsl_parser) JobSaver.create_job(job_info=job_info)
def calculate_job_resource(cls, job_parameters: RunParameters = None, job_id=None, role=None, party_id=None): if not job_parameters: dsl, runtime_conf, train_runtime_conf = job_utils.get_job_configuration(job_id=job_id, role=role, party_id=party_id) job_parameters = RunParameters(**runtime_conf["job_parameters"]) cores = job_parameters.adaptation_parameters["task_cores_per_node"] * job_parameters.adaptation_parameters[ "task_nodes"] * job_parameters.task_parallelism memory = job_parameters.adaptation_parameters["task_memory_per_node"] * job_parameters.adaptation_parameters[ "task_nodes"] * job_parameters.task_parallelism return job_parameters.computing_engine, cores, memory
def calculate_task_resource(cls, task_parameters: RunParameters = None, task_info: dict = None): if not task_parameters: dsl, runtime_conf, train_runtime_conf = job_utils.get_job_configuration(job_id=task_info["job_id"], role=task_info["role"], party_id=task_info["party_id"]) task_parameters = RunParameters(**runtime_conf["job_parameters"]) cores_per_task = task_parameters.adaptation_parameters["task_cores_per_node"] * \ task_parameters.adaptation_parameters["task_nodes"] memory_per_task = task_parameters.adaptation_parameters["task_memory_per_node"] * \ task_parameters.adaptation_parameters["task_nodes"] return cores_per_task, memory_per_task
def get_job_engines_address(cls, job_parameters: RunParameters): engines_info = {} engine_list = [ (EngineType.COMPUTING, job_parameters.computing_engine), (EngineType.FEDERATION, job_parameters.federation_engine), (EngineType.STORAGE, job_parameters.storage_engine) ] for engine_type, engine_name in engine_list: engine_info = ResourceManager.get_engine_registration_info(engine_type=engine_type, engine_name=engine_name) job_parameters.engines_address[engine_type] = engine_info.f_engine_config engines_info[engine_type] = engine_info return engines_info
def adapt_engine_parameters(cls, role, job_parameters: RunParameters, create_initiator_baseline=False): computing_engine_info = ResourceManager.get_engine_registration_info( engine_type=EngineType.COMPUTING, engine_name=job_parameters.computing_engine) if create_initiator_baseline: job_parameters.adaptation_parameters = { "task_nodes": 0, "task_cores_per_node": 0, "task_memory_per_node": 0, } task_cores = 0 else: # use initiator baseline if role == "arbiter": task_cores = 1 else: task_cores = job_parameters.adaptation_parameters[ "task_nodes"] * job_parameters.adaptation_parameters[ "task_cores_per_node"] if job_parameters.computing_engine in { ComputingEngine.STANDALONE, ComputingEngine.EGGROLL }: job_parameters.adaptation_parameters[ "task_nodes"] = computing_engine_info.f_nodes job_parameters.adaptation_parameters["task_cores_per_node"] = int( job_parameters.eggroll_run.get( "eggroll.session.processors.per.node", cls.adapt_task_cores_per_node( create_initiator_baseline, task_cores, job_parameters.adaptation_parameters["task_nodes"]))) if not create_initiator_baseline: job_parameters.eggroll_run[ "eggroll.session.processors.per.node"] = job_parameters.adaptation_parameters[ "task_cores_per_node"] elif job_parameters.computing_engine == ComputingEngine.SPARK: job_parameters.adaptation_parameters["task_nodes"] = int( job_parameters.spark_run.get("num-executors", computing_engine_info.f_nodes)) job_parameters.adaptation_parameters["task_cores_per_node"] = int( job_parameters.spark_run.get( "executor-cores", cls.adapt_task_cores_per_node( create_initiator_baseline, task_cores, job_parameters.adaptation_parameters["task_nodes"]))) if not create_initiator_baseline: job_parameters.spark_run[ "num-executors"] = job_parameters.adaptation_parameters[ "task_nodes"] job_parameters.spark_run[ "executor-cores"] = job_parameters.adaptation_parameters[ "task_cores_per_node"]
def calculate_task_resource(cls, task_parameters: RunParameters = None, task_info: dict = None): if not task_parameters: job_parameters = job_utils.get_job_parameters( job_id=task_info["job_id"], role=task_info["role"], party_id=task_info["party_id"]) task_parameters = RunParameters(**job_parameters) cores_per_task = task_parameters.adaptation_parameters["task_cores_per_node"] * \ task_parameters.adaptation_parameters["task_nodes"] memory_per_task = task_parameters.adaptation_parameters["task_memory_per_node"] * \ task_parameters.adaptation_parameters["task_nodes"] return cores_per_task, memory_per_task
def calculate_task_resource(cls, task_parameters: RunParameters = None, task_info: dict = None): if not task_parameters: job_parameters = job_utils.get_job_parameters(job_id=task_info["job_id"], role=task_info["role"], party_id=task_info["party_id"]) task_parameters = RunParameters(**job_parameters) if task_parameters.backend == Backend.LINKIS_SPARK_RABBITMQ: cores_per_task = 0 memory_per_task = 0 elif task_info["role"] in IGNORE_RESOURCE_ROLES and task_parameters.computing_engine in SUPPORT_IGNORE_RESOURCE_ENGINES: cores_per_task = 0 memory_per_task = 0 else: cores_per_task = task_parameters.adaptation_parameters["task_cores_per_node"] * \ task_parameters.adaptation_parameters["task_nodes"] memory_per_task = task_parameters.adaptation_parameters["task_memory_per_node"] * \ task_parameters.adaptation_parameters["task_nodes"] return cores_per_task, memory_per_task
def calculate_job_resource(cls, job_parameters: RunParameters = None, job_id=None, role=None, party_id=None): if not job_parameters: job_parameters = job_utils.get_job_parameters(job_id=job_id, role=role, party_id=party_id) job_parameters = RunParameters(**job_parameters) if job_parameters.backend == Backend.LINKIS_SPARK_RABBITMQ: cores = 0 memory = 0 elif role in IGNORE_RESOURCE_ROLES and job_parameters.computing_engine in SUPPORT_IGNORE_RESOURCE_ENGINES: cores = 0 memory = 0 else: cores = job_parameters.adaptation_parameters["task_cores_per_node"] * job_parameters.adaptation_parameters[ "task_nodes"] * job_parameters.task_parallelism memory = job_parameters.adaptation_parameters["task_memory_per_node"] * job_parameters.adaptation_parameters[ "task_nodes"] * job_parameters.task_parallelism return job_parameters.computing_engine, cores, memory
def get_common_parameters(self): if int(self.job_runtime_conf.get('dsl_version', 1)) == 2: if "common" not in self.job_runtime_conf["job_parameters"]: raise RuntimeError("the configuration format for v2 version must be job_parameters:common") job_parameters = RunParameters(**self.job_runtime_conf['job_parameters']['common']) self.job_runtime_conf['job_parameters']['common'] = job_parameters.to_dict() else: if "processors_per_node" in self.job_runtime_conf['job_parameters']: self.job_runtime_conf['job_parameters']["eggroll_run"] = \ {"eggroll.session.processors.per.node": self.job_runtime_conf['job_parameters']["processors_per_node"]} job_parameters = RunParameters(**self.job_runtime_conf['job_parameters']) self.job_runtime_conf['job_parameters'] = job_parameters.to_dict() return job_parameters
def job_engine_support_parameters(cls, job_parameters: RunParameters): computing_engine_info = ResourceManager.get_engine_registration_info(engine_type=EngineType.COMPUTING, engine_name=job_parameters.computing_engine) job_parameters.adaptation_parameters = { "task_nodes": 0, "task_cores_per_node": 0, "task_memory_per_node": 0, } if job_parameters.computing_engine in {ComputingEngine.STANDALONE, ComputingEngine.EGGROLL}: job_parameters.adaptation_parameters["task_nodes"] = computing_engine_info.f_nodes job_parameters.adaptation_parameters["task_cores_per_node"] = int( job_parameters.eggroll_run.get("eggroll.session.processors.per.node", DEFAULT_TASK_CORES_PER_NODE)) job_parameters.eggroll_run["eggroll.session.processors.per.node"] = job_parameters.adaptation_parameters[ "task_cores_per_node"] elif job_parameters.computing_engine == ComputingEngine.SPARK: job_parameters.adaptation_parameters["task_nodes"] = int(job_parameters.spark_run.get("num-executors", computing_engine_info.f_nodes)) job_parameters.spark_run["num-executors"] = job_parameters.adaptation_parameters["task_nodes"] job_parameters.adaptation_parameters["task_cores_per_node"] = int( job_parameters.spark_run.get("executor-cores", DEFAULT_TASK_CORES_PER_NODE)) job_parameters.spark_run["executor-cores"] = job_parameters.adaptation_parameters["task_cores_per_node"]
def adapt_engine_parameters(cls, role, job_parameters: RunParameters, create_initiator_baseline=False): computing_engine_info = ResourceManager.get_engine_registration_info(engine_type=EngineType.COMPUTING, engine_name=job_parameters.computing_engine) if create_initiator_baseline: job_parameters.adaptation_parameters = { "task_nodes": 0, "task_cores_per_node": 0, "task_memory_per_node": 0, # request_task_cores base on initiator and distribute to all parties, using job conf parameters or initiator fateflow server default settings "request_task_cores": int(job_parameters.task_cores) if job_parameters.task_cores else DEFAULT_TASK_CORES, "if_initiator_baseline": True } else: # use initiator baseline if role == "arbiter": job_parameters.adaptation_parameters["request_task_cores"] = 1 elif "request_task_cores" not in job_parameters.adaptation_parameters: # compatibility 1.5.0 job_parameters.adaptation_parameters["request_task_cores"] = job_parameters.adaptation_parameters["task_nodes"] * job_parameters.adaptation_parameters["task_cores_per_node"] job_parameters.adaptation_parameters["if_initiator_baseline"] = False adaptation_parameters = job_parameters.adaptation_parameters if job_parameters.computing_engine in {ComputingEngine.STANDALONE, ComputingEngine.EGGROLL}: adaptation_parameters["task_nodes"] = computing_engine_info.f_nodes if int(job_parameters.eggroll_run.get("eggroll.session.processors.per.node", 0)) > 0: adaptation_parameters["task_cores_per_node"] = int(job_parameters.eggroll_run["eggroll.session.processors.per.node"]) else: adaptation_parameters["task_cores_per_node"] = max(1, int(adaptation_parameters["request_task_cores"] / adaptation_parameters["task_nodes"])) if not create_initiator_baseline: # set the adaptation parameters to the actual engine operation parameters job_parameters.eggroll_run["eggroll.session.processors.per.node"] = adaptation_parameters["task_cores_per_node"] elif job_parameters.computing_engine == ComputingEngine.SPARK or job_parameters.computing_engine == ComputingEngine.LINKIS_SPARK: adaptation_parameters["task_nodes"] = int(job_parameters.spark_run.get("num-executors", computing_engine_info.f_nodes)) if int(job_parameters.spark_run.get("executor-cores", 0)) > 0: adaptation_parameters["task_cores_per_node"] = int(job_parameters.spark_run["executor-cores"]) else: adaptation_parameters["task_cores_per_node"] = max(1, int(adaptation_parameters["request_task_cores"] / adaptation_parameters["task_nodes"])) if not create_initiator_baseline: # set the adaptation parameters to the actual engine operation parameters job_parameters.spark_run["num-executors"] = adaptation_parameters["task_nodes"] job_parameters.spark_run["executor-cores"] = adaptation_parameters["task_cores_per_node"]
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info( f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}" ) jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError( f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}" ) if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job.f_dsl, runtime_conf=job.f_runtime_conf_on_party, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info( f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun" ) else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info( f"create task {task.f_task_id} new version {task.f_task_version}" ) for _role, _party_ids in job.f_runtime_conf_on_party[ "role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks( job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters( ** job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info( f"create task {task.f_task_id} new version {task.f_task_version} successfully" ) job_can_rerun = True if job_can_rerun: schedule_logger( job_id=job_id).info(f"job {job_id} set rerun signal") status = cls.rerun_signal(job_id=job_id, set_or_reset=True) if status: schedule_logger(job_id=job_id).info( f"job {job_id} set rerun signal successfully") else: schedule_logger(job_id=job_id).info( f"job {job_id} set rerun signal failed") else: FederatedScheduler.sync_job_status(job=job) schedule_logger( job_id=job_id).info(f"job {job_id} no task to rerun")
def submit(cls, job_data, job_id=None): if not job_id: job_id = job_utils.generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_initiator = job_runtime_conf['initiator'] job_parameters = RunParameters(**job_runtime_conf['job_parameters']) cls.backend_compatibility(job_parameters=job_parameters) job_utils.check_job_runtime_conf(job_runtime_conf) if job_parameters.job_type != 'predict': # generate job model info job_parameters.model_id = model_utils.gen_model_id(job_runtime_conf['role']) job_parameters.model_version = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters.to_dict(), ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl tracker = Tracker(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters.model_id, model_version=job_parameters.model_version) pipeline_model = tracker.get_output_model('pipeline') if not job_dsl: job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) path_dict = job_utils.save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) job = Job() job.f_job_id = job_id job.f_dsl = job_dsl job_runtime_conf["job_parameters"] = job_parameters.to_dict() job.f_runtime_conf = job_runtime_conf job.f_train_runtime_conf = train_runtime_conf job.f_roles = job_runtime_conf['role'] job.f_work_mode = job_parameters.work_mode job.f_initiator_role = job_initiator['role'] job.f_initiator_party_id = job_initiator['party_id'] initiator_role = job_initiator['role'] initiator_party_id = job_initiator['party_id'] if initiator_party_id not in job_runtime_conf['role'][initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id)) raise Exception("initiator party id error {}".format(initiator_party_id)) dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) cls.adapt_job_parameters(job_parameters=job_parameters) # update runtime conf job_runtime_conf["job_parameters"] = job_parameters.to_dict() job.f_runtime_conf = job_runtime_conf status_code, response = FederatedScheduler.create_job(job=job) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception("create job failed: {}".format(response)) if job_parameters.work_mode == WorkMode.CLUSTER: # Save the status information of all participants in the initiator for scheduling for role, party_ids in job_runtime_conf["role"].items(): for party_id in party_ids: if role == job_initiator['role'] and party_id == job_initiator['party_id']: continue JobController.initialize_tasks(job_id, role, party_id, False, job_initiator, job_parameters, dsl_parser) # push into queue try: JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id) except Exception as e: raise Exception(f'push job into queue failed:\n{e}') schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters.model_id)) board_url = "http://{}:{}{}".format( ServiceUtils.get_item("fateboard", "host"), ServiceUtils.get_item("fateboard", "port"), FATE_BOARD_DASHBOARD_ENDPOINT).format(job_id, job_initiator['role'], job_initiator['party_id']) logs_directory = job_utils.get_job_log_directory(job_id) return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \ {'model_id': job_parameters.model_id, 'model_version': job_parameters.model_version}, board_url
def adapt_job_parameters(cls, job_parameters: RunParameters): if job_parameters.task_parallelism is None: job_parameters.task_parallelism = DEFAULT_TASK_PARALLELISM if job_parameters.federated_status_collect_type is None: job_parameters.federated_status_collect_type = DEFAULT_FEDERATED_STATUS_COLLECT_TYPE ResourceManager.job_engine_support_parameters(job_parameters=job_parameters)
def start_task(cls, job_id, component_name, task_id, task_version, role, party_id): """ Start task, update status and party status :param job_id: :param component_name: :param task_id: :param task_version: :param role: :param party_id: :return: """ schedule_logger(job_id).info( 'try to start job {} task {} {} on {} {} executor subprocess'. format(job_id, task_id, task_version, role, party_id)) task_executor_process_start_status = False task_info = { "job_id": job_id, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, } try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name, task_id, task_version) os.makedirs(task_dir, exist_ok=True) task_parameters_path = os.path.join(task_dir, 'task_parameters.json') run_parameters_dict = job_utils.get_job_parameters( job_id, role, party_id) with open(task_parameters_path, 'w') as fw: fw.write(json_dumps(run_parameters_dict)) run_parameters = RunParameters(**run_parameters_dict) schedule_logger(job_id=job_id).info( f"use computing engine {run_parameters.computing_engine}") if run_parameters.computing_engine in { ComputingEngine.EGGROLL, ComputingEngine.STANDALONE }: process_cmd = [ sys.executable, sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-v', task_version, '-r', role, '-p', party_id, '-c', task_parameters_path, '--run_ip', RuntimeConfig.JOB_SERVER_HOST, '--job_server', '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST, RuntimeConfig.HTTP_PORT), ] elif run_parameters.computing_engine == ComputingEngine.SPARK: if "SPARK_HOME" not in os.environ: raise EnvironmentError("SPARK_HOME not found") spark_home = os.environ["SPARK_HOME"] # additional configs spark_submit_config = run_parameters.spark_run deploy_mode = spark_submit_config.get("deploy-mode", "client") if deploy_mode not in ["client"]: raise ValueError( f"deploy mode {deploy_mode} not supported") spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit") process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}'] for k, v in spark_submit_config.items(): if k != "conf": process_cmd.append(f'--{k}={v}') if "conf" in spark_submit_config: for ck, cv in spark_submit_config["conf"].items(): process_cmd.append(f'--conf') process_cmd.append(f'{ck}={cv}') process_cmd.extend([ sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-v', task_version, '-r', role, '-p', party_id, '-c', task_parameters_path, '--run_ip', RuntimeConfig.JOB_SERVER_HOST, '--job_server', '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST, RuntimeConfig.HTTP_PORT), ]) else: raise ValueError( f"${run_parameters.computing_engine} is not supported") task_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} task {} {} on {} {} executor subprocess is ready'. format(job_id, task_id, task_version, role, party_id)) p = job_utils.run_subprocess(job_id=job_id, config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_info["party_status"] = TaskStatus.RUNNING #task_info["run_pid"] = p.pid task_info["start_time"] = current_timestamp() task_executor_process_start_status = True else: task_info["party_status"] = TaskStatus.FAILED except Exception as e: schedule_logger(job_id).exception(e) task_info["party_status"] = TaskStatus.FAILED finally: try: cls.update_task(task_info=task_info) cls.update_task_status(task_info=task_info) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).info( 'job {} task {} {} on {} {} executor subprocess start {}'. format( job_id, task_id, task_version, role, party_id, "success" if task_executor_process_start_status else "failed"))
def create_job(cls, job_id, role, party_id, job_info): # parse job configuration dsl = job_info['dsl'] runtime_conf = job_info['runtime_conf'] train_runtime_conf = job_info['train_runtime_conf'] if USE_AUTHENTICATION: authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get( 'src_party_id', None), dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=dsl, runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf) job_parameters = dsl_parser.get_job_parameters().get(role, {}).get( party_id, {}) schedule_logger(job_id).info( 'job parameters:{}'.format(job_parameters)) dest_user = dsl_parser.get_job_parameters().get(role, {}).get( party_id, {}).get("user", '') user = {} src_party_id = int(job_info.get('src_party_id')) if job_info.get( 'src_party_id') else 0 src_user = dsl_parser.get_job_parameters().get( job_info.get('src_role'), {}).get(src_party_id, {}).get("user", '') for _role, party_id_item in dsl_parser.get_job_parameters().items(): user[_role] = {} for _party_id, _parameters in party_id_item.items(): user[_role][_party_id] = _parameters.get("user", "") schedule_logger(job_id).info('job user:{}'.format(user)) if USE_DATA_AUTHENTICATION: job_args = dsl_parser.get_args_input() schedule_logger(job_id).info('job args:{}'.format(job_args)) dataset_dict = cls.get_dataset(False, role, party_id, runtime_conf.get("role"), job_args) dataset_list = [] if dataset_dict.get(role, {}).get(party_id): for k, v in dataset_dict[role][party_id].items(): dataset_list.append({ "namespace": v.split('.')[0], "table_name": v.split('.')[1] }) data_authentication_check( src_role=job_info.get('src_role'), src_party_id=job_info.get('src_party_id'), src_user=src_user, dest_user=dest_user, dataset_list=dataset_list) job_parameters = RunParameters(**job_parameters) # save new job into db if role == job_info["initiator_role"] and party_id == job_info[ "initiator_party_id"]: is_initiator = True else: is_initiator = False job_info["status"] = JobStatus.WAITING job_info["user_id"] = dest_user job_info["src_user"] = src_user job_info["user"] = user # this party configuration job_info["role"] = role job_info["party_id"] = party_id job_info["is_initiator"] = is_initiator job_info["progress"] = 0 cls.adapt_job_parameters(role=role, job_parameters=job_parameters) engines_info = cls.get_job_engines_address( job_parameters=job_parameters) cls.check_parameters(job_parameters=job_parameters, role=role, party_id=party_id, engines_info=engines_info) job_info["runtime_conf_on_party"][ "job_parameters"] = job_parameters.to_dict() job_utils.save_job_conf( job_id=job_id, role=role, job_dsl=dsl, job_runtime_conf=runtime_conf, job_runtime_conf_on_party=job_info["runtime_conf_on_party"], train_runtime_conf=train_runtime_conf, pipeline_dsl=None) cls.initialize_tasks(job_id=job_id, role=role, party_id=party_id, run_on_this_party=True, initiator_role=job_info["initiator_role"], initiator_party_id=job_info["initiator_party_id"], job_parameters=job_parameters, dsl_parser=dsl_parser) job_parameters = job_info['runtime_conf_on_party']['job_parameters'] roles = job_info['roles'] cls.initialize_job_tracker(job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, roles=roles, is_initiator=is_initiator, dsl_parser=dsl_parser) JobSaver.create_job(job_info=job_info)
def start_task(cls, job_id, component_name, task_id, task_version, role, party_id, **kwargs): """ Start task, update status and party status :param job_id: :param component_name: :param task_id: :param task_version: :param role: :param party_id: :return: """ job_dsl = job_utils.get_job_dsl(job_id, role, party_id) PrivilegeAuth.authentication_component( job_dsl, src_party_id=kwargs.get('src_party_id'), src_role=kwargs.get('src_role'), party_id=party_id, component_name=component_name) schedule_logger(job_id).info( 'try to start job {} task {} {} on {} {} executor subprocess'. format(job_id, task_id, task_version, role, party_id)) task_executor_process_start_status = False task_info = { "job_id": job_id, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, "party_status": TaskStatus.RUNNING } is_failed = False try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name, task_id, task_version) os.makedirs(task_dir, exist_ok=True) task_parameters_path = os.path.join(task_dir, 'task_parameters.json') run_parameters_dict = job_utils.get_job_parameters( job_id, role, party_id) run_parameters_dict["src_user"] = kwargs.get("src_user") with open(task_parameters_path, 'w') as fw: fw.write(json_dumps(run_parameters_dict)) run_parameters = RunParameters(**run_parameters_dict) schedule_logger(job_id=job_id).info( f"use computing engine {run_parameters.computing_engine}") subprocess = True task_info["engine_conf"] = { "computing_engine": run_parameters.computing_engine } backend_engine = build_engine(run_parameters.computing_engine) status = backend_engine.run( job_id=job_id, component_name=component_name, task_id=task_id, task_version=task_version, role=role, party_id=party_id, task_parameters_path=task_parameters_path, run_parameters=run_parameters, task_info=task_info, user_name=kwargs.get("user_id")) if status: task_info["start_time"] = current_timestamp() task_executor_process_start_status = True else: is_failed = True except Exception as e: schedule_logger(job_id).exception(e) is_failed = True finally: try: cls.update_task(task_info=task_info) cls.update_task_status(task_info=task_info) if is_failed: task_info["party_status"] = TaskStatus.FAILED cls.update_task_status(task_info=task_info) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).info( 'job {} task {} {} on {} {} executor subprocess start {}'. format( job_id, task_id, task_version, role, party_id, "success" if task_executor_process_start_status else "failed"))
def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name): schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}") jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id) if jobs: job = jobs[0] else: raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}") if component_name != job_utils.job_virtual_component_name(): tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name) else: tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id) job_can_rerun = False dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl, runtime_conf=job.f_runtime_conf, train_runtime_conf=job.f_train_runtime_conf) for task in tasks: if task.f_status in {TaskStatus.WAITING, TaskStatus.COMPLETE}: if task.f_status == TaskStatus.WAITING: job_can_rerun = True schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun") else: # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task(job=job, task=task, content_type="metrics") # create new version task task.f_task_version = task.f_task_version + 1 task.f_run_pid = None task.f_run_ip = None FederatedScheduler.create_task(job=job, task=task) # Save the status information of all participants in the initiator for scheduling schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}") for _role, _party_ids in job.f_runtime_conf["role"].items(): for _party_id in _party_ids: if _role == initiator_role and _party_id == initiator_party_id: continue JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_runtime_conf["initiator"], RunParameters(**job.f_runtime_conf["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version) schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully") job_can_rerun = True if job_can_rerun: if EndStatus.contains(job.f_status): job.f_status = JobStatus.WAITING job.f_end_time = None job.f_elapsed = None job.f_progress = 0 schedule_logger(job_id=job_id).info(f"job {job_id} has been finished, set waiting to rerun") status, response = FederatedScheduler.sync_job_status(job=job) if status == FederatedSchedulingStatusCode.SUCCESS: FederatedScheduler.sync_job(job=job, update_fields=["end_time", "elapsed", "progress"]) JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id) schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun successfully") else: schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun failed") else: # status updates may be delayed, and in a very small probability they will be executed after the rerun command schedule_logger(job_id=job_id).info(f"job {job_id} status is {job.f_status}, will be run new version waiting task") else: schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
def run_task(cls): task_info = {} try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-v', '--task_version', required=True, type=int, help="task version") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=int, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task parameters") parser.add_argument('--run_ip', help="run ip", type=str) parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger(args.job_id).info('enter task process') schedule_logger(args.job_id).info(args) # init function args if args.job_server: RuntimeConfig.init_config( JOB_SERVER_HOST=args.job_server.split(':')[0], HTTP_PORT=args.job_server.split(':')[1]) RuntimeConfig.set_process_role(ProcessRole.EXECUTOR) job_id = args.job_id component_name = args.component_name task_id = args.task_id task_version = args.task_version role = args.role party_id = args.party_id executor_pid = os.getpid() task_info.update({ "job_id": job_id, "component_name": component_name, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, "run_ip": args.run_ip, "run_pid": executor_pid }) start_time = current_timestamp() job_conf = job_utils.get_job_conf(job_id, role) job_dsl = job_conf["job_dsl_path"] job_runtime_conf = job_conf["job_runtime_conf_path"] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=job_conf["train_runtime_conf_path"], pipeline_dsl=job_conf["pipeline_dsl_path"]) party_index = job_runtime_conf["role"][role].index(party_id) job_args_on_party = TaskExecutor.get_job_args_on_party( dsl_parser, job_runtime_conf, role, party_id) component = dsl_parser.get_component_info( component_name=component_name) component_parameters = component.get_role_parameters() component_parameters_on_party = component_parameters[role][ party_index] if role in component_parameters else {} module_name = component.get_module() task_input_dsl = component.get_input() task_output_dsl = component.get_output() component_parameters_on_party[ 'output_data_name'] = task_output_dsl.get('data') task_parameters = RunParameters( **file_utils.load_json_conf(args.config)) job_parameters = task_parameters if job_parameters.assistant_role: TaskExecutor.monkey_patch() except Exception as e: traceback.print_exc() schedule_logger().exception(e) task_info["party_status"] = TaskStatus.FAILED return try: job_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, task_version=task_version, model_id=job_parameters.model_id, model_version=job_parameters.model_version, component_module_name=module_name, job_parameters=job_parameters) tracker_client = TrackerClient( job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, task_version=task_version, model_id=job_parameters.model_id, model_version=job_parameters.model_version, component_module_name=module_name, job_parameters=job_parameters) run_class_paths = component_parameters_on_party.get( 'CodePath').split('/') run_class_package = '.'.join( run_class_paths[:-2]) + '.' + run_class_paths[-2].replace( '.py', '') run_class_name = run_class_paths[-1] task_info["party_status"] = TaskStatus.RUNNING cls.report_task_update_to_driver(task_info=task_info) # init environment, process is shared globally RuntimeConfig.init_config( WORK_MODE=job_parameters.work_mode, COMPUTING_ENGINE=job_parameters.computing_engine, FEDERATION_ENGINE=job_parameters.federation_engine, FEDERATED_MODE=job_parameters.federated_mode) if RuntimeConfig.COMPUTING_ENGINE == ComputingEngine.EGGROLL: session_options = task_parameters.eggroll_run.copy() else: session_options = {} sess = session.Session( computing_type=job_parameters.computing_engine, federation_type=job_parameters.federation_engine) computing_session_id = job_utils.generate_session_id( task_id, task_version, role, party_id) sess.init_computing(computing_session_id=computing_session_id, options=session_options) federation_session_id = job_utils.generate_task_version_id( task_id, task_version) component_parameters_on_party[ "job_parameters"] = job_parameters.to_dict() sess.init_federation( federation_session_id=federation_session_id, runtime_conf=component_parameters_on_party, service_conf=job_parameters.engines_address.get( EngineType.FEDERATION, {})) sess.as_default() schedule_logger().info('Run {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id)) schedule_logger().info("Component parameters on party {}".format( component_parameters_on_party)) schedule_logger().info("Task input dsl {}".format(task_input_dsl)) task_run_args = cls.get_task_run_args( job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version, job_args=job_args_on_party, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, ) if module_name in {"Upload", "Download", "Reader", "Writer"}: task_run_args["job_parameters"] = job_parameters run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker_client) run_object.set_task_version_id( task_version_id=job_utils.generate_task_version_id( task_id, task_version)) # add profile logs profile.profile_start() run_object.run(component_parameters_on_party, task_run_args) profile.profile_ends() output_data = run_object.save_data() if not isinstance(output_data, list): output_data = [output_data] for index in range(0, len(output_data)): data_name = task_output_dsl.get( 'data')[index] if task_output_dsl.get( 'data') else '{}'.format(index) persistent_table_namespace, persistent_table_name = tracker.save_output_data( computing_table=output_data[index], output_storage_engine=job_parameters.storage_engine, output_storage_address=job_parameters.engines_address.get( EngineType.STORAGE, {})) if persistent_table_namespace and persistent_table_name: tracker.log_output_data_info( data_name=data_name, table_namespace=persistent_table_namespace, table_name=persistent_table_name) output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model( output_model, task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default') task_info["party_status"] = TaskStatus.SUCCESS except Exception as e: task_info["party_status"] = TaskStatus.FAILED schedule_logger().exception(e) finally: try: task_info["end_time"] = current_timestamp() task_info["elapsed"] = task_info["end_time"] - start_time cls.report_task_update_to_driver(task_info=task_info) except Exception as e: task_info["party_status"] = TaskStatus.FAILED traceback.print_exc() schedule_logger().exception(e) schedule_logger().info('task {} {} {} start time: {}'.format( task_id, role, party_id, timestamp_to_date(start_time))) schedule_logger().info('task {} {} {} end time: {}'.format( task_id, role, party_id, timestamp_to_date(task_info["end_time"]))) schedule_logger().info('task {} {} {} takes {}s'.format( task_id, role, party_id, int(task_info["elapsed"]) / 1000)) schedule_logger().info('Finish {} {} {} {} {} {} task {}'.format( job_id, component_name, task_id, task_version, role, party_id, task_info["party_status"])) print('Finish {} {} {} {} {} {} task {}'.format( job_id, component_name, task_id, task_version, role, party_id, task_info["party_status"])) return task_info
def backend_compatibility(cls, job_parameters: RunParameters): # compatible with previous 1.5 versions if job_parameters.computing_engine is None or job_parameters.federation_engine is None: if job_parameters.work_mode is None or job_parameters.backend is None: raise RuntimeError("unable to find compatible backend engines") work_mode = WorkMode(job_parameters.work_mode) backend = Backend(job_parameters.backend) if backend == Backend.EGGROLL: if work_mode == WorkMode.CLUSTER: job_parameters.computing_engine = ComputingEngine.EGGROLL job_parameters.federation_engine = FederationEngine.EGGROLL job_parameters.storage_engine = StorageEngine.EGGROLL else: job_parameters.computing_engine = ComputingEngine.STANDALONE job_parameters.federation_engine = FederationEngine.STANDALONE job_parameters.storage_engine = StorageEngine.STANDALONE elif backend == Backend.SPARK_PULSAR: job_parameters.computing_engine = ComputingEngine.SPARK job_parameters.federation_engine = FederationEngine.PULSAR job_parameters.storage_engine = StorageEngine.HDFS elif backend == Backend.SPARK_RABBITMQ: job_parameters.computing_engine = ComputingEngine.SPARK job_parameters.federation_engine = FederationEngine.RABBITMQ job_parameters.storage_engine = StorageEngine.HDFS # add mq info federation_info = {} federation_info['union_name'] = string_utils.random_string(4) federation_info['policy_id'] = string_utils.random_string(10) job_parameters.federation_info = federation_info elif backend == Backend.LINKIS_SPARK_RABBITMQ: job_parameters.computing_engine = ComputingEngine.LINKIS_SPARK job_parameters.federation_engine = FederationEngine.RABBITMQ job_parameters.storage_engine = StorageEngine.LINKIS_HIVE # add mq info federation_info = {} federation_info['union_name'] = string_utils.random_string(4) federation_info['policy_id'] = string_utils.random_string(10) job_parameters.federation_info = federation_info if job_parameters.federated_mode is None: if job_parameters.computing_engine in [ ComputingEngine.EGGROLL, ComputingEngine.SPARK, ComputingEngine.LINKIS_SPARK ]: job_parameters.federated_mode = FederatedMode.MULTIPLE elif job_parameters.computing_engine in [ ComputingEngine.STANDALONE ]: job_parameters.federated_mode = FederatedMode.SINGLE