def gen_updated_parameters(cls, job_id, initiator_role, initiator_party_id, input_job_parameters, input_component_parameters): # todo: check can not update job parameters job_configuration = job_utils.get_job_configuration( job_id=job_id, role=initiator_role, party_id=initiator_party_id) updated_job_parameters = job_configuration.runtime_conf[ "job_parameters"] updated_component_parameters = job_configuration.runtime_conf[ "component_parameters"] if input_job_parameters: if input_job_parameters.get("common"): common_job_parameters = RunParameters( **input_job_parameters["common"]) cls.create_common_job_parameters( job_id=job_id, initiator_role=initiator_role, common_job_parameters=common_job_parameters) for attr in {"model_id", "model_version"}: setattr(common_job_parameters, attr, updated_job_parameters["common"].get(attr)) updated_job_parameters[ "common"] = common_job_parameters.to_dict() # not support role updated_components = set() if input_component_parameters: cls.merge_update(input_component_parameters, updated_component_parameters) return updated_job_parameters, updated_component_parameters, list( updated_components)
def update_parameter(cls, job_id, role, party_id, updated_parameters: dict): job_configuration = job_utils.get_job_configuration(job_id=job_id, role=role, party_id=party_id) job_parameters = updated_parameters.get("job_parameters") component_parameters = updated_parameters.get("component_parameters") if job_parameters: job_configuration.runtime_conf["job_parameters"] = job_parameters job_parameters = RunParameters(**job_parameters["common"]) cls.create_job_parameters_on_party(role=role, party_id=party_id, job_parameters=job_parameters) job_configuration.runtime_conf_on_party[ "job_parameters"] = job_parameters.to_dict() if component_parameters: job_configuration.runtime_conf[ "component_parameters"] = component_parameters job_configuration.runtime_conf_on_party[ "component_parameters"] = component_parameters job_info = {} job_info["job_id"] = job_id job_info["role"] = role job_info["party_id"] = party_id job_info["runtime_conf"] = job_configuration.runtime_conf job_info[ "runtime_conf_on_party"] = job_configuration.runtime_conf_on_party JobSaver.update_job(job_info)
def update_common_parameters(self, common_parameters: RunParameters): if int(self.job_runtime_conf.get("dsl_version", 1)) == 2: self.job_runtime_conf["job_parameters"][ "common"] = common_parameters.to_dict() else: self.job_runtime_conf[ "job_parameters"] = common_parameters.to_dict() return self.job_runtime_conf
def get_job_parameters_dict(self, job_parameters: RunParameters = None): if job_parameters: if int(self.job_runtime_conf.get('dsl_version', 1)) == 2: self.job_runtime_conf['job_parameters'][ 'common'] = job_parameters.to_dict() else: self.job_runtime_conf[ 'job_parameters'] = job_parameters.to_dict() return self.job_runtime_conf['job_parameters']
def clean_task(cls, job_id, task_id, task_version, role, party_id, content_type: TaskCleanResourceType): status = set() if content_type == TaskCleanResourceType.METRICS: tracker = Tracker(job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version) status.add(tracker.clean_metrics()) elif content_type == TaskCleanResourceType.TABLE: jobs = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id) if jobs: job = jobs[0] job_parameters = RunParameters( **job.f_runtime_conf_on_party["job_parameters"]) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version, job_parameters=job_parameters) status.add(tracker.clean_task(job.f_runtime_conf_on_party)) if len(status) == 1 and True in status: return True else: return False
def start_session_stop(task): job_parameters = RunParameters(**get_job_parameters( job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id)) session_manager_id = generate_session_id(task.f_task_id, task.f_task_version, task.f_role, task.f_party_id) if task.f_status != TaskStatus.WAITING: schedule_logger(task.f_job_id).info( f'start run subprocess to stop task sessions {session_manager_id}') else: schedule_logger(task.f_job_id).info( f'task is waiting, pass stop sessions {session_manager_id}') return task_dir = os.path.join(get_job_directory(job_id=task.f_job_id), task.f_role, task.f_party_id, task.f_component_name, 'session_stop') os.makedirs(task_dir, exist_ok=True) process_cmd = [ sys.executable or 'python3', sys.modules[session_utils.SessionStop.__module__].__file__, '--session', session_manager_id, '--computing', job_parameters.computing_engine, '--federation', job_parameters.federation_engine, '--storage', job_parameters.storage_engine, '-c', 'stop' if task.f_status == JobStatus.SUCCESS else 'kill' ] p = process_utils.run_subprocess(job_id=task.f_job_id, config_dir=task_dir, process_cmd=process_cmd) p.wait() p.poll()
def get_component_input_table(dsl_parser, job, component_name): component = dsl_parser.get_component_info(component_name=component_name) module_name = get_component_module(component_name, job.f_dsl) if 'reader' in module_name.lower(): return job.f_runtime_conf.get("component_parameters", {}).get( "role", {}).get(job.f_role, {}).get( str(job.f_roles.get(job.f_role).index(int( job.f_party_id)))).get(component_name) task_input_dsl = component.get_input() job_args_on_party = TaskExecutor.get_job_args_on_party( dsl_parser=dsl_parser, job_runtime_conf=job.f_runtime_conf, role=job.f_role, party_id=job.f_party_id) config = job_utils.get_job_parameters(job.f_job_id, job.f_role, job.f_party_id) task_parameters = RunParameters(**config) job_parameters = task_parameters component_input_table = TaskExecutor.get_task_run_args( job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id, task_id=None, task_version=None, job_args=job_args_on_party, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, get_input_table=True) return component_input_table
def calculate_job_resource(cls, job_parameters: RunParameters = None, job_id=None, role=None, party_id=None): if not job_parameters: job_parameters = job_utils.get_job_parameters(job_id=job_id, role=role, party_id=party_id) job_parameters = RunParameters(**job_parameters) cores = 0 memory = 0 if not (job_parameters.computing_engine in IGNORE_RESOURCE_COMPUTING_ENGINE or role in IGNORE_RESOURCE_ROLES and job_parameters.computing_engine in SUPPORT_IGNORE_RESOURCE_ENGINES): cores = ( int(job_parameters.adaptation_parameters["task_cores_per_node"] or 0) * int(job_parameters.adaptation_parameters["task_nodes"] or 0) * int(job_parameters.task_parallelism or 0)) memory = (int( job_parameters.adaptation_parameters["task_memory_per_node"] or 0) * int(job_parameters.adaptation_parameters["task_nodes"] or 0) * int(job_parameters.task_parallelism or 0)) return job_parameters.computing_engine, cores, memory
def adapt_job_parameters(cls, role, job_parameters: RunParameters, create_initiator_baseline=False): ResourceManager.adapt_engine_parameters( role=role, job_parameters=job_parameters, create_initiator_baseline=create_initiator_baseline) if create_initiator_baseline: if job_parameters.task_parallelism is None: job_parameters.task_parallelism = JobDefaultConfig.task_parallelism if job_parameters.federated_status_collect_type is None: job_parameters.federated_status_collect_type = JobDefaultConfig.federated_status_collect_type if create_initiator_baseline and not job_parameters.computing_partitions: job_parameters.computing_partitions = job_parameters.adaptation_parameters[ "task_cores_per_node"] * job_parameters.adaptation_parameters[ "task_nodes"]
def get_job_engines_address(cls, job_parameters: RunParameters): engines_info = {} engine_list = [(EngineType.COMPUTING, job_parameters.computing_engine), (EngineType.FEDERATION, job_parameters.federation_engine), (EngineType.STORAGE, job_parameters.storage_engine)] for engine_type, engine_name in engine_list: engine_info = ResourceManager.get_engine_registration_info( engine_type=engine_type, engine_name=engine_name) job_parameters.engines_address[engine_type] = engine_info.f_engine_config if engine_info else {} engines_info[engine_type] = engine_info return engines_info
def get_common_parameters(self): if int(self.job_runtime_conf.get('dsl_version', 1)) == 2: job_parameters = RunParameters(**self.job_runtime_conf.get( "job_parameters", {}).get("common", {})) self.job_runtime_conf['job_parameters'][ 'common'] = job_parameters.to_dict() else: if "processors_per_node" in self.job_runtime_conf[ 'job_parameters']: self.job_runtime_conf['job_parameters']["eggroll_run"] = \ {"eggroll.session.processors.per.node": self.job_runtime_conf['job_parameters']["processors_per_node"]} job_parameters = RunParameters( **self.job_runtime_conf['job_parameters']) self.job_runtime_conf['job_parameters'] = job_parameters.to_dict() return job_parameters
def calculate_task_resource(cls, task_parameters: RunParameters = None, task_info: dict = None): if not task_parameters: job_parameters = job_utils.get_job_parameters( job_id=task_info["job_id"], role=task_info["role"], party_id=task_info["party_id"]) task_parameters = RunParameters(**job_parameters) if task_parameters.computing_engine in IGNORE_RESOURCE_COMPUTING_ENGINE: cores_per_task = 0 memory_per_task = 0 elif task_info[ "role"] in IGNORE_RESOURCE_ROLES and task_parameters.computing_engine in SUPPORT_IGNORE_RESOURCE_ENGINES: cores_per_task = 0 memory_per_task = 0 else: cores_per_task = task_parameters.adaptation_parameters["task_cores_per_node"] * \ task_parameters.adaptation_parameters["task_nodes"] memory_per_task = task_parameters.adaptation_parameters["task_memory_per_node"] * \ task_parameters.adaptation_parameters["task_nodes"] return cores_per_task, memory_per_task
def create_new_version_task(cls, job, task, dsl_parser, auto): # stop old version task FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED) FederatedScheduler.clean_task( job=job, task=task, content_type=TaskCleanResourceType.METRICS) # create new version task task.f_task_version = task.f_task_version + 1 if auto: task.f_auto_retries = task.f_auto_retries - 1 task.f_run_pid = None task.f_run_ip = None # todo: FederatedScheduler.create_task and JobController.initialize_tasks will create task twice status_code, response = FederatedScheduler.create_task(job=job, task=task) if status_code != FederatedSchedulingStatusCode.SUCCESS: raise Exception(f"create {task.f_task_id} new version failed") # create the task holder in db to record information of all participants in the initiator for scheduling for _role in response: for _party_id in response[_role]: if _role == job.f_initiator_role and _party_id == job.f_initiator_party_id: continue JobController.initialize_tasks( job_id=job.f_job_id, role=_role, party_id=_party_id, run_on_this_party=False, initiator_role=job.f_initiator_role, initiator_party_id=job.f_initiator_party_id, job_parameters=RunParameters( **job.f_runtime_conf_on_party["job_parameters"]), dsl_parser=dsl_parser, components=[task.f_component_name], task_version=task.f_task_version, auto_retries=task.f_auto_retries) schedule_logger(job.f_job_id).info( f"create task {task.f_task_id} new version {task.f_task_version} successfully" )
def start_task_worker(cls, worker_name, task: Task, task_parameters: RunParameters = None, executable: list = None, extra_env: dict = None, **kwargs): worker_id, config_dir, log_dir = cls.get_process_dirs( worker_name=worker_name, job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id, task=task) session_id = job_utils.generate_session_id(task.f_task_id, task.f_task_version, task.f_role, task.f_party_id) federation_session_id = job_utils.generate_task_version_id( task.f_task_id, task.f_task_version) info_kwargs = {} specific_cmd = [] if worker_name is WorkerName.TASK_EXECUTOR: from fate_flow.worker.task_executor import TaskExecutor module_file_path = sys.modules[TaskExecutor.__module__].__file__ else: raise Exception(f"not support {worker_name} worker") if task_parameters is None: task_parameters = RunParameters(**job_utils.get_job_parameters( task.f_job_id, task.f_role, task.f_party_id)) config = task_parameters.to_dict() config["src_user"] = kwargs.get("src_user") config_path, result_path = cls.get_config(config_dir=config_dir, config=config, log_dir=log_dir) if executable: process_cmd = executable else: process_cmd = [sys.executable or "python3"] common_cmd = [ module_file_path, "--job_id", task.f_job_id, "--component_name", task.f_component_name, "--task_id", task.f_task_id, "--task_version", task.f_task_version, "--role", task.f_role, "--party_id", task.f_party_id, "--config", config_path, '--result', result_path, "--log_dir", log_dir, "--parent_log_dir", os.path.dirname(log_dir), "--worker_id", worker_id, "--run_ip", RuntimeConfig.JOB_SERVER_HOST, "--job_server", f"{RuntimeConfig.JOB_SERVER_HOST}:{RuntimeConfig.HTTP_PORT}", "--session_id", session_id, "--federation_session_id", federation_session_id, ] process_cmd.extend(common_cmd) process_cmd.extend(specific_cmd) env = cls.get_env(task.f_job_id, task.f_provider_info) if extra_env: env.update(extra_env) schedule_logger(task.f_job_id).info( f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} {worker_name} worker subprocess is ready" ) p = process_utils.run_subprocess(job_id=task.f_job_id, config_dir=config_dir, process_cmd=process_cmd, added_env=env, log_dir=log_dir, cwd_dir=config_dir, process_name=worker_name.value, process_id=worker_id) cls.save_worker_info(task=task, worker_name=worker_name, worker_id=worker_id, run_ip=RuntimeConfig.JOB_SERVER_HOST, run_pid=p.pid, config=config, cmd=process_cmd, **info_kwargs) return {"run_pid": p.pid, "worker_id": worker_id, "cmd": process_cmd}
def create_job(cls, job_id, role, party_id, job_info): # parse job configuration dsl = job_info['dsl'] runtime_conf = job_info['runtime_conf'] train_runtime_conf = job_info['train_runtime_conf'] if USE_AUTHENTICATION: authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get( 'src_party_id', None), dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id) dsl_parser = schedule_utils.get_job_dsl_parser( dsl=dsl, runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf) job_parameters = dsl_parser.get_job_parameters(runtime_conf) schedule_logger(job_id).info( 'job parameters:{}'.format(job_parameters)) dest_user = job_parameters.get(role, {}).get(party_id, {}).get('user', '') user = {} src_party_id = int( job_info['src_party_id']) if job_info.get('src_party_id') else 0 src_role = job_info.get('src_role', '') src_user = job_parameters.get(src_role, {}).get(src_party_id, {}).get( 'user', '') if src_role else '' for _role, party_id_item in job_parameters.items(): user[_role] = {} for _party_id, _parameters in party_id_item.items(): user[_role][_party_id] = _parameters.get("user", "") schedule_logger(job_id).info('job user:{}'.format(user)) if USE_DATA_AUTHENTICATION: job_args = dsl_parser.get_args_input() schedule_logger(job_id).info('job args:{}'.format(job_args)) dataset_dict = cls.get_dataset(False, role, party_id, runtime_conf.get("role"), job_args) dataset_list = [] if dataset_dict.get(role, {}).get(party_id): for k, v in dataset_dict[role][party_id].items(): dataset_list.append({ "namespace": v.split('.')[0], "table_name": v.split('.')[1] }) data_authentication_check( src_role=job_info.get('src_role'), src_party_id=job_info.get('src_party_id'), src_user=src_user, dest_user=dest_user, dataset_list=dataset_list) job_parameters = RunParameters( **job_parameters.get(role, {}).get(party_id, {})) # save new job into db if role == job_info["initiator_role"] and party_id == job_info[ "initiator_party_id"]: is_initiator = True else: is_initiator = False job_info["status"] = JobStatus.READY job_info["user_id"] = dest_user job_info["src_user"] = src_user job_info["user"] = user # this party configuration job_info["role"] = role job_info["party_id"] = party_id job_info["is_initiator"] = is_initiator job_info["progress"] = 0 cls.create_job_parameters_on_party(role=role, party_id=party_id, job_parameters=job_parameters) # update job parameters on party job_info["runtime_conf_on_party"][ "job_parameters"] = job_parameters.to_dict() JobSaver.create_job(job_info=job_info) schedule_logger(job_id).info("start initialize tasks") initialized_result, provider_group = cls.initialize_tasks( job_id=job_id, role=role, party_id=party_id, run_on_this_party=True, initiator_role=job_info["initiator_role"], initiator_party_id=job_info["initiator_party_id"], job_parameters=job_parameters, dsl_parser=dsl_parser) schedule_logger(job_id).info("initialize tasks success") for provider_key, group_info in provider_group.items(): for cpn in group_info["components"]: dsl["components"][cpn]["provider"] = provider_key roles = job_info['roles'] cls.initialize_job_tracker(job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, roles=roles, is_initiator=is_initiator, dsl_parser=dsl_parser) job_utils.save_job_conf( job_id=job_id, role=role, party_id=party_id, dsl=dsl, runtime_conf=runtime_conf, runtime_conf_on_party=job_info["runtime_conf_on_party"], train_runtime_conf=train_runtime_conf, pipeline_dsl=None) return {"components": initialized_result}
def start_task(cls, job_id, component_name, task_id, task_version, role, party_id, **kwargs): """ Start task, update status and party status :param job_id: :param component_name: :param task_id: :param task_version: :param role: :param party_id: :return: """ job_dsl = job_utils.get_job_dsl(job_id, role, party_id) PrivilegeAuth.authentication_component( job_dsl, src_party_id=kwargs.get('src_party_id'), src_role=kwargs.get('src_role'), party_id=party_id, component_name=component_name) schedule_logger(job_id).info( f"try to start task {task_id} {task_version} on {role} {party_id} executor subprocess" ) task_executor_process_start_status = False task_info = { "job_id": job_id, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, } is_failed = False try: task = JobSaver.query_task(task_id=task_id, task_version=task_version, role=role, party_id=party_id)[0] run_parameters_dict = job_utils.get_job_parameters( job_id, role, party_id) run_parameters_dict["src_user"] = kwargs.get("src_user") run_parameters = RunParameters(**run_parameters_dict) config_dir = job_utils.get_task_directory(job_id, role, party_id, component_name, task_id, task_version) os.makedirs(config_dir, exist_ok=True) run_parameters_path = os.path.join(config_dir, 'task_parameters.json') with open(run_parameters_path, 'w') as fw: fw.write(json_dumps(run_parameters_dict)) schedule_logger(job_id).info( f"use computing engine {run_parameters.computing_engine}") task_info["engine_conf"] = { "computing_engine": run_parameters.computing_engine } backend_engine = build_engine(run_parameters.computing_engine) run_info = backend_engine.run( task=task, run_parameters=run_parameters, run_parameters_path=run_parameters_path, config_dir=config_dir, log_dir=job_utils.get_job_log_directory( job_id, role, party_id, component_name), cwd_dir=job_utils.get_job_directory(job_id, role, party_id, component_name), user_name=kwargs.get("user_id")) task_info.update(run_info) task_info["start_time"] = current_timestamp() task_executor_process_start_status = True except Exception as e: schedule_logger(job_id).exception(e) is_failed = True finally: try: cls.update_task(task_info=task_info) task_info["party_status"] = TaskStatus.RUNNING cls.update_task_status(task_info=task_info) if is_failed: task_info["party_status"] = TaskStatus.FAILED cls.update_task_status(task_info=task_info) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).info( "task {} {} on {} {} executor subprocess start {}".format( task_id, task_version, role, party_id, "success" if task_executor_process_start_status else "failed"))
def adapt_engine_parameters(cls, role, job_parameters: RunParameters, create_initiator_baseline=False): computing_engine_info = ResourceManager.get_engine_registration_info( engine_type=EngineType.COMPUTING, engine_name=job_parameters.computing_engine) if not job_parameters.adaptation_parameters or create_initiator_baseline: job_parameters.adaptation_parameters = { "task_nodes": 0, "task_cores_per_node": 0, "task_memory_per_node": 0, # request_task_cores base on initiator and distribute to all parties, using job conf parameters or initiator fateflow server default settings "request_task_cores": int(job_parameters.task_cores) if job_parameters.task_cores else JobDefaultConfig.task_cores, "if_initiator_baseline": True } else: # use initiator baseline if role == "arbiter": job_parameters.adaptation_parameters["request_task_cores"] = 1 elif "request_task_cores" not in job_parameters.adaptation_parameters: # compatibility 1.5.0 job_parameters.adaptation_parameters[ "request_task_cores"] = job_parameters.adaptation_parameters[ "task_nodes"] * job_parameters.adaptation_parameters[ "task_cores_per_node"] job_parameters.adaptation_parameters[ "if_initiator_baseline"] = False adaptation_parameters = job_parameters.adaptation_parameters if job_parameters.computing_engine in { ComputingEngine.STANDALONE, ComputingEngine.EGGROLL }: adaptation_parameters["task_nodes"] = computing_engine_info.f_nodes if int( job_parameters.eggroll_run.get( "eggroll.session.processors.per.node", 0)) > 0: adaptation_parameters["task_cores_per_node"] = int( job_parameters. eggroll_run["eggroll.session.processors.per.node"]) else: adaptation_parameters["task_cores_per_node"] = max( 1, int(adaptation_parameters["request_task_cores"] / adaptation_parameters["task_nodes"])) if not create_initiator_baseline: # set the adaptation parameters to the actual engine operation parameters job_parameters.eggroll_run[ "eggroll.session.processors.per.node"] = adaptation_parameters[ "task_cores_per_node"] elif job_parameters.computing_engine == ComputingEngine.SPARK or job_parameters.computing_engine == ComputingEngine.LINKIS_SPARK: adaptation_parameters["task_nodes"] = int( job_parameters.spark_run.get("num-executors", computing_engine_info.f_nodes)) if int(job_parameters.spark_run.get("executor-cores", 0)) > 0: adaptation_parameters["task_cores_per_node"] = int( job_parameters.spark_run["executor-cores"]) else: adaptation_parameters["task_cores_per_node"] = max( 1, int(adaptation_parameters["request_task_cores"] / adaptation_parameters["task_nodes"])) if not create_initiator_baseline: # set the adaptation parameters to the actual engine operation parameters job_parameters.spark_run[ "num-executors"] = adaptation_parameters["task_nodes"] job_parameters.spark_run[ "executor-cores"] = adaptation_parameters[ "task_cores_per_node"]
def set_federated_mode(cls, job_parameters: RunParameters): if not job_parameters.federated_mode: job_parameters.federated_mode = ENGINES["federated_mode"]
def save_pipelined_model(cls, job_id, role, party_id): schedule_logger(job_id).info( f"start to save pipeline model on {role} {party_id}") job_configuration = job_utils.get_job_configuration(job_id=job_id, role=role, party_id=party_id) runtime_conf_on_party = job_configuration.runtime_conf_on_party job_parameters = runtime_conf_on_party.get('job_parameters', {}) if role in job_parameters.get("assistant_role", []): return model_id = job_parameters['model_id'] model_version = job_parameters['model_version'] job_type = job_parameters.get('job_type', '') roles = runtime_conf_on_party['role'] initiator_role = runtime_conf_on_party['initiator']['role'] initiator_party_id = runtime_conf_on_party['initiator']['party_id'] if job_type == 'predict': return dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job_configuration.dsl, runtime_conf=job_configuration.runtime_conf, train_runtime_conf=job_configuration.train_runtime_conf) components_parameters = {} tasks = JobSaver.query_task(job_id=job_id, role=role, party_id=party_id, only_latest=True) for task in tasks: components_parameters[ task.f_component_name] = task.f_component_parameters predict_dsl = schedule_utils.fill_inference_dsl( dsl_parser, origin_inference_dsl=job_configuration.dsl, components_parameters=components_parameters) pipeline = pipeline_pb2.Pipeline() pipeline.inference_dsl = json_dumps(predict_dsl, byte=True) pipeline.train_dsl = json_dumps(job_configuration.dsl, byte=True) pipeline.train_runtime_conf = json_dumps( job_configuration.runtime_conf, byte=True) pipeline.fate_version = RuntimeConfig.get_env("FATE") pipeline.model_id = model_id pipeline.model_version = model_version pipeline.parent = True pipeline.loaded_times = 0 pipeline.roles = json_dumps(roles, byte=True) pipeline.initiator_role = initiator_role pipeline.initiator_party_id = initiator_party_id pipeline.runtime_conf_on_party = json_dumps(runtime_conf_on_party, byte=True) pipeline.parent_info = json_dumps({}, byte=True) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, model_id=model_id, model_version=model_version, job_parameters=RunParameters(**job_parameters)) tracker.save_pipeline_model(pipeline_buffer_object=pipeline) if role != 'local': tracker.save_machine_learning_model_info() schedule_logger(job_id).info( f"save pipeline on {role} {party_id} successfully")
def _run_(self): # todo: All function calls where errors should be thrown args = self.args start_time = current_timestamp() try: LOGGER.info( f'run {args.component_name} {args.task_id} {args.task_version} on {args.role} {args.party_id} task' ) self.report_info.update({ "job_id": args.job_id, "component_name": args.component_name, "task_id": args.task_id, "task_version": args.task_version, "role": args.role, "party_id": args.party_id, "run_ip": args.run_ip, "run_pid": self.run_pid }) operation_client = OperationClient() job_configuration = JobConfiguration( **operation_client.get_job_conf( args.job_id, args.role, args.party_id, args.component_name, args.task_id, args.task_version)) task_parameters_conf = args.config dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job_configuration.dsl, runtime_conf=job_configuration.runtime_conf, train_runtime_conf=job_configuration.train_runtime_conf, pipeline_dsl=None) job_parameters = dsl_parser.get_job_parameters( job_configuration.runtime_conf) user_name = job_parameters.get(args.role, {}).get(args.party_id, {}).get("user", '') LOGGER.info(f"user name:{user_name}") src_user = task_parameters_conf.get("src_user") task_parameters = RunParameters(**task_parameters_conf) job_parameters = task_parameters if job_parameters.assistant_role: TaskExecutor.monkey_patch() job_args_on_party = TaskExecutor.get_job_args_on_party( dsl_parser, job_configuration.runtime_conf_on_party, args.role, args.party_id) component = dsl_parser.get_component_info( component_name=args.component_name) module_name = component.get_module() task_input_dsl = component.get_input() task_output_dsl = component.get_output() kwargs = { 'job_id': args.job_id, 'role': args.role, 'party_id': args.party_id, 'component_name': args.component_name, 'task_id': args.task_id, 'task_version': args.task_version, 'model_id': job_parameters.model_id, 'model_version': job_parameters.model_version, 'component_module_name': module_name, 'job_parameters': job_parameters, } tracker = Tracker(**kwargs) tracker_client = TrackerClient(**kwargs) checkpoint_manager = CheckpointManager(**kwargs) self.report_info["party_status"] = TaskStatus.RUNNING self.report_task_info_to_driver() previous_components_parameters = tracker_client.get_model_run_parameters( ) LOGGER.info( f"previous_components_parameters:\n{json_dumps(previous_components_parameters, indent=4)}" ) component_provider, component_parameters_on_party, user_specified_parameters = ProviderManager.get_component_run_info( dsl_parser=dsl_parser, component_name=args.component_name, role=args.role, party_id=args.party_id, previous_components_parameters=previous_components_parameters) RuntimeConfig.set_component_provider(component_provider) LOGGER.info( f"component parameters on party:\n{json_dumps(component_parameters_on_party, indent=4)}" ) flow_feeded_parameters = { "output_data_name": task_output_dsl.get("data") } # init environment, process is shared globally RuntimeConfig.init_config( COMPUTING_ENGINE=job_parameters.computing_engine, FEDERATION_ENGINE=job_parameters.federation_engine, FEDERATED_MODE=job_parameters.federated_mode) if RuntimeConfig.COMPUTING_ENGINE == ComputingEngine.EGGROLL: session_options = task_parameters.eggroll_run.copy() session_options["python.path"] = os.getenv("PYTHONPATH") session_options["python.venv"] = os.getenv("VIRTUAL_ENV") else: session_options = {} sess = session.Session(session_id=args.session_id) sess.as_global() sess.init_computing(computing_session_id=args.session_id, options=session_options) component_parameters_on_party[ "job_parameters"] = job_parameters.to_dict() roles = job_configuration.runtime_conf["role"] if set(roles) == {"local"}: LOGGER.info(f"only local roles, pass init federation") else: sess.init_federation( federation_session_id=args.federation_session_id, runtime_conf=component_parameters_on_party, service_conf=job_parameters.engines_address.get( EngineType.FEDERATION, {})) LOGGER.info( f'run {args.component_name} {args.task_id} {args.task_version} on {args.role} {args.party_id} task' ) LOGGER.info( f"component parameters on party:\n{json_dumps(component_parameters_on_party, indent=4)}" ) LOGGER.info(f"task input dsl {task_input_dsl}") task_run_args, input_table_list = self.get_task_run_args( job_id=args.job_id, role=args.role, party_id=args.party_id, task_id=args.task_id, task_version=args.task_version, job_args=job_args_on_party, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, ) if module_name in { "Upload", "Download", "Reader", "Writer", "Checkpoint" }: task_run_args["job_parameters"] = job_parameters LOGGER.info(f"task input args {task_run_args}") need_run = component_parameters_on_party.get("ComponentParam", {}).get( "need_run", True) provider_interface = provider_utils.get_provider_interface( provider=component_provider) run_object = provider_interface.get( module_name, ComponentRegistry.get_provider_components( provider_name=component_provider.name, provider_version=component_provider.version)).get_run_obj( self.args.role) flow_feeded_parameters.update({"table_info": input_table_list}) cpn_input = ComponentInput( tracker=tracker_client, checkpoint_manager=checkpoint_manager, task_version_id=job_utils.generate_task_version_id( args.task_id, args.task_version), parameters=component_parameters_on_party["ComponentParam"], datasets=task_run_args.get("data", None), caches=task_run_args.get("cache", None), models=dict( model=task_run_args.get("model"), isometric_model=task_run_args.get("isometric_model"), ), job_parameters=job_parameters, roles=dict( role=component_parameters_on_party["role"], local=component_parameters_on_party["local"], ), flow_feeded_parameters=flow_feeded_parameters, ) profile_log_enabled = False try: if int(os.getenv("FATE_PROFILE_LOG_ENABLED", "0")) > 0: profile_log_enabled = True except Exception as e: LOGGER.warning(e) if profile_log_enabled: # add profile logs LOGGER.info("profile logging is enabled") profile.profile_start() cpn_output = run_object.run(cpn_input) sess.wait_remote_all_done() profile.profile_ends() else: LOGGER.info("profile logging is disabled") cpn_output = run_object.run(cpn_input) sess.wait_remote_all_done() output_table_list = [] LOGGER.info(f"task output data {cpn_output.data}") for index, data in enumerate(cpn_output.data): data_name = task_output_dsl.get( 'data')[index] if task_output_dsl.get( 'data') else '{}'.format(index) #todo: the token depends on the engine type, maybe in job parameters persistent_table_namespace, persistent_table_name = tracker.save_output_data( computing_table=data, output_storage_engine=job_parameters.storage_engine, token={"username": user_name}) if persistent_table_namespace and persistent_table_name: tracker.log_output_data_info( data_name=data_name, table_namespace=persistent_table_namespace, table_name=persistent_table_name) output_table_list.append({ "namespace": persistent_table_namespace, "name": persistent_table_name }) self.log_output_data_table_tracker(args.job_id, input_table_list, output_table_list) # There is only one model output at the current dsl version. tracker_client.save_component_output_model( model_buffers=cpn_output.model, model_alias=task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default', user_specified_run_parameters=user_specified_parameters) if cpn_output.cache is not None: for i, cache in enumerate(cpn_output.cache): if cache is None: continue name = task_output_dsl.get( "cache")[i] if "cache" in task_output_dsl else str(i) if isinstance(cache, DataCache): tracker.tracking_output_cache(cache, cache_name=name) elif isinstance(cache, tuple): tracker.save_output_cache( cache_data=cache[0], cache_meta=cache[1], cache_name=name, output_storage_engine=job_parameters. storage_engine, output_storage_address=job_parameters. engines_address.get(EngineType.STORAGE, {}), token={"username": user_name}) else: raise RuntimeError( f"can not support type {type(cache)} module run object output cache" ) if need_run: self.report_info["party_status"] = TaskStatus.SUCCESS else: self.report_info["party_status"] = TaskStatus.PASS except PassError as e: self.report_info["party_status"] = TaskStatus.PASS except Exception as e: traceback.print_exc() self.report_info["party_status"] = TaskStatus.FAILED LOGGER.exception(e) finally: try: self.report_info["end_time"] = current_timestamp() self.report_info[ "elapsed"] = self.report_info["end_time"] - start_time self.report_task_info_to_driver() except Exception as e: self.report_info["party_status"] = TaskStatus.FAILED traceback.print_exc() LOGGER.exception(e) msg = f"finish {args.component_name} {args.task_id} {args.task_version} on {args.role} {args.party_id} with {self.report_info['party_status']}" LOGGER.info(msg) print(msg) return self.report_info