def upload_spark_dependence( cls, job, upload_details, storage_engine=FateDependenceStorageEngine.HDFS.value): schedule_logger( job.f_job_id).info(f"start upload dependence: {upload_details}") for version, type_provider in upload_details.items(): for dependence_type, provider in type_provider.items(): storage_meta = { "f_storage_engine": storage_engine, "f_type": dependence_type, "f_version": version, "f_upload_status": True } schedule_logger(job.f_job_id).info( f"update dependence storage meta:{storage_meta}") DependenceRegistry.save_dependencies_storage_meta( storage_meta, status_check=True) WorkerManager.start_general_worker( worker_name=WorkerName.DEPENDENCE_UPLOAD, job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id, provider=provider, dependence_type=dependence_type, callback=cls.record_upload_process, callback_param=["dependence_type", "pid", "provider"])
def insert_metrics_into_db(self, metric_namespace: str, metric_name: str, data_type: int, kv, job_level=False): try: model_class = self.get_model_class() tracking_metric = model_class() tracking_metric.f_job_id = self.job_id tracking_metric.f_component_name = ( self.component_name if not job_level else job_utils.job_pipeline_component_name()) tracking_metric.f_task_id = self.task_id tracking_metric.f_task_version = self.task_version tracking_metric.f_role = self.role tracking_metric.f_party_id = self.party_id tracking_metric.f_metric_namespace = metric_namespace tracking_metric.f_metric_name = metric_name tracking_metric.f_type = data_type default_db_source = tracking_metric.to_dict() tracking_metric_data_source = [] for k, v in kv: db_source = default_db_source.copy() db_source['f_key'] = serialize_b64(k) db_source['f_value'] = serialize_b64(v) db_source['f_create_time'] = current_timestamp() tracking_metric_data_source.append(db_source) db_utils.bulk_insert_into_db(model_class, tracking_metric_data_source, schedule_logger(self.job_id)) except Exception as e: schedule_logger(self.job_id).exception( "An exception where inserted metric {} of metric namespace: {} to database:\n{}" .format(metric_name, metric_namespace, e))
def is_task_executor_process(task: Task, process: psutil.Process): """ check the process if task executor or not by command :param task: :param process: :return: """ try: cmdline = process.cmdline() except Exception as e: # Not sure whether the process is a task executor process, operations processing is required schedule_logger(task.f_job_id).warning(e) return False else: schedule_logger(task.f_job_id).info(cmdline) if task.f_worker_id and task.f_worker_id in cmdline: return True if len(cmdline) != len(task.f_cmd): return False for i, v in enumerate(task.f_cmd): if cmdline[i] != str(v): return False return True
def report_task_to_initiator(cls, task: Task): """ :param task: :return: """ if task.f_role != task.f_initiator_role and task.f_party_id != task.f_initiator_party_id: try: response = federated_api( job_id=task.f_job_id, method='POST', endpoint='/initiator/{}/{}/{}/{}/{}/{}/report'.format( task.f_job_id, task.f_component_name, task.f_task_id, task.f_task_version, task.f_role, task.f_party_id), src_party_id=task.f_party_id, dest_party_id=task.f_initiator_party_id, src_role=task.f_role, json_body=task.to_human_model_dict( only_primary_with=cls.REPORT_TO_INITIATOR_FIELDS), federated_mode=task.f_federated_mode) except Exception as e: schedule_logger(task.f_job_id).error( f"report task to initiator error: {e}") return False if response["retcode"] != RetCode.SUCCESS: retmsg = response["retmsg"] schedule_logger(task.f_job_id).error( f"report task to initiator error: {retmsg}") return False else: return True else: return False
def start_session_stop(task): job_parameters = RunParameters(**get_job_parameters( job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id)) session_manager_id = generate_session_id(task.f_task_id, task.f_task_version, task.f_role, task.f_party_id) if task.f_status != TaskStatus.WAITING: schedule_logger(task.f_job_id).info( f'start run subprocess to stop task sessions {session_manager_id}') else: schedule_logger(task.f_job_id).info( f'task is waiting, pass stop sessions {session_manager_id}') return task_dir = os.path.join(get_job_directory(job_id=task.f_job_id), task.f_role, task.f_party_id, task.f_component_name, 'session_stop') os.makedirs(task_dir, exist_ok=True) process_cmd = [ sys.executable or 'python3', sys.modules[session_utils.SessionStop.__module__].__file__, '--session', session_manager_id, '--computing', job_parameters.computing_engine, '--federation', job_parameters.federation_engine, '--storage', job_parameters.storage_engine, '-c', 'stop' if task.f_status == JobStatus.SUCCESS else 'kill' ] p = process_utils.run_subprocess(job_id=task.f_job_id, config_dir=task_dir, process_cmd=process_cmd) p.wait() p.poll()
def kill_task_all_workers(cls, task: Task): schedule_logger(task.f_job_id).info( start_log("kill all workers", task=task)) workers_info = WorkerInfo.query(task_id=task.f_task_id, task_version=task.f_task_version, role=task.f_role, party_id=task.f_party_id) for worker_info in workers_info: schedule_logger(task.f_job_id).info( start_log( f"kill {worker_info.f_worker_name}({worker_info.f_run_pid})", task=task)) try: cls.kill_worker(worker_info) schedule_logger(task.f_job_id).info( successful_log( f"kill {worker_info.f_worker_name}({worker_info.f_run_pid})", task=task)) except Exception as e: schedule_logger(task.f_job_id).warning(failed_log( f"kill {worker_info.f_worker_name}({worker_info.f_run_pid})", task=task), exc_info=True) schedule_logger(task.f_job_id).info( successful_log("kill all workers", task=task))
def collect_task_of_all_party(cls, job, initiator_task, set_status=None): tasks_on_all_party = JobSaver.query_task( task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) tasks_status_on_all = set( [task.f_status for task in tasks_on_all_party]) if not len(tasks_status_on_all ) > 1 and TaskStatus.RUNNING not in tasks_status_on_all: return status, federated_response = FederatedScheduler.collect_task( job=job, task=initiator_task) if status != FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job.f_job_id).warning( f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed" ) for _role in federated_response.keys(): for _party_id, party_response in federated_response[_role].items(): if party_response["retcode"] == RetCode.SUCCESS: JobSaver.update_task_status( task_info=party_response["data"]) JobSaver.update_task(task_info=party_response["data"]) elif party_response[ "retcode"] == RetCode.FEDERATED_ERROR and set_status: tmp_task_info = { "job_id": initiator_task.f_job_id, "task_id": initiator_task.f_task_id, "task_version": initiator_task.f_task_version, "role": _role, "party_id": _party_id, "party_status": TaskStatus.RUNNING } JobSaver.update_task_status(task_info=tmp_task_info) tmp_task_info["party_status"] = set_status JobSaver.update_task_status(task_info=tmp_task_info)
def read_metrics_from_db(self, metric_namespace: str, metric_name: str, data_type, job_level=False): metrics = [] try: tracking_metric_model = self.get_model_class() tracking_metrics = tracking_metric_model.select( tracking_metric_model.f_key, tracking_metric_model.f_value).where( tracking_metric_model.f_job_id == self.job_id, tracking_metric_model.f_component_name == ( self.component_name if not job_level else job_utils.job_pipeline_component_name()), tracking_metric_model.f_role == self.role, tracking_metric_model.f_party_id == self.party_id, tracking_metric_model.f_metric_namespace == metric_namespace, tracking_metric_model.f_metric_name == metric_name, tracking_metric_model.f_type == data_type) for tracking_metric in tracking_metrics: yield deserialize_b64(tracking_metric.f_key), deserialize_b64( tracking_metric.f_value) except Exception as e: schedule_logger(self.job_id).exception(e) raise e return metrics
def insert_summary_into_db(self, summary_data: dict, need_serialize=True): try: summary_model = self.get_dynamic_db_model(ComponentSummary, self.job_id) DB.create_tables([summary_model]) summary_obj = summary_model.get_or_none( summary_model.f_job_id == self.job_id, summary_model.f_component_name == self.component_name, summary_model.f_role == self.role, summary_model.f_party_id == self.party_id, summary_model.f_task_id == self.task_id, summary_model.f_task_version == self.task_version ) if summary_obj: summary_obj.f_summary = serialize_b64(summary_data, to_str=True) if need_serialize else summary_data summary_obj.f_update_time = current_timestamp() summary_obj.save() else: self.get_dynamic_db_model(ComponentSummary, self.job_id).create( f_job_id=self.job_id, f_component_name=self.component_name, f_role=self.role, f_party_id=self.party_id, f_task_id=self.task_id, f_task_version=self.task_version, f_summary=serialize_b64(summary_data, to_str=True), f_create_time=current_timestamp() ) except Exception as e: schedule_logger(self.job_id).exception("An exception where querying summary job id: {} " "component name: {} to database:\n{}".format( self.job_id, self.component_name, e) )
def get_table_meta(self, table_info): schedule_logger(self.job_id).info(f'start get table meta:{table_info}') table_meta_dict = storage.StorageTableMeta(namespace=table_info.get("namespace"), name=table_info.get("table_name"), create_address=False).to_dict() schedule_logger(self.job_id).info(f'get table meta success: {table_meta_dict}') table_meta_dict["part_of_data"] = serialize_b64(table_meta_dict["part_of_data"], to_str=True) table_meta_dict["schema"] = serialize_b64(table_meta_dict["schema"], to_str=True) return table_meta_dict
def save_metric_data(self, metric_namespace: str, metric_name: str, metrics: typing.List[Metric], job_level=False): schedule_logger(self.job_id).info( 'save component {} on {} {} {} {} metric data'.format(self.component_name, self.role, self.party_id, metric_namespace, metric_name)) kv = [] for metric in metrics: kv.append((metric.key, metric.value)) self.metric_manager.insert_metrics_into_db(metric_namespace, metric_name, 1, kv, job_level)
def run(self, task: Task, run_parameters, run_parameters_path, config_dir, log_dir, cwd_dir, **kwargs): spark_home = ServiceRegistry.FATE_ON_SPARK.get("spark", {}).get("home") if not spark_home: try: import pyspark spark_home = pyspark.__path__[0] except ImportError as e: raise RuntimeError("can not import pyspark") except Exception as e: raise RuntimeError("can not import pyspark") # else: # raise ValueError(f"spark home must be configured in conf/service_conf.yaml when run on cluster mode") # additional configs spark_submit_config = run_parameters.spark_run deploy_mode = spark_submit_config.get("deploy-mode", "client") if deploy_mode not in ["client"]: raise ValueError(f"deploy mode {deploy_mode} not supported") spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit") executable = [ spark_submit_cmd, f"--name={task.f_task_id}#{task.f_role}" ] for k, v in spark_submit_config.items(): if k != "conf": executable.append(f"--{k}={v}") if "conf" in spark_submit_config: for ck, cv in spark_submit_config["conf"].items(): executable.append(f"--conf") executable.append(f"{ck}={cv}") extra_env = {} extra_env["SPARK_HOME"] = spark_home if DEPENDENT_DISTRIBUTION: dependence = Dependence() dependence.init(provider=ComponentProvider(**task.f_provider_info)) executor_env_pythonpath, executor_python_env, driver_python_env, archives = dependence.get_task_dependence_info( ) schedule_logger(task.f_job_id).info( f"executor_env_python {executor_python_env}," f"driver_env_python {driver_python_env}, archives {archives}") executable.append(f'--archives') executable.append(archives) executable.append(f'--conf') executable.append(f'spark.pyspark.python={executor_python_env}') executable.append(f'--conf') executable.append( f'spark.executorEnv.PYTHONPATH={executor_env_pythonpath}') executable.append(f'--conf') executable.append( f'spark.pyspark.driver.python={driver_python_env}') return WorkerManager.start_task_worker( worker_name=WorkerName.TASK_EXECUTOR, task=task, task_parameters=run_parameters, executable=executable, extra_env=extra_env)
def federated_coordination_on_grpc(job_id, method, host, port, endpoint, src_party_id, src_role, dest_party_id, json_body, api_version=API_VERSION, overall_timeout=None, try_times=3): overall_timeout = JobDefaultConfig.remote_request_timeout if overall_timeout is None else overall_timeout endpoint = f"/{api_version}{endpoint}" json_body['src_fate_ver'] = RuntimeConfig.get_env('FATE') json_body['src_role'] = src_role json_body['src_party_id'] = src_party_id if CHECK_NODES_IDENTITY: get_node_identity(json_body, src_party_id) _packet = wrap_grpc_packet(json_body, method, endpoint, src_party_id, dest_party_id, job_id, overall_timeout=overall_timeout) _routing_metadata = gen_routing_metadata(src_party_id=src_party_id, dest_party_id=dest_party_id) exception = None for t in range(try_times): try: channel, stub = get_command_federation_channel(host, port) _return, _call = stub.unaryCall.with_call( _packet, metadata=_routing_metadata, timeout=(overall_timeout / 1000)) audit_logger(job_id).info("grpc api response: {}".format(_return)) channel.close() response = json_loads(_return.body.value) return response except Exception as e: exception = e schedule_logger(job_id).warning( f"remote request {endpoint} error, sleep and try again") time.sleep(2 * (t + 1)) else: tips = 'Please check rollSite and fateflow network connectivity' """ if 'Error received from peer' in str(exception): tips = 'Please check if the fate flow server of the other party is started. ' if 'failed to connect to all addresses' in str(exception): tips = 'Please check whether the rollsite service(port: 9370) is started. ' """ raise Exception('{}rpc request error: {}'.format(tips, exception))
def finish(cls, job, end_status): schedule_logger(job.f_job_id).info( f"job finished with {end_status}, do something...") cls.stop_job(job_id=job.f_job_id, role=job.f_initiator_role, party_id=job.f_initiator_party_id, stop_status=end_status) FederatedScheduler.clean_job(job=job) schedule_logger( job.f_job_id).info(f"job finished with {end_status}, done")
def check_job_is_timeout(job: Job): job_parameters = job.f_runtime_conf_on_party["job_parameters"] timeout = job_parameters.get("timeout", JobDefaultConfig.job_timeout) now_time = current_timestamp() running_time = (now_time - job.f_create_time) / 1000 if running_time > timeout: schedule_logger(job.f_job_id).info(f'run time {running_time}s timeout') return True else: return False
def save_table_meta(self, meta): schedule_logger(self.job_id).info(f'start save table meta:{meta}') address = storage.StorageTableMeta.create_address(storage_engine=meta.get("engine"), address_dict=meta.get("address")) table_meta = storage.StorageTableMeta(name=meta.get("name"), namespace=meta.get("namespace"), new=True) table_meta.set_metas(**meta) meta["address"] = address meta["part_of_data"] = deserialize_b64(meta["part_of_data"]) meta["schema"] = deserialize_b64(meta["schema"]) table_meta.create() schedule_logger(self.job_id).info(f'save table meta success')
def tracking_output_cache(self, cache: DataCache, cache_name: str) -> str: cache_key = CacheManager.record(cache=cache, job_id=self.job_id, role=self.role, party_id=self.party_id, component_name=self.component_name, task_id=self.task_id, task_version=self.task_version, cache_name=cache_name) schedule_logger(self.job_id).info(f"tracking {self.task_id} {self.task_version} output cache, cache key is {cache_key}") return cache_key
def job_reload(cls, job): schedule_logger(job.f_job_id).info(f"start job reload") cls.log_reload(job) source_inheritance_tasks, target_inheritance_tasks = cls.load_source_target_tasks( job) schedule_logger(job.f_job_id).info( f"source_inheritance_tasks:{source_inheritance_tasks}, target_inheritance_tasks:{target_inheritance_tasks}" ) cls.output_reload(job, source_inheritance_tasks, target_inheritance_tasks) cls.status_reload(job, source_inheritance_tasks, target_inheritance_tasks)
def read_component_metrics(self): try: tracking_metric_model = self.get_model_class() tracking_metrics = tracking_metric_model.select().where( tracking_metric_model.f_job_id == self.job_id, tracking_metric_model.f_component_name == self.component_name, tracking_metric_model.f_role == self.role, tracking_metric_model.f_party_id == self.party_id, tracking_metric_model.f_task_version == self.task_version) return [tracking_metric for tracking_metric in tracking_metrics] except Exception as e: schedule_logger(self.job_id).exception(e) raise e
def stop_job(): job_id = request.json.get('job_id') stop_status = request.json.get("stop_status", "canceled") jobs = JobSaver.query_job(job_id=job_id) if jobs: schedule_logger(job_id).info(f"stop job on this party") kill_status, kill_details = JobController.stop_jobs( job_id=job_id, stop_status=stop_status) schedule_logger(job_id).info( f"stop job on this party status {kill_status}") schedule_logger(job_id).info(f"request stop job to {stop_status}") status_code, response = FederatedScheduler.request_stop_job( job=jobs[0], stop_status=stop_status, command_body=jobs[0].to_dict()) if status_code == FederatedSchedulingStatusCode.SUCCESS: return get_json_result( retcode=RetCode.SUCCESS, retmsg= f"stop job on this party {kill_status}; stop job on all party success" ) else: return get_json_result( retcode=RetCode.OPERATING_ERROR, retmsg=f"stop job on this party {kill_status}", data=response) else: schedule_logger(job_id).info(f"can not found job to stop") return get_json_result(retcode=RetCode.DATA_ERROR, retmsg="can not found job")
def load_task_tracker(cls, tasks: dict): tracker_dict = {} for key, task in tasks.items(): schedule_logger(task.f_job_id).info( f"task:{task.f_job_id}, {task.f_role}, {task.f_party_id},{task.f_component_name},{task.f_task_version}" ) tracker = Tracker(job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id, component_name=task.f_component_name, task_id=task.f_task_id, task_version=task.f_task_version) tracker_dict[key] = tracker return tracker_dict
def bulk_insert_into_db(self, model, data_source): try: try: DB.create_tables([model]) except Exception as e: schedule_logger(self.job_id).exception(e) batch_size = 50 if RuntimeConfig.USE_LOCAL_DATABASE else 1000 for i in range(0, len(data_source), batch_size): with DB.atomic(): model.insert_many(data_source[i:i+batch_size]).execute() return len(data_source) except Exception as e: schedule_logger(self.job_id).exception(e) return 0
def update_task(cls, task_info): """ Save to local database and then report to Initiator :param task_info: :return: """ update_status = False try: update_status = JobSaver.update_task(task_info=task_info) cls.report_task_to_initiator(task_info=task_info) except Exception as e: schedule_logger(task_info["job_id"]).exception(e) finally: return update_status
def get_remaining_resource(cls, resource_model: typing.Union[EngineRegistry, Job], filters): remaining_cores, remaining_memory = None, None try: objs = resource_model.select( resource_model.f_remaining_cores, resource_model.f_remaining_memory).where(*filters) if objs: remaining_cores, remaining_memory = objs[ 0].f_remaining_cores, objs[0].f_remaining_memory except Exception as e: schedule_logger().exception(e) finally: return remaining_cores, remaining_memory
def status_reload(cls, job, source_tasks, target_tasks): schedule_logger(job.f_job_id).info("start reload status") # update task status for key, source_task in source_tasks.items(): JobSaver.reload_task(source_task, target_tasks[key]) # update job status JobSaver.update_job( job_info={ "job_id": job.f_job_id, "role": job.f_role, "party_id": job.f_party_id, "inheritance_status": JobInheritanceStatus.SUCCESS }) schedule_logger(job.f_job_id).info("reload status success")
def federated_coordination_on_http(job_id, method, host, port, endpoint, src_party_id, src_role, dest_party_id, json_body, api_version=API_VERSION, overall_timeout=None, try_times=3): overall_timeout = JobDefaultConfig.remote_request_timeout if overall_timeout is None else overall_timeout endpoint = f"/{api_version}{endpoint}" exception = None json_body['src_fate_ver'] = RuntimeConfig.get_env('FATE') json_body['src_role'] = src_role json_body['src_party_id'] = src_party_id for t in range(try_times): try: url = "http://{}:{}{}".format(host, port, endpoint) audit_logger(job_id).info( 'remote http api request: {}'.format(url)) headers = HEADERS.copy() headers["dest-party-id"] = str(dest_party_id) headers["src-fate-ver"] = RuntimeConfig.get_env("FATE") headers["src-party-id"] = str(src_party_id) headers["src-role"] = str(src_role) response = request(method=method, url=url, json=json_body, headers=headers) audit_logger(job_id).info(response.text) audit_logger(job_id).info('remote http api response: {} {}'.format( endpoint, response.json())) return response.json() except Exception as e: exception = e schedule_logger(job_id).warning( f"remote http request {endpoint} error, sleep and try again") time.sleep(2 * (t + 1)) else: raise exception
def log_reload(cls, job): schedule_logger(job.f_job_id).info("start reload job log") if job.f_inheritance_info: for component_name in job.f_inheritance_info.get("component_list"): source_path = os.path.join( log_utils.get_logger_base_dir(), job.f_inheritance_info.get("job_id"), job.f_role, job.f_party_id, component_name) target_path = os.path.join(log_utils.get_logger_base_dir(), job.f_job_id, job.f_role, job.f_party_id, component_name) if os.path.exists(source_path): if os.path.exists(target_path): shutil.rmtree(target_path) shutil.copytree(source_path, target_path) schedule_logger(job.f_job_id).info("reload job log success")
def check_job_inherit_dependence(cls, job): schedule_logger(job.f_job_id).info( f"check job inherit dependence: {job.f_inheritance_info}, {job.f_inheritance_status}" ) if job.f_inheritance_info: if job.f_inheritance_status == JobInheritanceStatus.WAITING: cls.start_inheriting_job(job) return False elif job.f_inheritance_status == JobInheritanceStatus.RUNNING: return False elif job.f_inheritance_status == JobInheritanceStatus.FAILED: raise Exception("job inheritance failed") else: return True else: return True
def fill_default_job_parameters(cls, job_id, job_parameters: RunParameters): keys = { "task_parallelism", "auto_retries", "auto_retry_delay", "federated_status_collect_type" } for key in keys: if hasattr(job_parameters, key) and getattr(job_parameters, key) is None: if hasattr(JobDefaultConfig, key): setattr(job_parameters, key, getattr(JobDefaultConfig, key)) else: schedule_logger(job_id).warning( f"can not found {key} job parameter default value from job_default_settings" )
def read_summary_from_db(self, need_deserialize=True): try: summary_model = self.get_dynamic_db_model(ComponentSummary, self.job_id) summary = summary_model.get_or_none( summary_model.f_job_id == self.job_id, summary_model.f_component_name == self.component_name, summary_model.f_role == self.role, summary_model.f_party_id == self.party_id ) if summary: cpn_summary = deserialize_b64(summary.f_summary) if need_deserialize else summary.f_summary else: cpn_summary = "" except Exception as e: schedule_logger(self.job_id).exception(e) raise e return cpn_summary