def update_job_status(cls, job_info): update_status = JobSaver.update_job_status(job_info=job_info) if update_status and EndStatus.contains(job_info.get("status")): ResourceManager.return_job_resource(job_id=job_info["job_id"], role=job_info["role"], party_id=job_info["party_id"]) return update_status
def start_task(cls, job, task): schedule_logger(task.f_job_id).info( "try to start task {} {} on {} {}".format(task.f_task_id, task.f_task_version, task.f_role, task.f_party_id)) apply_status = ResourceManager.apply_for_task_resource( task_info=task.to_human_model_dict(only_primary_with=["status"])) if not apply_status: return SchedulingStatusCode.NO_RESOURCE task.f_status = TaskStatus.RUNNING update_status = JobSaver.update_task_status( task_info=task.to_human_model_dict(only_primary_with=["status"])) if not update_status: # Another scheduler scheduling the task schedule_logger(task.f_job_id).info( "task {} {} start on another scheduler".format( task.f_task_id, task.f_task_version)) # Rollback task.f_status = TaskStatus.WAITING ResourceManager.return_task_resource( task_info=task.to_human_model_dict( only_primary_with=["status"])) return SchedulingStatusCode.PASS schedule_logger(task.f_job_id).info("start task {} {} on {} {}".format( task.f_task_id, task.f_task_version, task.f_role, task.f_party_id)) FederatedScheduler.sync_task_status(job=job, task=task) status_code, response = FederatedScheduler.start_task(job=job, task=task) if status_code == FederatedSchedulingStatusCode.SUCCESS: return SchedulingStatusCode.SUCCESS else: return SchedulingStatusCode.FAILED
def load(cls): configs = { "job_default_config": JobDefaultConfig.load(), "service_registry": ServiceRegistry.load(), } ResourceManager.initialize() RuntimeConfig.load_config_manager() return configs
def update_task_status(cls, task_info): update_status = JobSaver.update_task_status(task_info=task_info) if update_status and EndStatus.contains(task_info.get("status")): ResourceManager.return_task_resource(task_info=task_info) cls.clean_task(job_id=task_info["job_id"], task_id=task_info["task_id"], task_version=task_info["task_version"], role=task_info["role"], party_id=task_info["party_id"], content_type=TaskCleanResourceType.TABLE) cls.report_task_to_initiator(task_info=task_info) return update_status
def adapt_job_parameters(cls, role, job_parameters: RunParameters, create_initiator_baseline=False): ResourceManager.adapt_engine_parameters( role=role, job_parameters=job_parameters, create_initiator_baseline=create_initiator_baseline) if create_initiator_baseline: if job_parameters.task_parallelism is None: job_parameters.task_parallelism = JobDefaultConfig.task_parallelism if job_parameters.federated_status_collect_type is None: job_parameters.federated_status_collect_type = JobDefaultConfig.federated_status_collect_type if create_initiator_baseline and not job_parameters.computing_partitions: job_parameters.computing_partitions = job_parameters.adaptation_parameters[ "task_cores_per_node"] * job_parameters.adaptation_parameters[ "task_nodes"]
def adapt_job_parameters(cls, role, job_parameters: RunParameters, create_initiator_baseline=False): ResourceManager.adapt_engine_parameters( role=role, job_parameters=job_parameters, create_initiator_baseline=create_initiator_baseline) if create_initiator_baseline: if job_parameters.task_parallelism is None: job_parameters.task_parallelism = DEFAULT_TASK_PARALLELISM if job_parameters.federated_status_collect_type is None: job_parameters.federated_status_collect_type = DEFAULT_FEDERATED_STATUS_COLLECT_TYPE if create_initiator_baseline and not job_parameters.computing_partitions: job_parameters.computing_partitions = job_parameters.adaptation_parameters[ "task_cores_per_node"] * job_parameters.adaptation_parameters[ "task_nodes"]
def return_resource(job_id, role, party_id): status = ResourceManager.return_job_resource(job_id=job_id, role=role, party_id=int(party_id)) if status: return get_json_result(retcode=0, retmsg='success') else: return get_json_result( retcode=RetCode.OPERATING_ERROR, retmsg=f"apply for job {job_id} resource failed")
def query_resource(): use_resource_job, computing_engine_resource = ResourceManager.query_resource( **request.json) return get_json_result(retcode=0, retmsg='success', data={ "use_resource_job": use_resource_job, "computing_engine_resource": computing_engine_resource })
def get_job_engines_address(cls, job_parameters: RunParameters): engines_info = {} engine_list = [(EngineType.COMPUTING, job_parameters.computing_engine), (EngineType.FEDERATION, job_parameters.federation_engine), (EngineType.STORAGE, job_parameters.storage_engine)] for engine_type, engine_name in engine_list: engine_info = ResourceManager.get_engine_registration_info( engine_type=engine_type, engine_name=engine_name) job_parameters.engines_address[engine_type] = engine_info.f_engine_config if engine_info else {} engines_info[engine_type] = engine_info return engines_info
def get_archives(cls, storage_engine=FateDependenceStorageEngine.HDFS.value): archives = [] name_node = ResourceManager.get_engine_registration_info( engine_type=EngineType.STORAGE, engine_name=storage_engine).f_engine_config.get("name_node") for dependence_type in [ FateDependenceName.Fate_Source_Code.value, FateDependenceName.Python_Env.value ]: archives.append(name_node + cls.dependence_config.get( dependence_type).get("f_dependencies_conf").get("archives")) return ','.join(archives)
def check_parameters(cls, job_parameters: RunParameters, role, party_id, engines_info): status, cores_submit, max_cores_per_job = ResourceManager.check_resource_apply( job_parameters=job_parameters, role=role, party_id=party_id, engines_info=engines_info) if not status: msg = "" msg2 = "default value is fate_flow/settings.py#DEFAULT_TASK_CORES_PER_NODE, refer fate_flow/examples/simple/simple_job_conf.json" if job_parameters.computing_engine in { ComputingEngine.EGGROLL, ComputingEngine.STANDALONE }: msg = "please use task_cores job parameters to set request task cores or you can customize it with eggroll_run job parameters" elif job_parameters.computing_engine in {ComputingEngine.SPARK}: msg = "please use task_cores job parameters to set request task cores or you can customize it with spark_run job parameters" raise RuntimeError( f"max cores per job is {max_cores_per_job} base on (fate_flow/settings#MAX_CORES_PERCENT_PER_JOB * conf/service_conf.yaml#nodes * conf/service_conf.yaml#cores_per_node), expect {cores_submit} cores, {msg}, {msg2}" )
def detect_resource_record(cls): detect_logger().info('start detect resource recycle') try: filter_status = EndStatus.status_list() filter_status.append(JobStatus.WAITING) jobs = Job.select().where( Job.f_resource_in_use == True, current_timestamp() - Job.f_apply_resource_time > 10 * 60 * 1000, Job.f_status << filter_status) stop_jobs = set() for job in jobs: if job.f_status == JobStatus.WAITING: stop_jobs.add(job) else: try: detect_logger(job_id=job.f_job_id).info( f"start to return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource" ) flag = ResourceManager.return_job_resource( job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id) if flag: detect_logger(job_id=job.f_job_id).info( f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource successfully" ) else: detect_logger(job_id=job.f_job_id).info( f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource failed" ) except Exception as e: detect_logger(job_id=job.f_job_id).exception(e) cls.request_stop_jobs(jobs=stop_jobs, stop_msg="start timeout", stop_status=JobStatus.TIMEOUT) except Exception as e: detect_logger().exception(e) finally: detect_logger().info('finish detect resource recycle')
def return_resource(): status = ResourceManager.return_resource(job_id=request.json.get("job_id")) return get_json_result(data=status)
# init db init_flow_db() init_arch_db() # init runtime config import argparse parser = argparse.ArgumentParser() parser.add_argument('--standalone_node', default=False, help="if standalone node mode or not ", action='store_true') args = parser.parse_args() RuntimeConfig.init_env() RuntimeConfig.set_process_role(ProcessRole.DRIVER) PrivilegeAuth.init() ServiceUtils.register() ResourceManager.initialize() Detector(interval=5 * 1000).start() DAGScheduler(interval=2 * 1000).start() thread_pool_executor = ThreadPoolExecutor( max_workers=GRPC_SERVER_MAX_WORKERS) stat_logger.info( f"start grpc server thread pool by {thread_pool_executor._max_workers} max workers" ) server = grpc.server( thread_pool=thread_pool_executor, options=[(cygrpc.ChannelArgKey.max_send_message_length, -1), (cygrpc.ChannelArgKey.max_receive_message_length, -1)]) proxy_pb2_grpc.add_DataTransferServiceServicer_to_server( UnaryService(), server) server.add_insecure_port("{}:{}".format(IP, GRPC_PORT))