コード例 #1
0
 def update_job_status(cls, job_info):
     update_status = JobSaver.update_job_status(job_info=job_info)
     if update_status and EndStatus.contains(job_info.get("status")):
         ResourceManager.return_job_resource(job_id=job_info["job_id"],
                                             role=job_info["role"],
                                             party_id=job_info["party_id"])
     return update_status
コード例 #2
0
 def start_task(cls, job, task):
     schedule_logger(task.f_job_id).info(
         "try to start task {} {} on {} {}".format(task.f_task_id,
                                                   task.f_task_version,
                                                   task.f_role,
                                                   task.f_party_id))
     apply_status = ResourceManager.apply_for_task_resource(
         task_info=task.to_human_model_dict(only_primary_with=["status"]))
     if not apply_status:
         return SchedulingStatusCode.NO_RESOURCE
     task.f_status = TaskStatus.RUNNING
     update_status = JobSaver.update_task_status(
         task_info=task.to_human_model_dict(only_primary_with=["status"]))
     if not update_status:
         # Another scheduler scheduling the task
         schedule_logger(task.f_job_id).info(
             "task {} {} start on another scheduler".format(
                 task.f_task_id, task.f_task_version))
         # Rollback
         task.f_status = TaskStatus.WAITING
         ResourceManager.return_task_resource(
             task_info=task.to_human_model_dict(
                 only_primary_with=["status"]))
         return SchedulingStatusCode.PASS
     schedule_logger(task.f_job_id).info("start task {} {} on {} {}".format(
         task.f_task_id, task.f_task_version, task.f_role, task.f_party_id))
     FederatedScheduler.sync_task_status(job=job, task=task)
     status_code, response = FederatedScheduler.start_task(job=job,
                                                           task=task)
     if status_code == FederatedSchedulingStatusCode.SUCCESS:
         return SchedulingStatusCode.SUCCESS
     else:
         return SchedulingStatusCode.FAILED
コード例 #3
0
 def load(cls):
     configs = {
         "job_default_config": JobDefaultConfig.load(),
         "service_registry": ServiceRegistry.load(),
     }
     ResourceManager.initialize()
     RuntimeConfig.load_config_manager()
     return configs
コード例 #4
0
 def update_task_status(cls, task_info):
     update_status = JobSaver.update_task_status(task_info=task_info)
     if update_status and EndStatus.contains(task_info.get("status")):
         ResourceManager.return_task_resource(task_info=task_info)
         cls.clean_task(job_id=task_info["job_id"],
                        task_id=task_info["task_id"],
                        task_version=task_info["task_version"],
                        role=task_info["role"],
                        party_id=task_info["party_id"],
                        content_type=TaskCleanResourceType.TABLE)
     cls.report_task_to_initiator(task_info=task_info)
     return update_status
コード例 #5
0
 def adapt_job_parameters(cls,
                          role,
                          job_parameters: RunParameters,
                          create_initiator_baseline=False):
     ResourceManager.adapt_engine_parameters(
         role=role,
         job_parameters=job_parameters,
         create_initiator_baseline=create_initiator_baseline)
     if create_initiator_baseline:
         if job_parameters.task_parallelism is None:
             job_parameters.task_parallelism = JobDefaultConfig.task_parallelism
         if job_parameters.federated_status_collect_type is None:
             job_parameters.federated_status_collect_type = JobDefaultConfig.federated_status_collect_type
     if create_initiator_baseline and not job_parameters.computing_partitions:
         job_parameters.computing_partitions = job_parameters.adaptation_parameters[
             "task_cores_per_node"] * job_parameters.adaptation_parameters[
                 "task_nodes"]
コード例 #6
0
 def adapt_job_parameters(cls,
                          role,
                          job_parameters: RunParameters,
                          create_initiator_baseline=False):
     ResourceManager.adapt_engine_parameters(
         role=role,
         job_parameters=job_parameters,
         create_initiator_baseline=create_initiator_baseline)
     if create_initiator_baseline:
         if job_parameters.task_parallelism is None:
             job_parameters.task_parallelism = DEFAULT_TASK_PARALLELISM
         if job_parameters.federated_status_collect_type is None:
             job_parameters.federated_status_collect_type = DEFAULT_FEDERATED_STATUS_COLLECT_TYPE
     if create_initiator_baseline and not job_parameters.computing_partitions:
         job_parameters.computing_partitions = job_parameters.adaptation_parameters[
             "task_cores_per_node"] * job_parameters.adaptation_parameters[
                 "task_nodes"]
コード例 #7
0
ファイル: party_app.py プロジェクト: zark7777/FATE
def return_resource(job_id, role, party_id):
    status = ResourceManager.return_job_resource(job_id=job_id,
                                                 role=role,
                                                 party_id=int(party_id))
    if status:
        return get_json_result(retcode=0, retmsg='success')
    else:
        return get_json_result(
            retcode=RetCode.OPERATING_ERROR,
            retmsg=f"apply for job {job_id} resource failed")
コード例 #8
0
ファイル: resource_app.py プロジェクト: FederatedAI/FATE-Flow
def query_resource():
    use_resource_job, computing_engine_resource = ResourceManager.query_resource(
        **request.json)
    return get_json_result(retcode=0,
                           retmsg='success',
                           data={
                               "use_resource_job":
                               use_resource_job,
                               "computing_engine_resource":
                               computing_engine_resource
                           })
コード例 #9
0
 def get_job_engines_address(cls, job_parameters: RunParameters):
     engines_info = {}
     engine_list = [(EngineType.COMPUTING, job_parameters.computing_engine),
                    (EngineType.FEDERATION,
                     job_parameters.federation_engine),
                    (EngineType.STORAGE, job_parameters.storage_engine)]
     for engine_type, engine_name in engine_list:
         engine_info = ResourceManager.get_engine_registration_info(
             engine_type=engine_type, engine_name=engine_name)
         job_parameters.engines_address[engine_type] = engine_info.f_engine_config if engine_info else {}
         engines_info[engine_type] = engine_info
     return engines_info
コード例 #10
0
 def get_archives(cls,
                  storage_engine=FateDependenceStorageEngine.HDFS.value):
     archives = []
     name_node = ResourceManager.get_engine_registration_info(
         engine_type=EngineType.STORAGE,
         engine_name=storage_engine).f_engine_config.get("name_node")
     for dependence_type in [
             FateDependenceName.Fate_Source_Code.value,
             FateDependenceName.Python_Env.value
     ]:
         archives.append(name_node + cls.dependence_config.get(
             dependence_type).get("f_dependencies_conf").get("archives"))
     return ','.join(archives)
コード例 #11
0
 def check_parameters(cls, job_parameters: RunParameters, role, party_id,
                      engines_info):
     status, cores_submit, max_cores_per_job = ResourceManager.check_resource_apply(
         job_parameters=job_parameters,
         role=role,
         party_id=party_id,
         engines_info=engines_info)
     if not status:
         msg = ""
         msg2 = "default value is fate_flow/settings.py#DEFAULT_TASK_CORES_PER_NODE, refer fate_flow/examples/simple/simple_job_conf.json"
         if job_parameters.computing_engine in {
                 ComputingEngine.EGGROLL, ComputingEngine.STANDALONE
         }:
             msg = "please use task_cores job parameters to set request task cores or you can customize it with eggroll_run job parameters"
         elif job_parameters.computing_engine in {ComputingEngine.SPARK}:
             msg = "please use task_cores job parameters to set request task cores or you can customize it with spark_run job parameters"
         raise RuntimeError(
             f"max cores per job is {max_cores_per_job} base on (fate_flow/settings#MAX_CORES_PERCENT_PER_JOB * conf/service_conf.yaml#nodes * conf/service_conf.yaml#cores_per_node), expect {cores_submit} cores, {msg}, {msg2}"
         )
コード例 #12
0
 def detect_resource_record(cls):
     detect_logger().info('start detect resource recycle')
     try:
         filter_status = EndStatus.status_list()
         filter_status.append(JobStatus.WAITING)
         jobs = Job.select().where(
             Job.f_resource_in_use == True,
             current_timestamp() - Job.f_apply_resource_time >
             10 * 60 * 1000, Job.f_status << filter_status)
         stop_jobs = set()
         for job in jobs:
             if job.f_status == JobStatus.WAITING:
                 stop_jobs.add(job)
             else:
                 try:
                     detect_logger(job_id=job.f_job_id).info(
                         f"start to return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource"
                     )
                     flag = ResourceManager.return_job_resource(
                         job_id=job.f_job_id,
                         role=job.f_role,
                         party_id=job.f_party_id)
                     if flag:
                         detect_logger(job_id=job.f_job_id).info(
                             f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource successfully"
                         )
                     else:
                         detect_logger(job_id=job.f_job_id).info(
                             f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource failed"
                         )
                 except Exception as e:
                     detect_logger(job_id=job.f_job_id).exception(e)
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="start timeout",
                               stop_status=JobStatus.TIMEOUT)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info('finish detect resource recycle')
コード例 #13
0
ファイル: resource_app.py プロジェクト: FederatedAI/FATE-Flow
def return_resource():
    status = ResourceManager.return_resource(job_id=request.json.get("job_id"))
    return get_json_result(data=status)
コード例 #14
0
ファイル: fate_flow_server.py プロジェクト: zark7777/FATE
    # init db
    init_flow_db()
    init_arch_db()
    # init runtime config
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--standalone_node',
                        default=False,
                        help="if standalone node mode or not ",
                        action='store_true')
    args = parser.parse_args()
    RuntimeConfig.init_env()
    RuntimeConfig.set_process_role(ProcessRole.DRIVER)
    PrivilegeAuth.init()
    ServiceUtils.register()
    ResourceManager.initialize()
    Detector(interval=5 * 1000).start()
    DAGScheduler(interval=2 * 1000).start()
    thread_pool_executor = ThreadPoolExecutor(
        max_workers=GRPC_SERVER_MAX_WORKERS)
    stat_logger.info(
        f"start grpc server thread pool by {thread_pool_executor._max_workers} max workers"
    )
    server = grpc.server(
        thread_pool=thread_pool_executor,
        options=[(cygrpc.ChannelArgKey.max_send_message_length, -1),
                 (cygrpc.ChannelArgKey.max_receive_message_length, -1)])

    proxy_pb2_grpc.add_DataTransferServiceServicer_to_server(
        UnaryService(), server)
    server.add_insecure_port("{}:{}".format(IP, GRPC_PORT))