Esempio n. 1
0
    def check_component(cls, job, check_type="inheritance"):
        schedule_logger(job.f_job_id).info(f"component check")
        dependence_status_code, response = FederatedScheduler.check_component(
            job=job, check_type=check_type)
        schedule_logger(
            job.f_job_id).info(f"component check response: {response}")
        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf,
            train_runtime_conf=job.f_train_runtime_conf)
        component_set = set([
            cpn.name for cpn in dsl_parser.get_source_connect_sub_graph(
                job.f_inheritance_info.get("component_list"))
        ])
        for dest_role in response.keys():
            for party_id in response[dest_role].keys():
                component_set = component_set.intersection(
                    set(response[dest_role][party_id].get("data")))
        if component_set != set(job.f_inheritance_info.get("component_list")):
            schedule_logger(
                job.f_job_id).info(f"dsl parser components:{component_set}")

            component_list = [
                cpn.name for cpn in dsl_parser.get_source_connect_sub_graph(
                    list(component_set))
            ]
            schedule_logger(
                job.f_job_id).info(f"parser result:{component_list}")
            command_body = {"inheritance_info": job.f_inheritance_info}
            command_body["inheritance_info"].update(
                {"component_list": component_list})
            schedule_logger(
                job.f_job_id).info(f"start align job info:{command_body}")
            status_code, response = FederatedScheduler.align_args(
                job, command_body=command_body)
            schedule_logger(
                job.f_job_id).info(f"align result:{status_code}, {response}")
        schedule_logger(job.f_job_id).info(f"check success")
Esempio n. 2
0
 def get_rerun_component(cls, component_name, job, dsl_parser, force):
     if not component_name or component_name == job_utils.job_pipeline_component_name(
     ):
         pass
     else:
         dependence_status_code, response = FederatedScheduler.check_component(
             job=job, check_type="rerun")
         success_task_list = [
             task.f_component_name
             for task in JobSaver.query_task(job_id=job.f_job_id,
                                             party_id=job.f_party_id,
                                             role=job.f_role,
                                             status=TaskStatus.SUCCESS,
                                             only_latest=True)
         ]
         component_set = set()
         for dest_role in response.keys():
             for party_id in response[dest_role].keys():
                 component_set = component_set.union(
                     set(response[dest_role][party_id].get("data")))
         schedule_logger(job.f_job_id).info(
             f"success task list: {success_task_list}, check failed component list: {list(component_set)}"
         )
         need_rerun = [
             cpn.name for cpn in dsl_parser.get_need_revisit_nodes(
                 success_task_list, list(component_set))
         ]
         schedule_logger(job.f_job_id).info(
             f"need rerun success component: {need_rerun}")
         if component_set:
             force = True
         if isinstance(component_name, str):
             component_name = set(need_rerun).union({component_name})
         else:
             component_name = set(need_rerun).union(set(component_name))
     return component_name, force