Ejemplo n.º 1
0
 def get_common_parameters(self):
     if int(self.job_runtime_conf.get('dsl_version', 1)) == 2:
         if "common" not in self.job_runtime_conf["job_parameters"]:
             raise RuntimeError("the configuration format for v2 version must be job_parameters:common")
         job_parameters = RunParameters(**self.job_runtime_conf['job_parameters']['common'])
         self.job_runtime_conf['job_parameters']['common'] = job_parameters.to_dict()
     else:
         if "processors_per_node" in self.job_runtime_conf['job_parameters']:
             self.job_runtime_conf['job_parameters']["eggroll_run"] = \
                 {"eggroll.session.processors.per.node": self.job_runtime_conf['job_parameters']["processors_per_node"]}
         job_parameters = RunParameters(**self.job_runtime_conf['job_parameters'])
         self.job_runtime_conf['job_parameters'] = job_parameters.to_dict()
     return job_parameters
Ejemplo n.º 2
0
def start_session_stop(task):
    job_parameters = RunParameters(**get_job_parameters(
        job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id))
    computing_session_id = generate_session_id(task.f_task_id,
                                               task.f_task_version,
                                               task.f_role, task.f_party_id)
    if task.f_status != TaskStatus.WAITING:
        schedule_logger(task.f_job_id).info(
            f'start run subprocess to stop task session {computing_session_id}'
        )
    else:
        schedule_logger(task.f_job_id).info(
            f'task is waiting, pass stop session {computing_session_id}')
        return
    task_dir = os.path.join(get_job_directory(job_id=task.f_job_id),
                            task.f_role, task.f_party_id,
                            task.f_component_name, 'session_stop')
    os.makedirs(task_dir, exist_ok=True)
    process_cmd = [
        'python3', sys.modules[session_utils.SessionStop.__module__].__file__,
        '-j', computing_session_id, '--computing',
        job_parameters.computing_engine, '--federation',
        job_parameters.federation_engine, '--storage',
        job_parameters.storage_engine, '-c',
        'stop' if task.f_status == JobStatus.SUCCESS else 'kill'
    ]
    p = run_subprocess(job_id=task.f_job_id,
                       config_dir=task_dir,
                       process_cmd=process_cmd,
                       log_dir=None)
Ejemplo n.º 3
0
 def clean_task(cls, job_id, task_id, task_version, role, party_id,
                content_type):
     status = set()
     if content_type == "metrics":
         tracker = Tracker(job_id=job_id,
                           role=role,
                           party_id=party_id,
                           task_id=task_id,
                           task_version=task_version)
         status.add(tracker.clean_metrics())
     elif content_type == "table":
         jobs = JobSaver.query_job(job_id=job_id,
                                   role=role,
                                   party_id=party_id)
         if jobs:
             job = jobs[0]
             job_parameters = RunParameters(
                 **job.f_runtime_conf_on_party["job_parameters"])
             tracker = Tracker(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               task_id=task_id,
                               task_version=task_version,
                               job_parameters=job_parameters)
             status.add(tracker.clean_task(job.f_runtime_conf_on_party))
     if len(status) == 1 and True in status:
         return True
     else:
         return False
Ejemplo n.º 4
0
def get_component_input_table(dsl_parser, job, component_name):
    component = dsl_parser.get_component_info(component_name=component_name)
    if 'reader' in component_name:
        component_parameters = component.get_role_parameters()
        return component_parameters[job.f_role][0]['ReaderParam']
    task_input_dsl = component.get_input()
    job_args_on_party = TaskExecutor.get_job_args_on_party(
        dsl_parser=dsl_parser,
        job_runtime_conf=job.f_runtime_conf,
        role=job.f_role,
        party_id=job.f_party_id)
    config = job_utils.get_job_parameters(job.f_job_id, job.f_role,
                                          job.f_party_id)
    task_parameters = RunParameters(**config)
    job_parameters = task_parameters
    component_input_table = TaskExecutor.get_task_run_args(
        job_id=job.f_job_id,
        role=job.f_role,
        party_id=job.f_party_id,
        task_id=None,
        task_version=None,
        job_args=job_args_on_party,
        job_parameters=job_parameters,
        task_parameters=task_parameters,
        input_dsl=task_input_dsl,
        get_input_table=True)
    return component_input_table
Ejemplo n.º 5
0
    def create_job(cls, job_id, role, party_id, job_info):
        # parse job configuration
        dsl = job_info['dsl']
        runtime_conf = job_info['runtime_conf']
        train_runtime_conf = job_info['train_runtime_conf']
        if USE_AUTHENTICATION:
            authentication_check(src_role=job_info.get('src_role', None),
                                 src_party_id=job_info.get(
                                     'src_party_id', None),
                                 dsl=dsl,
                                 runtime_conf=runtime_conf,
                                 role=role,
                                 party_id=party_id)
        job_parameters = RunParameters(**runtime_conf['job_parameters'])
        job_initiator = runtime_conf['initiator']

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=dsl,
            runtime_conf=runtime_conf,
            train_runtime_conf=train_runtime_conf)

        # save new job into db
        if role == job_initiator['role'] and party_id == job_initiator[
                'party_id']:
            is_initiator = True
        else:
            is_initiator = False
        job_info["status"] = JobStatus.WAITING
        roles = job_info['roles']
        # this party configuration
        job_info["role"] = role
        job_info["party_id"] = party_id
        job_info["is_initiator"] = is_initiator
        job_info["progress"] = 0
        engines_info = cls.get_job_engines_address(
            job_parameters=job_parameters)
        cls.special_role_parameters(role=role, job_parameters=job_parameters)
        cls.check_parameters(job_parameters=job_parameters,
                             engines_info=engines_info)
        runtime_conf["job_parameters"] = job_parameters.to_dict()

        JobSaver.create_job(job_info=job_info)
        job_utils.save_job_conf(job_id=job_id,
                                job_dsl=dsl,
                                job_runtime_conf=runtime_conf,
                                train_runtime_conf=train_runtime_conf,
                                pipeline_dsl=None)

        cls.initialize_tasks(job_id, role, party_id, True, job_initiator,
                             job_parameters, dsl_parser)
        cls.initialize_job_tracker(job_id=job_id,
                                   role=role,
                                   party_id=party_id,
                                   job_info=job_info,
                                   is_initiator=is_initiator,
                                   dsl_parser=dsl_parser)
Ejemplo n.º 6
0
 def calculate_job_resource(cls, job_parameters: RunParameters = None, job_id=None, role=None, party_id=None):
     if not job_parameters:
         dsl, runtime_conf, train_runtime_conf = job_utils.get_job_configuration(job_id=job_id,
                                                                                 role=role,
                                                                                 party_id=party_id)
         job_parameters = RunParameters(**runtime_conf["job_parameters"])
     cores = job_parameters.adaptation_parameters["task_cores_per_node"] * job_parameters.adaptation_parameters[
         "task_nodes"] * job_parameters.task_parallelism
     memory = job_parameters.adaptation_parameters["task_memory_per_node"] * job_parameters.adaptation_parameters[
         "task_nodes"] * job_parameters.task_parallelism
     return job_parameters.computing_engine, cores, memory
Ejemplo n.º 7
0
 def calculate_task_resource(cls, task_parameters: RunParameters = None, task_info: dict = None):
     if not task_parameters:
         dsl, runtime_conf, train_runtime_conf = job_utils.get_job_configuration(job_id=task_info["job_id"],
                                                                                 role=task_info["role"],
                                                                                 party_id=task_info["party_id"])
         task_parameters = RunParameters(**runtime_conf["job_parameters"])
     cores_per_task = task_parameters.adaptation_parameters["task_cores_per_node"] * \
                      task_parameters.adaptation_parameters["task_nodes"]
     memory_per_task = task_parameters.adaptation_parameters["task_memory_per_node"] * \
                       task_parameters.adaptation_parameters["task_nodes"]
     return cores_per_task, memory_per_task
Ejemplo n.º 8
0
 def calculate_task_resource(cls,
                             task_parameters: RunParameters = None,
                             task_info: dict = None):
     if not task_parameters:
         job_parameters = job_utils.get_job_parameters(
             job_id=task_info["job_id"],
             role=task_info["role"],
             party_id=task_info["party_id"])
         task_parameters = RunParameters(**job_parameters)
     cores_per_task = task_parameters.adaptation_parameters["task_cores_per_node"] * \
                      task_parameters.adaptation_parameters["task_nodes"]
     memory_per_task = task_parameters.adaptation_parameters["task_memory_per_node"] * \
                       task_parameters.adaptation_parameters["task_nodes"]
     return cores_per_task, memory_per_task
Ejemplo n.º 9
0
 def calculate_task_resource(cls, task_parameters: RunParameters = None, task_info: dict = None):
     if not task_parameters:
         job_parameters = job_utils.get_job_parameters(job_id=task_info["job_id"],
                                                       role=task_info["role"],
                                                       party_id=task_info["party_id"])
         task_parameters = RunParameters(**job_parameters)
     if task_parameters.backend == Backend.LINKIS_SPARK_RABBITMQ:
         cores_per_task = 0
         memory_per_task = 0
     elif task_info["role"] in IGNORE_RESOURCE_ROLES and task_parameters.computing_engine in SUPPORT_IGNORE_RESOURCE_ENGINES:
         cores_per_task = 0
         memory_per_task = 0
     else:
         cores_per_task = task_parameters.adaptation_parameters["task_cores_per_node"] * \
                          task_parameters.adaptation_parameters["task_nodes"]
         memory_per_task = task_parameters.adaptation_parameters["task_memory_per_node"] * \
                           task_parameters.adaptation_parameters["task_nodes"]
     return cores_per_task, memory_per_task
Ejemplo n.º 10
0
 def calculate_job_resource(cls, job_parameters: RunParameters = None, job_id=None, role=None, party_id=None):
     if not job_parameters:
         job_parameters = job_utils.get_job_parameters(job_id=job_id,
                                                       role=role,
                                                       party_id=party_id)
         job_parameters = RunParameters(**job_parameters)
     if job_parameters.backend == Backend.LINKIS_SPARK_RABBITMQ:
         cores = 0
         memory = 0
     elif role in IGNORE_RESOURCE_ROLES and job_parameters.computing_engine in SUPPORT_IGNORE_RESOURCE_ENGINES:
         cores = 0
         memory = 0
     else:
         cores = job_parameters.adaptation_parameters["task_cores_per_node"] * job_parameters.adaptation_parameters[
             "task_nodes"] * job_parameters.task_parallelism
         memory = job_parameters.adaptation_parameters["task_memory_per_node"] * job_parameters.adaptation_parameters[
             "task_nodes"] * job_parameters.task_parallelism
     return job_parameters.computing_engine, cores, memory
Ejemplo n.º 11
0
    def create_job(cls, job_id, role, party_id, job_info):
        # parse job configuration
        dsl = job_info['dsl']
        runtime_conf = job_info['runtime_conf']
        train_runtime_conf = job_info['train_runtime_conf']
        if USE_AUTHENTICATION:
            authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get('src_party_id', None),
                                 dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id)

        dsl_parser = schedule_utils.get_job_dsl_parser(dsl=dsl,
                                                       runtime_conf=runtime_conf,
                                                       train_runtime_conf=train_runtime_conf)
        job_parameters = dsl_parser.get_job_parameters().get(role, {}).get(party_id, {})
        schedule_logger(job_id).info('job parameters:{}'.format(job_parameters))
        job_parameters = RunParameters(**job_parameters)

        # save new job into db
        if role == job_info["initiator_role"] and party_id == job_info["initiator_party_id"]:
            is_initiator = True
        else:
            is_initiator = False
        job_info["status"] = JobStatus.WAITING
        # this party configuration
        job_info["role"] = role
        job_info["party_id"] = party_id
        job_info["is_initiator"] = is_initiator
        job_info["progress"] = 0
        cls.adapt_job_parameters(role=role, job_parameters=job_parameters)
        engines_info = cls.get_job_engines_address(job_parameters=job_parameters)
        cls.check_parameters(job_parameters=job_parameters, role=role, party_id=party_id, engines_info=engines_info)
        job_info["runtime_conf_on_party"]["job_parameters"] = job_parameters.to_dict()
        job_utils.save_job_conf(job_id=job_id,
                                role=role,
                                job_dsl=dsl,
                                job_runtime_conf=runtime_conf,
                                job_runtime_conf_on_party=job_info["runtime_conf_on_party"],
                                train_runtime_conf=train_runtime_conf,
                                pipeline_dsl=None)

        cls.initialize_tasks(job_id=job_id, role=role, party_id=party_id, run_on_this_party=True, initiator_role=job_info["initiator_role"], initiator_party_id=job_info["initiator_party_id"], job_parameters=job_parameters, dsl_parser=dsl_parser)
        job_parameters = job_info['runtime_conf_on_party']['job_parameters']
        roles = job_info['roles']
        cls.initialize_job_tracker(job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, roles=roles, is_initiator=is_initiator, dsl_parser=dsl_parser)
        JobSaver.create_job(job_info=job_info)
Ejemplo n.º 12
0
    def run_task(cls):
        task_info = {}
        try:
            parser = argparse.ArgumentParser()
            parser.add_argument('-j',
                                '--job_id',
                                required=True,
                                type=str,
                                help="job id")
            parser.add_argument('-n',
                                '--component_name',
                                required=True,
                                type=str,
                                help="component name")
            parser.add_argument('-t',
                                '--task_id',
                                required=True,
                                type=str,
                                help="task id")
            parser.add_argument('-v',
                                '--task_version',
                                required=True,
                                type=int,
                                help="task version")
            parser.add_argument('-r',
                                '--role',
                                required=True,
                                type=str,
                                help="role")
            parser.add_argument('-p',
                                '--party_id',
                                required=True,
                                type=int,
                                help="party id")
            parser.add_argument('-c',
                                '--config',
                                required=True,
                                type=str,
                                help="task parameters")
            parser.add_argument('--run_ip', help="run ip", type=str)
            parser.add_argument('--job_server', help="job server", type=str)
            args = parser.parse_args()
            schedule_logger(args.job_id).info('enter task process')
            schedule_logger(args.job_id).info(args)
            # init function args
            if args.job_server:
                RuntimeConfig.init_config(
                    JOB_SERVER_HOST=args.job_server.split(':')[0],
                    HTTP_PORT=args.job_server.split(':')[1])
                RuntimeConfig.set_process_role(ProcessRole.EXECUTOR)
            job_id = args.job_id
            component_name = args.component_name
            task_id = args.task_id
            task_version = args.task_version
            role = args.role
            party_id = args.party_id
            executor_pid = os.getpid()
            task_info.update({
                "job_id": job_id,
                "component_name": component_name,
                "task_id": task_id,
                "task_version": task_version,
                "role": role,
                "party_id": party_id,
                "run_ip": args.run_ip,
                "run_pid": executor_pid
            })
            start_time = current_timestamp()
            job_conf = job_utils.get_job_conf(job_id, role)
            job_dsl = job_conf["job_dsl_path"]
            job_runtime_conf = job_conf["job_runtime_conf_path"]
            dsl_parser = schedule_utils.get_job_dsl_parser(
                dsl=job_dsl,
                runtime_conf=job_runtime_conf,
                train_runtime_conf=job_conf["train_runtime_conf_path"],
                pipeline_dsl=job_conf["pipeline_dsl_path"])
            party_index = job_runtime_conf["role"][role].index(party_id)
            job_args_on_party = TaskExecutor.get_job_args_on_party(
                dsl_parser, job_runtime_conf, role, party_id)
            component = dsl_parser.get_component_info(
                component_name=component_name)
            component_parameters = component.get_role_parameters()
            component_parameters_on_party = component_parameters[role][
                party_index] if role in component_parameters else {}
            module_name = component.get_module()
            task_input_dsl = component.get_input()
            task_output_dsl = component.get_output()
            component_parameters_on_party[
                'output_data_name'] = task_output_dsl.get('data')
            task_parameters = RunParameters(
                **file_utils.load_json_conf(args.config))
            job_parameters = task_parameters
            if job_parameters.assistant_role:
                TaskExecutor.monkey_patch()
        except Exception as e:
            traceback.print_exc()
            schedule_logger().exception(e)
            task_info["party_status"] = TaskStatus.FAILED
            return
        try:
            job_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role,
                str(party_id))
            task_log_dir = os.path.join(job_log_dir, component_name)
            log.LoggerFactory.set_directory(directory=task_log_dir,
                                            parent_log_dir=job_log_dir,
                                            append_to_parent_log=True,
                                            force=True)

            tracker = Tracker(job_id=job_id,
                              role=role,
                              party_id=party_id,
                              component_name=component_name,
                              task_id=task_id,
                              task_version=task_version,
                              model_id=job_parameters.model_id,
                              model_version=job_parameters.model_version,
                              component_module_name=module_name,
                              job_parameters=job_parameters)
            tracker_client = TrackerClient(
                job_id=job_id,
                role=role,
                party_id=party_id,
                component_name=component_name,
                task_id=task_id,
                task_version=task_version,
                model_id=job_parameters.model_id,
                model_version=job_parameters.model_version,
                component_module_name=module_name,
                job_parameters=job_parameters)
            run_class_paths = component_parameters_on_party.get(
                'CodePath').split('/')
            run_class_package = '.'.join(
                run_class_paths[:-2]) + '.' + run_class_paths[-2].replace(
                    '.py', '')
            run_class_name = run_class_paths[-1]
            task_info["party_status"] = TaskStatus.RUNNING
            cls.report_task_update_to_driver(task_info=task_info)

            # init environment, process is shared globally
            RuntimeConfig.init_config(
                WORK_MODE=job_parameters.work_mode,
                COMPUTING_ENGINE=job_parameters.computing_engine,
                FEDERATION_ENGINE=job_parameters.federation_engine,
                FEDERATED_MODE=job_parameters.federated_mode)

            if RuntimeConfig.COMPUTING_ENGINE == ComputingEngine.EGGROLL:
                session_options = task_parameters.eggroll_run.copy()
            else:
                session_options = {}

            sess = session.Session(
                computing_type=job_parameters.computing_engine,
                federation_type=job_parameters.federation_engine)
            computing_session_id = job_utils.generate_session_id(
                task_id, task_version, role, party_id)
            sess.init_computing(computing_session_id=computing_session_id,
                                options=session_options)
            federation_session_id = job_utils.generate_task_version_id(
                task_id, task_version)
            component_parameters_on_party[
                "job_parameters"] = job_parameters.to_dict()
            sess.init_federation(
                federation_session_id=federation_session_id,
                runtime_conf=component_parameters_on_party,
                service_conf=job_parameters.engines_address.get(
                    EngineType.FEDERATION, {}))
            sess.as_default()

            schedule_logger().info('Run {} {} {} {} {} task'.format(
                job_id, component_name, task_id, role, party_id))
            schedule_logger().info("Component parameters on party {}".format(
                component_parameters_on_party))
            schedule_logger().info("Task input dsl {}".format(task_input_dsl))
            task_run_args = cls.get_task_run_args(
                job_id=job_id,
                role=role,
                party_id=party_id,
                task_id=task_id,
                task_version=task_version,
                job_args=job_args_on_party,
                job_parameters=job_parameters,
                task_parameters=task_parameters,
                input_dsl=task_input_dsl,
            )
            if module_name in {"Upload", "Download", "Reader", "Writer"}:
                task_run_args["job_parameters"] = job_parameters
            run_object = getattr(importlib.import_module(run_class_package),
                                 run_class_name)()
            run_object.set_tracker(tracker=tracker_client)
            run_object.set_task_version_id(
                task_version_id=job_utils.generate_task_version_id(
                    task_id, task_version))
            # add profile logs
            profile.profile_start()
            run_object.run(component_parameters_on_party, task_run_args)
            profile.profile_ends()
            output_data = run_object.save_data()
            if not isinstance(output_data, list):
                output_data = [output_data]
            for index in range(0, len(output_data)):
                data_name = task_output_dsl.get(
                    'data')[index] if task_output_dsl.get(
                        'data') else '{}'.format(index)
                persistent_table_namespace, persistent_table_name = tracker.save_output_data(
                    computing_table=output_data[index],
                    output_storage_engine=job_parameters.storage_engine,
                    output_storage_address=job_parameters.engines_address.get(
                        EngineType.STORAGE, {}))
                if persistent_table_namespace and persistent_table_name:
                    tracker.log_output_data_info(
                        data_name=data_name,
                        table_namespace=persistent_table_namespace,
                        table_name=persistent_table_name)
            output_model = run_object.export_model()
            # There is only one model output at the current dsl version.
            tracker.save_output_model(
                output_model, task_output_dsl['model'][0]
                if task_output_dsl.get('model') else 'default')
            task_info["party_status"] = TaskStatus.SUCCESS
        except Exception as e:
            task_info["party_status"] = TaskStatus.FAILED
            schedule_logger().exception(e)
        finally:
            try:
                task_info["end_time"] = current_timestamp()
                task_info["elapsed"] = task_info["end_time"] - start_time
                cls.report_task_update_to_driver(task_info=task_info)
            except Exception as e:
                task_info["party_status"] = TaskStatus.FAILED
                traceback.print_exc()
                schedule_logger().exception(e)
        schedule_logger().info('task {} {} {} start time: {}'.format(
            task_id, role, party_id, timestamp_to_date(start_time)))
        schedule_logger().info('task {} {} {} end time: {}'.format(
            task_id, role, party_id, timestamp_to_date(task_info["end_time"])))
        schedule_logger().info('task {} {} {} takes {}s'.format(
            task_id, role, party_id,
            int(task_info["elapsed"]) / 1000))
        schedule_logger().info('Finish {} {} {} {} {} {} task {}'.format(
            job_id, component_name, task_id, task_version, role, party_id,
            task_info["party_status"]))

        print('Finish {} {} {} {} {} {} task {}'.format(
            job_id, component_name, task_id, task_version, role, party_id,
            task_info["party_status"]))
        return task_info
Ejemplo n.º 13
0
    def start_task(cls, job_id, component_name, task_id, task_version, role,
                   party_id):
        """
        Start task, update status and party status
        :param job_id:
        :param component_name:
        :param task_id:
        :param task_version:
        :param role:
        :param party_id:
        :return:
        """
        schedule_logger(job_id).info(
            'try to start job {} task {} {} on {} {} executor subprocess'.
            format(job_id, task_id, task_version, role, party_id))
        task_executor_process_start_status = False
        task_info = {
            "job_id": job_id,
            "task_id": task_id,
            "task_version": task_version,
            "role": role,
            "party_id": party_id,
        }
        try:
            task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                    role, party_id, component_name, task_id,
                                    task_version)
            os.makedirs(task_dir, exist_ok=True)
            task_parameters_path = os.path.join(task_dir,
                                                'task_parameters.json')
            run_parameters_dict = job_utils.get_job_parameters(
                job_id, role, party_id)
            with open(task_parameters_path, 'w') as fw:
                fw.write(json_dumps(run_parameters_dict))

            run_parameters = RunParameters(**run_parameters_dict)

            schedule_logger(job_id=job_id).info(
                f"use computing engine {run_parameters.computing_engine}")

            if run_parameters.computing_engine in {
                    ComputingEngine.EGGROLL, ComputingEngine.STANDALONE
            }:
                process_cmd = [
                    sys.executable,
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-v',
                    task_version,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_parameters_path,
                    '--run_ip',
                    RuntimeConfig.JOB_SERVER_HOST,
                    '--job_server',
                    '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST,
                                   RuntimeConfig.HTTP_PORT),
                ]
            elif run_parameters.computing_engine == ComputingEngine.SPARK:
                if "SPARK_HOME" not in os.environ:
                    raise EnvironmentError("SPARK_HOME not found")
                spark_home = os.environ["SPARK_HOME"]

                # additional configs
                spark_submit_config = run_parameters.spark_run

                deploy_mode = spark_submit_config.get("deploy-mode", "client")
                if deploy_mode not in ["client"]:
                    raise ValueError(
                        f"deploy mode {deploy_mode} not supported")

                spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit")
                process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}']
                for k, v in spark_submit_config.items():
                    if k != "conf":
                        process_cmd.append(f'--{k}={v}')
                if "conf" in spark_submit_config:
                    for ck, cv in spark_submit_config["conf"].items():
                        process_cmd.append(f'--conf')
                        process_cmd.append(f'{ck}={cv}')
                process_cmd.extend([
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-v',
                    task_version,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_parameters_path,
                    '--run_ip',
                    RuntimeConfig.JOB_SERVER_HOST,
                    '--job_server',
                    '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST,
                                   RuntimeConfig.HTTP_PORT),
                ])
            else:
                raise ValueError(
                    f"${run_parameters.computing_engine} is not supported")

            task_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role, party_id,
                component_name)
            schedule_logger(job_id).info(
                'job {} task {} {} on {} {} executor subprocess is ready'.
                format(job_id, task_id, task_version, role, party_id))
            p = job_utils.run_subprocess(job_id=job_id,
                                         config_dir=task_dir,
                                         process_cmd=process_cmd,
                                         log_dir=task_log_dir)
            if p:
                task_info["party_status"] = TaskStatus.RUNNING
                #task_info["run_pid"] = p.pid
                task_info["start_time"] = current_timestamp()
                task_executor_process_start_status = True
            else:
                task_info["party_status"] = TaskStatus.FAILED
        except Exception as e:
            schedule_logger(job_id).exception(e)
            task_info["party_status"] = TaskStatus.FAILED
        finally:
            try:
                cls.update_task(task_info=task_info)
                cls.update_task_status(task_info=task_info)
            except Exception as e:
                schedule_logger(job_id).exception(e)
            schedule_logger(job_id).info(
                'job {} task {} {} on {} {} executor subprocess start {}'.
                format(
                    job_id, task_id, task_version, role, party_id, "success"
                    if task_executor_process_start_status else "failed"))
Ejemplo n.º 14
0
    def start_task(cls, job_id, component_name, task_id, task_version, role,
                   party_id, **kwargs):
        """
        Start task, update status and party status
        :param job_id:
        :param component_name:
        :param task_id:
        :param task_version:
        :param role:
        :param party_id:
        :return:
        """
        job_dsl = job_utils.get_job_dsl(job_id, role, party_id)
        PrivilegeAuth.authentication_component(
            job_dsl,
            src_party_id=kwargs.get('src_party_id'),
            src_role=kwargs.get('src_role'),
            party_id=party_id,
            component_name=component_name)

        schedule_logger(job_id).info(
            'try to start job {} task {} {} on {} {} executor subprocess'.
            format(job_id, task_id, task_version, role, party_id))
        task_executor_process_start_status = False
        task_info = {
            "job_id": job_id,
            "task_id": task_id,
            "task_version": task_version,
            "role": role,
            "party_id": party_id,
            "party_status": TaskStatus.RUNNING
        }
        is_failed = False
        try:
            task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                    role, party_id, component_name, task_id,
                                    task_version)
            os.makedirs(task_dir, exist_ok=True)
            task_parameters_path = os.path.join(task_dir,
                                                'task_parameters.json')
            run_parameters_dict = job_utils.get_job_parameters(
                job_id, role, party_id)
            run_parameters_dict["src_user"] = kwargs.get("src_user")
            with open(task_parameters_path, 'w') as fw:
                fw.write(json_dumps(run_parameters_dict))

            run_parameters = RunParameters(**run_parameters_dict)

            schedule_logger(job_id=job_id).info(
                f"use computing engine {run_parameters.computing_engine}")
            subprocess = True
            task_info["engine_conf"] = {
                "computing_engine": run_parameters.computing_engine
            }
            backend_engine = build_engine(run_parameters.computing_engine)
            status = backend_engine.run(
                job_id=job_id,
                component_name=component_name,
                task_id=task_id,
                task_version=task_version,
                role=role,
                party_id=party_id,
                task_parameters_path=task_parameters_path,
                run_parameters=run_parameters,
                task_info=task_info,
                user_name=kwargs.get("user_id"))
            if status:
                task_info["start_time"] = current_timestamp()
                task_executor_process_start_status = True
            else:
                is_failed = True
        except Exception as e:
            schedule_logger(job_id).exception(e)
            is_failed = True
        finally:
            try:
                cls.update_task(task_info=task_info)
                cls.update_task_status(task_info=task_info)
                if is_failed:
                    task_info["party_status"] = TaskStatus.FAILED
                    cls.update_task_status(task_info=task_info)
            except Exception as e:
                schedule_logger(job_id).exception(e)
            schedule_logger(job_id).info(
                'job {} task {} {} on {} {} executor subprocess start {}'.
                format(
                    job_id, task_id, task_version, role, party_id, "success"
                    if task_executor_process_start_status else "failed"))
Ejemplo n.º 15
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id,
               component_name):
     schedule_logger(job_id=job_id).info(
         f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}"
     )
     jobs = JobSaver.query_job(job_id=job_id,
                               role=initiator_role,
                               party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(
             f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}"
         )
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id,
                                     role=initiator_role,
                                     party_id=initiator_party_id,
                                     component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id,
                                     role=initiator_role,
                                     party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(
         dsl=job.f_dsl,
         runtime_conf=job.f_runtime_conf_on_party,
         train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(
                 f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun"
             )
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job,
                                          task=task,
                                          stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job,
                                           task=task,
                                           content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(
                 f"create task {task.f_task_id} new version {task.f_task_version}"
             )
             for _role, _party_ids in job.f_runtime_conf_on_party[
                     "role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(
                         job_id,
                         _role,
                         _party_id,
                         False,
                         job.f_initiator_role,
                         job.f_initiator_party_id,
                         RunParameters(
                             **
                             job.f_runtime_conf_on_party["job_parameters"]),
                         dsl_parser,
                         component_name=task.f_component_name,
                         task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(
                 f"create task {task.f_task_id} new version {task.f_task_version} successfully"
             )
             job_can_rerun = True
     if job_can_rerun:
         schedule_logger(
             job_id=job_id).info(f"job {job_id} set rerun signal")
         status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
         if status:
             schedule_logger(job_id=job_id).info(
                 f"job {job_id} set rerun signal successfully")
         else:
             schedule_logger(job_id=job_id).info(
                 f"job {job_id} set rerun signal failed")
     else:
         FederatedScheduler.sync_job_status(job=job)
         schedule_logger(
             job_id=job_id).info(f"job {job_id} no task to rerun")
Ejemplo n.º 16
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name):
     schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}")
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}")
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                    runtime_conf=job.f_runtime_conf,
                                                    train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.COMPLETE}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun")
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job, task=task, content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}")
             for _role, _party_ids in job.f_runtime_conf["role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_runtime_conf["initiator"], RunParameters(**job.f_runtime_conf["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully")
             job_can_rerun = True
     if job_can_rerun:
         if EndStatus.contains(job.f_status):
             job.f_status = JobStatus.WAITING
             job.f_end_time = None
             job.f_elapsed = None
             job.f_progress = 0
             schedule_logger(job_id=job_id).info(f"job {job_id} has been finished, set waiting to rerun")
             status, response = FederatedScheduler.sync_job_status(job=job)
             if status == FederatedSchedulingStatusCode.SUCCESS:
                 FederatedScheduler.sync_job(job=job, update_fields=["end_time", "elapsed", "progress"])
                 JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id)
                 schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun successfully")
             else:
                 schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun failed")
         else:
             # status updates may be delayed, and in a very small probability they will be executed after the rerun command
             schedule_logger(job_id=job_id).info(f"job {job_id} status is {job.f_status}, will be run new version waiting task")
     else:
         schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
Ejemplo n.º 17
0
    def submit(cls, job_data, job_id=None):
        if not job_id:
            job_id = job_utils.generate_job_id()
        schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data))
        job_dsl = job_data.get('job_dsl', {})
        job_runtime_conf = job_data.get('job_runtime_conf', {})
        job_initiator = job_runtime_conf['initiator']
        job_parameters = RunParameters(**job_runtime_conf['job_parameters'])
        cls.backend_compatibility(job_parameters=job_parameters)

        job_utils.check_job_runtime_conf(job_runtime_conf)
        if job_parameters.job_type != 'predict':
            # generate job model info
            job_parameters.model_id = model_utils.gen_model_id(job_runtime_conf['role'])
            job_parameters.model_version = job_id
            train_runtime_conf = {}
        else:
            detect_utils.check_config(job_parameters.to_dict(), ['model_id', 'model_version'])
            # get inference dsl from pipeline model as job dsl
            tracker = Tracker(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'],
                              model_id=job_parameters.model_id, model_version=job_parameters.model_version)
            pipeline_model = tracker.get_output_model('pipeline')
            if not job_dsl:
                job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl)
            train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf)

        path_dict = job_utils.save_job_conf(job_id=job_id,
                                            job_dsl=job_dsl,
                                            job_runtime_conf=job_runtime_conf,
                                            train_runtime_conf=train_runtime_conf,
                                            pipeline_dsl=None)

        job = Job()
        job.f_job_id = job_id
        job.f_dsl = job_dsl
        job_runtime_conf["job_parameters"] = job_parameters.to_dict()
        job.f_runtime_conf = job_runtime_conf
        job.f_train_runtime_conf = train_runtime_conf
        job.f_roles = job_runtime_conf['role']
        job.f_work_mode = job_parameters.work_mode
        job.f_initiator_role = job_initiator['role']
        job.f_initiator_party_id = job_initiator['party_id']

        initiator_role = job_initiator['role']
        initiator_party_id = job_initiator['party_id']
        if initiator_party_id not in job_runtime_conf['role'][initiator_role]:
            schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id))
            raise Exception("initiator party id error {}".format(initiator_party_id))

        dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_dsl,
                                                       runtime_conf=job_runtime_conf,
                                                       train_runtime_conf=train_runtime_conf)

        cls.adapt_job_parameters(job_parameters=job_parameters)

        # update runtime conf
        job_runtime_conf["job_parameters"] = job_parameters.to_dict()
        job.f_runtime_conf = job_runtime_conf

        status_code, response = FederatedScheduler.create_job(job=job)
        if status_code != FederatedSchedulingStatusCode.SUCCESS:
            raise Exception("create job failed: {}".format(response))

        if job_parameters.work_mode == WorkMode.CLUSTER:
            # Save the status information of all participants in the initiator for scheduling
            for role, party_ids in job_runtime_conf["role"].items():
                for party_id in party_ids:
                    if role == job_initiator['role'] and party_id == job_initiator['party_id']:
                        continue
                    JobController.initialize_tasks(job_id, role, party_id, False, job_initiator, job_parameters, dsl_parser)

        # push into queue
        try:
            JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id)
        except Exception as e:
            raise Exception(f'push job into queue failed:\n{e}')

        schedule_logger(job_id).info(
            'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters.model_id))
        board_url = "http://{}:{}{}".format(
            ServiceUtils.get_item("fateboard", "host"),
            ServiceUtils.get_item("fateboard", "port"),
            FATE_BOARD_DASHBOARD_ENDPOINT).format(job_id, job_initiator['role'], job_initiator['party_id'])
        logs_directory = job_utils.get_job_log_directory(job_id)
        return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \
               {'model_id': job_parameters.model_id, 'model_version': job_parameters.model_version}, board_url
Ejemplo n.º 18
0
    def create_job(cls, job_id, role, party_id, job_info):
        # parse job configuration
        dsl = job_info['dsl']
        runtime_conf = job_info['runtime_conf']
        train_runtime_conf = job_info['train_runtime_conf']
        if USE_AUTHENTICATION:
            authentication_check(src_role=job_info.get('src_role', None),
                                 src_party_id=job_info.get(
                                     'src_party_id', None),
                                 dsl=dsl,
                                 runtime_conf=runtime_conf,
                                 role=role,
                                 party_id=party_id)

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=dsl,
            runtime_conf=runtime_conf,
            train_runtime_conf=train_runtime_conf)
        job_parameters = dsl_parser.get_job_parameters().get(role, {}).get(
            party_id, {})
        schedule_logger(job_id).info(
            'job parameters:{}'.format(job_parameters))
        dest_user = dsl_parser.get_job_parameters().get(role, {}).get(
            party_id, {}).get("user", '')
        user = {}
        src_party_id = int(job_info.get('src_party_id')) if job_info.get(
            'src_party_id') else 0
        src_user = dsl_parser.get_job_parameters().get(
            job_info.get('src_role'), {}).get(src_party_id,
                                              {}).get("user", '')
        for _role, party_id_item in dsl_parser.get_job_parameters().items():
            user[_role] = {}
            for _party_id, _parameters in party_id_item.items():
                user[_role][_party_id] = _parameters.get("user", "")
        schedule_logger(job_id).info('job user:{}'.format(user))
        if USE_DATA_AUTHENTICATION:
            job_args = dsl_parser.get_args_input()
            schedule_logger(job_id).info('job args:{}'.format(job_args))
            dataset_dict = cls.get_dataset(False, role, party_id,
                                           runtime_conf.get("role"), job_args)
            dataset_list = []
            if dataset_dict.get(role, {}).get(party_id):
                for k, v in dataset_dict[role][party_id].items():
                    dataset_list.append({
                        "namespace": v.split('.')[0],
                        "table_name": v.split('.')[1]
                    })
            data_authentication_check(
                src_role=job_info.get('src_role'),
                src_party_id=job_info.get('src_party_id'),
                src_user=src_user,
                dest_user=dest_user,
                dataset_list=dataset_list)
        job_parameters = RunParameters(**job_parameters)

        # save new job into db
        if role == job_info["initiator_role"] and party_id == job_info[
                "initiator_party_id"]:
            is_initiator = True
        else:
            is_initiator = False
        job_info["status"] = JobStatus.WAITING
        job_info["user_id"] = dest_user
        job_info["src_user"] = src_user
        job_info["user"] = user
        # this party configuration
        job_info["role"] = role
        job_info["party_id"] = party_id
        job_info["is_initiator"] = is_initiator
        job_info["progress"] = 0
        cls.adapt_job_parameters(role=role, job_parameters=job_parameters)
        engines_info = cls.get_job_engines_address(
            job_parameters=job_parameters)
        cls.check_parameters(job_parameters=job_parameters,
                             role=role,
                             party_id=party_id,
                             engines_info=engines_info)
        job_info["runtime_conf_on_party"][
            "job_parameters"] = job_parameters.to_dict()
        job_utils.save_job_conf(
            job_id=job_id,
            role=role,
            job_dsl=dsl,
            job_runtime_conf=runtime_conf,
            job_runtime_conf_on_party=job_info["runtime_conf_on_party"],
            train_runtime_conf=train_runtime_conf,
            pipeline_dsl=None)

        cls.initialize_tasks(job_id=job_id,
                             role=role,
                             party_id=party_id,
                             run_on_this_party=True,
                             initiator_role=job_info["initiator_role"],
                             initiator_party_id=job_info["initiator_party_id"],
                             job_parameters=job_parameters,
                             dsl_parser=dsl_parser)
        job_parameters = job_info['runtime_conf_on_party']['job_parameters']
        roles = job_info['roles']
        cls.initialize_job_tracker(job_id=job_id,
                                   role=role,
                                   party_id=party_id,
                                   job_parameters=job_parameters,
                                   roles=roles,
                                   is_initiator=is_initiator,
                                   dsl_parser=dsl_parser)
        JobSaver.create_job(job_info=job_info)