Ejemplo n.º 1
0
 def get_process_dirs(cls,
                      worker_name: WorkerName,
                      job_id=None,
                      role=None,
                      party_id=None,
                      task: Task = None):
     worker_id = base_utils.new_unique_id()
     party_id = str(party_id)
     if task:
         config_dir = job_utils.get_job_directory(job_id, role, party_id,
                                                  task.f_component_name,
                                                  task.f_task_id,
                                                  str(task.f_task_version),
                                                  worker_name.value,
                                                  worker_id)
         log_dir = job_utils.get_job_log_directory(job_id, role, party_id,
                                                   task.f_component_name)
     elif job_id and role and party_id:
         config_dir = job_utils.get_job_directory(job_id, role, party_id,
                                                  worker_name.value,
                                                  worker_id)
         log_dir = job_utils.get_job_log_directory(job_id, role, party_id,
                                                   worker_name.value,
                                                   worker_id)
     else:
         config_dir = job_utils.get_general_worker_directory(
             worker_name.value, worker_id)
         log_dir = job_utils.get_general_worker_log_directory(
             worker_name.value, worker_id)
     os.makedirs(config_dir, exist_ok=True)
     return worker_id, config_dir, log_dir
Ejemplo n.º 2
0
def download_upload(access_module):
    job_id = job_utils.generate_job_id()
    if access_module == "upload" and UPLOAD_DATA_FROM_CLIENT and not (request.json and request.json.get("use_local_data") == 0):
        file = request.files['file']
        filename = os.path.join(job_utils.get_job_directory(job_id), 'fate_upload_tmp', file.filename)
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        try:
            file.save(filename)
        except Exception as e:
            shutil.rmtree(os.path.join(job_utils.get_job_directory(job_id), 'fate_upload_tmp'))
            raise e
        job_config = request.args.to_dict()
        if "namespace" in job_config and "table_name" in job_config:
            pass
        else:
            # higher than version 1.5.1, support eggroll run parameters
            job_config = json_loads(list(job_config.keys())[0])
        job_config['file'] = filename
    else:
        job_config = request.json
    required_arguments = ['work_mode', 'namespace', 'table_name']
    if access_module == 'upload':
        required_arguments.extend(['file', 'head', 'partition'])
    elif access_module == 'download':
        required_arguments.extend(['output_path'])
    else:
        raise Exception('can not support this operating: {}'.format(access_module))
    detect_utils.check_config(job_config, required_arguments=required_arguments)
    data = {}
    # compatibility
    if "table_name" in job_config:
        job_config["name"] = job_config["table_name"]
    if "backend" not in job_config:
        job_config["backend"] = 0
    for _ in ["work_mode", "backend", "head", "partition", "drop"]:
        if _ in job_config:
            job_config[_] = int(job_config[_])
    if access_module == "upload":
        if job_config.get('drop', 0) == 1:
            job_config["destroy"] = True
        else:
            job_config["destroy"] = False
        data['table_name'] = job_config["table_name"]
        data['namespace'] = job_config["namespace"]
        data_table_meta = storage.StorageTableMeta(name=job_config["table_name"], namespace=job_config["namespace"])
        if data_table_meta and not job_config["destroy"]:
            return get_json_result(retcode=100,
                                   retmsg='The data table already exists.'
                                          'If you still want to continue uploading, please add the parameter -drop.'
                                          ' 0 means not to delete and continue uploading, '
                                          '1 means to upload again after deleting the table')
    job_dsl, job_runtime_conf = gen_data_access_job_config(job_config, access_module)
    submit_result = DAGScheduler.submit({'job_dsl': job_dsl, 'job_runtime_conf': job_runtime_conf}, job_id=job_id)
    data.update(submit_result)
    return get_json_result(job_id=job_id, data=data)
Ejemplo n.º 3
0
 def start_task(job_id, component_name, task_id, role, party_id,
                task_config):
     schedule_logger.info('job {} {} {} {} task subprocess is ready'.format(
         job_id, component_name, role, party_id, task_config))
     task_process_start_status = False
     try:
         task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                 role, party_id, component_name)
         os.makedirs(task_dir, exist_ok=True)
         task_config_path = os.path.join(task_dir, 'task_config.json')
         with open(task_config_path, 'w') as fw:
             json.dump(task_config, fw)
         process_cmd = [
             'python3', sys.modules[TaskExecutor.__module__].__file__, '-j',
             job_id, '-n', component_name, '-t', task_id, '-r', role, '-p',
             party_id, '-c', task_config_path
         ]
         task_log_dir = os.path.join(
             job_utils.get_job_log_directory(job_id=job_id), role, party_id,
             component_name)
         schedule_logger.info(
             'job {} {} {} {} task subprocess start'.format(
                 job_id, component_name, role, party_id, task_config))
         p = job_utils.run_subprocess(config_dir=task_dir,
                                      process_cmd=process_cmd,
                                      log_dir=task_log_dir)
         if p:
             task_process_start_status = True
     except Exception as e:
         schedule_logger.exception(e)
     finally:
         schedule_logger.info(
             'job {} component {} on {} {} start task subprocess {}'.format(
                 job_id, component_name, role, party_id,
                 'success' if task_process_start_status else 'failed'))
Ejemplo n.º 4
0
 def start_inheriting_job(cls, job):
     JobSaver.update_job(
         job_info={
             "job_id": job.f_job_id,
             "role": job.f_role,
             "party_id": job.f_party_id,
             "inheritance_status": JobInheritanceStatus.RUNNING
         })
     conf_dir = job_utils.get_job_directory(job_id=job.f_job_id)
     os.makedirs(conf_dir, exist_ok=True)
     process_cmd = [
         sys.executable or 'python3',
         sys.modules[JobInherit.__module__].__file__,
         '--job_id',
         job.f_job_id,
         '--role',
         job.f_role,
         '--party_id',
         job.f_party_id,
     ]
     log_dir = os.path.join(
         job_utils.get_job_log_directory(job_id=job.f_job_id),
         "job_inheritance")
     p = process_utils.run_subprocess(job_id=job.f_job_id,
                                      config_dir=conf_dir,
                                      process_cmd=process_cmd,
                                      log_dir=log_dir,
                                      process_name="job_inheritance")
Ejemplo n.º 5
0
 def upload_file_block(self, file_list, data_head, table_list):
     if data_head:
         self.update_table_meta(data_head)
     upload_process = []
     for block_index, block_file in enumerate(file_list):
         task_dir = os.path.join(
             job_utils.get_job_directory(job_id=self.tracker.job_id),
             self.tracker.role, str(self.tracker.party_id),
             self.tracker.component_name, 'upload')
         os.makedirs(task_dir, exist_ok=True)
         process_cmd = [
             sys.executable or 'python3',
             sys.modules[upload_utils.UploadFile.__module__].__file__,
             '--session_id', self.session_id, '--storage',
             self.storage_engine, '--file', block_file, '--namespace',
             table_list[block_index].get("namespace"), '--name',
             table_list[block_index].get("name"), '--partitions',
             self.parameters.get('partition')
         ]
         LOGGER.info(process_cmd)
         job_log_dir = os.path.join(
             job_utils.get_job_log_directory(job_id=self.tracker.job_id),
             self.tracker.role, str(self.tracker.party_id))
         task_log_dir = os.path.join(job_log_dir,
                                     self.tracker.component_name,
                                     f'block_{block_index}')
         p = process_utils.run_subprocess(job_id=self.tracker.job_id,
                                          config_dir=task_dir,
                                          process_cmd=process_cmd,
                                          log_dir=task_log_dir)
         upload_process.append(p)
     self.check_upload_process(upload_process)
     self.union_table(table_list)
Ejemplo n.º 6
0
 def run(job_id, component_name, task_id, task_version, role, party_id,
         task_parameters_path, task_info, **kwargs):
     process_cmd = [
         sys.executable,
         sys.modules[TaskExecutor.__module__].__file__,
         '-j',
         job_id,
         '-n',
         component_name,
         '-t',
         task_id,
         '-v',
         task_version,
         '-r',
         role,
         '-p',
         party_id,
         '-c',
         task_parameters_path,
         '--run_ip',
         RuntimeConfig.JOB_SERVER_HOST,
         '--job_server',
         '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST,
                        RuntimeConfig.HTTP_PORT),
     ]
     task_log_dir = os.path.join(
         job_utils.get_job_log_directory(job_id=job_id), role, party_id,
         component_name)
     task_job_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                 role, party_id, component_name)
     schedule_logger(job_id).info(
         'job {} task {} {} on {} {} executor subprocess is ready'.format(
             job_id, task_id, task_version, role, party_id))
     task_dir = os.path.dirname(task_parameters_path)
     p = job_utils.run_subprocess(job_id=job_id,
                                  config_dir=task_dir,
                                  process_cmd=process_cmd,
                                  log_dir=task_log_dir,
                                  job_dir=task_job_dir)
     task_info["run_pid"] = p.pid
     return p
Ejemplo n.º 7
0
def download_upload(data_func):
    request_config = request.json
    _job_id = generate_job_id()
    stat_logger.info('generated job_id {}, body {}'.format(_job_id, request_config))
    _job_dir = get_job_directory(_job_id)
    os.makedirs(_job_dir, exist_ok=True)
    module = data_func
    required_arguments = ['work_mode', 'namespace', 'table_name']
    if module == 'upload':
        required_arguments.extend(['file', 'head', 'partition'])
    elif module == 'download':
        required_arguments.extend(['output_path'])
    else:
        raise Exception('can not support this operating: {}'.format(module))
    detect_utils.check_config(request_config, required_arguments=required_arguments)
    if module == "upload":
        if not os.path.isabs(request_config['file']):
            request_config["file"] = os.path.join(file_utils.get_project_base_directory(), request_config["file"])
    try:
        conf_file_path = new_runtime_conf(job_dir=_job_dir, method=data_func, module=module,
                                          role=request_config.get('local', {}).get("role"),
                                          party_id=request_config.get('local', {}).get("party_id", ''))
        file_utils.dump_json_conf(request_config, conf_file_path)
        progs = ["python3",
                 os.path.join(file_utils.get_project_base_directory(), JOB_MODULE_CONF[module]["module_path"]),
                 "-j", _job_id,
                 "-c", conf_file_path
                 ]
        try:
            p = run_subprocess(config_dir=_job_dir, process_cmd=progs)
        except Exception as e:
            stat_logger.exception(e)
            p = None
        return get_json_result(retcode=(0 if p else 101), job_id=_job_id,
                               data={'table_name': request_config['table_name'],
                                     'namespace': request_config['namespace'], 'pid': p.pid if p else ''})
    except Exception as e:
        stat_logger.exception(e)
        return get_json_result(retcode=-104, retmsg="failed", job_id=_job_id)
Ejemplo n.º 8
0
    def start_task(cls, job_id, component_name, task_id, task_version, role,
                   party_id, **kwargs):
        """
        Start task, update status and party status
        :param job_id:
        :param component_name:
        :param task_id:
        :param task_version:
        :param role:
        :param party_id:
        :return:
        """
        job_dsl = job_utils.get_job_dsl(job_id, role, party_id)
        PrivilegeAuth.authentication_component(
            job_dsl,
            src_party_id=kwargs.get('src_party_id'),
            src_role=kwargs.get('src_role'),
            party_id=party_id,
            component_name=component_name)

        schedule_logger(job_id).info(
            f"try to start task {task_id} {task_version} on {role} {party_id} executor subprocess"
        )
        task_executor_process_start_status = False
        task_info = {
            "job_id": job_id,
            "task_id": task_id,
            "task_version": task_version,
            "role": role,
            "party_id": party_id,
        }
        is_failed = False
        try:
            task = JobSaver.query_task(task_id=task_id,
                                       task_version=task_version,
                                       role=role,
                                       party_id=party_id)[0]
            run_parameters_dict = job_utils.get_job_parameters(
                job_id, role, party_id)
            run_parameters_dict["src_user"] = kwargs.get("src_user")
            run_parameters = RunParameters(**run_parameters_dict)

            config_dir = job_utils.get_task_directory(job_id, role, party_id,
                                                      component_name, task_id,
                                                      task_version)
            os.makedirs(config_dir, exist_ok=True)

            run_parameters_path = os.path.join(config_dir,
                                               'task_parameters.json')
            with open(run_parameters_path, 'w') as fw:
                fw.write(json_dumps(run_parameters_dict))

            schedule_logger(job_id).info(
                f"use computing engine {run_parameters.computing_engine}")
            task_info["engine_conf"] = {
                "computing_engine": run_parameters.computing_engine
            }
            backend_engine = build_engine(run_parameters.computing_engine)
            run_info = backend_engine.run(
                task=task,
                run_parameters=run_parameters,
                run_parameters_path=run_parameters_path,
                config_dir=config_dir,
                log_dir=job_utils.get_job_log_directory(
                    job_id, role, party_id, component_name),
                cwd_dir=job_utils.get_job_directory(job_id, role, party_id,
                                                    component_name),
                user_name=kwargs.get("user_id"))
            task_info.update(run_info)
            task_info["start_time"] = current_timestamp()
            task_executor_process_start_status = True
        except Exception as e:
            schedule_logger(job_id).exception(e)
            is_failed = True
        finally:
            try:
                cls.update_task(task_info=task_info)
                task_info["party_status"] = TaskStatus.RUNNING
                cls.update_task_status(task_info=task_info)
                if is_failed:
                    task_info["party_status"] = TaskStatus.FAILED
                    cls.update_task_status(task_info=task_info)
            except Exception as e:
                schedule_logger(job_id).exception(e)
            schedule_logger(job_id).info(
                "task {} {} on {} {} executor subprocess start {}".format(
                    task_id, task_version, role, party_id, "success"
                    if task_executor_process_start_status else "failed"))
Ejemplo n.º 9
0
    def run_task(job_id, component_name, task_id, role, party_id, task_config):
        schedule_logger(job_id).info(
            'job {} {} {} {} task subprocess is ready'.format(job_id, component_name, role, party_id, task_config))
        task_process_start_status = False
        try:
            task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name)
            os.makedirs(task_dir, exist_ok=True)
            task_config_path = os.path.join(task_dir, 'task_config.json')
            with open(task_config_path, 'w') as fw:
                json.dump(task_config, fw)

            try:
                backend = task_config['job_parameters']['backend']
            except KeyError:
                backend = 0
                schedule_logger(job_id).warning("failed to get backend, set as 0")

            backend = Backend(backend)

            if backend.is_eggroll():
                process_cmd = [
                    'python3', sys.modules[TaskExecutor.__module__].__file__,
                    '-j', job_id,
                    '-n', component_name,
                    '-t', task_id,
                    '-r', role,
                    '-p', party_id,
                    '-c', task_config_path,
                    '--processors_per_node', str(task_config['job_parameters'].get("processors_per_node", 0)),
                    '--job_server', '{}:{}'.format(get_lan_ip(), HTTP_PORT),
                ]
            elif backend.is_spark():
                if "SPARK_HOME" not in os.environ:
                    raise EnvironmentError("SPARK_HOME not found")
                spark_home = os.environ["SPARK_HOME"]

                # additional configs
                spark_submit_config = task_config['job_parameters'].get("spark_submit_config", dict())

                deploy_mode = spark_submit_config.get("deploy-mode", "client")
                if deploy_mode not in ["client"]:
                    raise ValueError(f"deploy mode {deploy_mode} not supported")

                spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit")
                process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}']
                for k, v in spark_submit_config.items():
                    if k != "conf":
                        process_cmd.append(f'--{k}={v}')
                if "conf" in spark_submit_config:
                    for ck, cv in spark_submit_config["conf"].items():
                        process_cmd.append(f'--conf')
                        process_cmd.append(f'{ck}={cv}')
                process_cmd.extend([
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j', job_id,
                    '-n', component_name,
                    '-t', task_id,
                    '-r', role,
                    '-p', party_id,
                    '-c', task_config_path,
                    '--job_server',
                    '{}:{}'.format(get_lan_ip(), HTTP_PORT),
                ])
            else:
                raise ValueError(f"${backend} supported")

            task_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name)
            schedule_logger(job_id).info(
                'job {} {} {} {} task subprocess start'.format(job_id, component_name, role, party_id, task_config))
            p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir)
            if p:
                task_process_start_status = True
        except Exception as e:
            schedule_logger(job_id).exception(e)
        finally:
            schedule_logger(job_id).info(
                'job {} component {} on {} {} start task subprocess {}'.format(job_id, component_name, role, party_id,
                                                                               'success' if task_process_start_status else 'failed'))
Ejemplo n.º 10
0
def download_upload(access_module):
    job_id = generate_job_id()
    if access_module == "upload" and USE_LOCAL_DATA and not (
            request.json and request.json.get("use_local_data") == 0):
        file = request.files['file']
        filename = os.path.join(get_job_directory(job_id), 'fate_upload_tmp',
                                file.filename)
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        try:
            file.save(filename)
        except Exception as e:
            shutil.rmtree(os.path.join(get_job_directory(job_id), 'tmp'))
            raise e
        request_config = request.args.to_dict()
        request_config['file'] = filename
    else:
        request_config = request.json
    required_arguments = ['work_mode', 'namespace', 'table_name']
    if access_module == 'upload':
        required_arguments.extend(['file', 'head', 'partition'])
    elif access_module == 'download':
        required_arguments.extend(['output_path'])
    elif access_module == 'download_test':
        required_arguments.extend(['output_path'])
    else:
        raise Exception(
            'can not support this operating: {}'.format(access_module))
    detect_utils.check_config(request_config,
                              required_arguments=required_arguments)
    data = {}
    if access_module == "upload":
        data['table_name'] = request_config["table_name"]
        data['namespace'] = request_config["namespace"]
        if WORK_MODE != 0:
            data_table = session.get_data_table(
                name=request_config["table_name"],
                namespace=request_config["namespace"])
            count = data_table.count()
            if count and int(request_config.get('drop', 2)) == 2:
                return get_json_result(
                    retcode=100,
                    retmsg='The data table already exists, table data count:{}.'
                    'If you still want to continue uploading, please add the parameter -drop. '
                    '0 means not to delete and continue uploading, '
                    '1 means to upload again after deleting the table'.format(
                        count))
            elif count and int(request_config.get('drop', 2)) == 1:
                data_table.destroy()
    job_dsl, job_runtime_conf = gen_data_access_job_config(
        request_config, access_module)
    job_id, job_dsl_path, job_runtime_conf_path, logs_directory, model_info, board_url = JobController.submit_job(
        {
            'job_dsl': job_dsl,
            'job_runtime_conf': job_runtime_conf
        },
        job_id=job_id)
    data.update({
        'job_dsl_path': job_dsl_path,
        'job_runtime_conf_path': job_runtime_conf_path,
        'board_url': board_url,
        'logs_directory': logs_directory
    })
    return get_json_result(job_id=job_id, data=data)
Ejemplo n.º 11
0
def download_upload(data_func):
    request_config = request.json
    _job_id = generate_job_id()
    stat_logger.info('generated job_id {}, body {}'.format(
        _job_id, request_config))
    _job_dir = get_job_directory(_job_id)
    os.makedirs(_job_dir, exist_ok=True)
    module = data_func
    required_arguments = ['work_mode', 'namespace', 'table_name']
    if module == 'upload':
        required_arguments.extend(['file', 'head', 'partition'])
    elif module == 'download':
        required_arguments.extend(['output_path'])
    else:
        raise Exception('can not support this operating: {}'.format(module))
    detect_utils.check_config(request_config,
                              required_arguments=required_arguments)
    job_work_mode = request_config['work_mode']
    # todo: The current code here is redundant with job_app/submit_job, the next version of this function will be implemented by job_app/submit_job
    if job_work_mode != RuntimeConfig.WORK_MODE:
        if RuntimeConfig.WORK_MODE == WorkMode.CLUSTER and job_work_mode == WorkMode.STANDALONE:
            # use cluster standalone job server to execute standalone job
            return request_execute_server(
                request=request,
                execute_host='{}:{}'.format(
                    request.remote_addr, CLUSTER_STANDALONE_JOB_SERVER_PORT))
        else:
            raise Exception(
                'server run on standalone can not support cluster mode job')

    if module == "upload":
        if not os.path.isabs(request_config['file']):
            request_config["file"] = os.path.join(
                file_utils.get_project_base_directory(),
                request_config["file"])
    try:
        conf_file_path = new_runtime_conf(
            job_dir=_job_dir,
            method=data_func,
            module=module,
            role=request_config.get('local', {}).get("role"),
            party_id=request_config.get('local', {}).get("party_id", ''))
        file_utils.dump_json_conf(request_config, conf_file_path)
        progs = [
            "python3",
            os.path.join(file_utils.get_project_base_directory(),
                         JOB_MODULE_CONF[module]["module_path"]), "-j",
            _job_id, "-c", conf_file_path
        ]
        try:
            p = run_subprocess(config_dir=_job_dir, process_cmd=progs)
        except Exception as e:
            stat_logger.exception(e)
            p = None
        return get_json_result(retcode=(0 if p else 101),
                               job_id=_job_id,
                               data={
                                   'table_name': request_config['table_name'],
                                   'namespace': request_config['namespace'],
                                   'pid': p.pid if p else ''
                               })
    except Exception as e:
        stat_logger.exception(e)
        return get_json_result(retcode=-104, retmsg="failed", job_id=_job_id)
Ejemplo n.º 12
0
    def start_task(cls, job_id, component_name, task_id, task_version, role,
                   party_id):
        """
        Start task, update status and party status
        :param job_id:
        :param component_name:
        :param task_id:
        :param task_version:
        :param role:
        :param party_id:
        :return:
        """
        schedule_logger(job_id).info(
            'try to start job {} task {} {} on {} {} executor subprocess'.
            format(job_id, task_id, task_version, role, party_id))
        task_executor_process_start_status = False
        task_info = {
            "job_id": job_id,
            "task_id": task_id,
            "task_version": task_version,
            "role": role,
            "party_id": party_id,
        }
        try:
            task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                    role, party_id, component_name, task_id,
                                    task_version)
            os.makedirs(task_dir, exist_ok=True)
            task_parameters_path = os.path.join(task_dir,
                                                'task_parameters.json')
            run_parameters_dict = job_utils.get_job_parameters(
                job_id, role, party_id)
            with open(task_parameters_path, 'w') as fw:
                fw.write(json_dumps(run_parameters_dict))

            run_parameters = RunParameters(**run_parameters_dict)

            schedule_logger(job_id=job_id).info(
                f"use computing engine {run_parameters.computing_engine}")

            if run_parameters.computing_engine in {
                    ComputingEngine.EGGROLL, ComputingEngine.STANDALONE
            }:
                process_cmd = [
                    sys.executable,
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-v',
                    task_version,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_parameters_path,
                    '--run_ip',
                    RuntimeConfig.JOB_SERVER_HOST,
                    '--job_server',
                    '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST,
                                   RuntimeConfig.HTTP_PORT),
                ]
            elif run_parameters.computing_engine == ComputingEngine.SPARK:
                if "SPARK_HOME" not in os.environ:
                    raise EnvironmentError("SPARK_HOME not found")
                spark_home = os.environ["SPARK_HOME"]

                # additional configs
                spark_submit_config = run_parameters.spark_run

                deploy_mode = spark_submit_config.get("deploy-mode", "client")
                if deploy_mode not in ["client"]:
                    raise ValueError(
                        f"deploy mode {deploy_mode} not supported")

                spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit")
                process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}']
                for k, v in spark_submit_config.items():
                    if k != "conf":
                        process_cmd.append(f'--{k}={v}')
                if "conf" in spark_submit_config:
                    for ck, cv in spark_submit_config["conf"].items():
                        process_cmd.append(f'--conf')
                        process_cmd.append(f'{ck}={cv}')
                process_cmd.extend([
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-v',
                    task_version,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_parameters_path,
                    '--run_ip',
                    RuntimeConfig.JOB_SERVER_HOST,
                    '--job_server',
                    '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST,
                                   RuntimeConfig.HTTP_PORT),
                ])
            else:
                raise ValueError(
                    f"${run_parameters.computing_engine} is not supported")

            task_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role, party_id,
                component_name)
            schedule_logger(job_id).info(
                'job {} task {} {} on {} {} executor subprocess is ready'.
                format(job_id, task_id, task_version, role, party_id))
            p = job_utils.run_subprocess(job_id=job_id,
                                         config_dir=task_dir,
                                         process_cmd=process_cmd,
                                         log_dir=task_log_dir)
            if p:
                task_info["party_status"] = TaskStatus.RUNNING
                #task_info["run_pid"] = p.pid
                task_info["start_time"] = current_timestamp()
                task_executor_process_start_status = True
            else:
                task_info["party_status"] = TaskStatus.FAILED
        except Exception as e:
            schedule_logger(job_id).exception(e)
            task_info["party_status"] = TaskStatus.FAILED
        finally:
            try:
                cls.update_task(task_info=task_info)
                cls.update_task_status(task_info=task_info)
            except Exception as e:
                schedule_logger(job_id).exception(e)
            schedule_logger(job_id).info(
                'job {} task {} {} on {} {} executor subprocess start {}'.
                format(
                    job_id, task_id, task_version, role, party_id, "success"
                    if task_executor_process_start_status else "failed"))
Ejemplo n.º 13
0
    def start_task(job_id, component_name, task_id, role, party_id,
                   task_config):
        schedule_logger(job_id).info(
            'job {} {} {} {} task subprocess is ready'.format(
                job_id, component_name, role, party_id, task_config))
        task_process_start_status = False
        try:
            task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                    role, party_id, component_name)
            os.makedirs(task_dir, exist_ok=True)
            task_config_path = os.path.join(task_dir, 'task_config.json')
            with open(task_config_path, 'w') as fw:
                json.dump(task_config, fw)

            try:
                backend = task_config['job_parameters']['backend']
            except KeyError:
                backend = 0
                schedule_logger(job_id).warning(
                    "failed to get backend, set as 0")

            backend = Backend(backend)

            if backend.is_eggroll() or backend.is_eggroll2():
                process_cmd = [
                    'python3',
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_config_path,
                    '--job_server',
                    '{}:{}'.format(task_config['job_server']['ip'],
                                   task_config['job_server']['http_port']),
                ]
            elif backend.is_spark():
                if "SPARK_HOME" not in os.environ:
                    raise EnvironmentError("SPARK_HOME not found")
                spark_submit_config = task_config['job_parameters'].get(
                    "spark_submit_config", dict())
                deploy_mode = spark_submit_config.get("deploy-mode", "client")
                queue = spark_submit_config.get("queue", "default")
                driver_memory = spark_submit_config.get("driver-memory", "1g")
                num_executors = spark_submit_config.get("num-executors", 2)
                executor_memory = spark_submit_config.get(
                    "executor-memory", "1g")
                executor_cores = spark_submit_config.get("executor-cores", 1)

                if deploy_mode not in ["client"]:
                    raise ValueError(
                        f"deploy mode {deploy_mode} not supported")
                spark_home = os.environ["SPARK_HOME"]
                spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit")
                process_cmd = [
                    spark_submit_cmd,
                    f'--name={task_id}#{role}',
                    f'--deploy-mode={deploy_mode}',
                    f'--queue={queue}',
                    f'--driver-memory={driver_memory}',
                    f'--num-executors={num_executors}',
                    f'--executor-memory={executor_memory}',
                    f'--executor-cores={executor_cores}',
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_config_path,
                    '--job_server',
                    '{}:{}'.format(task_config['job_server']['ip'],
                                   task_config['job_server']['http_port']),
                ]
            else:
                raise ValueError(f"${backend} supported")

            task_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role, party_id,
                component_name)
            schedule_logger(job_id).info(
                'job {} {} {} {} task subprocess start'.format(
                    job_id, component_name, role, party_id, task_config))
            p = job_utils.run_subprocess(config_dir=task_dir,
                                         process_cmd=process_cmd,
                                         log_dir=task_log_dir)
            if p:
                task_process_start_status = True
        except Exception as e:
            schedule_logger(job_id).exception(e)
        finally:
            schedule_logger(job_id).info(
                'job {} component {} on {} {} start task subprocess {}'.format(
                    job_id, component_name, role, party_id,
                    'success' if task_process_start_status else 'failed'))
Ejemplo n.º 14
0
    def run_task():
        # signal.signal(signal.SIGTERM, job_utils.onsignal_term)
        task = Task()
        task.f_create_time = current_timestamp()
        try:
            parser = argparse.ArgumentParser()
            parser.add_argument('-j',
                                '--job_id',
                                required=True,
                                type=str,
                                help="job id")
            parser.add_argument('-n',
                                '--component_name',
                                required=True,
                                type=str,
                                help="component name")
            parser.add_argument('-t',
                                '--task_id',
                                required=True,
                                type=str,
                                help="task id")
            parser.add_argument('-r',
                                '--role',
                                required=True,
                                type=str,
                                help="role")
            parser.add_argument('-p',
                                '--party_id',
                                required=True,
                                type=str,
                                help="party id")
            parser.add_argument('-c',
                                '--config',
                                required=True,
                                type=str,
                                help="task config")
            parser.add_argument('--job_server', help="job server", type=str)
            args = parser.parse_args()
            schedule_logger(args.job_id).info('enter task process')
            schedule_logger(args.job_id).info(args)
            # init function args
            if args.job_server:
                RuntimeConfig.init_config(
                    HTTP_PORT=args.job_server.split(':')[1])
            job_id = args.job_id
            component_name = args.component_name
            task_id = args.task_id
            role = args.role
            party_id = int(args.party_id)
            executor_pid = os.getpid()
            #job_utils.task_killed_detector(job_id, role, party_id, component_name, executor_pid)
            task_config = file_utils.load_json_conf(args.config)
            job_parameters = task_config['job_parameters']
            job_initiator = task_config['job_initiator']
            job_args = task_config['job_args']
            task_input_dsl = task_config['input']
            task_output_dsl = task_config['output']
            parameters = TaskExecutor.get_parameters(job_id, component_name,
                                                     role, party_id)
            # parameters = task_config['parameters']
            module_name = task_config['module_name']
        except Exception as e:
            traceback.print_exc()
            schedule_logger().exception(e)
            task.f_status = TaskStatus.FAILED
            return
        try:
            # init environment, process is shared globally
            RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode'],
                                      BACKEND=2)
            # BACKEND=job_parameters.get('backend', 2))
            session.init(job_id='{}_{}_{}'.format(task_id, role, party_id),
                         mode=RuntimeConfig.WORK_MODE,
                         backend=RuntimeConfig.BACKEND)
            federation.init(job_id=task_id, runtime_conf=parameters)
            job_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role,
                str(party_id))
            task_log_dir = os.path.join(job_log_dir, component_name)
            log_utils.LoggerFactory.set_directory(directory=task_log_dir,
                                                  parent_log_dir=job_log_dir,
                                                  append_to_parent_log=True,
                                                  force=True)

            task.f_job_id = job_id
            task.f_component_name = component_name
            task.f_task_id = task_id
            task.f_role = role
            task.f_party_id = party_id
            task.f_operator = 'python_operator'
            tracker = Tracking(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               component_name=component_name,
                               task_id=task_id,
                               model_id=job_parameters['model_id'],
                               model_version=job_parameters['model_version'],
                               module_name=module_name)
            task.f_start_time = current_timestamp()
            task.f_run_ip = get_lan_ip()
            task.f_run_pid = executor_pid
            run_class_paths = parameters.get('CodePath').split('/')
            run_class_package = '.'.join(
                run_class_paths[:-2]) + '.' + run_class_paths[-2].replace(
                    '.py', '')
            run_class_name = run_class_paths[-1]
            task_run_args = TaskExecutor.get_task_run_args(
                job_id=job_id,
                role=role,
                party_id=party_id,
                job_parameters=job_parameters,
                job_args=job_args,
                input_dsl=task_input_dsl)
            run_object = getattr(importlib.import_module(run_class_package),
                                 run_class_name)()
            run_object.set_tracker(tracker=tracker)
            run_object.set_taskid(taskid=task_id)
            task.f_status = TaskStatus.RUNNING
            TaskExecutor.sync_task_status(
                job_id=job_id,
                component_name=component_name,
                task_id=task_id,
                role=role,
                party_id=party_id,
                initiator_party_id=job_initiator.get('party_id', None),
                initiator_role=job_initiator.get('role', None),
                task_info=task.to_json())

            schedule_logger().info('run {} {} {} {} {} task'.format(
                job_id, component_name, task_id, role, party_id))
            schedule_logger().info(parameters)
            schedule_logger().info(task_input_dsl)
            run_object.run(parameters, task_run_args)
            output_data = run_object.save_data()
            tracker.save_output_data_table(
                output_data,
                task_output_dsl.get('data')[0]
                if task_output_dsl.get('data') else 'component')
            output_model = run_object.export_model()
            # There is only one model output at the current dsl version.
            tracker.save_output_model(
                output_model, task_output_dsl['model'][0]
                if task_output_dsl.get('model') else 'default')
            task.f_status = TaskStatus.SUCCESS
        except Exception as e:
            traceback.print_exc()
            schedule_logger().exception(e)
            task.f_status = TaskStatus.FAILED
        finally:
            sync_success = False
            try:
                task.f_end_time = current_timestamp()
                task.f_elapsed = task.f_end_time - task.f_start_time
                task.f_update_time = current_timestamp()
                TaskExecutor.sync_task_status(
                    job_id=job_id,
                    component_name=component_name,
                    task_id=task_id,
                    role=role,
                    party_id=party_id,
                    initiator_party_id=job_initiator.get('party_id', None),
                    initiator_role=job_initiator.get('role', None),
                    task_info=task.to_json())
                sync_success = True
            except Exception as e:
                traceback.print_exc()
                schedule_logger().exception(e)
        schedule_logger().info('finish {} {} {} {} {} {} task'.format(
            job_id, component_name, task_id, role, party_id,
            task.f_status if sync_success else TaskStatus.FAILED))
        print('finish {} {} {} {} {} {} task'.format(
            job_id, component_name, task_id, role, party_id,
            task.f_status if sync_success else TaskStatus.FAILED))
        while True:
            time.sleep(0.5)
            kill_path = os.path.join(job_utils.get_job_directory(job_id),
                                     str(role), str(party_id), component_name,
                                     'kill')
            if os.path.exists(kill_path):
                try:
                    session.stop()
                except Exception as e:
                    pass
                break